Series comparison

-[PULL V2 0/4] Net patches
+[PULL V2 00/15] Net patches
-The following changes since commit 187f35512106501fe9a11057f4d8705431e0026d:
+The following changes since commit d9ccf33f9479201e5add8db0af68ca9ca8da358b:
-  Merge remote-tracking branch 'remotes/stsquad/tags/pull-testing-next-251019-3' into staging (2019-10-26 10:13:48 +0100)
+  Merge remote-tracking branch 'remotes/lvivier-gitlab/tags/linux-user-for-7.0-pull-request' into staging (2022-03-09 20:01:17 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to 1e907a32b77e5d418538453df5945242e43224fa:
+for you to fetch changes up to eea40402ecf895ed345f8e8eb07dbb484f4542c5:
-  COLO-compare: Fix incorrect `if` logic (2019-10-29 10:28:07 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-10 10:26:32 +0800)
 ----------------------------------------------------------------
-Changes from V1:
+----------------------------------------------------------------
 Eugenio Pérez (14):
       vhost: Add VhostShadowVirtqueue
       vhost: Add Shadow VirtQueue kick forwarding capabilities
       vhost: Add Shadow VirtQueue call forwarding capabilities
       vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
-- Fix compling issue
+Jason Wang (1):
       virtio-net: fix map leaking on error during receive
-----------------------------------------------------------------
+ hw/net/virtio-net.c                |   1 +
-Fan Yang (1):
+ hw/virtio/meson.build              |   2 +-
-      COLO-compare: Fix incorrect `if` logic
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
+ hw/virtio/vhost-iova-tree.h        |  27 ++
-Michael S. Tsirkin (1):
+ hw/virtio/vhost-shadow-virtqueue.c | 638 +++++++++++++++++++++++++++++++++++++
-      virtio: new post_load hook
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
+ hw/virtio/vhost-vdpa.c             | 525 +++++++++++++++++++++++++++++-
-Mikhail Sennikovsky (1):
+ include/hw/virtio/vhost-vdpa.h     |   8 +
-      virtio-net: prevent offloads reset on migration
+ include/qemu/iova-tree.h           |  38 ++-
+ util/iova-tree.c                   | 169 ++++++++++
-Sven Schnelle (1):
+files changed, 1588 insertions(+), 17 deletions(-)
-      net: add tulip (dec21143) driver
+ create mode 100644 hw/virtio/vhost-iova-tree.c
+ create mode 100644 hw/virtio/vhost-iova-tree.h
- MAINTAINERS                    |    6 +
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
- hw/net/Kconfig                 |    5 +
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
  hw/net/Makefile.objs           |    1 +
  hw/net/trace-events            |   14 +
  hw/net/tulip.c                 | 1029 ++++++++++++++++++++++++++++++++++++++++
  hw/net/tulip.h                 |  267 +++++++++++
  hw/net/virtio-net.c            |   27 +-
  hw/virtio/virtio.c             |    7 +
  include/hw/pci/pci_ids.h       |    1 +
  include/hw/virtio/virtio-net.h |    2 +
  include/hw/virtio/virtio.h     |    6 +
  net/colo-compare.c             |    6 +-
 files changed, 1365 insertions(+), 6 deletions(-)
  create mode 100644 hw/net/tulip.c
  create mode 100644 hw/net/tulip.h

-New patch
+[PULL V2 01/15] virtio-net: fix map leaking on error during receive
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
+tries to fix the use after free of the sg by caching the virtqueue
+elements in an array and unmap them at once after receiving the
+packets, But it forgot to unmap the cached elements on error which
+will lead to leaking of mapping and other unexpected results.
+Fixing this by detaching the cached elements on error. This addresses
+CVE-2022-26353.
+Reported-by: Victor Tom <vv474172261@gmail.com>
+Cc: qemu-stable@nongnu.org
+Fixes: CVE-2022-26353
+Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/net/virtio-net.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/net/virtio-net.c
++++ b/hw/net/virtio-net.c
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
+ err:
+     for (j = 0; j < i; j++) {
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
+         g_free(elems[j]);
+     }
+--
+.7.4

-New patch
+[PULL V2 02/15] vhost: Add VhostShadowVirtqueue
+From: Eugenio Pérez <eperezma@redhat.com>
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
+notifications and buffers, allowing qemu to track them. While qemu is
+forwarding the buffers and virtqueue changes, it is able to commit the
+memory it's being dirtied, the same way regular qemu's VirtIO devices
+do.
+This commit only exposes basic SVQ allocation and free. Next patches of
+the series add functionality like notifications and buffers forwarding.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/meson.build              |  2 +-
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
+files changed, 91 insertions(+), 1 deletion(-)
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/meson.build
++++ b/hw/virtio/meson.build
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ * vhost shadow virtqueue
++ *
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
++ *
++ * SPDX-License-Identifier: GPL-2.0-or-later
++ */
++
++#include "qemu/osdep.h"
++#include "hw/virtio/vhost-shadow-virtqueue.h"
++
++#include "qemu/error-report.h"
++
++/**
++ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
++ * shadow methods and file descriptors.
++ *
++ * Returns the new virtqueue or NULL.
++ *
++ * In case of error, reason is reported through error_report.
++ */
++VhostShadowVirtqueue *vhost_svq_new(void)
++{
++    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
++    int r;
++
++    r = event_notifier_init(&svq->hdev_kick, 0);
++    if (r != 0) {
++        error_report("Couldn't create kick event notifier: %s (%d)",
++                     g_strerror(errno), errno);
++        goto err_init_hdev_kick;
++    }
++
++    r = event_notifier_init(&svq->hdev_call, 0);
++    if (r != 0) {
++        error_report("Couldn't create call event notifier: %s (%d)",
++                     g_strerror(errno), errno);
++        goto err_init_hdev_call;
++    }
++
++    return g_steal_pointer(&svq);
++
++err_init_hdev_call:
++    event_notifier_cleanup(&svq->hdev_kick);
++
++err_init_hdev_kick:
++    return NULL;
++}
++
++/**
++ * Free the resources of the shadow virtqueue.
++ *
++ * @pvq: gpointer to SVQ so it can be used by autofree functions.
++ */
++void vhost_svq_free(gpointer pvq)
++{
++    VhostShadowVirtqueue *vq = pvq;
++    event_notifier_cleanup(&vq->hdev_kick);
++    event_notifier_cleanup(&vq->hdev_call);
++    g_free(vq);
++}
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@
++/*
++ * vhost shadow virtqueue
++ *
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
++ *
++ * SPDX-License-Identifier: GPL-2.0-or-later
++ */
++
++#ifndef VHOST_SHADOW_VIRTQUEUE_H
++#define VHOST_SHADOW_VIRTQUEUE_H
++
++#include "qemu/event_notifier.h"
++
++/* Shadow virtqueue to relay notifications */
++typedef struct VhostShadowVirtqueue {
++    /* Shadow kick notifier, sent to vhost */
++    EventNotifier hdev_kick;
++    /* Shadow call notifier, sent to vhost */
++    EventNotifier hdev_call;
++} VhostShadowVirtqueue;
++
++VhostShadowVirtqueue *vhost_svq_new(void);
++
++void vhost_svq_free(gpointer vq);
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
++
++#endif
+--
+.7.4

-New patch
+[PULL V2 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
+From: Eugenio Pérez <eperezma@redhat.com>
 At this mode no buffer forwarding will be performed in SVQ mode: Qemu
 will just forward the guest's kicks to the device.
 Host memory notifiers regions are left out for simplicity, and they will
 not be addressed in this series.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
  hw/virtio/vhost-shadow-virtqueue.c |  56 ++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
  hw/virtio/vhost-vdpa.c             | 145 ++++++++++++++++++++++++++++++++++++-
  include/hw/virtio/vhost-vdpa.h     |   4 +
 files changed, 217 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "qemu/error-report.h"
 +#include "qemu/main-loop.h"
 +#include "linux-headers/linux/vhost.h"
 +
 +/**
 + * Forward guest notifications.
 + *
 + * @n: guest kick event notifier, the one that guest set to notify svq.
 + */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
 +                                             svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
 +    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
  void vhost_svq_free(gpointer pvq)
  {
      VhostShadowVirtqueue *vq = pvq;
 +    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
      EventNotifier hdev_call;
 +
 +    /*
 +     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
 +     * notifier allows to recover the VhostShadowVirtqueue from the event loop
 +     * easily. If we use the VirtQueue's one, we don't have an easy way to
 +     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
 +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t idx;
 +
 +    if (!v->shadow_vqs) {
 +        return;
 +    }
 +
 +    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 +        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 +    }
 +    g_ptr_array_free(v->shadow_vqs, true);
 +}
 +
  static int vhost_vdpa_cleanup(struct vhost_dev *dev)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
      trace_vhost_vdpa_cleanup(dev, v);
      vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      memory_listener_unregister(&v->listener);
 +    vhost_vdpa_svq_cleanup(dev);
      dev->opaque = NULL;
      ram_block_discard_disable(false);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
      return ret;
  }
 +static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
 +{
 +    if (!v->shadow_vqs_enabled) {
 +        return;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        vhost_svq_stop(svq);
 +    }
 +}
 +
  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      uint8_t status = 0;
 +    vhost_vdpa_reset_svq(v);
 +
      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
      trace_vhost_vdpa_reset_device(dev, status);
      return ret;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +}
 +
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq,
 +                                 unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HW_VIRTIO_VHOST_VDPA_H
  #define HW_VIRTIO_VHOST_VDPA_H
 +#include <gmodule.h>
 +
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
 .7.4

-New patch
+[PULL V2 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
+From: Eugenio Pérez <eperezma@redhat.com>
+This will make qemu aware of the device used buffers, allowing it to
+write the guest memory with its contents if needed.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
+ hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
+files changed, 71 insertions(+), 2 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
+ }
+ /**
++ * Forward vhost notifications
++ *
++ * @n: hdev call event notifier, the one that device set to notify svq.
++ */
++static void vhost_svq_handle_call(EventNotifier *n)
++{
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
++                                             hdev_call);
++    event_notifier_test_and_clear(n);
++    event_notifier_set(&svq->svq_call);
++}
++
++/**
++ * Set the call notifier for the SVQ to call the guest
++ *
++ * @svq: Shadow virtqueue
++ * @call_fd: call notifier
++ *
++ * Called on BQL context.
++ */
++void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
++{
++    if (call_fd == VHOST_FILE_UNBIND) {
++        /*
++         * Fail event_notifier_set if called handling device call.
++         *
++         * SVQ still needs device notifications, since it needs to keep
++         * forwarding used buffers even with the unbind.
++         */
++        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
++    } else {
++        event_notifier_init_fd(&svq->svq_call, call_fd);
++    }
++}
++
++/**
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
+  *
+  * @svq: The svq
+@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
+     }
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+     return g_steal_pointer(&svq);
+ err_init_hdev_call:
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
+     VhostShadowVirtqueue *vq = pvq;
+     vhost_svq_stop(vq);
+     event_notifier_cleanup(&vq->hdev_kick);
++    event_notifier_set_handler(&vq->hdev_call, NULL);
+     event_notifier_cleanup(&vq->hdev_call);
+     g_free(vq);
+ }
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
+      */
+     EventNotifier svq_kick;
++
++    /* Guest's call notifier, where the SVQ calls guest. */
++    EventNotifier svq_call;
+ } VhostShadowVirtqueue;
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
++void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
+     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
+ }
++static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
++                                         struct vhost_vring_file *file)
++{
++    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
++}
++
+ /**
+  * Set the shadow virtqueue descriptors to the device
+  *
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
+  * @svq: The shadow virtqueue
+  * @idx: The index of the virtqueue in the vhost device
+  * @errp: Error
++ *
++ * Note that this function does not rewind kick file descriptor if cannot set
++ * call one.
+  */
+ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
+     if (unlikely(r != 0)) {
+         error_setg_errno(errp, -r, "Can't set device kick fd");
++        return false;
++    }
++
++    event_notifier = &svq->hdev_call;
++    file.fd = event_notifier_get_fd(event_notifier);
++    r = vhost_vdpa_set_vring_dev_call(dev, &file);
++    if (unlikely(r != 0)) {
++        error_setg_errno(errp, -r, "Can't set device call fd");
+     }
+     return r == 0;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
+ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
+                                        struct vhost_vring_file *file)
+ {
+-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
++    struct vhost_vdpa *v = dev->opaque;
++
++    if (v->shadow_vqs_enabled) {
++        int vdpa_idx = file->index - dev->vq_index;
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
++
++        vhost_svq_set_svq_call_fd(svq, file->fd);
++        return 0;
++    } else {
++        return vhost_vdpa_set_vring_dev_call(dev, file);
++    }
+ }
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
+--
+.7.4

-New patch
+[PULL V2 05/15] vhost: Add vhost_svq_valid_features to shadow vq
+From: Eugenio Pérez <eperezma@redhat.com>
+This allows SVQ to negotiate features with the guest and the device. For
+the device, SVQ is a driver. While this function bypasses all
+non-transport features, it needs to disable the features that SVQ does
+not support when forwarding buffers. This includes packed vq layout,
+indirect descriptors or event idx.
+Future changes can add support to offer more features to the guest,
+since the use of VirtQueue gives this for free. This is left out at the
+moment for simplicity.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
+ hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
+files changed, 61 insertions(+)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
+ #include "qemu/error-report.h"
++#include "qapi/error.h"
+ #include "qemu/main-loop.h"
+ #include "linux-headers/linux/vhost.h"
+ /**
++ * Validate the transport device features that both guests can use with the SVQ
++ * and SVQs can use with the device.
++ *
++ * @dev_features: The features
++ * @errp: Error pointer
++ */
++bool vhost_svq_valid_features(uint64_t features, Error **errp)
++{
++    bool ok = true;
++    uint64_t svq_features = features;
++
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
++         ++b) {
++        switch (b) {
++        case VIRTIO_F_ANY_LAYOUT:
++            continue;
++
++        case VIRTIO_F_ACCESS_PLATFORM:
++            /* SVQ trust in the host's IOMMU to translate addresses */
++        case VIRTIO_F_VERSION_1:
++            /* SVQ trust that the guest vring is little endian */
++            if (!(svq_features & BIT_ULL(b))) {
++                set_bit(b, &svq_features);
++                ok = false;
++            }
++            continue;
++
++        default:
++            if (svq_features & BIT_ULL(b)) {
++                clear_bit(b, &svq_features);
++                ok = false;
++            }
++        }
++    }
++
++    if (!ok) {
++        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
++                         ", ok: 0x%"PRIx64, features, svq_features);
++    }
++    return ok;
++}
++
++/**
+  * Forward guest notifications.
+  *
+  * @n: guest kick event notifier, the one that guest set to notify svq.
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+     EventNotifier svq_call;
+ } VhostShadowVirtqueue;
++bool vhost_svq_valid_features(uint64_t features, Error **errp);
++
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
+                                Error **errp)
+ {
+     g_autoptr(GPtrArray) shadow_vqs = NULL;
++    uint64_t dev_features, svq_features;
++    int r;
++    bool ok;
+     if (!v->shadow_vqs_enabled) {
+         return 0;
+     }
++    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
++    if (r != 0) {
++        error_setg_errno(errp, -r, "Can't get vdpa device features");
++        return r;
++    }
++
++    svq_features = dev_features;
++    ok = vhost_svq_valid_features(svq_features, errp);
++    if (unlikely(!ok)) {
++        return -1;
++    }
++
+     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
+     for (unsigned n = 0; n < hdev->nvqs; ++n) {
+         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+--
+.7.4

-New patch
+[PULL V2 06/15] virtio: Add vhost_svq_get_vring_addr
+From: Eugenio Pérez <eperezma@redhat.com>
+It reports the shadow virtqueue address from qemu virtual address space.
+Since this will be different from the guest's vaddr, but the device can
+access it, SVQ takes special care about its alignment & lack of garbage
+data. It assumes that IOMMU will work in host_page_size ranges for that.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
+files changed, 38 insertions(+)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+ }
+ /**
++ * Get the shadow vq vring address.
++ * @svq: Shadow virtqueue
++ * @addr: Destination to store address
++ */
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
++                              struct vhost_vring_addr *addr)
++{
++    addr->desc_user_addr = (uint64_t)svq->vring.desc;
++    addr->avail_user_addr = (uint64_t)svq->vring.avail;
++    addr->used_user_addr = (uint64_t)svq->vring.used;
++}
++
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
++{
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
++    size_t avail_size = offsetof(vring_avail_t, ring) +
++                                             sizeof(uint16_t) * svq->vring.num;
++
++    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
++}
++
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
++{
++    size_t used_size = offsetof(vring_used_t, ring) +
++                                    sizeof(vring_used_elem_t) * svq->vring.num;
++    return ROUND_UP(used_size, qemu_real_host_page_size);
++}
++
++/**
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
+  *
+  * @svq: The svq
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@
+ #define VHOST_SHADOW_VIRTQUEUE_H
+ #include "qemu/event_notifier.h"
++#include "hw/virtio/virtio.h"
++#include "standard-headers/linux/vhost_types.h"
+ /* Shadow virtqueue to relay notifications */
+ typedef struct VhostShadowVirtqueue {
++    /* Shadow vring */
++    struct vring vring;
++
+     /* Shadow kick notifier, sent to vhost */
+     EventNotifier hdev_kick;
+     /* Shadow call notifier, sent to vhost */
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
++                              struct vhost_vring_addr *addr);
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
+--
+.7.4

-New patch
+[PULL V2 07/15] vdpa: adapt vhost_ops callbacks to svq
+From: Eugenio Pérez <eperezma@redhat.com>
+First half of the buffers forwarding part, preparing vhost-vdpa
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
+this is effectively dead code at the moment, but it helps to reduce
+patch size.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
+file changed, 41 insertions(+), 7 deletions(-)
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
+     return ret;
+  }
++static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
++                                         struct vhost_vring_state *ring)
++{
++    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
++}
++
+ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
+                                          struct vhost_vring_file *file)
+ {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+ }
++static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
++                                         struct vhost_vring_addr *addr)
++{
++    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
++                                addr->desc_user_addr, addr->used_user_addr,
++                                addr->avail_user_addr,
++                                addr->log_guest_addr);
++
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
++
++}
++
+ /**
+  * Set the shadow virtqueue descriptors to the device
+  *
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
+ static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
+                                        struct vhost_vring_addr *addr)
+ {
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+-                                    addr->desc_user_addr, addr->used_user_addr,
+-                                    addr->avail_user_addr,
+-                                    addr->log_guest_addr);
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
++    struct vhost_vdpa *v = dev->opaque;
++
++    if (v->shadow_vqs_enabled) {
++        /*
++         * Device vring addr was set at device start. SVQ base is handled by
++         * VirtQueue code.
++         */
++        return 0;
++    }
++
++    return vhost_vdpa_set_vring_dev_addr(dev, addr);
+ }
+ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
+ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
+                                        struct vhost_vring_state *ring)
+ {
+-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
++    struct vhost_vdpa *v = dev->opaque;
++
++    if (v->shadow_vqs_enabled) {
++        /*
++         * Device vring base was set at device start. SVQ base is handled by
++         * VirtQueue code.
++         */
++        return 0;
++    }
++
++    return vhost_vdpa_set_dev_vring_base(dev, ring);
+ }
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
+--
+.7.4

-New patch
+[PULL V2 08/15] vhost: Shadow virtqueue buffers forwarding
+From: Eugenio Pérez <eperezma@redhat.com>
 Initial version of shadow virtqueue that actually forward buffers. There
 is no iommu support at the moment, and that will be addressed in future
 patches of this series. Since all vhost-vdpa devices use forced IOMMU,
 this means that SVQ is not usable at this point of the series on any
 device.
 For simplicity it only supports modern devices, that expects vring
 in little endian, with split ring and no event idx or indirect
 descriptors. Support for them will not be added in this series.
 It reuses the VirtQueue code for the device part. The driver part is
 based on Linux's virtio_ring driver, but with stripped functionality
 and optimizations so it's easier to review.
 However, forwarding buffers have some particular pieces: One of the most
 unexpected ones is that a guest's buffer can expand through more than
 one descriptor in SVQ. While this is handled gracefully by qemu's
 emulated virtio devices, it may cause unexpected SVQ queue full. This
 patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
  hw/virtio/vhost-shadow-virtqueue.c | 354 ++++++++++++++++++++++++++++++++++++-
  hw/virtio/vhost-shadow-virtqueue.h |  26 +++
  hw/virtio/vhost-vdpa.c             | 159 ++++++++++++++++-
 files changed, 527 insertions(+), 12 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/error-report.h"
  #include "qapi/error.h"
  #include "qemu/main-loop.h"
 +#include "qemu/log.h"
 +#include "qemu/memalign.h"
  #include "linux-headers/linux/vhost.h"
  /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
  }
  /**
 - * Forward guest notifications.
 + * Number of descriptors that the SVQ can make available from the guest.
 + *
 + * @svq: The svq
 + */
 +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
 +{
 +    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    const struct iovec *iovec,
 +                                    size_t num, bool more_descs, bool write)
 +{
 +    uint16_t i = svq->free_head, last = svq->free_head;
 +    unsigned n;
 +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 +    vring_desc_t *descs = svq->vring.desc;
 +
 +    if (num == 0) {
 +        return;
 +    }
 +
 +    for (n = 0; n < num; n++) {
 +        if (more_descs || (n + 1 < num)) {
 +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 +        } else {
 +            descs[i].flags = flags;
 +        }
 +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
 +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
 +
 +        last = i;
 +        i = cpu_to_le16(descs[i].next);
 +    }
 +
 +    svq->free_head = le16_to_cpu(descs[last].next);
 +}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem,
 +                                unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num >
 +                vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               svq_kick);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->hdev_kick);
 +    vhost_handle_guest_kick(svq);
 +}
 +
 +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 +{
 +    if (svq->last_used_idx != svq->shadow_used_idx) {
 +        return true;
 +    }
 +
 +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
 +
 +    return svq->last_used_idx != svq->shadow_used_idx;
  }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
  void vhost_svq_stop(VhostShadowVirtqueue *svq)
  {
      event_notifier_set_handler(&svq->svq_kick, NULL);
 +    g_autofree VirtQueueElement *next_avail_elem = NULL;
 +
 +    if (!svq->vq) {
 +        return;
 +    }
 +
 +    /* Send all pending used descriptors to guest */
 +    vhost_svq_flush(svq, false);
 +
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq,
 -                                 unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq,
 +                                  unsigned idx,
 +                                  Error **errp)
  {
      struct vhost_vring_file file = {
          .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 -        return false;
 +        return r;
      }
      event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
          error_setg_errno(errp, -r, "Can't set device call fd");
      }
 +    return r;
 +}
 +
 +/**
 + * Unmap a SVQ area in the device
 + */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq,
 +                                 unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
      }
      for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
          VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        struct vhost_vring_addr addr = {
 +            .index = i,
 +        };
 +        int r;
          bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
          if (unlikely(!ok)) {
 -            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            goto err;
 +        }
 +
 +        vhost_svq_start(svq, dev->vdev, vq);
 +        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
 +        if (unlikely(!ok)) {
 +            goto err_map;
 +        }
 +
 +        /* Override vring GPA set by vhost subsystem */
 +        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
 +        if (unlikely(r != 0)) {
 +            error_setg_errno(&err, -r, "Cannot set device address");
 +            goto err_set_addr;
 +        }
 +    }
 +
 +    return true;
 +
 +err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
 +                                                      i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
      }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
          }
          vhost_vdpa_set_vring_ready(dev);
      } else {
 +        ok = vhost_vdpa_svqs_stop(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      }
 --
 .7.4

-New patch
+[PULL V2 09/15] util: Add iova_tree_alloc_map
+From: Eugenio Pérez <eperezma@redhat.com>
 This iova tree function allows it to look for a hole in allocated
 regions and return a totally new translation for a given translated
 address.
 It's usage is mainly to allow devices to access qemu address space,
 remapping guest's one into a new iova space where qemu can add chunks of
 addresses.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
  include/qemu/iova-tree.h |  18 +++++++
  util/iova-tree.c         | 135 +++++++++++++++++++++++++++++++++++++++++++++++
 files changed, 153 insertions(+)
 diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/iova-tree.h
 +++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
  #define  IOVA_OK           (0)
  #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
  #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
 +#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
  typedef struct IOVATree IOVATree;
  typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
  void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next) {
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
 +/**
 + * Try to find an unallocated IOVA range between prev and this elements.
 + *
 + * @args: Arguments to allocation
 + *
 + * Cases:
 + *
 + * (1) !prev, !this: No entries allocated, always succeed
 + *
 + * (2) !prev, this: We're iterating at the 1st element.
 + *
 + * (3) prev, !this: We're iterating at the last element.
 + *
 + * (4) prev, this: this is the most common case, we'll try to find a hole
 + * between "prev" and "this" mapping.
 + *
 + * Note that this function assumes the last valid iova is HWADDR_MAX, but it
 + * searches linearly so it's easy to discard the result if it's not the case.
 + */
 +static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
 +{
 +    const DMAMap *prev = args->prev, *this = args->this;
 +    uint64_t hole_start, hole_last;
 +
 +    if (this && this->iova + this->size < args->iova_begin) {
 +        return;
 +    }
 +
 +    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
 +    hole_last = this ? this->iova : HWADDR_MAX;
 +
 +    if (hole_last - hole_start > args->new_size) {
 +        args->iova_result = hole_start;
 +        args->iova_found = true;
 +    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
      g_tree_destroy(tree->tree);
 --
 .7.4

-[PULL V2 3/4] virtio-net: prevent offloads reset on migration
+[PULL V2 10/15] util: add iova_tree_find_iova
-From: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Currently offloads disabled by guest via the VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
+This function does the reverse operation of iova_tree_find: To look for
-command are not preserved on VM migration.
+a mapping that match a translated address so we can do the reverse.
 Instead all offloads reported by guest features (via VIRTIO_PCI_GUEST_FEATURES)
 get enabled.
 What happens is: first the VirtIONet::curr_guest_offloads gets restored and offloads
 are getting set correctly:
- #0  qemu_set_offload (nc=0x555556a11400, csum=1, tso4=0, tso6=0, ecn=0, ufo=0) at net/net.c:474
+This have linear complexity instead of logarithmic, but it supports
- #1  virtio_net_apply_guest_offloads (n=0x555557701ca0) at hw/net/virtio-net.c:720
+overlapping HVA. Future developments could reduce it.
  #2  virtio_net_post_load_device (opaque=0x555557701ca0, version_id=11) at hw/net/virtio-net.c:2334
  #3  vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577c80 <vmstate_virtio_net_device>, opaque=0x555557701ca0, version_id=11)
      at migration/vmstate.c:168
  #4  virtio_load (vdev=0x555557701ca0, f=0x5555569dc010, version_id=11) at hw/virtio/virtio.c:2197
  #5  virtio_device_get (f=0x5555569dc010, opaque=0x555557701ca0, size=0, field=0x55555668cd00 <__compound_literal.5>) at hw/virtio/virtio.c:2036
  #6  vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577ce0 <vmstate_virtio_net>, opaque=0x555557701ca0, version_id=11) at migration/vmstate.c:143
  #7  vmstate_load (f=0x5555569dc010, se=0x5555578189e0) at migration/savevm.c:829
  #8  qemu_loadvm_section_start_full (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2211
  #9  qemu_loadvm_state_main (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2395
  #10 qemu_loadvm_state (f=0x5555569dc010) at migration/savevm.c:2467
  #11 process_incoming_migration_co (opaque=0x0) at migration/migration.c:449
-However later on the features are getting restored, and offloads get reset to
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-everything supported by features:
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
  #0  qemu_set_offload (nc=0x555556a11400, csum=1, tso4=1, tso6=1, ecn=0, ufo=0) at net/net.c:474
  #1  virtio_net_apply_guest_offloads (n=0x555557701ca0) at hw/net/virtio-net.c:720
  #2  virtio_net_set_features (vdev=0x555557701ca0, features=5104441767) at hw/net/virtio-net.c:773
  #3  virtio_set_features_nocheck (vdev=0x555557701ca0, val=5104441767) at hw/virtio/virtio.c:2052
  #4  virtio_load (vdev=0x555557701ca0, f=0x5555569dc010, version_id=11) at hw/virtio/virtio.c:2220
  #5  virtio_device_get (f=0x5555569dc010, opaque=0x555557701ca0, size=0, field=0x55555668cd00 <__compound_literal.5>) at hw/virtio/virtio.c:2036
  #6  vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577ce0 <vmstate_virtio_net>, opaque=0x555557701ca0, version_id=11) at migration/vmstate.c:143
  #7  vmstate_load (f=0x5555569dc010, se=0x5555578189e0) at migration/savevm.c:829
  #8  qemu_loadvm_section_start_full (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2211
  #9  qemu_loadvm_state_main (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2395
  #10 qemu_loadvm_state (f=0x5555569dc010) at migration/savevm.c:2467
  #11 process_incoming_migration_co (opaque=0x0) at migration/migration.c:449
 Fix this by preserving the state in saved_guest_offloads field and
 pushing out offload initialization to the new post load hook.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/virtio-net.c            | 27 ++++++++++++++++++++++++---
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
- include/hw/virtio/virtio-net.h |  2 ++
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
-files changed, 26 insertions(+), 3 deletions(-)
+files changed, 53 insertions(+), 1 deletion(-)
-diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/virtio-net.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/virtio-net.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
+  * @tree: the iova tree to search from
-     }
+  * @map: the mapping to search
+  *
--    if (peer_has_vnet_hdr(n)) {
+- * Search for a mapping in the iova tree that overlaps with the
--        virtio_net_apply_guest_offloads(n);
++ * Search for a mapping in the iova tree that iova overlaps with the
--    }
+  * mapping range specified.  Only the first found mapping will be
-+    /*
+  * returned.
-+     * curr_guest_offloads will be later overwritten by the
+  *
-+     * virtio_set_features_nocheck call done from the virtio_load.
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-+     * Here we make sure it is preserved and restored accordingly
+ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
-+     * in the virtio_net_post_load_virtio callback.
-+     */
+ /**
-+    n->saved_guest_offloads = n->curr_guest_offloads;
++ * iova_tree_find_iova:
++ *
-     virtio_net_set_queues(n);
++ * @tree: the iova tree to search from
++ * @map: the mapping to search
-@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
++ *
-     return 0;
++ * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
      bool iova_found;
  };
 +typedef struct IOVATreeFindIOVAArgs {
 +    const DMAMap *needle;
 +    const DMAMap *result;
 +} IOVATreeFindIOVAArgs;
 +
  /**
   * Iterate args to the next hole
   *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
      return g_tree_lookup(tree->tree, map);
  }
-+static int virtio_net_post_load_virtio(VirtIODevice *vdev)
++static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
 +                                                gpointer data)
 +{
-+    VirtIONet *n = VIRTIO_NET(vdev);
++    const DMAMap *map = key;
-+    /*
++    IOVATreeFindIOVAArgs *args = data;
-+     * The actual needed state is now in saved_guest_offloads,
++    const DMAMap *needle;
-+     * see virtio_net_post_load_device for detail.
++
-+     * Restore it back and apply the desired offloads.
++    g_assert(key == value);
-+     */
++
-+    n->curr_guest_offloads = n->saved_guest_offloads;
++    needle = args->needle;
-+    if (peer_has_vnet_hdr(n)) {
++    if (map->translated_addr + map->size < needle->translated_addr ||
-+        virtio_net_apply_guest_offloads(n);
++        needle->translated_addr + needle->size < map->translated_addr) {
 +        return false;
 +    }
 +
-+    return 0;
++    args->result = map;
 +    return true;
 +}
 +
- /* tx_waiting field of a VirtIONetQueue */
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
- static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
++{
-     .name = "virtio-net-queue-tx_waiting",
++    IOVATreeFindIOVAArgs args = {
-@@ -XXX,XX +XXX,XX @@ static void virtio_net_class_init(ObjectClass *klass, void *data)
++        .needle = map,
-     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
++    };
-     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
++
-     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
++    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
-+    vdc->post_load = virtio_net_post_load_virtio;
++    return args.result;
-     vdc->vmsd = &vmstate_virtio_net_device;
++}
- }
++
+ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
-diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
+ {
-index XXXXXXX..XXXXXXX 100644
+     const DMAMap map = { .iova = iova, .size = 0 };
 --- a/include/hw/virtio/virtio-net.h
 +++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
      char *netclient_name;
      char *netclient_type;
      uint64_t curr_guest_offloads;
 +    /* used on saved state restore phase to preserve the curr_guest_offloads */
 +    uint64_t saved_guest_offloads;
      AnnounceTimer announce_timer;
      bool needs_vnet_hdr_swap;
      bool mtu_bypass_backend;
 --
-.5.0
+.7.4

-[PULL V2 1/4] net: add tulip (dec21143) driver
+[PULL V2 11/15] vhost: Add VhostIOVATree
-From: Sven Schnelle <svens@stackframe.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-This adds the basic functionality to emulate a Tulip NIC.
+This tree is able to look for a translated address from an IOVA address.
-Implemented are:
+At first glance it is similar to util/iova-tree. However, SVQ working on
 devices with limited IOVA space need more capabilities, like allocating
 IOVA chunks or performing reverse translations (qemu addresses to iova).
-- RX and TX functionality
+The allocation capability, as "assign a free IOVA address to this chunk
-- Perfect Frame Filtering
+of memory in qemu's address space" allows shadow virtqueue to create a
-- Big/Little Endian descriptor support
+new address space that is not restricted by guest's addressable one, so
-- 93C46 EEPROM support
+we can allocate shadow vqs vrings outside of it.
 - LXT970 PHY
-Not implemented, mostly because i had no OS using these functions:
+It duplicates the tree so it can search efficiently in both directions,
 and it will signal overlap if iova or the translated address is present
 in any tree.
-- Imperfect frame filtering
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-- General Purpose Timer
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 - Transmit automatic polling
 - Boot ROM support
 - SIA interface
 - Big/Little Endian data buffer conversion
 Successfully tested with the following Operating Systems:
 - MSDOS with Microsoft Network Client 3.0 and DEC ODI drivers
 - HPPA Linux
 - Windows XP
 - HP-UX
 Signed-off-by: Sven Schnelle <svens@stackframe.org>
 Message-Id: <20191022155413.4619-1-svens@stackframe.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- MAINTAINERS              |    6 +
+ hw/virtio/meson.build       |   2 +-
- hw/net/Kconfig           |    5 +
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
- hw/net/Makefile.objs     |    1 +
+ hw/virtio/vhost-iova-tree.h |  27 +++++++++++
- hw/net/trace-events      |   14 +
+files changed, 138 insertions(+), 1 deletion(-)
- hw/net/tulip.c           | 1029 ++++++++++++++++++++++++++++++++++++++++++++++
+ create mode 100644 hw/virtio/vhost-iova-tree.c
- hw/net/tulip.h           |  267 ++++++++++++
+ create mode 100644 hw/virtio/vhost-iova-tree.h
  include/hw/pci/pci_ids.h |    1 +
 files changed, 1323 insertions(+)
  create mode 100644 hw/net/tulip.c
  create mode 100644 hw/net/tulip.h
-diff --git a/MAINTAINERS b/MAINTAINERS
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/hw/virtio/meson.build
-+++ b/MAINTAINERS
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ M: Stefan Weil <sw@weilnetz.de>
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
- S: Maintained
- F: hw/net/eepro100.c
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
-+tulip
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+M: Sven Schnelle <svens@stackframe.org>
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
-+S: Maintained
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-+F: hw/net/tulip.c
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
-+F: hw/net/tulip.h
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-+
+diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
  Generic Loader
  M: Alistair Francis <alistair@alistair23.me>
  S: Maintained
 diff --git a/hw/net/Kconfig b/hw/net/Kconfig
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/Kconfig
 +++ b/hw/net/Kconfig
@@ -XXX,XX +XXX,XX @@ config PCNET_PCI
  config PCNET_COMMON
      bool
 +config TULIP
 +    bool
 +    default y if PCI_DEVICES
 +    depends on PCI
 +
  config E1000_PCI
      bool
      default y if PCI_DEVICES
 diff --git a/hw/net/Makefile.objs b/hw/net/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/Makefile.objs
 +++ b/hw/net/Makefile.objs
@@ -XXX,XX +XXX,XX @@ common-obj-$(CONFIG_E1000E_PCI_EXPRESS) += e1000e.o e1000e_core.o e1000x_common.
  common-obj-$(CONFIG_RTL8139_PCI) += rtl8139.o
  common-obj-$(CONFIG_VMXNET3_PCI) += net_tx_pkt.o net_rx_pkt.o
  common-obj-$(CONFIG_VMXNET3_PCI) += vmxnet3.o
 +common-obj-$(CONFIG_TULIP) += tulip.o
  common-obj-$(CONFIG_SMC91C111) += smc91c111.o
  common-obj-$(CONFIG_LAN9118) += lan9118.o
 diff --git a/hw/net/trace-events b/hw/net/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/trace-events
 +++ b/hw/net/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_net_announce_notify(void) ""
  virtio_net_announce_timer(int round) "%d"
  virtio_net_handle_announce(int round) "%d"
  virtio_net_post_load_device(void)
 +
 +# tulip.c
 +tulip_reg_write(uint64_t addr, const char *name, int size, uint64_t val) "addr 0x%02"PRIx64" (%s) size %d value 0x%08"PRIx64
 +tulip_reg_read(uint64_t addr, const char *name, int size, uint64_t val) "addr 0x%02"PRIx64" (%s) size %d value 0x%08"PRIx64
 +tulip_receive(const uint8_t *buf, size_t len) "buf %p size %zu"
 +tulip_descriptor(const char *prefix, uint32_t addr, uint32_t status, uint32_t control, uint32_t len1, uint32_t len2, uint32_t buf1, uint32_t buf2) "%s 0x%08x: status 0x%08x control 0x%03x len1 %4d len2 %4d buf1 0x%08x buf2 0x%08x"
 +tulip_rx_state(const char *state) "RX %s"
 +tulip_tx_state(const char *state) "TX %s"
 +tulip_irq(uint32_t mask, uint32_t en, const char *state) "mask 0x%08x ie 0x%08x %s"
 +tulip_mii_write(int phy, int reg, uint16_t data) "phy 0x%x reg 0x%x data 0x%04x"
 +tulip_mii_read(int phy, int reg, uint16_t data) "phy 0x%x, reg 0x%x data 0x%04x"
 +tulip_reset(void) ""
 +tulip_setup_frame(void) ""
 +tulip_setup_filter(int n, uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f) "%d: %02x:%02x:%02x:%02x:%02x:%02x"
 diff --git a/hw/net/tulip.c b/hw/net/tulip.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/hw/net/tulip.c
++++ b/hw/virtio/vhost-iova-tree.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * QEMU TULIP Emulation
++ * vhost software live migration iova tree
 + *
-+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
-+ * This work is licensed under the GNU GPL license version 2 or later.
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
-+#include "qemu/log.h"
++#include "qemu/iova-tree.h"
-+#include "hw/irq.h"
++#include "vhost-iova-tree.h"
 +#include "hw/pci/pci.h"
 +#include "hw/qdev-properties.h"
 +#include "hw/nvram/eeprom93xx.h"
 +#include "migration/vmstate.h"
 +#include "sysemu/sysemu.h"
 +#include "tulip.h"
 +#include "trace.h"
 +#include "net/eth.h"
 +
-+typedef struct TULIPState {
++#define iova_min_addr qemu_real_host_page_size
 +    PCIDevice dev;
 +    MemoryRegion io;
 +    MemoryRegion memory;
 +    NICConf c;
 +    qemu_irq irq;
 +    NICState *nic;
 +    eeprom_t *eeprom;
 +    uint32_t csr[16];
 +
-+    /* state for MII */
++/**
-+    uint32_t old_csr9;
++ * VhostIOVATree, able to:
-+    uint32_t mii_word;
++ * - Translate iova address
-+    uint32_t mii_bitcnt;
++ * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
 +struct VhostIOVATree {
 +    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
-+    hwaddr current_rx_desc;
++    /* Last addressable iova address in the device */
-+    hwaddr current_tx_desc;
++    uint64_t iova_last;
 +
-+    uint8_t rx_frame[2048];
++    /* IOVA address to qemu memory maps. */
-+    uint8_t tx_frame[2048];
++    IOVATree *iova_taddr_map;
 +    uint16_t tx_frame_len;
 +    uint16_t rx_frame_len;
 +    uint16_t rx_frame_size;
 +
 +    uint32_t rx_status;
 +    uint8_t filter[16][6];
 +} TULIPState;
 +
 +static const VMStateDescription vmstate_pci_tulip = {
 +    .name = "tulip",
 +    .fields = (VMStateField[]) {
 +        VMSTATE_PCI_DEVICE(dev, TULIPState),
 +        VMSTATE_UINT32_ARRAY(csr, TULIPState, 16),
 +        VMSTATE_UINT32(old_csr9, TULIPState),
 +        VMSTATE_UINT32(mii_word, TULIPState),
 +        VMSTATE_UINT32(mii_bitcnt, TULIPState),
 +        VMSTATE_UINT64(current_rx_desc, TULIPState),
 +        VMSTATE_UINT64(current_tx_desc, TULIPState),
 +        VMSTATE_BUFFER(rx_frame, TULIPState),
 +        VMSTATE_BUFFER(tx_frame, TULIPState),
 +        VMSTATE_UINT16(rx_frame_len, TULIPState),
 +        VMSTATE_UINT16(tx_frame_len, TULIPState),
 +        VMSTATE_UINT16(rx_frame_size, TULIPState),
 +        VMSTATE_UINT32(rx_status, TULIPState),
 +        VMSTATE_UINT8_2DARRAY(filter, TULIPState, 16, 6),
 +        VMSTATE_END_OF_LIST()
 +    }
 +};
 +
-+static void tulip_desc_read(TULIPState *s, hwaddr p,
++/**
-+        struct tulip_descriptor *desc)
++ * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
-+    if (s->csr[0] & CSR0_DBO) {
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
-+        desc->status = ldl_be_pci_dma(&s->dev, p);
++
-+        desc->control = ldl_be_pci_dma(&s->dev, p + 4);
++    /* Some devices do not like 0 addresses */
-+        desc->buf_addr1 = ldl_be_pci_dma(&s->dev, p + 8);
++    tree->iova_first = MAX(iova_first, iova_min_addr);
-+        desc->buf_addr2 = ldl_be_pci_dma(&s->dev, p + 12);
++    tree->iova_last = iova_last;
-+    } else {
++
-+        desc->status = ldl_le_pci_dma(&s->dev, p);
++    tree->iova_taddr_map = iova_tree_new();
-+        desc->control = ldl_le_pci_dma(&s->dev, p + 4);
++    return tree;
 +        desc->buf_addr1 = ldl_le_pci_dma(&s->dev, p + 8);
 +        desc->buf_addr2 = ldl_le_pci_dma(&s->dev, p + 12);
 +    }
 +}
 +
-+static void tulip_desc_write(TULIPState *s, hwaddr p,
++/**
-+        struct tulip_descriptor *desc)
++ * Delete an iova tree
 + */
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
 +{
-+    if (s->csr[0] & CSR0_DBO) {
++    iova_tree_destroy(iova_tree->iova_taddr_map);
-+        stl_be_pci_dma(&s->dev, p, desc->status);
++    g_free(iova_tree);
 +        stl_be_pci_dma(&s->dev, p + 4, desc->control);
 +        stl_be_pci_dma(&s->dev, p + 8, desc->buf_addr1);
 +        stl_be_pci_dma(&s->dev, p + 12, desc->buf_addr2);
 +    } else {
 +        stl_le_pci_dma(&s->dev, p, desc->status);
 +        stl_le_pci_dma(&s->dev, p + 4, desc->control);
 +        stl_le_pci_dma(&s->dev, p + 8, desc->buf_addr1);
 +        stl_le_pci_dma(&s->dev, p + 12, desc->buf_addr2);
 +    }
 +}
 +
-+static void tulip_update_int(TULIPState *s)
++/**
 + * Find the IOVA address stored from a memory address
 + *
 + * @tree: The iova tree
 + * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
-+    uint32_t ie = s->csr[5] & s->csr[7];
++    return iova_tree_find_iova(tree->iova_taddr_map, map);
-+    bool assert = false;
++}
 +
-+    s->csr[5] &= ~(CSR5_AIS | CSR5_NIS);
++/**
 + * Allocate a new mapping
 + *
 + * @tree: The iova tree
 + * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
 +    /* Some vhost devices do not like addr 0. Skip first page */
 +    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
-+    if (ie & (CSR5_TI | CSR5_TU | CSR5_RI | CSR5_GTE | CSR5_ERI)) {
++    if (map->translated_addr + map->size < map->translated_addr ||
-+        s->csr[5] |= CSR5_NIS;
++        map->perm == IOMMU_NONE) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
-+    if (ie & (CSR5_LC | CSR5_GPI | CSR5_FBE | CSR5_LNF | CSR5_ETI | CSR5_RWT |
++    /* Allocate a node in IOVA address */
-+              CSR5_RPS | CSR5_RU | CSR5_UNF | CSR5_LNP_ANC | CSR5_TJT |
++    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
-+              CSR5_TPS)) {
++                               tree->iova_last);
 +        s->csr[5] |= CSR5_AIS;
 +    }
 +
 +    assert = s->csr[5] & s->csr[7] & (CSR5_AIS | CSR5_NIS);
 +    trace_tulip_irq(s->csr[5], s->csr[7], assert ? "assert" : "deassert");
 +    qemu_set_irq(s->irq, assert);
 +}
 +
-+static bool tulip_rx_stopped(TULIPState *s)
++/**
 + * Remove existing mappings from iova tree
 + *
 + * @iova_tree: The vhost iova tree
 + * @map: The map to remove
 + */
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
 +{
-+    return ((s->csr[5] >> CSR5_RS_SHIFT) & CSR5_RS_MASK) == CSR5_RS_STOPPED;
++    iova_tree_remove(iova_tree->iova_taddr_map, map);
 +}
-+
+diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
 +static void tulip_dump_tx_descriptor(TULIPState *s,
 +        struct tulip_descriptor *desc)
 +{
 +    trace_tulip_descriptor("TX ", s->current_tx_desc,
 +                desc->status, desc->control >> 22,
 +                desc->control & 0x7ff, (desc->control >> 11) & 0x7ff,
 +                desc->buf_addr1, desc->buf_addr2);
 +}
 +
 +static void tulip_dump_rx_descriptor(TULIPState *s,
 +        struct tulip_descriptor *desc)
 +{
 +    trace_tulip_descriptor("RX ", s->current_rx_desc,
 +                desc->status, desc->control >> 22,
 +                desc->control & 0x7ff, (desc->control >> 11) & 0x7ff,
 +                desc->buf_addr1, desc->buf_addr2);
 +}
 +
 +static void tulip_next_rx_descriptor(TULIPState *s,
 +    struct tulip_descriptor *desc)
 +{
 +    if (desc->control & RDES1_RER) {
 +        s->current_rx_desc = s->csr[3];
 +    } else if (desc->control & RDES1_RCH) {
 +        s->current_rx_desc = desc->buf_addr2;
 +    } else {
 +        s->current_rx_desc += sizeof(struct tulip_descriptor) +
 +                (((s->csr[0] >> CSR0_DSL_SHIFT) & CSR0_DSL_MASK) << 2);
 +    }
 +    s->current_rx_desc &= ~3ULL;
 +}
 +
 +static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
 +{
 +    int len1 = (desc->control >> RDES1_BUF1_SIZE_SHIFT) & RDES1_BUF1_SIZE_MASK;
 +    int len2 = (desc->control >> RDES1_BUF2_SIZE_SHIFT) & RDES1_BUF2_SIZE_MASK;
 +    int len;
 +
 +    if (s->rx_frame_len && len1) {
 +        if (s->rx_frame_len > len1) {
 +            len = len1;
 +        } else {
 +            len = s->rx_frame_len;
 +        }
 +        pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
 +            (s->rx_frame_size - s->rx_frame_len), len);
 +        s->rx_frame_len -= len;
 +    }
 +
 +    if (s->rx_frame_len && len2) {
 +        if (s->rx_frame_len > len2) {
 +            len = len2;
 +        } else {
 +            len = s->rx_frame_len;
 +        }
 +        pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
 +            (s->rx_frame_size - s->rx_frame_len), len);
 +        s->rx_frame_len -= len;
 +    }
 +}
 +
 +static bool tulip_filter_address(TULIPState *s, const uint8_t *addr)
 +{
 +    static const char broadcast[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 +    bool ret = false;
 +    int i;
 +
 +    for (i = 0; i < 16 && ret == false; i++) {
 +        if (!memcmp(&s->filter[i], addr, ETH_ALEN)) {
 +            ret = true;
 +        }
 +    }
 +
 +    if (!memcmp(addr, broadcast, ETH_ALEN)) {
 +        return true;
 +    }
 +
 +    if (s->csr[6] & (CSR6_PR | CSR6_RA)) {
 +        /* Promiscuous mode enabled */
 +        s->rx_status |= RDES0_FF;
 +        return true;
 +    }
 +
 +    if ((s->csr[6] & CSR6_PM) && (addr[0] & 1)) {
 +        /* Pass all Multicast enabled */
 +        s->rx_status |= RDES0_MF;
 +        return true;
 +    }
 +
 +    if (s->csr[6] & CSR6_IF) {
 +        ret ^= true;
 +    }
 +    return ret;
 +}
 +
 +static ssize_t tulip_receive(TULIPState *s, const uint8_t *buf, size_t size)
 +{
 +    struct tulip_descriptor desc;
 +
 +    trace_tulip_receive(buf, size);
 +
 +    if (size < 14 || size > 2048 || s->rx_frame_len || tulip_rx_stopped(s)) {
 +        return 0;
 +    }
 +
 +    if (!tulip_filter_address(s, buf)) {
 +        return size;
 +    }
 +
 +    do {
 +        tulip_desc_read(s, s->current_rx_desc, &desc);
 +        tulip_dump_rx_descriptor(s, &desc);
 +
 +        if (!(desc.status & RDES0_OWN)) {
 +            s->csr[5] |= CSR5_RU;
 +            tulip_update_int(s);
 +            return s->rx_frame_size - s->rx_frame_len;
 +        }
 +        desc.status = 0;
 +
 +        if (!s->rx_frame_len) {
 +            s->rx_frame_size = size + 4;
 +            s->rx_status = RDES0_LS |
 +                 ((s->rx_frame_size & RDES0_FL_MASK) << RDES0_FL_SHIFT);
 +            desc.status |= RDES0_FS;
 +            memcpy(s->rx_frame, buf, size);
 +            s->rx_frame_len = s->rx_frame_size;
 +        }
 +
 +        tulip_copy_rx_bytes(s, &desc);
 +
 +        if (!s->rx_frame_len) {
 +            desc.status |= s->rx_status;
 +            s->csr[5] |= CSR5_RI;
 +            tulip_update_int(s);
 +        }
 +        tulip_dump_rx_descriptor(s, &desc);
 +        tulip_desc_write(s, s->current_rx_desc, &desc);
 +        tulip_next_rx_descriptor(s, &desc);
 +    } while (s->rx_frame_len);
 +    return size;
 +}
 +
 +static ssize_t tulip_receive_nc(NetClientState *nc,
 +                             const uint8_t *buf, size_t size)
 +{
 +    return tulip_receive(qemu_get_nic_opaque(nc), buf, size);
 +}
 +
 +
 +static NetClientInfo net_tulip_info = {
 +    .type = NET_CLIENT_DRIVER_NIC,
 +    .size = sizeof(NICState),
 +    .receive = tulip_receive_nc,
 +};
 +
 +static const char *tulip_reg_name(const hwaddr addr)
 +{
 +    switch (addr) {
 +    case CSR(0):
 +        return "CSR0";
 +
 +    case CSR(1):
 +        return "CSR1";
 +
 +    case CSR(2):
 +        return "CSR2";
 +
 +    case CSR(3):
 +        return "CSR3";
 +
 +    case CSR(4):
 +        return "CSR4";
 +
 +    case CSR(5):
 +        return "CSR5";
 +
 +    case CSR(6):
 +        return "CSR6";
 +
 +    case CSR(7):
 +        return "CSR7";
 +
 +    case CSR(8):
 +        return "CSR8";
 +
 +    case CSR(9):
 +        return "CSR9";
 +
 +    case CSR(10):
 +        return "CSR10";
 +
 +    case CSR(11):
 +        return "CSR11";
 +
 +    case CSR(12):
 +        return "CSR12";
 +
 +    case CSR(13):
 +        return "CSR13";
 +
 +    case CSR(14):
 +        return "CSR14";
 +
 +    case CSR(15):
 +        return "CSR15";
 +
 +    default:
 +        break;
 +    }
 +    return "";
 +}
 +
 +static const char *tulip_rx_state_name(int state)
 +{
 +    switch (state) {
 +    case CSR5_RS_STOPPED:
 +        return "STOPPED";
 +
 +    case CSR5_RS_RUNNING_FETCH:
 +        return "RUNNING/FETCH";
 +
 +    case CSR5_RS_RUNNING_CHECK_EOR:
 +        return "RUNNING/CHECK EOR";
 +
 +    case CSR5_RS_RUNNING_WAIT_RECEIVE:
 +        return "WAIT RECEIVE";
 +
 +    case CSR5_RS_SUSPENDED:
 +        return "SUSPENDED";
 +
 +    case CSR5_RS_RUNNING_CLOSE:
 +        return "RUNNING/CLOSE";
 +
 +    case CSR5_RS_RUNNING_FLUSH:
 +        return "RUNNING/FLUSH";
 +
 +    case CSR5_RS_RUNNING_QUEUE:
 +        return "RUNNING/QUEUE";
 +
 +    default:
 +        break;
 +    }
 +    return "";
 +}
 +
 +static const char *tulip_tx_state_name(int state)
 +{
 +    switch (state) {
 +    case CSR5_TS_STOPPED:
 +        return "STOPPED";
 +
 +    case CSR5_TS_RUNNING_FETCH:
 +        return "RUNNING/FETCH";
 +
 +    case CSR5_TS_RUNNING_WAIT_EOT:
 +        return "RUNNING/WAIT EOT";
 +
 +    case CSR5_TS_RUNNING_READ_BUF:
 +        return "RUNNING/READ BUF";
 +
 +    case CSR5_TS_RUNNING_SETUP:
 +        return "RUNNING/SETUP";
 +
 +    case CSR5_TS_SUSPENDED:
 +        return "SUSPENDED";
 +
 +    case CSR5_TS_RUNNING_CLOSE:
 +        return "RUNNING/CLOSE";
 +
 +    default:
 +        break;
 +    }
 +    return "";
 +}
 +
 +static void tulip_update_rs(TULIPState *s, int state)
 +{
 +    s->csr[5] &= ~(CSR5_RS_MASK << CSR5_RS_SHIFT);
 +    s->csr[5] |= (state & CSR5_RS_MASK) << CSR5_RS_SHIFT;
 +    trace_tulip_rx_state(tulip_rx_state_name(state));
 +}
 +
 +static uint16_t tulip_mdi_default[] = {
 +    /* MDI Registers 0 - 6, 7 */
 +    0x3100, 0xf02c, 0x7810, 0x0000, 0x0501, 0x4181, 0x0000, 0x0000,
 +    /* MDI Registers 8 - 15 */
 +    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 +    /* MDI Registers 16 - 31 */
 +    0x0003, 0x0000, 0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 +    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 +};
 +
 +/* Readonly mask for MDI (PHY) registers */
 +static const uint16_t tulip_mdi_mask[] = {
 +    0x0000, 0xffff, 0xffff, 0xffff, 0xc01f, 0xffff, 0xffff, 0x0000,
 +    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 +    0x0fff, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
 +    0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 +};
 +
 +static uint16_t tulip_mii_read(TULIPState *s, int phy, int reg)
 +{
 +    uint16_t ret = 0;
 +    if (phy == 1) {
 +        ret = tulip_mdi_default[reg];
 +    }
 +    trace_tulip_mii_read(phy, reg, ret);
 +    return ret;
 +}
 +
 +static void tulip_mii_write(TULIPState *s, int phy, int reg, uint16_t data)
 +{
 +    trace_tulip_mii_write(phy, reg, data);
 +
 +    if (phy != 1) {
 +        return;
 +    }
 +
 +    tulip_mdi_default[reg] &= ~tulip_mdi_mask[reg];
 +    tulip_mdi_default[reg] |= (data & tulip_mdi_mask[reg]);
 +}
 +
 +static void tulip_mii(TULIPState *s)
 +{
 +    uint32_t changed = s->old_csr9 ^ s->csr[9];
 +    uint16_t data;
 +    int op, phy, reg;
 +
 +    if (!(changed & CSR9_MDC)) {
 +        return;
 +    }
 +
 +    if (!(s->csr[9] & CSR9_MDC)) {
 +        return;
 +    }
 +
 +    s->mii_bitcnt++;
 +    s->mii_word <<= 1;
 +
 +    if (s->csr[9] & CSR9_MDO && (s->mii_bitcnt < 16 ||
 +        !(s->csr[9] & CSR9_MII))) {
 +        /* write op or address bits */
 +        s->mii_word |= 1;
 +    }
 +
 +    if (s->mii_bitcnt >= 16 && (s->csr[9] & CSR9_MII)) {
 +        if (s->mii_word & 0x8000) {
 +            s->csr[9] |= CSR9_MDI;
 +        } else {
 +            s->csr[9] &= ~CSR9_MDI;
 +        }
 +    }
 +
 +    if (s->mii_word == 0xffffffff) {
 +        s->mii_bitcnt = 0;
 +    } else if (s->mii_bitcnt == 16) {
 +        op = (s->mii_word >> 12) & 0x0f;
 +        phy = (s->mii_word >> 7) & 0x1f;
 +        reg = (s->mii_word >> 2) & 0x1f;
 +
 +        if (op == 6) {
 +            s->mii_word = tulip_mii_read(s, phy, reg);
 +        }
 +    } else if (s->mii_bitcnt == 32) {
 +            op = (s->mii_word >> 28) & 0x0f;
 +            phy = (s->mii_word >> 23) & 0x1f;
 +            reg = (s->mii_word >> 18) & 0x1f;
 +            data = s->mii_word & 0xffff;
 +
 +        if (op == 5) {
 +            tulip_mii_write(s, phy, reg, data);
 +        }
 +    }
 +}
 +
 +static uint32_t tulip_csr9_read(TULIPState *s)
 +{
 +    if (s->csr[9] & CSR9_SR) {
 +        if (eeprom93xx_read(s->eeprom)) {
 +            s->csr[9] |= CSR9_SR_DO;
 +        } else {
 +            s->csr[9] &= ~CSR9_SR_DO;
 +        }
 +    }
 +
 +    tulip_mii(s);
 +    return s->csr[9];
 +}
 +
 +static void tulip_update_ts(TULIPState *s, int state)
 +{
 +        s->csr[5] &= ~(CSR5_TS_MASK << CSR5_TS_SHIFT);
 +        s->csr[5] |= (state & CSR5_TS_MASK) << CSR5_TS_SHIFT;
 +        trace_tulip_tx_state(tulip_tx_state_name(state));
 +}
 +
 +static uint64_t tulip_read(void *opaque, hwaddr addr,
 +                              unsigned size)
 +{
 +    TULIPState *s = opaque;
 +    uint64_t data = 0;
 +
 +    switch (addr) {
 +    case CSR(9):
 +        data = tulip_csr9_read(s);
 +        break;
 +
 +    case CSR(12):
 +        /* Fake autocompletion complete until we have PHY emulation */
 +        data = 5 << CSR12_ANS_SHIFT;
 +        break;
 +
 +    default:
 +        if (addr & 7) {
 +            qemu_log_mask(LOG_GUEST_ERROR, "%s: read access at unknown address"
 +                " 0x%"PRIx64"\n", __func__, addr);
 +        } else {
 +            data = s->csr[addr >> 3];
 +        }
 +        break;
 +    }
 +    trace_tulip_reg_read(addr, tulip_reg_name(addr), size, data);
 +    return data;
 +}
 +
 +static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
 +{
 +    if (s->tx_frame_len) {
 +        if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
 +            /* Internal or external Loopback */
 +            tulip_receive(s, s->tx_frame, s->tx_frame_len);
 +        } else {
 +            qemu_send_packet(qemu_get_queue(s->nic),
 +                s->tx_frame, s->tx_frame_len);
 +        }
 +    }
 +
 +    if (desc->control & TDES1_IC) {
 +        s->csr[5] |= CSR5_TI;
 +        tulip_update_int(s);
 +    }
 +}
 +
 +static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
 +{
 +    int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
 +    int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
 +
 +    if (len1) {
 +        pci_dma_read(&s->dev, desc->buf_addr1,
 +            s->tx_frame + s->tx_frame_len, len1);
 +        s->tx_frame_len += len1;
 +    }
 +
 +    if (len2) {
 +        pci_dma_read(&s->dev, desc->buf_addr2,
 +            s->tx_frame + s->tx_frame_len, len2);
 +        s->tx_frame_len += len2;
 +    }
 +    desc->status = (len1 + len2) ? 0 : 0x7fffffff;
 +}
 +
 +static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
 +{
 +    int offset = n * 12;
 +
 +    s->filter[n][0] = buf[offset];
 +    s->filter[n][1] = buf[offset + 1];
 +
 +    s->filter[n][2] = buf[offset + 4];
 +    s->filter[n][3] = buf[offset + 5];
 +
 +    s->filter[n][4] = buf[offset + 8];
 +    s->filter[n][5] = buf[offset + 9];
 +
 +    trace_tulip_setup_filter(n, s->filter[n][5], s->filter[n][4],
 +            s->filter[n][3], s->filter[n][2], s->filter[n][1], s->filter[n][0]);
 +}
 +
 +static void tulip_setup_frame(TULIPState *s,
 +        struct tulip_descriptor *desc)
 +{
 +    uint8_t buf[4096];
 +    int len = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
 +    int i;
 +
 +    trace_tulip_setup_frame();
 +
 +    if (len == 192) {
 +        pci_dma_read(&s->dev, desc->buf_addr1, buf, len);
 +        for (i = 0; i < 16; i++) {
 +            tulip_setup_filter_addr(s, buf, i);
 +        }
 +    }
 +
 +    desc->status = 0x7fffffff;
 +
 +    if (desc->control & TDES1_IC) {
 +        s->csr[5] |= CSR5_TI;
 +        tulip_update_int(s);
 +    }
 +}
 +
 +static void tulip_next_tx_descriptor(TULIPState *s,
 +    struct tulip_descriptor *desc)
 +{
 +    if (desc->control & TDES1_TER) {
 +        s->current_tx_desc = s->csr[4];
 +    } else if (desc->control & TDES1_TCH) {
 +        s->current_tx_desc = desc->buf_addr2;
 +    } else {
 +        s->current_tx_desc += sizeof(struct tulip_descriptor) +
 +                (((s->csr[0] >> CSR0_DSL_SHIFT) & CSR0_DSL_MASK) << 2);
 +    }
 +    s->current_tx_desc &= ~3ULL;
 +}
 +
 +static uint32_t tulip_ts(TULIPState *s)
 +{
 +    return (s->csr[5] >> CSR5_TS_SHIFT) & CSR5_TS_MASK;
 +}
 +
 +static void tulip_xmit_list_update(TULIPState *s)
 +{
 +    struct tulip_descriptor desc;
 +
 +    if (tulip_ts(s) != CSR5_TS_SUSPENDED) {
 +        return;
 +    }
 +
 +    for (;;) {
 +        tulip_desc_read(s, s->current_tx_desc, &desc);
 +        tulip_dump_tx_descriptor(s, &desc);
 +
 +        if (!(desc.status & TDES0_OWN)) {
 +            tulip_update_ts(s, CSR5_TS_SUSPENDED);
 +            s->csr[5] |= CSR5_TU;
 +            tulip_update_int(s);
 +            return;
 +        }
 +
 +        if (desc.control & TDES1_SET) {
 +            tulip_setup_frame(s, &desc);
 +        } else {
 +            if (desc.control & TDES1_FS) {
 +                s->tx_frame_len = 0;
 +            }
 +
 +            tulip_copy_tx_buffers(s, &desc);
 +
 +            if (desc.control & TDES1_LS) {
 +                tulip_tx(s, &desc);
 +            }
 +        }
 +        tulip_desc_write(s, s->current_tx_desc, &desc);
 +        tulip_next_tx_descriptor(s, &desc);
 +    }
 +}
 +
 +static void tulip_csr9_write(TULIPState *s, uint32_t old_val,
 +        uint32_t new_val)
 +{
 +    if (new_val & CSR9_SR) {
 +        eeprom93xx_write(s->eeprom,
 +            !!(new_val & CSR9_SR_CS),
 +            !!(new_val & CSR9_SR_SK),
 +            !!(new_val & CSR9_SR_DI));
 +    }
 +}
 +
 +static void tulip_reset(TULIPState *s)
 +{
 +    trace_tulip_reset();
 +
 +    s->csr[0] = 0xfe000000;
 +    s->csr[1] = 0xffffffff;
 +    s->csr[2] = 0xffffffff;
 +    s->csr[5] = 0xf0000000;
 +    s->csr[6] = 0x32000040;
 +    s->csr[7] = 0xf3fe0000;
 +    s->csr[8] = 0xe0000000;
 +    s->csr[9] = 0xfff483ff;
 +    s->csr[11] = 0xfffe0000;
 +    s->csr[12] = 0x000000c6;
 +    s->csr[13] = 0xffff0000;
 +    s->csr[14] = 0xffffffff;
 +    s->csr[15] = 0x8ff00000;
 +}
 +
 +static void tulip_qdev_reset(DeviceState *dev)
 +{
 +    PCIDevice *d = PCI_DEVICE(dev);
 +    TULIPState *s = TULIP(d);
 +
 +    tulip_reset(s);
 +}
 +
 +static void tulip_write(void *opaque, hwaddr addr,
 +                           uint64_t data, unsigned size)
 +{
 +    TULIPState *s = opaque;
 +    trace_tulip_reg_write(addr, tulip_reg_name(addr), size, data);
 +
 +    switch (addr) {
 +    case CSR(0):
 +        s->csr[0] = data;
 +        if (data & CSR0_SWR) {
 +            tulip_reset(s);
 +            tulip_update_int(s);
 +        }
 +        break;
 +
 +    case CSR(1):
 +        tulip_xmit_list_update(s);
 +        break;
 +
 +    case CSR(2):
 +        qemu_flush_queued_packets(qemu_get_queue(s->nic));
 +        break;
 +
 +    case CSR(3):
 +        s->csr[3] = data & ~3ULL;
 +        s->current_rx_desc = s->csr[3];
 +        qemu_flush_queued_packets(qemu_get_queue(s->nic));
 +        break;
 +
 +    case CSR(4):
 +        s->csr[4] = data & ~3ULL;
 +        s->current_tx_desc = s->csr[4];
 +        tulip_xmit_list_update(s);
 +        break;
 +
 +    case CSR(5):
 +        /* Status register, write clears bit */
 +        s->csr[5] &= ~(data & (CSR5_TI | CSR5_TPS | CSR5_TU | CSR5_TJT |
 +                               CSR5_LNP_ANC | CSR5_UNF | CSR5_RI | CSR5_RU |
 +                               CSR5_RPS | CSR5_RWT | CSR5_ETI | CSR5_GTE |
 +                               CSR5_LNF | CSR5_FBE | CSR5_ERI | CSR5_AIS |
 +                               CSR5_NIS | CSR5_GPI | CSR5_LC));
 +        tulip_update_int(s);
 +        break;
 +
 +    case CSR(6):
 +        s->csr[6] = data;
 +        if (s->csr[6] & CSR6_SR) {
 +            tulip_update_rs(s, CSR5_RS_RUNNING_WAIT_RECEIVE);
 +            qemu_flush_queued_packets(qemu_get_queue(s->nic));
 +        } else {
 +            tulip_update_rs(s, CSR5_RS_STOPPED);
 +        }
 +
 +        if (s->csr[6] & CSR6_ST) {
 +            tulip_update_ts(s, CSR5_TS_SUSPENDED);
 +            tulip_xmit_list_update(s);
 +        } else {
 +            tulip_update_ts(s, CSR5_TS_STOPPED);
 +        }
 +        break;
 +
 +    case CSR(7):
 +        s->csr[7] = data;
 +        tulip_update_int(s);
 +        break;
 +
 +    case CSR(8):
 +        s->csr[9] = data;
 +        break;
 +
 +    case CSR(9):
 +        tulip_csr9_write(s, s->csr[9], data);
 +        /* don't clear MII read data */
 +        s->csr[9] &= CSR9_MDI;
 +        s->csr[9] |= (data & ~CSR9_MDI);
 +        tulip_mii(s);
 +        s->old_csr9 = s->csr[9];
 +        break;
 +
 +    case CSR(10):
 +        s->csr[10] = data;
 +        break;
 +
 +    case CSR(11):
 +        s->csr[11] = data;
 +        break;
 +
 +    case CSR(12):
 +        /* SIA Status register, some bits are cleared by writing 1 */
 +        s->csr[12] &= ~(data & (CSR12_MRA | CSR12_TRA | CSR12_ARA));
 +        break;
 +
 +    case CSR(13):
 +        s->csr[13] = data;
 +        break;
 +
 +    case CSR(14):
 +        s->csr[14] = data;
 +        break;
 +
 +    case CSR(15):
 +        s->csr[15] = data;
 +        break;
 +
 +    default:
 +        qemu_log_mask(LOG_GUEST_ERROR, "%s: write to CSR at unknown address "
 +                "0x%"PRIx64"\n", __func__, addr);
 +        break;
 +    }
 +}
 +
 +static const MemoryRegionOps tulip_ops = {
 +    .read = tulip_read,
 +    .write = tulip_write,
 +    .endianness = DEVICE_LITTLE_ENDIAN,
 +    .impl = {
 +        .min_access_size = 4,
 +        .max_access_size = 4,
 +    },
 +};
 +
 +static void tulip_idblock_crc(TULIPState *s, uint16_t *srom)
 +{
 +    int word, n;
 +    int bit;
 +    unsigned char bitval, crc;
 +    const int len = 9;
 +    n = 0;
 +    crc = -1;
 +
 +    for (word = 0; word < len; word++) {
 +        for (bit = 15; bit >= 0; bit--) {
 +            if ((word == (len - 1)) && (bit == 7)) {
 +                /*
 +                 * Insert the correct CRC result into input data stream
 +                 * in place.
 +                 */
 +                srom[len - 1] = (srom[len - 1] & 0xff00) | (unsigned short)crc;
 +                break;
 +            }
 +            n++;
 +            bitval = ((srom[word] >> bit) & 1) ^ ((crc >> 7) & 1);
 +            crc = crc << 1;
 +            if (bitval == 1) {
 +                crc ^= 6;
 +                crc |= 0x01;
 +            }
 +        }
 +    }
 +}
 +
 +static uint16_t tulip_srom_crc(TULIPState *s, uint8_t *eeprom, size_t len)
 +{
 +    unsigned long crc = 0xffffffff;
 +    unsigned long flippedcrc = 0;
 +    unsigned char currentbyte;
 +    unsigned int msb, bit, i;
 +
 +    for (i = 0; i < len; i++) {
 +        currentbyte = eeprom[i];
 +        for (bit = 0; bit < 8; bit++) {
 +            msb = (crc >> 31) & 1;
 +            crc <<= 1;
 +            if (msb ^ (currentbyte & 1)) {
 +                crc ^= 0x04c11db6;
 +                crc |= 0x00000001;
 +            }
 +            currentbyte >>= 1;
 +        }
 +    }
 +
 +    for (i = 0; i < 32; i++) {
 +        flippedcrc <<= 1;
 +        bit = crc & 1;
 +        crc >>= 1;
 +        flippedcrc += bit;
 +    }
 +    return (flippedcrc ^ 0xffffffff) & 0xffff;
 +}
 +
 +static const uint8_t eeprom_default[128] = {
 +    0x3c, 0x10, 0x4f, 0x10, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x56, 0x08, 0x04, 0x01, 0x00, 0x80, 0x48, 0xb3,
 +    0x0e, 0xa7, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x08,
 +    0x01, 0x8d, 0x03, 0x00, 0x00, 0x00, 0x00, 0x78,
 +    0xe0, 0x01, 0x00, 0x50, 0x00, 0x18, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0x6b,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
 +    0x48, 0xb3, 0x0e, 0xa7, 0x40, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 +};
 +
 +static void tulip_fill_eeprom(TULIPState *s)
 +{
 +    uint16_t *eeprom = eeprom93xx_data(s->eeprom);
 +    memcpy(eeprom, eeprom_default, 128);
 +
 +    /* patch in our mac address */
 +    eeprom[10] = cpu_to_le16(s->c.macaddr.a[0] | (s->c.macaddr.a[1] << 8));
 +    eeprom[11] = cpu_to_le16(s->c.macaddr.a[2] | (s->c.macaddr.a[3] << 8));
 +    eeprom[12] = cpu_to_le16(s->c.macaddr.a[4] | (s->c.macaddr.a[5] << 8));
 +    tulip_idblock_crc(s, eeprom);
 +    eeprom[63] = cpu_to_le16(tulip_srom_crc(s, (uint8_t *)eeprom, 126));
 +}
 +
 +static void pci_tulip_realize(PCIDevice *pci_dev, Error **errp)
 +{
 +    TULIPState *s = DO_UPCAST(TULIPState, dev, pci_dev);
 +    uint8_t *pci_conf;
 +
 +    pci_conf = s->dev.config;
 +    pci_conf[PCI_INTERRUPT_PIN] = 1; /* interrupt pin A */
 +
 +    s->eeprom = eeprom93xx_new(&pci_dev->qdev, 64);
 +    tulip_fill_eeprom(s);
 +
 +    memory_region_init_io(&s->io, OBJECT(&s->dev), &tulip_ops, s,
 +            "tulip-io", 128);
 +
 +    memory_region_init_io(&s->memory, OBJECT(&s->dev), &tulip_ops, s,
 +            "tulip-mem", 128);
 +
 +    pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &s->io);
 +    pci_register_bar(&s->dev, 1, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->memory);
 +
 +    s->irq = pci_allocate_irq(&s->dev);
 +
 +    qemu_macaddr_default_if_unset(&s->c.macaddr);
 +
 +    s->nic = qemu_new_nic(&net_tulip_info, &s->c,
 +                          object_get_typename(OBJECT(pci_dev)),
 +                          pci_dev->qdev.id, s);
 +    qemu_format_nic_info_str(qemu_get_queue(s->nic), s->c.macaddr.a);
 +}
 +
 +static void pci_tulip_exit(PCIDevice *pci_dev)
 +{
 +    TULIPState *s = DO_UPCAST(TULIPState, dev, pci_dev);
 +
 +    qemu_del_nic(s->nic);
 +    qemu_free_irq(s->irq);
 +    eeprom93xx_free(&pci_dev->qdev, s->eeprom);
 +}
 +
 +static void tulip_instance_init(Object *obj)
 +{
 +    PCIDevice *pci_dev = PCI_DEVICE(obj);
 +    TULIPState *d = DO_UPCAST(TULIPState, dev, pci_dev);
 +
 +    device_add_bootindex_property(obj, &d->c.bootindex,
 +                                  "bootindex", "/ethernet-phy@0",
 +                                  &pci_dev->qdev, NULL);
 +}
 +
 +static Property tulip_properties[] = {
 +    DEFINE_NIC_PROPERTIES(TULIPState, c),
 +    DEFINE_PROP_END_OF_LIST(),
 +};
 +
 +static void tulip_class_init(ObjectClass *klass, void *data)
 +{
 +    DeviceClass *dc = DEVICE_CLASS(klass);
 +    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 +
 +    k->realize = pci_tulip_realize;
 +    k->exit = pci_tulip_exit;
 +    k->vendor_id = PCI_VENDOR_ID_DEC;
 +    k->device_id = PCI_DEVICE_ID_DEC_21143;
 +    k->subsystem_vendor_id = 0x103c;
 +    k->subsystem_id = 0x104f;
 +    k->class_id = PCI_CLASS_NETWORK_ETHERNET;
 +    dc->vmsd = &vmstate_pci_tulip;
 +    dc->props = tulip_properties;
 +    dc->reset = tulip_qdev_reset;
 +    set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
 +}
 +
 +static const TypeInfo tulip_info = {
 +    .name          = TYPE_TULIP,
 +    .parent        = TYPE_PCI_DEVICE,
 +    .instance_size = sizeof(TULIPState),
 +    .class_init    = tulip_class_init,
 +    .instance_init = tulip_instance_init,
 +    .interfaces = (InterfaceInfo[]) {
 +        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
 +        { },
 +    },
 +};
 +
 +static void tulip_register_types(void)
 +{
 +    type_register_static(&tulip_info);
 +}
 +
 +type_init(tulip_register_types)
 diff --git a/hw/net/tulip.h b/hw/net/tulip.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/hw/net/tulip.h
++++ b/hw/virtio/vhost-iova-tree.h
 @@ -XXX,XX +XXX,XX @@
-+#ifndef HW_TULIP_H
++/*
-+#define HW_TULIP_H
++ * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
-+#include "qemu/units.h"
++#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
-+#include "net/net.h"
++#define HW_VIRTIO_VHOST_IOVA_TREE_H
 +
-+#define TYPE_TULIP "tulip"
++#include "qemu/iova-tree.h"
-+#define TULIP(obj) OBJECT_CHECK(TULIPState, (obj), TYPE_TULIP)
++#include "exec/memory.h"
 +
-+#define CSR(_x) ((_x) << 3)
++typedef struct VhostIOVATree VhostIOVATree;
 +
-+#define CSR0_SWR        BIT(0)
++VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
-+#define CSR0_BAR        BIT(1)
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
-+#define CSR0_DSL_SHIFT  2
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
 +#define CSR0_DSL_MASK   0x1f
 +#define CSR0_BLE        BIT(7)
 +#define CSR0_PBL_SHIFT  8
 +#define CSR0_PBL_MASK   0x3f
 +#define CSR0_CAC_SHIFT  14
 +#define CSR0_CAC_MASK   0x3
 +#define CSR0_DAS        0x10000
 +#define CSR0_TAP_SHIFT  17
 +#define CSR0_TAP_MASK   0x7
 +#define CSR0_DBO        0x100000
 +#define CSR1_TPD        0x01
 +#define CSR0_RLE        BIT(23)
 +#define CSR0_WIE        BIT(24)
 +
-+#define CSR2_RPD        0x01
++const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
-+
++                                        const DMAMap *map);
-+#define CSR5_TI         BIT(0)
++int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
-+#define CSR5_TPS        BIT(1)
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
 +#define CSR5_TU         BIT(2)
 +#define CSR5_TJT        BIT(3)
 +#define CSR5_LNP_ANC    BIT(4)
 +#define CSR5_UNF        BIT(5)
 +#define CSR5_RI         BIT(6)
 +#define CSR5_RU         BIT(7)
 +#define CSR5_RPS        BIT(8)
 +#define CSR5_RWT        BIT(9)
 +#define CSR5_ETI        BIT(10)
 +#define CSR5_GTE        BIT(11)
 +#define CSR5_LNF        BIT(12)
 +#define CSR5_FBE        BIT(13)
 +#define CSR5_ERI        BIT(14)
 +#define CSR5_AIS        BIT(15)
 +#define CSR5_NIS        BIT(16)
 +#define CSR5_RS_SHIFT   17
 +#define CSR5_RS_MASK    7
 +#define CSR5_TS_SHIFT   20
 +#define CSR5_TS_MASK    7
 +
 +#define CSR5_TS_STOPPED                 0
 +#define CSR5_TS_RUNNING_FETCH           1
 +#define CSR5_TS_RUNNING_WAIT_EOT        2
 +#define CSR5_TS_RUNNING_READ_BUF        3
 +#define CSR5_TS_RUNNING_SETUP           5
 +#define CSR5_TS_SUSPENDED               6
 +#define CSR5_TS_RUNNING_CLOSE           7
 +
 +#define CSR5_RS_STOPPED                 0
 +#define CSR5_RS_RUNNING_FETCH           1
 +#define CSR5_RS_RUNNING_CHECK_EOR       2
 +#define CSR5_RS_RUNNING_WAIT_RECEIVE    3
 +#define CSR5_RS_SUSPENDED               4
 +#define CSR5_RS_RUNNING_CLOSE           5
 +#define CSR5_RS_RUNNING_FLUSH           6
 +#define CSR5_RS_RUNNING_QUEUE           7
 +
 +#define CSR5_EB_SHIFT   23
 +#define CSR5_EB_MASK    7
 +
 +#define CSR5_GPI        BIT(26)
 +#define CSR5_LC         BIT(27)
 +
 +#define CSR6_HP         BIT(0)
 +#define CSR6_SR         BIT(1)
 +#define CSR6_HO         BIT(2)
 +#define CSR6_PB         BIT(3)
 +#define CSR6_IF         BIT(4)
 +#define CSR6_SB         BIT(5)
 +#define CSR6_PR         BIT(6)
 +#define CSR6_PM         BIT(7)
 +#define CSR6_FKD        BIT(8)
 +#define CSR6_FD         BIT(9)
 +
 +#define CSR6_OM_SHIFT   10
 +#define CSR6_OM_MASK    3
 +#define CSR6_OM_NORMAL          0
 +#define CSR6_OM_INT_LOOPBACK    1
 +#define CSR6_OM_EXT_LOOPBACK    2
 +
 +#define CSR6_FC         BIT(12)
 +#define CSR6_ST         BIT(13)
 +
 +
 +#define CSR6_TR_SHIFT   14
 +#define CSR6_TR_MASK    3
 +#define CSR6_TR_72      0
 +#define CSR6_TR_96      1
 +#define CSR6_TR_128     2
 +#define CSR6_TR_160     3
 +
 +#define CSR6_CA         BIT(17)
 +#define CSR6_RA         BIT(30)
 +#define CSR6_SC         BIT(31)
 +
 +#define CSR7_TIM        BIT(0)
 +#define CSR7_TSM        BIT(1)
 +#define CSR7_TUM        BIT(2)
 +#define CSR7_TJM        BIT(3)
 +#define CSR7_LPM        BIT(4)
 +#define CSR7_UNM        BIT(5)
 +#define CSR7_RIM        BIT(6)
 +#define CSR7_RUM        BIT(7)
 +#define CSR7_RSM        BIT(8)
 +#define CSR7_RWM        BIT(9)
 +#define CSR7_TMM        BIT(11)
 +#define CSR7_LFM        BIT(12)
 +#define CSR7_SEM        BIT(13)
 +#define CSR7_ERM        BIT(14)
 +#define CSR7_AIM        BIT(15)
 +#define CSR7_NIM        BIT(16)
 +
 +#define CSR8_MISSED_FRAME_OVL           BIT(16)
 +#define CSR8_MISSED_FRAME_CNT_MASK      0xffff
 +
 +#define CSR9_DATA_MASK  0xff
 +#define CSR9_SR_CS      BIT(0)
 +#define CSR9_SR_SK      BIT(1)
 +#define CSR9_SR_DI      BIT(2)
 +#define CSR9_SR_DO      BIT(3)
 +#define CSR9_REG        BIT(10)
 +#define CSR9_SR         BIT(11)
 +#define CSR9_BR         BIT(12)
 +#define CSR9_WR         BIT(13)
 +#define CSR9_RD         BIT(14)
 +#define CSR9_MOD        BIT(15)
 +#define CSR9_MDC        BIT(16)
 +#define CSR9_MDO        BIT(17)
 +#define CSR9_MII        BIT(18)
 +#define CSR9_MDI        BIT(19)
 +
 +#define CSR11_CON       BIT(16)
 +#define CSR11_TIMER_MASK 0xffff
 +
 +#define CSR12_MRA       BIT(0)
 +#define CSR12_LS100     BIT(1)
 +#define CSR12_LS10      BIT(2)
 +#define CSR12_APS       BIT(3)
 +#define CSR12_ARA       BIT(8)
 +#define CSR12_TRA       BIT(9)
 +#define CSR12_NSN       BIT(10)
 +#define CSR12_TRF       BIT(11)
 +#define CSR12_ANS_SHIFT 12
 +#define CSR12_ANS_MASK  7
 +#define CSR12_LPN       BIT(15)
 +#define CSR12_LPC_SHIFT 16
 +#define CSR12_LPC_MASK  0xffff
 +
 +#define CSR13_SRL       BIT(0)
 +#define CSR13_CAC       BIT(2)
 +#define CSR13_AUI       BIT(3)
 +#define CSR13_SDM_SHIFT 4
 +#define CSR13_SDM_MASK  0xfff
 +
 +#define CSR14_ECEN      BIT(0)
 +#define CSR14_LBK       BIT(1)
 +#define CSR14_DREN      BIT(2)
 +#define CSR14_LSE       BIT(3)
 +#define CSR14_CPEN_SHIFT 4
 +#define CSR14_CPEN_MASK 3
 +#define CSR14_MBO       BIT(6)
 +#define CSR14_ANE       BIT(7)
 +#define CSR14_RSQ       BIT(8)
 +#define CSR14_CSQ       BIT(9)
 +#define CSR14_CLD       BIT(10)
 +#define CSR14_SQE       BIT(11)
 +#define CSR14_LTE       BIT(12)
 +#define CSR14_APE       BIT(13)
 +#define CSR14_SPP       BIT(14)
 +#define CSR14_TAS       BIT(15)
 +
 +#define CSR15_JBD       BIT(0)
 +#define CSR15_HUJ       BIT(1)
 +#define CSR15_JCK       BIT(2)
 +#define CSR15_ABM       BIT(3)
 +#define CSR15_RWD       BIT(4)
 +#define CSR15_RWR       BIT(5)
 +#define CSR15_LE1       BIT(6)
 +#define CSR15_LV1       BIT(7)
 +#define CSR15_TSCK      BIT(8)
 +#define CSR15_FUSQ      BIT(9)
 +#define CSR15_FLF       BIT(10)
 +#define CSR15_LSD       BIT(11)
 +#define CSR15_DPST      BIT(12)
 +#define CSR15_FRL       BIT(13)
 +#define CSR15_LE2       BIT(14)
 +#define CSR15_LV2       BIT(15)
 +
 +#define RDES0_OF         BIT(0)
 +#define RDES0_CE         BIT(1)
 +#define RDES0_DB         BIT(2)
 +#define RDES0_RJ         BIT(4)
 +#define RDES0_FT         BIT(5)
 +#define RDES0_CS         BIT(6)
 +#define RDES0_TL         BIT(7)
 +#define RDES0_LS         BIT(8)
 +#define RDES0_FS         BIT(9)
 +#define RDES0_MF         BIT(10)
 +#define RDES0_RF         BIT(11)
 +#define RDES0_DT_SHIFT   12
 +#define RDES0_DT_MASK    3
 +#define RDES0_LE         BIT(14)
 +#define RDES0_ES         BIT(15)
 +#define RDES0_FL_SHIFT   16
 +#define RDES0_FL_MASK    0x3fff
 +#define RDES0_FF         BIT(30)
 +#define RDES0_OWN        BIT(31)
 +
 +#define RDES1_BUF1_SIZE_SHIFT 0
 +#define RDES1_BUF1_SIZE_MASK 0x7ff
 +
 +#define RDES1_BUF2_SIZE_SHIFT 11
 +#define RDES1_BUF2_SIZE_MASK 0x7ff
 +#define RDES1_RCH       BIT(24)
 +#define RDES1_RER       BIT(25)
 +
 +#define TDES0_DE        BIT(0)
 +#define TDES0_UF        BIT(1)
 +#define TDES0_LF        BIT(2)
 +#define TDES0_CC_SHIFT  3
 +#define TDES0_CC_MASK   0xf
 +#define TDES0_HF        BIT(7)
 +#define TDES0_EC        BIT(8)
 +#define TDES0_LC        BIT(9)
 +#define TDES0_NC        BIT(10)
 +#define TDES0_LO        BIT(11)
 +#define TDES0_TO        BIT(14)
 +#define TDES0_ES        BIT(15)
 +#define TDES0_OWN       BIT(31)
 +
 +#define TDES1_BUF1_SIZE_SHIFT 0
 +#define TDES1_BUF1_SIZE_MASK 0x7ff
 +
 +#define TDES1_BUF2_SIZE_SHIFT 11
 +#define TDES1_BUF2_SIZE_MASK 0x7ff
 +
 +#define TDES1_FT0       BIT(22)
 +#define TDES1_DPD       BIT(23)
 +#define TDES1_TCH       BIT(24)
 +#define TDES1_TER       BIT(25)
 +#define TDES1_AC        BIT(26)
 +#define TDES1_SET       BIT(27)
 +#define TDES1_FT1       BIT(28)
 +#define TDES1_FS        BIT(29)
 +#define TDES1_LS        BIT(30)
 +#define TDES1_IC        BIT(31)
 +
 +struct tulip_descriptor {
 +    uint32_t status;
 +    uint32_t control;
 +    uint32_t buf_addr1;
 +    uint32_t buf_addr2;
 +};
 +
 +#endif
-diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/pci/pci_ids.h
-+++ b/include/hw/pci/pci_ids.h
-@@ -XXX,XX +XXX,XX @@
- #define PCI_DEVICE_ID_LSI_SAS0079        0x0079
- #define PCI_VENDOR_ID_DEC                0x1011
-+#define PCI_DEVICE_ID_DEC_21143          0x0019
- #define PCI_DEVICE_ID_DEC_21154          0x0026
- #define PCI_VENDOR_ID_CIRRUS             0x1013
 --
-.5.0
+.7.4

-New patch
+[PULL V2 12/15] vdpa: Add custom IOTLB translations to SVQ
+From: Eugenio Pérez <eperezma@redhat.com>
 Use translations added in VhostIOVATree in SVQ.
 Only introduce usage here, not allocation and deallocation. As with
 previous patches, we use the dead code paths of shadow_vqs_enabled to
 avoid commiting too many changes at once. These are impossible to take
 at the moment.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
  hw/virtio/vhost-shadow-virtqueue.c |  75 +++++++++++++++++++++--
  hw/virtio/vhost-shadow-virtqueue.h |   6 +-
  hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
  include/hw/virtio/vhost-vdpa.h     |   3 +
 files changed, 181 insertions(+), 25 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
      return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
  }
 +/**
 + * Translate addresses between the qemu's virtual address and the SVQ IOVA
 + *
 + * @svq: Shadow VirtQueue
 + * @vaddr: Translated IOVA addresses
 + * @iovec: Source qemu's VA addresses
 + * @num: Length of iovec and minimum length of vaddr
 + */
 +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
 +                                     void **addrs, const struct iovec *iovec,
 +                                     size_t num)
 +{
 +    if (num == 0) {
 +        return true;
 +    }
 +
 +    for (size_t i = 0; i < num; ++i) {
 +        DMAMap needle = {
 +            .translated_addr = (hwaddr)iovec[i].iov_base,
 +            .size = iovec[i].iov_len,
 +        };
 +        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = (void *)(map->iova + off);
 +
 +        if (unlikely(int128_gt(int128_add(needle.translated_addr,
 +                                          iovec[i].iov_len),
 +                               map->translated_addr + map->size))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    void * const *sg,
                                      const struct iovec *iovec,
                                      size_t num, bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
 -        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
 +        descs[i].addr = cpu_to_le64((hwaddr)sg[n]);
          descs[i].len = cpu_to_le32(iovec[n].iov_len);
          last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
 -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
                              elem->in_num > 0, false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/event_notifier.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 +#include "hw/virtio/vhost-iova-tree.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
 +
 +        iova = mem_region.iova;
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
 -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +    needle.translated_addr = svq_addr.used_user_addr;
 +    return vhost_vdpa_svq_unmap_ring(v, &needle);
 +}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
 +    addr->desc_user_addr = driver_region.iova;
 +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
 +    addr->avail_user_addr = driver_region.iova + avail_offset;
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
 +    addr->used_user_addr = device_region.iova;
 -    return r == 0;
 +    return ok;
  }
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #include <gmodule.h>
 +#include "hw/virtio/vhost-iova-tree.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
      bool shadow_vqs_enabled;
 +    /* IOVA mapping used by the Shadow Virtqueue */
 +    VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 --
 .7.4

-New patch
+[PULL V2 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
+From: Eugenio Pérez <eperezma@redhat.com>
+This is needed to achieve migration, so the destination can restore its
+index.
+Setting base as last used idx, so destination will see as available all
+the entries that the device did not use, including the in-flight
+processing ones.
+This is ok for networking, but other kinds of devices might have
+problems with these retransmissions.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
+file changed, 17 insertions(+)
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
+                                        struct vhost_vring_state *ring)
+ {
++    struct vhost_vdpa *v = dev->opaque;
+     int ret;
++    if (v->shadow_vqs_enabled) {
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
++                                                      ring->index);
++
++        /*
++         * Setting base as last used idx, so destination will see as available
++         * all the entries that the device did not use, including the in-flight
++         * processing ones.
++         *
++         * TODO: This is ok for networking, but other kinds of devices might
++         * have problems with these retransmissions.
++         */
++        ring->num = svq->last_used_idx;
++        return 0;
++    }
++
+     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
+     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
+     return ret;
+--
+.7.4

-[PULL V2 4/4] COLO-compare: Fix incorrect `if` logic
+[PULL V2 14/15] vdpa: Never set log_base addr if SVQ is enabled
-From: Fan Yang <Fan_Yang@sjtu.edu.cn>
+From: Eugenio Pérez <eperezma@redhat.com>
-'colo_mark_tcp_pkt' should return 'true' when packets are the same, and
+Setting the log address would make the device start reporting invalid
-'false' otherwise.  However, it returns 'true' when
+dirty memory because the SVQ vrings are located in qemu's memory.
 'colo_compare_packet_payload' returns non-zero while
 'colo_compare_packet_payload' is just a 'memcmp'.  The result is that
 COLO-compare reports inconsistent TCP packets when they are actually
 the same.
-Fixes: f449c9e549c ("colo: compare the packet based on the tcp sequence number")
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-Cc: qemu-stable@nongnu.org
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Fan Yang <Fan_Yang@sjtu.edu.cn>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 6 +++---
+ hw/virtio/vhost-vdpa.c | 3 ++-
-file changed, 3 insertions(+), 3 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
-     *mark = 0;
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
+                                      struct vhost_log *log)
-     if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
+ {
--        if (colo_compare_packet_payload(ppkt, spkt,
+-    if (vhost_vdpa_one_time_request(dev)) {
-+        if (!colo_compare_packet_payload(ppkt, spkt,
++    struct vhost_vdpa *v = dev->opaque;
-                                         ppkt->header_size, spkt->header_size,
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
-                                         ppkt->payload_size)) {
+         return 0;
-             *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
+     }
-@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
      /* one part of secondary packet payload still need to be compared */
      if (!after(ppkt->seq_end, spkt->seq_end)) {
 -        if (colo_compare_packet_payload(ppkt, spkt,
 +        if (!colo_compare_packet_payload(ppkt, spkt,
                                          ppkt->header_size + ppkt->offset,
                                          spkt->header_size + spkt->offset,
                                          ppkt->payload_size - ppkt->offset)) {
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
          /* primary packet is longer than secondary packet, compare
           * the same part and mark the primary packet offset
           */
 -        if (colo_compare_packet_payload(ppkt, spkt,
 +        if (!colo_compare_packet_payload(ppkt, spkt,
                                          ppkt->header_size + ppkt->offset,
                                          spkt->header_size + spkt->offset,
                                          spkt->payload_size - spkt->offset)) {
 --
-.5.0
+.7.4

-[PULL V2 2/4] virtio: new post_load hook
+[PULL V2 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: "Michael S. Tsirkin" <mst@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Post load hook in virtio vmsd is called early while device is processed,
+SVQ is able to log the dirty bits by itself, so let's use it to not
-and when VirtIODevice core isn't fully initialized.  Most device
+block migration.
 specific code isn't ready to deal with a device in such state, and
 behaves weirdly.
-Add a new post_load hook in a device class instead.  Devices should use
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-this unless they specifically want to verify the migration stream as
+enabled. Even if the device supports it, the reports would be nonsense
-it's processed, e.g. for bounds checking.
+because SVQ memory is in the qemu region.
-Cc: qemu-stable@nongnu.org
+The log region is still allocated. Future changes might skip that, but
-Suggested-by: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
+this series is already long enough.
-Cc: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>
-Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/virtio/virtio.c         | 7 +++++++
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
- include/hw/virtio/virtio.h | 6 ++++++
+ include/hw/virtio/vhost-vdpa.h |  1 +
-files changed, 13 insertions(+)
+files changed, 36 insertions(+), 4 deletions(-)
-diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/virtio.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/virtio/virtio.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
 +                                       uint64_t *features)
 +{
 +    int ret;
 +
 +    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 +    trace_vhost_vdpa_get_features(dev, *features);
 +    return ret;
 +}
 +
  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
          return 0;
      }
-     rcu_read_unlock();
+-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
-+    if (vdc->post_load) {
++    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
-+        ret = vdc->post_load(vdev);
+     if (r != 0) {
-+        if (ret) {
+         error_setg_errno(errp, -r, "Can't get vdpa device features");
-+            return ret;
+         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
  static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                     uint64_t features)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      if (vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
++
++        v->acked_features = features;
++
++        /* We must not ack _F_LOG if SVQ is enabled */
++        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 +
-     return 0;
+     trace_vhost_vdpa_set_features(dev, features);
      ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                       uint64_t *features)
  {
 -    int ret;
 +    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 -    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
-diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/virtio/virtio.h
+--- a/include/hw/virtio/vhost-vdpa.h
-+++ b/include/hw/virtio/virtio.h
++++ b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ typedef struct VirtioDeviceClass {
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
-      */
+     bool iotlb_batch_begin_sent;
-     void (*save)(VirtIODevice *vdev, QEMUFile *f);
+     MemoryListener listener;
-     int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id);
+     struct vhost_vdpa_iova_range iova_range;
-+    /* Post load hook in vmsd is called early while device is processed, and
++    uint64_t acked_features;
-+     * when VirtIODevice isn't fully initialized.  Devices should use this instead,
+     bool shadow_vqs_enabled;
-+     * unless they specifically want to verify the migration stream as it's
+     /* IOVA mapping used by the Shadow Virtqueue */
-+     * processed, e.g. for bounds checking.
+     VhostIOVATree *iova_tree;
 +     */
 +    int (*post_load)(VirtIODevice *vdev);
      const VMStateDescription *vmsd;
  } VirtioDeviceClass;
 --
-.5.0
+.7.4

The following changes since commit 187f35512106501fe9a11057f4d8705431e0026d:

Merge remote-tracking branch 'remotes/stsquad/tags/pull-testing-next-251019-3' into staging (2019-10-26 10:13:48 +0100)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 1e907a32b77e5d418538453df5945242e43224fa:

COLO-compare: Fix incorrect `if` logic (2019-10-29 10:28:07 +0800)

----------------------------------------------------------------

Changes from V1:

- Fix compling issue

----------------------------------------------------------------
Fan Yang (1):
      COLO-compare: Fix incorrect `if` logic

Michael S. Tsirkin (1):
      virtio: new post_load hook

Mikhail Sennikovsky (1):
      virtio-net: prevent offloads reset on migration

Sven Schnelle (1):
      net: add tulip (dec21143) driver

From: Sven Schnelle <svens@stackframe.org>

This adds the basic functionality to emulate a Tulip NIC.

Implemented are:

- RX and TX functionality
- Perfect Frame Filtering
- Big/Little Endian descriptor support
- 93C46 EEPROM support
- LXT970 PHY

Not implemented, mostly because i had no OS using these functions:

- Imperfect frame filtering
- General Purpose Timer
- Transmit automatic polling
- Boot ROM support
- SIA interface
- Big/Little Endian data buffer conversion

Successfully tested with the following Operating Systems:

- MSDOS with Microsoft Network Client 3.0 and DEC ODI drivers
- HPPA Linux
- Windows XP
- HP-UX

Signed-off-by: Sven Schnelle <svens@stackframe.org>
Message-Id: <20191022155413.4619-1-svens@stackframe.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 MAINTAINERS              |    6 +
 hw/net/Kconfig           |    5 +
 hw/net/Makefile.objs     |    1 +
 hw/net/trace-events      |   14 +
 hw/net/tulip.c           | 1029 ++++++++++++++++++++++++++++++++++++++++++++++
 hw/net/tulip.h           |  267 ++++++++++++
 include/hw/pci/pci_ids.h |    1 +
 7 files changed, 1323 insertions(+)
 create mode 100644 hw/net/tulip.c
 create mode 100644 hw/net/tulip.h

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ M: Stefan Weil <sw@weilnetz.de>
 S: Maintained
 F: hw/net/eepro100.c
 
+tulip
+M: Sven Schnelle <svens@stackframe.org>
+S: Maintained
+F: hw/net/tulip.c
+F: hw/net/tulip.h
+
 Generic Loader
 M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
diff --git a/hw/net/Kconfig b/hw/net/Kconfig
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/Kconfig
+++ b/hw/net/Kconfig
@@ -XXX,XX +XXX,XX @@ config PCNET_PCI
 config PCNET_COMMON
     bool
 
+config TULIP
+    bool
+    default y if PCI_DEVICES
+    depends on PCI
+
 config E1000_PCI
     bool
     default y if PCI_DEVICES
diff --git a/hw/net/Makefile.objs b/hw/net/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/Makefile.objs
+++ b/hw/net/Makefile.objs
@@ -XXX,XX +XXX,XX @@ common-obj-$(CONFIG_E1000E_PCI_EXPRESS) += e1000e.o e1000e_core.o e1000x_common.
 common-obj-$(CONFIG_RTL8139_PCI) += rtl8139.o
 common-obj-$(CONFIG_VMXNET3_PCI) += net_tx_pkt.o net_rx_pkt.o
 common-obj-$(CONFIG_VMXNET3_PCI) += vmxnet3.o
+common-obj-$(CONFIG_TULIP) += tulip.o
 
 common-obj-$(CONFIG_SMC91C111) += smc91c111.o
 common-obj-$(CONFIG_LAN9118) += lan9118.o
diff --git a/hw/net/trace-events b/hw/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_net_announce_notify(void) ""
 virtio_net_announce_timer(int round) "%d"
 virtio_net_handle_announce(int round) "%d"
 virtio_net_post_load_device(void)
+
+# tulip.c
+tulip_reg_write(uint64_t addr, const char *name, int size, uint64_t val) "addr 0x%02"PRIx64" (%s) size %d value 0x%08"PRIx64
+tulip_reg_read(uint64_t addr, const char *name, int size, uint64_t val) "addr 0x%02"PRIx64" (%s) size %d value 0x%08"PRIx64
+tulip_receive(const uint8_t *buf, size_t len) "buf %p size %zu"
+tulip_descriptor(const char *prefix, uint32_t addr, uint32_t status, uint32_t control, uint32_t len1, uint32_t len2, uint32_t buf1, uint32_t buf2) "%s 0x%08x: status 0x%08x control 0x%03x len1 %4d len2 %4d buf1 0x%08x buf2 0x%08x"
+tulip_rx_state(const char *state) "RX %s"
+tulip_tx_state(const char *state) "TX %s"
+tulip_irq(uint32_t mask, uint32_t en, const char *state) "mask 0x%08x ie 0x%08x %s"
+tulip_mii_write(int phy, int reg, uint16_t data) "phy 0x%x reg 0x%x data 0x%04x"
+tulip_mii_read(int phy, int reg, uint16_t data) "phy 0x%x, reg 0x%x data 0x%04x"
+tulip_reset(void) ""
+tulip_setup_frame(void) ""
+tulip_setup_filter(int n, uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f) "%d: %02x:%02x:%02x:%02x:%02x:%02x"
diff --git a/hw/net/tulip.c b/hw/net/tulip.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/net/tulip.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QEMU TULIP Emulation
+ *
+ * Copyright (c) 2019 Sven Schnelle <svens@stackframe.org>
+ *
+ * This work is licensed under the GNU GPL license version 2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "hw/irq.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/nvram/eeprom93xx.h"
+#include "migration/vmstate.h"
+#include "sysemu/sysemu.h"
+#include "tulip.h"
+#include "trace.h"
+#include "net/eth.h"
+
+typedef struct TULIPState {
+    PCIDevice dev;
+    MemoryRegion io;
+    MemoryRegion memory;
+    NICConf c;
+    qemu_irq irq;
+    NICState *nic;
+    eeprom_t *eeprom;
+    uint32_t csr[16];
+
+    /* state for MII */
+    uint32_t old_csr9;
+    uint32_t mii_word;
+    uint32_t mii_bitcnt;
+
+    hwaddr current_rx_desc;
+    hwaddr current_tx_desc;
+
+    uint8_t rx_frame[2048];
+    uint8_t tx_frame[2048];
+    uint16_t tx_frame_len;
+    uint16_t rx_frame_len;
+    uint16_t rx_frame_size;
+
+    uint32_t rx_status;
+    uint8_t filter[16][6];
+} TULIPState;
+
+static const VMStateDescription vmstate_pci_tulip = {
+    .name = "tulip",
+    .fields = (VMStateField[]) {
+        VMSTATE_PCI_DEVICE(dev, TULIPState),
+        VMSTATE_UINT32_ARRAY(csr, TULIPState, 16),
+        VMSTATE_UINT32(old_csr9, TULIPState),
+        VMSTATE_UINT32(mii_word, TULIPState),
+        VMSTATE_UINT32(mii_bitcnt, TULIPState),
+        VMSTATE_UINT64(current_rx_desc, TULIPState),
+        VMSTATE_UINT64(current_tx_desc, TULIPState),
+        VMSTATE_BUFFER(rx_frame, TULIPState),
+        VMSTATE_BUFFER(tx_frame, TULIPState),
+        VMSTATE_UINT16(rx_frame_len, TULIPState),
+        VMSTATE_UINT16(tx_frame_len, TULIPState),
+        VMSTATE_UINT16(rx_frame_size, TULIPState),
+        VMSTATE_UINT32(rx_status, TULIPState),
+        VMSTATE_UINT8_2DARRAY(filter, TULIPState, 16, 6),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void tulip_desc_read(TULIPState *s, hwaddr p,
+        struct tulip_descriptor *desc)
+{
+    if (s->csr[0] & CSR0_DBO) {
+        desc->status = ldl_be_pci_dma(&s->dev, p);
+        desc->control = ldl_be_pci_dma(&s->dev, p + 4);
+        desc->buf_addr1 = ldl_be_pci_dma(&s->dev, p + 8);
+        desc->buf_addr2 = ldl_be_pci_dma(&s->dev, p + 12);
+    } else {
+        desc->status = ldl_le_pci_dma(&s->dev, p);
+        desc->control = ldl_le_pci_dma(&s->dev, p + 4);
+        desc->buf_addr1 = ldl_le_pci_dma(&s->dev, p + 8);
+        desc->buf_addr2 = ldl_le_pci_dma(&s->dev, p + 12);
+    }
+}
+
+static void tulip_desc_write(TULIPState *s, hwaddr p,
+        struct tulip_descriptor *desc)
+{
+    if (s->csr[0] & CSR0_DBO) {
+        stl_be_pci_dma(&s->dev, p, desc->status);
+        stl_be_pci_dma(&s->dev, p + 4, desc->control);
+        stl_be_pci_dma(&s->dev, p + 8, desc->buf_addr1);
+        stl_be_pci_dma(&s->dev, p + 12, desc->buf_addr2);
+    } else {
+        stl_le_pci_dma(&s->dev, p, desc->status);
+        stl_le_pci_dma(&s->dev, p + 4, desc->control);
+        stl_le_pci_dma(&s->dev, p + 8, desc->buf_addr1);
+        stl_le_pci_dma(&s->dev, p + 12, desc->buf_addr2);
+    }
+}
+
+static void tulip_update_int(TULIPState *s)
+{
+    uint32_t ie = s->csr[5] & s->csr[7];
+    bool assert = false;
+
+    s->csr[5] &= ~(CSR5_AIS | CSR5_NIS);
+
+    if (ie & (CSR5_TI | CSR5_TU | CSR5_RI | CSR5_GTE | CSR5_ERI)) {
+        s->csr[5] |= CSR5_NIS;
+    }
+
+    if (ie & (CSR5_LC | CSR5_GPI | CSR5_FBE | CSR5_LNF | CSR5_ETI | CSR5_RWT |
+              CSR5_RPS | CSR5_RU | CSR5_UNF | CSR5_LNP_ANC | CSR5_TJT |
+              CSR5_TPS)) {
+        s->csr[5] |= CSR5_AIS;
+    }
+
+    assert = s->csr[5] & s->csr[7] & (CSR5_AIS | CSR5_NIS);
+    trace_tulip_irq(s->csr[5], s->csr[7], assert ? "assert" : "deassert");
+    qemu_set_irq(s->irq, assert);
+}
+
+static bool tulip_rx_stopped(TULIPState *s)
+{
+    return ((s->csr[5] >> CSR5_RS_SHIFT) & CSR5_RS_MASK) == CSR5_RS_STOPPED;
+}
+
+static void tulip_dump_tx_descriptor(TULIPState *s,
+        struct tulip_descriptor *desc)
+{
+    trace_tulip_descriptor("TX ", s->current_tx_desc,
+                desc->status, desc->control >> 22,
+                desc->control & 0x7ff, (desc->control >> 11) & 0x7ff,
+                desc->buf_addr1, desc->buf_addr2);
+}
+
+static void tulip_dump_rx_descriptor(TULIPState *s,
+        struct tulip_descriptor *desc)
+{
+    trace_tulip_descriptor("RX ", s->current_rx_desc,
+                desc->status, desc->control >> 22,
+                desc->control & 0x7ff, (desc->control >> 11) & 0x7ff,
+                desc->buf_addr1, desc->buf_addr2);
+}
+
+static void tulip_next_rx_descriptor(TULIPState *s,
+    struct tulip_descriptor *desc)
+{
+    if (desc->control & RDES1_RER) {
+        s->current_rx_desc = s->csr[3];
+    } else if (desc->control & RDES1_RCH) {
+        s->current_rx_desc = desc->buf_addr2;
+    } else {
+        s->current_rx_desc += sizeof(struct tulip_descriptor) +
+                (((s->csr[0] >> CSR0_DSL_SHIFT) & CSR0_DSL_MASK) << 2);
+    }
+    s->current_rx_desc &= ~3ULL;
+}
+
+static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
+{
+    int len1 = (desc->control >> RDES1_BUF1_SIZE_SHIFT) & RDES1_BUF1_SIZE_MASK;
+    int len2 = (desc->control >> RDES1_BUF2_SIZE_SHIFT) & RDES1_BUF2_SIZE_MASK;
+    int len;
+
+    if (s->rx_frame_len && len1) {
+        if (s->rx_frame_len > len1) {
+            len = len1;
+        } else {
+            len = s->rx_frame_len;
+        }
+        pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
+            (s->rx_frame_size - s->rx_frame_len), len);
+        s->rx_frame_len -= len;
+    }
+
+    if (s->rx_frame_len && len2) {
+        if (s->rx_frame_len > len2) {
+            len = len2;
+        } else {
+            len = s->rx_frame_len;
+        }
+        pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
+            (s->rx_frame_size - s->rx_frame_len), len);
+        s->rx_frame_len -= len;
+    }
+}
+
+static bool tulip_filter_address(TULIPState *s, const uint8_t *addr)
+{
+    static const char broadcast[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+    bool ret = false;
+    int i;
+
+    for (i = 0; i < 16 && ret == false; i++) {
+        if (!memcmp(&s->filter[i], addr, ETH_ALEN)) {
+            ret = true;
+        }
+    }
+
+    if (!memcmp(addr, broadcast, ETH_ALEN)) {
+        return true;
+    }
+
+    if (s->csr[6] & (CSR6_PR | CSR6_RA)) {
+        /* Promiscuous mode enabled */
+        s->rx_status |= RDES0_FF;
+        return true;
+    }
+
+    if ((s->csr[6] & CSR6_PM) && (addr[0] & 1)) {
+        /* Pass all Multicast enabled */
+        s->rx_status |= RDES0_MF;
+        return true;
+    }
+
+    if (s->csr[6] & CSR6_IF) {
+        ret ^= true;
+    }
+    return ret;
+}
+
+static ssize_t tulip_receive(TULIPState *s, const uint8_t *buf, size_t size)
+{
+    struct tulip_descriptor desc;
+
+    trace_tulip_receive(buf, size);
+
+    if (size < 14 || size > 2048 || s->rx_frame_len || tulip_rx_stopped(s)) {
+        return 0;
+    }
+
+    if (!tulip_filter_address(s, buf)) {
+        return size;
+    }
+
+    do {
+        tulip_desc_read(s, s->current_rx_desc, &desc);
+        tulip_dump_rx_descriptor(s, &desc);
+
+        if (!(desc.status & RDES0_OWN)) {
+            s->csr[5] |= CSR5_RU;
+            tulip_update_int(s);
+            return s->rx_frame_size - s->rx_frame_len;
+        }
+        desc.status = 0;
+
+        if (!s->rx_frame_len) {
+            s->rx_frame_size = size + 4;
+            s->rx_status = RDES0_LS |
+                 ((s->rx_frame_size & RDES0_FL_MASK) << RDES0_FL_SHIFT);
+            desc.status |= RDES0_FS;
+            memcpy(s->rx_frame, buf, size);
+            s->rx_frame_len = s->rx_frame_size;
+        }
+
+        tulip_copy_rx_bytes(s, &desc);
+
+        if (!s->rx_frame_len) {
+            desc.status |= s->rx_status;
+            s->csr[5] |= CSR5_RI;
+            tulip_update_int(s);
+        }
+        tulip_dump_rx_descriptor(s, &desc);
+        tulip_desc_write(s, s->current_rx_desc, &desc);
+        tulip_next_rx_descriptor(s, &desc);
+    } while (s->rx_frame_len);
+    return size;
+}
+
+static ssize_t tulip_receive_nc(NetClientState *nc,
+                             const uint8_t *buf, size_t size)
+{
+    return tulip_receive(qemu_get_nic_opaque(nc), buf, size);
+}
+
+
+static NetClientInfo net_tulip_info = {
+    .type = NET_CLIENT_DRIVER_NIC,
+    .size = sizeof(NICState),
+    .receive = tulip_receive_nc,
+};
+
+static const char *tulip_reg_name(const hwaddr addr)
+{
+    switch (addr) {
+    case CSR(0):
+        return "CSR0";
+
+    case CSR(1):
+        return "CSR1";
+
+    case CSR(2):
+        return "CSR2";
+
+    case CSR(3):
+        return "CSR3";
+
+    case CSR(4):
+        return "CSR4";
+
+    case CSR(5):
+        return "CSR5";
+
+    case CSR(6):
+        return "CSR6";
+
+    case CSR(7):
+        return "CSR7";
+
+    case CSR(8):
+        return "CSR8";
+
+    case CSR(9):
+        return "CSR9";
+
+    case CSR(10):
+        return "CSR10";
+
+    case CSR(11):
+        return "CSR11";
+
+    case CSR(12):
+        return "CSR12";
+
+    case CSR(13):
+        return "CSR13";
+
+    case CSR(14):
+        return "CSR14";
+
+    case CSR(15):
+        return "CSR15";
+
+    default:
+        break;
+    }
+    return "";
+}
+
+static const char *tulip_rx_state_name(int state)
+{
+    switch (state) {
+    case CSR5_RS_STOPPED:
+        return "STOPPED";
+
+    case CSR5_RS_RUNNING_FETCH:
+        return "RUNNING/FETCH";
+
+    case CSR5_RS_RUNNING_CHECK_EOR:
+        return "RUNNING/CHECK EOR";
+
+    case CSR5_RS_RUNNING_WAIT_RECEIVE:
+        return "WAIT RECEIVE";
+
+    case CSR5_RS_SUSPENDED:
+        return "SUSPENDED";
+
+    case CSR5_RS_RUNNING_CLOSE:
+        return "RUNNING/CLOSE";
+
+    case CSR5_RS_RUNNING_FLUSH:
+        return "RUNNING/FLUSH";
+
+    case CSR5_RS_RUNNING_QUEUE:
+        return "RUNNING/QUEUE";
+
+    default:
+        break;
+    }
+    return "";
+}
+
+static const char *tulip_tx_state_name(int state)
+{
+    switch (state) {
+    case CSR5_TS_STOPPED:
+        return "STOPPED";
+
+    case CSR5_TS_RUNNING_FETCH:
+        return "RUNNING/FETCH";
+
+    case CSR5_TS_RUNNING_WAIT_EOT:
+        return "RUNNING/WAIT EOT";
+
+    case CSR5_TS_RUNNING_READ_BUF:
+        return "RUNNING/READ BUF";
+
+    case CSR5_TS_RUNNING_SETUP:
+        return "RUNNING/SETUP";
+
+    case CSR5_TS_SUSPENDED:
+        return "SUSPENDED";
+
+    case CSR5_TS_RUNNING_CLOSE:
+        return "RUNNING/CLOSE";
+
+    default:
+        break;
+    }
+    return "";
+}
+
+static void tulip_update_rs(TULIPState *s, int state)
+{
+    s->csr[5] &= ~(CSR5_RS_MASK << CSR5_RS_SHIFT);
+    s->csr[5] |= (state & CSR5_RS_MASK) << CSR5_RS_SHIFT;
+    trace_tulip_rx_state(tulip_rx_state_name(state));
+}
+
+static uint16_t tulip_mdi_default[] = {
+    /* MDI Registers 0 - 6, 7 */
+    0x3100, 0xf02c, 0x7810, 0x0000, 0x0501, 0x4181, 0x0000, 0x0000,
+    /* MDI Registers 8 - 15 */
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    /* MDI Registers 16 - 31 */
+    0x0003, 0x0000, 0x0001, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+/* Readonly mask for MDI (PHY) registers */
+static const uint16_t tulip_mdi_mask[] = {
+    0x0000, 0xffff, 0xffff, 0xffff, 0xc01f, 0xffff, 0xffff, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0fff, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static uint16_t tulip_mii_read(TULIPState *s, int phy, int reg)
+{
+    uint16_t ret = 0;
+    if (phy == 1) {
+        ret = tulip_mdi_default[reg];
+    }
+    trace_tulip_mii_read(phy, reg, ret);
+    return ret;
+}
+
+static void tulip_mii_write(TULIPState *s, int phy, int reg, uint16_t data)
+{
+    trace_tulip_mii_write(phy, reg, data);
+
+    if (phy != 1) {
+        return;
+    }
+
+    tulip_mdi_default[reg] &= ~tulip_mdi_mask[reg];
+    tulip_mdi_default[reg] |= (data & tulip_mdi_mask[reg]);
+}
+
+static void tulip_mii(TULIPState *s)
+{
+    uint32_t changed = s->old_csr9 ^ s->csr[9];
+    uint16_t data;
+    int op, phy, reg;
+
+    if (!(changed & CSR9_MDC)) {
+        return;
+    }
+
+    if (!(s->csr[9] & CSR9_MDC)) {
+        return;
+    }
+
+    s->mii_bitcnt++;
+    s->mii_word <<= 1;
+
+    if (s->csr[9] & CSR9_MDO && (s->mii_bitcnt < 16 ||
+        !(s->csr[9] & CSR9_MII))) {
+        /* write op or address bits */
+        s->mii_word |= 1;
+    }
+
+    if (s->mii_bitcnt >= 16 && (s->csr[9] & CSR9_MII)) {
+        if (s->mii_word & 0x8000) {
+            s->csr[9] |= CSR9_MDI;
+        } else {
+            s->csr[9] &= ~CSR9_MDI;
+        }
+    }
+
+    if (s->mii_word == 0xffffffff) {
+        s->mii_bitcnt = 0;
+    } else if (s->mii_bitcnt == 16) {
+        op = (s->mii_word >> 12) & 0x0f;
+        phy = (s->mii_word >> 7) & 0x1f;
+        reg = (s->mii_word >> 2) & 0x1f;
+
+        if (op == 6) {
+            s->mii_word = tulip_mii_read(s, phy, reg);
+        }
+    } else if (s->mii_bitcnt == 32) {
+            op = (s->mii_word >> 28) & 0x0f;
+            phy = (s->mii_word >> 23) & 0x1f;
+            reg = (s->mii_word >> 18) & 0x1f;
+            data = s->mii_word & 0xffff;
+
+        if (op == 5) {
+            tulip_mii_write(s, phy, reg, data);
+        }
+    }
+}
+
+static uint32_t tulip_csr9_read(TULIPState *s)
+{
+    if (s->csr[9] & CSR9_SR) {
+        if (eeprom93xx_read(s->eeprom)) {
+            s->csr[9] |= CSR9_SR_DO;
+        } else {
+            s->csr[9] &= ~CSR9_SR_DO;
+        }
+    }
+
+    tulip_mii(s);
+    return s->csr[9];
+}
+
+static void tulip_update_ts(TULIPState *s, int state)
+{
+        s->csr[5] &= ~(CSR5_TS_MASK << CSR5_TS_SHIFT);
+        s->csr[5] |= (state & CSR5_TS_MASK) << CSR5_TS_SHIFT;
+        trace_tulip_tx_state(tulip_tx_state_name(state));
+}
+
+static uint64_t tulip_read(void *opaque, hwaddr addr,
+                              unsigned size)
+{
+    TULIPState *s = opaque;
+    uint64_t data = 0;
+
+    switch (addr) {
+    case CSR(9):
+        data = tulip_csr9_read(s);
+        break;
+
+    case CSR(12):
+        /* Fake autocompletion complete until we have PHY emulation */
+        data = 5 << CSR12_ANS_SHIFT;
+        break;
+
+    default:
+        if (addr & 7) {
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: read access at unknown address"
+                " 0x%"PRIx64"\n", __func__, addr);
+        } else {
+            data = s->csr[addr >> 3];
+        }
+        break;
+    }
+    trace_tulip_reg_read(addr, tulip_reg_name(addr), size, data);
+    return data;
+}
+
+static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
+{
+    if (s->tx_frame_len) {
+        if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
+            /* Internal or external Loopback */
+            tulip_receive(s, s->tx_frame, s->tx_frame_len);
+        } else {
+            qemu_send_packet(qemu_get_queue(s->nic),
+                s->tx_frame, s->tx_frame_len);
+        }
+    }
+
+    if (desc->control & TDES1_IC) {
+        s->csr[5] |= CSR5_TI;
+        tulip_update_int(s);
+    }
+}
+
+static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
+{
+    int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
+    int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
+
+    if (len1) {
+        pci_dma_read(&s->dev, desc->buf_addr1,
+            s->tx_frame + s->tx_frame_len, len1);
+        s->tx_frame_len += len1;
+    }
+
+    if (len2) {
+        pci_dma_read(&s->dev, desc->buf_addr2,
+            s->tx_frame + s->tx_frame_len, len2);
+        s->tx_frame_len += len2;
+    }
+    desc->status = (len1 + len2) ? 0 : 0x7fffffff;
+}
+
+static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
+{
+    int offset = n * 12;
+
+    s->filter[n][0] = buf[offset];
+    s->filter[n][1] = buf[offset + 1];
+
+    s->filter[n][2] = buf[offset + 4];
+    s->filter[n][3] = buf[offset + 5];
+
+    s->filter[n][4] = buf[offset + 8];
+    s->filter[n][5] = buf[offset + 9];
+
+    trace_tulip_setup_filter(n, s->filter[n][5], s->filter[n][4],
+            s->filter[n][3], s->filter[n][2], s->filter[n][1], s->filter[n][0]);
+}
+
+static void tulip_setup_frame(TULIPState *s,
+        struct tulip_descriptor *desc)
+{
+    uint8_t buf[4096];
+    int len = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
+    int i;
+
+    trace_tulip_setup_frame();
+
+    if (len == 192) {
+        pci_dma_read(&s->dev, desc->buf_addr1, buf, len);
+        for (i = 0; i < 16; i++) {
+            tulip_setup_filter_addr(s, buf, i);
+        }
+    }
+
+    desc->status = 0x7fffffff;
+
+    if (desc->control & TDES1_IC) {
+        s->csr[5] |= CSR5_TI;
+        tulip_update_int(s);
+    }
+}
+
+static void tulip_next_tx_descriptor(TULIPState *s,
+    struct tulip_descriptor *desc)
+{
+    if (desc->control & TDES1_TER) {
+        s->current_tx_desc = s->csr[4];
+    } else if (desc->control & TDES1_TCH) {
+        s->current_tx_desc = desc->buf_addr2;
+    } else {
+        s->current_tx_desc += sizeof(struct tulip_descriptor) +
+                (((s->csr[0] >> CSR0_DSL_SHIFT) & CSR0_DSL_MASK) << 2);
+    }
+    s->current_tx_desc &= ~3ULL;
+}
+
+static uint32_t tulip_ts(TULIPState *s)
+{
+    return (s->csr[5] >> CSR5_TS_SHIFT) & CSR5_TS_MASK;
+}
+
+static void tulip_xmit_list_update(TULIPState *s)
+{
+    struct tulip_descriptor desc;
+
+    if (tulip_ts(s) != CSR5_TS_SUSPENDED) {
+        return;
+    }
+
+    for (;;) {
+        tulip_desc_read(s, s->current_tx_desc, &desc);
+        tulip_dump_tx_descriptor(s, &desc);
+
+        if (!(desc.status & TDES0_OWN)) {
+            tulip_update_ts(s, CSR5_TS_SUSPENDED);
+            s->csr[5] |= CSR5_TU;
+            tulip_update_int(s);
+            return;
+        }
+
+        if (desc.control & TDES1_SET) {
+            tulip_setup_frame(s, &desc);
+        } else {
+            if (desc.control & TDES1_FS) {
+                s->tx_frame_len = 0;
+            }
+
+            tulip_copy_tx_buffers(s, &desc);
+
+            if (desc.control & TDES1_LS) {
+                tulip_tx(s, &desc);
+            }
+        }
+        tulip_desc_write(s, s->current_tx_desc, &desc);
+        tulip_next_tx_descriptor(s, &desc);
+    }
+}
+
+static void tulip_csr9_write(TULIPState *s, uint32_t old_val,
+        uint32_t new_val)
+{
+    if (new_val & CSR9_SR) {
+        eeprom93xx_write(s->eeprom,
+            !!(new_val & CSR9_SR_CS),
+            !!(new_val & CSR9_SR_SK),
+            !!(new_val & CSR9_SR_DI));
+    }
+}
+
+static void tulip_reset(TULIPState *s)
+{
+    trace_tulip_reset();
+
+    s->csr[0] = 0xfe000000;
+    s->csr[1] = 0xffffffff;
+    s->csr[2] = 0xffffffff;
+    s->csr[5] = 0xf0000000;
+    s->csr[6] = 0x32000040;
+    s->csr[7] = 0xf3fe0000;
+    s->csr[8] = 0xe0000000;
+    s->csr[9] = 0xfff483ff;
+    s->csr[11] = 0xfffe0000;
+    s->csr[12] = 0x000000c6;
+    s->csr[13] = 0xffff0000;
+    s->csr[14] = 0xffffffff;
+    s->csr[15] = 0x8ff00000;
+}
+
+static void tulip_qdev_reset(DeviceState *dev)
+{
+    PCIDevice *d = PCI_DEVICE(dev);
+    TULIPState *s = TULIP(d);
+
+    tulip_reset(s);
+}
+
+static void tulip_write(void *opaque, hwaddr addr,
+                           uint64_t data, unsigned size)
+{
+    TULIPState *s = opaque;
+    trace_tulip_reg_write(addr, tulip_reg_name(addr), size, data);
+
+    switch (addr) {
+    case CSR(0):
+        s->csr[0] = data;
+        if (data & CSR0_SWR) {
+            tulip_reset(s);
+            tulip_update_int(s);
+        }
+        break;
+
+    case CSR(1):
+        tulip_xmit_list_update(s);
+        break;
+
+    case CSR(2):
+        qemu_flush_queued_packets(qemu_get_queue(s->nic));
+        break;
+
+    case CSR(3):
+        s->csr[3] = data & ~3ULL;
+        s->current_rx_desc = s->csr[3];
+        qemu_flush_queued_packets(qemu_get_queue(s->nic));
+        break;
+
+    case CSR(4):
+        s->csr[4] = data & ~3ULL;
+        s->current_tx_desc = s->csr[4];
+        tulip_xmit_list_update(s);
+        break;
+
+    case CSR(5):
+        /* Status register, write clears bit */
+        s->csr[5] &= ~(data & (CSR5_TI | CSR5_TPS | CSR5_TU | CSR5_TJT |
+                               CSR5_LNP_ANC | CSR5_UNF | CSR5_RI | CSR5_RU |
+                               CSR5_RPS | CSR5_RWT | CSR5_ETI | CSR5_GTE |
+                               CSR5_LNF | CSR5_FBE | CSR5_ERI | CSR5_AIS |
+                               CSR5_NIS | CSR5_GPI | CSR5_LC));
+        tulip_update_int(s);
+        break;
+
+    case CSR(6):
+        s->csr[6] = data;
+        if (s->csr[6] & CSR6_SR) {
+            tulip_update_rs(s, CSR5_RS_RUNNING_WAIT_RECEIVE);
+            qemu_flush_queued_packets(qemu_get_queue(s->nic));
+        } else {
+            tulip_update_rs(s, CSR5_RS_STOPPED);
+        }
+
+        if (s->csr[6] & CSR6_ST) {
+            tulip_update_ts(s, CSR5_TS_SUSPENDED);
+            tulip_xmit_list_update(s);
+        } else {
+            tulip_update_ts(s, CSR5_TS_STOPPED);
+        }
+        break;
+
+    case CSR(7):
+        s->csr[7] = data;
+        tulip_update_int(s);
+        break;
+
+    case CSR(8):
+        s->csr[9] = data;
+        break;
+
+    case CSR(9):
+        tulip_csr9_write(s, s->csr[9], data);
+        /* don't clear MII read data */
+        s->csr[9] &= CSR9_MDI;
+        s->csr[9] |= (data & ~CSR9_MDI);
+        tulip_mii(s);
+        s->old_csr9 = s->csr[9];
+        break;
+
+    case CSR(10):
+        s->csr[10] = data;
+        break;
+
+    case CSR(11):
+        s->csr[11] = data;
+        break;
+
+    case CSR(12):
+        /* SIA Status register, some bits are cleared by writing 1 */
+        s->csr[12] &= ~(data & (CSR12_MRA | CSR12_TRA | CSR12_ARA));
+        break;
+
+    case CSR(13):
+        s->csr[13] = data;
+        break;
+
+    case CSR(14):
+        s->csr[14] = data;
+        break;
+
+    case CSR(15):
+        s->csr[15] = data;
+        break;
+
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: write to CSR at unknown address "
+                "0x%"PRIx64"\n", __func__, addr);
+        break;
+    }
+}
+
+static const MemoryRegionOps tulip_ops = {
+    .read = tulip_read,
+    .write = tulip_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
+static void tulip_idblock_crc(TULIPState *s, uint16_t *srom)
+{
+    int word, n;
+    int bit;
+    unsigned char bitval, crc;
+    const int len = 9;
+    n = 0;
+    crc = -1;
+
+    for (word = 0; word < len; word++) {
+        for (bit = 15; bit >= 0; bit--) {
+            if ((word == (len - 1)) && (bit == 7)) {
+                /*
+                 * Insert the correct CRC result into input data stream
+                 * in place.
+                 */
+                srom[len - 1] = (srom[len - 1] & 0xff00) | (unsigned short)crc;
+                break;
+            }
+            n++;
+            bitval = ((srom[word] >> bit) & 1) ^ ((crc >> 7) & 1);
+            crc = crc << 1;
+            if (bitval == 1) {
+                crc ^= 6;
+                crc |= 0x01;
+            }
+        }
+    }
+}
+
+static uint16_t tulip_srom_crc(TULIPState *s, uint8_t *eeprom, size_t len)
+{
+    unsigned long crc = 0xffffffff;
+    unsigned long flippedcrc = 0;
+    unsigned char currentbyte;
+    unsigned int msb, bit, i;
+
+    for (i = 0; i < len; i++) {
+        currentbyte = eeprom[i];
+        for (bit = 0; bit < 8; bit++) {
+            msb = (crc >> 31) & 1;
+            crc <<= 1;
+            if (msb ^ (currentbyte & 1)) {
+                crc ^= 0x04c11db6;
+                crc |= 0x00000001;
+            }
+            currentbyte >>= 1;
+        }
+    }
+
+    for (i = 0; i < 32; i++) {
+        flippedcrc <<= 1;
+        bit = crc & 1;
+        crc >>= 1;
+        flippedcrc += bit;
+    }
+    return (flippedcrc ^ 0xffffffff) & 0xffff;
+}
+
+static const uint8_t eeprom_default[128] = {
+    0x3c, 0x10, 0x4f, 0x10, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x56, 0x08, 0x04, 0x01, 0x00, 0x80, 0x48, 0xb3,
+    0x0e, 0xa7, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x08,
+    0x01, 0x8d, 0x03, 0x00, 0x00, 0x00, 0x00, 0x78,
+    0xe0, 0x01, 0x00, 0x50, 0x00, 0x18, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0x6b,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+    0x48, 0xb3, 0x0e, 0xa7, 0x40, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static void tulip_fill_eeprom(TULIPState *s)
+{
+    uint16_t *eeprom = eeprom93xx_data(s->eeprom);
+    memcpy(eeprom, eeprom_default, 128);
+
+    /* patch in our mac address */
+    eeprom[10] = cpu_to_le16(s->c.macaddr.a[0] | (s->c.macaddr.a[1] << 8));
+    eeprom[11] = cpu_to_le16(s->c.macaddr.a[2] | (s->c.macaddr.a[3] << 8));
+    eeprom[12] = cpu_to_le16(s->c.macaddr.a[4] | (s->c.macaddr.a[5] << 8));
+    tulip_idblock_crc(s, eeprom);
+    eeprom[63] = cpu_to_le16(tulip_srom_crc(s, (uint8_t *)eeprom, 126));
+}
+
+static void pci_tulip_realize(PCIDevice *pci_dev, Error **errp)
+{
+    TULIPState *s = DO_UPCAST(TULIPState, dev, pci_dev);
+    uint8_t *pci_conf;
+
+    pci_conf = s->dev.config;
+    pci_conf[PCI_INTERRUPT_PIN] = 1; /* interrupt pin A */
+
+    s->eeprom = eeprom93xx_new(&pci_dev->qdev, 64);
+    tulip_fill_eeprom(s);
+
+    memory_region_init_io(&s->io, OBJECT(&s->dev), &tulip_ops, s,
+            "tulip-io", 128);
+
+    memory_region_init_io(&s->memory, OBJECT(&s->dev), &tulip_ops, s,
+            "tulip-mem", 128);
+
+    pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &s->io);
+    pci_register_bar(&s->dev, 1, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->memory);
+
+    s->irq = pci_allocate_irq(&s->dev);
+
+    qemu_macaddr_default_if_unset(&s->c.macaddr);
+
+    s->nic = qemu_new_nic(&net_tulip_info, &s->c,
+                          object_get_typename(OBJECT(pci_dev)),
+                          pci_dev->qdev.id, s);
+    qemu_format_nic_info_str(qemu_get_queue(s->nic), s->c.macaddr.a);
+}
+
+static void pci_tulip_exit(PCIDevice *pci_dev)
+{
+    TULIPState *s = DO_UPCAST(TULIPState, dev, pci_dev);
+
+    qemu_del_nic(s->nic);
+    qemu_free_irq(s->irq);
+    eeprom93xx_free(&pci_dev->qdev, s->eeprom);
+}
+
+static void tulip_instance_init(Object *obj)
+{
+    PCIDevice *pci_dev = PCI_DEVICE(obj);
+    TULIPState *d = DO_UPCAST(TULIPState, dev, pci_dev);
+
+    device_add_bootindex_property(obj, &d->c.bootindex,
+                                  "bootindex", "/ethernet-phy@0",
+                                  &pci_dev->qdev, NULL);
+}
+
+static Property tulip_properties[] = {
+    DEFINE_NIC_PROPERTIES(TULIPState, c),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void tulip_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    k->realize = pci_tulip_realize;
+    k->exit = pci_tulip_exit;
+    k->vendor_id = PCI_VENDOR_ID_DEC;
+    k->device_id = PCI_DEVICE_ID_DEC_21143;
+    k->subsystem_vendor_id = 0x103c;
+    k->subsystem_id = 0x104f;
+    k->class_id = PCI_CLASS_NETWORK_ETHERNET;
+    dc->vmsd = &vmstate_pci_tulip;
+    dc->props = tulip_properties;
+    dc->reset = tulip_qdev_reset;
+    set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+}
+
+static const TypeInfo tulip_info = {
+    .name          = TYPE_TULIP,
+    .parent        = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(TULIPState),
+    .class_init    = tulip_class_init,
+    .instance_init = tulip_instance_init,
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+        { },
+    },
+};
+
+static void tulip_register_types(void)
+{
+    type_register_static(&tulip_info);
+}
+
+type_init(tulip_register_types)
diff --git a/hw/net/tulip.h b/hw/net/tulip.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/net/tulip.h
@@ -XXX,XX +XXX,XX @@
+#ifndef HW_TULIP_H
+#define HW_TULIP_H
+
+#include "qemu/units.h"
+#include "net/net.h"
+
+#define TYPE_TULIP "tulip"
+#define TULIP(obj) OBJECT_CHECK(TULIPState, (obj), TYPE_TULIP)
+
+#define CSR(_x) ((_x) << 3)
+
+#define CSR0_SWR        BIT(0)
+#define CSR0_BAR        BIT(1)
+#define CSR0_DSL_SHIFT  2
+#define CSR0_DSL_MASK   0x1f
+#define CSR0_BLE        BIT(7)
+#define CSR0_PBL_SHIFT  8
+#define CSR0_PBL_MASK   0x3f
+#define CSR0_CAC_SHIFT  14
+#define CSR0_CAC_MASK   0x3
+#define CSR0_DAS        0x10000
+#define CSR0_TAP_SHIFT  17
+#define CSR0_TAP_MASK   0x7
+#define CSR0_DBO        0x100000
+#define CSR1_TPD        0x01
+#define CSR0_RLE        BIT(23)
+#define CSR0_WIE        BIT(24)
+
+#define CSR2_RPD        0x01
+
+#define CSR5_TI         BIT(0)
+#define CSR5_TPS        BIT(1)
+#define CSR5_TU         BIT(2)
+#define CSR5_TJT        BIT(3)
+#define CSR5_LNP_ANC    BIT(4)
+#define CSR5_UNF        BIT(5)
+#define CSR5_RI         BIT(6)
+#define CSR5_RU         BIT(7)
+#define CSR5_RPS        BIT(8)
+#define CSR5_RWT        BIT(9)
+#define CSR5_ETI        BIT(10)
+#define CSR5_GTE        BIT(11)
+#define CSR5_LNF        BIT(12)
+#define CSR5_FBE        BIT(13)
+#define CSR5_ERI        BIT(14)
+#define CSR5_AIS        BIT(15)
+#define CSR5_NIS        BIT(16)
+#define CSR5_RS_SHIFT   17
+#define CSR5_RS_MASK    7
+#define CSR5_TS_SHIFT   20
+#define CSR5_TS_MASK    7
+
+#define CSR5_TS_STOPPED                 0
+#define CSR5_TS_RUNNING_FETCH           1
+#define CSR5_TS_RUNNING_WAIT_EOT        2
+#define CSR5_TS_RUNNING_READ_BUF        3
+#define CSR5_TS_RUNNING_SETUP           5
+#define CSR5_TS_SUSPENDED               6
+#define CSR5_TS_RUNNING_CLOSE           7
+
+#define CSR5_RS_STOPPED                 0
+#define CSR5_RS_RUNNING_FETCH           1
+#define CSR5_RS_RUNNING_CHECK_EOR       2
+#define CSR5_RS_RUNNING_WAIT_RECEIVE    3
+#define CSR5_RS_SUSPENDED               4
+#define CSR5_RS_RUNNING_CLOSE           5
+#define CSR5_RS_RUNNING_FLUSH           6
+#define CSR5_RS_RUNNING_QUEUE           7
+
+#define CSR5_EB_SHIFT   23
+#define CSR5_EB_MASK    7
+
+#define CSR5_GPI        BIT(26)
+#define CSR5_LC         BIT(27)
+
+#define CSR6_HP         BIT(0)
+#define CSR6_SR         BIT(1)
+#define CSR6_HO         BIT(2)
+#define CSR6_PB         BIT(3)
+#define CSR6_IF         BIT(4)
+#define CSR6_SB         BIT(5)
+#define CSR6_PR         BIT(6)
+#define CSR6_PM         BIT(7)
+#define CSR6_FKD        BIT(8)
+#define CSR6_FD         BIT(9)
+
+#define CSR6_OM_SHIFT   10
+#define CSR6_OM_MASK    3
+#define CSR6_OM_NORMAL          0
+#define CSR6_OM_INT_LOOPBACK    1
+#define CSR6_OM_EXT_LOOPBACK    2
+
+#define CSR6_FC         BIT(12)
+#define CSR6_ST         BIT(13)
+
+
+#define CSR6_TR_SHIFT   14
+#define CSR6_TR_MASK    3
+#define CSR6_TR_72      0
+#define CSR6_TR_96      1
+#define CSR6_TR_128     2
+#define CSR6_TR_160     3
+
+#define CSR6_CA         BIT(17)
+#define CSR6_RA         BIT(30)
+#define CSR6_SC         BIT(31)
+
+#define CSR7_TIM        BIT(0)
+#define CSR7_TSM        BIT(1)
+#define CSR7_TUM        BIT(2)
+#define CSR7_TJM        BIT(3)
+#define CSR7_LPM        BIT(4)
+#define CSR7_UNM        BIT(5)
+#define CSR7_RIM        BIT(6)
+#define CSR7_RUM        BIT(7)
+#define CSR7_RSM        BIT(8)
+#define CSR7_RWM        BIT(9)
+#define CSR7_TMM        BIT(11)
+#define CSR7_LFM        BIT(12)
+#define CSR7_SEM        BIT(13)
+#define CSR7_ERM        BIT(14)
+#define CSR7_AIM        BIT(15)
+#define CSR7_NIM        BIT(16)
+
+#define CSR8_MISSED_FRAME_OVL           BIT(16)
+#define CSR8_MISSED_FRAME_CNT_MASK      0xffff
+
+#define CSR9_DATA_MASK  0xff
+#define CSR9_SR_CS      BIT(0)
+#define CSR9_SR_SK      BIT(1)
+#define CSR9_SR_DI      BIT(2)
+#define CSR9_SR_DO      BIT(3)
+#define CSR9_REG        BIT(10)
+#define CSR9_SR         BIT(11)
+#define CSR9_BR         BIT(12)
+#define CSR9_WR         BIT(13)
+#define CSR9_RD         BIT(14)
+#define CSR9_MOD        BIT(15)
+#define CSR9_MDC        BIT(16)
+#define CSR9_MDO        BIT(17)
+#define CSR9_MII        BIT(18)
+#define CSR9_MDI        BIT(19)
+
+#define CSR11_CON       BIT(16)
+#define CSR11_TIMER_MASK 0xffff
+
+#define CSR12_MRA       BIT(0)
+#define CSR12_LS100     BIT(1)
+#define CSR12_LS10      BIT(2)
+#define CSR12_APS       BIT(3)
+#define CSR12_ARA       BIT(8)
+#define CSR12_TRA       BIT(9)
+#define CSR12_NSN       BIT(10)
+#define CSR12_TRF       BIT(11)
+#define CSR12_ANS_SHIFT 12
+#define CSR12_ANS_MASK  7
+#define CSR12_LPN       BIT(15)
+#define CSR12_LPC_SHIFT 16
+#define CSR12_LPC_MASK  0xffff
+
+#define CSR13_SRL       BIT(0)
+#define CSR13_CAC       BIT(2)
+#define CSR13_AUI       BIT(3)
+#define CSR13_SDM_SHIFT 4
+#define CSR13_SDM_MASK  0xfff
+
+#define CSR14_ECEN      BIT(0)
+#define CSR14_LBK       BIT(1)
+#define CSR14_DREN      BIT(2)
+#define CSR14_LSE       BIT(3)
+#define CSR14_CPEN_SHIFT 4
+#define CSR14_CPEN_MASK 3
+#define CSR14_MBO       BIT(6)
+#define CSR14_ANE       BIT(7)
+#define CSR14_RSQ       BIT(8)
+#define CSR14_CSQ       BIT(9)
+#define CSR14_CLD       BIT(10)
+#define CSR14_SQE       BIT(11)
+#define CSR14_LTE       BIT(12)
+#define CSR14_APE       BIT(13)
+#define CSR14_SPP       BIT(14)
+#define CSR14_TAS       BIT(15)
+
+#define CSR15_JBD       BIT(0)
+#define CSR15_HUJ       BIT(1)
+#define CSR15_JCK       BIT(2)
+#define CSR15_ABM       BIT(3)
+#define CSR15_RWD       BIT(4)
+#define CSR15_RWR       BIT(5)
+#define CSR15_LE1       BIT(6)
+#define CSR15_LV1       BIT(7)
+#define CSR15_TSCK      BIT(8)
+#define CSR15_FUSQ      BIT(9)
+#define CSR15_FLF       BIT(10)
+#define CSR15_LSD       BIT(11)
+#define CSR15_DPST      BIT(12)
+#define CSR15_FRL       BIT(13)
+#define CSR15_LE2       BIT(14)
+#define CSR15_LV2       BIT(15)
+
+#define RDES0_OF         BIT(0)
+#define RDES0_CE         BIT(1)
+#define RDES0_DB         BIT(2)
+#define RDES0_RJ         BIT(4)
+#define RDES0_FT         BIT(5)
+#define RDES0_CS         BIT(6)
+#define RDES0_TL         BIT(7)
+#define RDES0_LS         BIT(8)
+#define RDES0_FS         BIT(9)
+#define RDES0_MF         BIT(10)
+#define RDES0_RF         BIT(11)
+#define RDES0_DT_SHIFT   12
+#define RDES0_DT_MASK    3
+#define RDES0_LE         BIT(14)
+#define RDES0_ES         BIT(15)
+#define RDES0_FL_SHIFT   16
+#define RDES0_FL_MASK    0x3fff
+#define RDES0_FF         BIT(30)
+#define RDES0_OWN        BIT(31)
+
+#define RDES1_BUF1_SIZE_SHIFT 0
+#define RDES1_BUF1_SIZE_MASK 0x7ff
+
+#define RDES1_BUF2_SIZE_SHIFT 11
+#define RDES1_BUF2_SIZE_MASK 0x7ff
+#define RDES1_RCH       BIT(24)
+#define RDES1_RER       BIT(25)
+
+#define TDES0_DE        BIT(0)
+#define TDES0_UF        BIT(1)
+#define TDES0_LF        BIT(2)
+#define TDES0_CC_SHIFT  3
+#define TDES0_CC_MASK   0xf
+#define TDES0_HF        BIT(7)
+#define TDES0_EC        BIT(8)
+#define TDES0_LC        BIT(9)
+#define TDES0_NC        BIT(10)
+#define TDES0_LO        BIT(11)
+#define TDES0_TO        BIT(14)
+#define TDES0_ES        BIT(15)
+#define TDES0_OWN       BIT(31)
+
+#define TDES1_BUF1_SIZE_SHIFT 0
+#define TDES1_BUF1_SIZE_MASK 0x7ff
+
+#define TDES1_BUF2_SIZE_SHIFT 11
+#define TDES1_BUF2_SIZE_MASK 0x7ff
+
+#define TDES1_FT0       BIT(22)
+#define TDES1_DPD       BIT(23)
+#define TDES1_TCH       BIT(24)
+#define TDES1_TER       BIT(25)
+#define TDES1_AC        BIT(26)
+#define TDES1_SET       BIT(27)
+#define TDES1_FT1       BIT(28)
+#define TDES1_FS        BIT(29)
+#define TDES1_LS        BIT(30)
+#define TDES1_IC        BIT(31)
+
+struct tulip_descriptor {
+    uint32_t status;
+    uint32_t control;
+    uint32_t buf_addr1;
+    uint32_t buf_addr2;
+};
+
+#endif
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -XXX,XX +XXX,XX @@
 #define PCI_DEVICE_ID_LSI_SAS0079        0x0079
 
 #define PCI_VENDOR_ID_DEC                0x1011
+#define PCI_DEVICE_ID_DEC_21143          0x0019
 #define PCI_DEVICE_ID_DEC_21154          0x0026
 
 #define PCI_VENDOR_ID_CIRRUS             0x1013
-- 
2.5.0

From: "Michael S. Tsirkin" <mst@redhat.com>

Post load hook in virtio vmsd is called early while device is processed,
and when VirtIODevice core isn't fully initialized.  Most device
specific code isn't ready to deal with a device in such state, and
behaves weirdly.

Add a new post_load hook in a device class instead.  Devices should use
this unless they specifically want to verify the migration stream as
it's processed, e.g. for bounds checking.

Cc: qemu-stable@nongnu.org
Suggested-by: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/virtio.c         | 7 +++++++
 include/hw/virtio/virtio.h | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -XXX,XX +XXX,XX @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
     }
     rcu_read_unlock();
 
+    if (vdc->post_load) {
+        ret = vdc->post_load(vdev);
+        if (ret) {
+            return ret;
+        }
+    }
+
     return 0;
 }
 
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -XXX,XX +XXX,XX @@ typedef struct VirtioDeviceClass {
      */
     void (*save)(VirtIODevice *vdev, QEMUFile *f);
     int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id);
+    /* Post load hook in vmsd is called early while device is processed, and
+     * when VirtIODevice isn't fully initialized.  Devices should use this instead,
+     * unless they specifically want to verify the migration stream as it's
+     * processed, e.g. for bounds checking.
+     */
+    int (*post_load)(VirtIODevice *vdev);
     const VMStateDescription *vmsd;
 } VirtioDeviceClass;
 
-- 
2.5.0

From: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>

Currently offloads disabled by guest via the VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
command are not preserved on VM migration.
Instead all offloads reported by guest features (via VIRTIO_PCI_GUEST_FEATURES)
get enabled.
What happens is: first the VirtIONet::curr_guest_offloads gets restored and offloads
are getting set correctly:

#0  qemu_set_offload (nc=0x555556a11400, csum=1, tso4=0, tso6=0, ecn=0, ufo=0) at net/net.c:474
 #1  virtio_net_apply_guest_offloads (n=0x555557701ca0) at hw/net/virtio-net.c:720
 #2  virtio_net_post_load_device (opaque=0x555557701ca0, version_id=11) at hw/net/virtio-net.c:2334
 #3  vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577c80 <vmstate_virtio_net_device>, opaque=0x555557701ca0, version_id=11)
     at migration/vmstate.c:168
 #4  virtio_load (vdev=0x555557701ca0, f=0x5555569dc010, version_id=11) at hw/virtio/virtio.c:2197
 #5  virtio_device_get (f=0x5555569dc010, opaque=0x555557701ca0, size=0, field=0x55555668cd00 <__compound_literal.5>) at hw/virtio/virtio.c:2036
 #6  vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577ce0 <vmstate_virtio_net>, opaque=0x555557701ca0, version_id=11) at migration/vmstate.c:143
 #7  vmstate_load (f=0x5555569dc010, se=0x5555578189e0) at migration/savevm.c:829
 #8  qemu_loadvm_section_start_full (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2211
 #9  qemu_loadvm_state_main (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2395
 #10 qemu_loadvm_state (f=0x5555569dc010) at migration/savevm.c:2467
 #11 process_incoming_migration_co (opaque=0x0) at migration/migration.c:449

However later on the features are getting restored, and offloads get reset to
everything supported by features:

#0  qemu_set_offload (nc=0x555556a11400, csum=1, tso4=1, tso6=1, ecn=0, ufo=0) at net/net.c:474
 #1  virtio_net_apply_guest_offloads (n=0x555557701ca0) at hw/net/virtio-net.c:720
 #2  virtio_net_set_features (vdev=0x555557701ca0, features=5104441767) at hw/net/virtio-net.c:773
 #3  virtio_set_features_nocheck (vdev=0x555557701ca0, val=5104441767) at hw/virtio/virtio.c:2052
 #4  virtio_load (vdev=0x555557701ca0, f=0x5555569dc010, version_id=11) at hw/virtio/virtio.c:2220
 #5  virtio_device_get (f=0x5555569dc010, opaque=0x555557701ca0, size=0, field=0x55555668cd00 <__compound_literal.5>) at hw/virtio/virtio.c:2036
 #6  vmstate_load_state (f=0x5555569dc010, vmsd=0x555556577ce0 <vmstate_virtio_net>, opaque=0x555557701ca0, version_id=11) at migration/vmstate.c:143
 #7  vmstate_load (f=0x5555569dc010, se=0x5555578189e0) at migration/savevm.c:829
 #8  qemu_loadvm_section_start_full (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2211
 #9  qemu_loadvm_state_main (f=0x5555569dc010, mis=0x5555569eee20) at migration/savevm.c:2395
 #10 qemu_loadvm_state (f=0x5555569dc010) at migration/savevm.c:2467
 #11 process_incoming_migration_co (opaque=0x0) at migration/migration.c:449

Fix this by preserving the state in saved_guest_offloads field and
pushing out offload initialization to the new post load hook.

Cc: qemu-stable@nongnu.org
Signed-off-by: Mikhail Sennikovsky <mikhail.sennikovskii@cloud.ionos.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 27 ++++++++++++++++++++++++---
 include/hw/virtio/virtio-net.h |  2 ++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
     }
 
-    if (peer_has_vnet_hdr(n)) {
-        virtio_net_apply_guest_offloads(n);
-    }
+    /*
+     * curr_guest_offloads will be later overwritten by the
+     * virtio_set_features_nocheck call done from the virtio_load.
+     * Here we make sure it is preserved and restored accordingly
+     * in the virtio_net_post_load_virtio callback.
+     */
+    n->saved_guest_offloads = n->curr_guest_offloads;
 
     virtio_net_set_queues(n);
 
@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
     return 0;
 }
 
+static int virtio_net_post_load_virtio(VirtIODevice *vdev)
+{
+    VirtIONet *n = VIRTIO_NET(vdev);
+    /*
+     * The actual needed state is now in saved_guest_offloads,
+     * see virtio_net_post_load_device for detail.
+     * Restore it back and apply the desired offloads.
+     */
+    n->curr_guest_offloads = n->saved_guest_offloads;
+    if (peer_has_vnet_hdr(n)) {
+        virtio_net_apply_guest_offloads(n);
+    }
+
+    return 0;
+}
+
 /* tx_waiting field of a VirtIONetQueue */
 static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
     .name = "virtio-net-queue-tx_waiting",
@@ -XXX,XX +XXX,XX @@ static void virtio_net_class_init(ObjectClass *klass, void *data)
     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
+    vdc->post_load = virtio_net_post_load_virtio;
     vdc->vmsd = &vmstate_virtio_net_device;
 }
 
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
     char *netclient_name;
     char *netclient_type;
     uint64_t curr_guest_offloads;
+    /* used on saved state restore phase to preserve the curr_guest_offloads */
+    uint64_t saved_guest_offloads;
     AnnounceTimer announce_timer;
     bool needs_vnet_hdr_swap;
     bool mtu_bypass_backend;
-- 
2.5.0

From: Fan Yang <Fan_Yang@sjtu.edu.cn>

'colo_mark_tcp_pkt' should return 'true' when packets are the same, and
'false' otherwise.  However, it returns 'true' when
'colo_compare_packet_payload' returns non-zero while
'colo_compare_packet_payload' is just a 'memcmp'.  The result is that
COLO-compare reports inconsistent TCP packets when they are actually
the same.

Fixes: f449c9e549c ("colo: compare the packet based on the tcp sequence number")
Cc: qemu-stable@nongnu.org
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Fan Yang <Fan_Yang@sjtu.edu.cn>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
     *mark = 0;
 
     if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
-        if (colo_compare_packet_payload(ppkt, spkt,
+        if (!colo_compare_packet_payload(ppkt, spkt,
                                         ppkt->header_size, spkt->header_size,
                                         ppkt->payload_size)) {
             *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
 
     /* one part of secondary packet payload still need to be compared */
     if (!after(ppkt->seq_end, spkt->seq_end)) {
-        if (colo_compare_packet_payload(ppkt, spkt,
+        if (!colo_compare_packet_payload(ppkt, spkt,
                                         ppkt->header_size + ppkt->offset,
                                         spkt->header_size + spkt->offset,
                                         ppkt->payload_size - ppkt->offset)) {
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
         /* primary packet is longer than secondary packet, compare
          * the same part and mark the primary packet offset
          */
-        if (colo_compare_packet_payload(ppkt, spkt,
+        if (!colo_compare_packet_payload(ppkt, spkt,
                                         ppkt->header_size + ppkt->offset,
                                         spkt->header_size + spkt->offset,
                                         spkt->payload_size - spkt->offset)) {
-- 
2.5.0

The following changes since commit d9ccf33f9479201e5add8db0af68ca9ca8da358b:

Merge remote-tracking branch 'remotes/lvivier-gitlab/tags/linux-user-for-7.0-pull-request' into staging (2022-03-09 20:01:17 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to eea40402ecf895ed345f8e8eb07dbb484f4542c5:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-10 10:26:32 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 638 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 525 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 169 ++++++++++
 10 files changed, 1588 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  56 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 145 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 +
 4 files changed, 217 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
 2 files changed, 38 insertions(+)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 354 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 159 ++++++++++++++++-
 3 files changed, 527 insertions(+), 12 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec,
+                                    size_t num, bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem,
+                                unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num >
+                vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq,
-                                 unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq,
+                                  unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq,
+                                 unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 135 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next) {
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  75 +++++++++++++++++++++--
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 181 insertions(+), 25 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     void **addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = (void *)(map->iova + off);
+
+        if (unlikely(int128_gt(int128_add(needle.translated_addr,
+                                          iovec[i].iov_len),
+                               map->translated_addr + map->size))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    void * const *sg,
                                     const struct iovec *iovec,
                                     size_t num, bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64((hwaddr)sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
                             elem->in_num > 0, false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4