Series comparison

-[Qemu-devel] [PULL 00/18] Net patches
+[PULL V3 00/15] Net patches
-The following changes since commit 43ab9a5376c95c61ae898a222c4d04bdf60e239b:
+The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
-  hw/i386/vmport: fix missing definitions with non-log trace backends (2017-12-21 22:52:28 +0000)
+  Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to 0065e915192cdf83c2700bb377e5323c2649476e:
+for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
-  qemu-doc: Update the deprecation information of -tftp, -bootp, -redir and -smb (2017-12-22 10:06:05 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
 ----------------------------------------------------------------
+Changes since V2:
+- fix 32bit build errros
 ----------------------------------------------------------------
-Ed Swierk via Qemu-devel (2):
+Eugenio Pérez (14):
-      e1000, e1000e: Move per-packet TX offload flags out of context state
+      vhost: Add VhostShadowVirtqueue
-      e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
+      vhost: Add Shadow VirtQueue kick forwarding capabilities
       vhost: Add Shadow VirtQueue call forwarding capabilities
       vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
-Mark Cave-Ayland (13):
+Jason Wang (1):
-      net: move CRC32 calculation from compute_mcast_idx() into its own net_crc32() function
+      virtio-net: fix map leaking on error during receive
       net: introduce net_crc32_le() function
       pcnet: switch pcnet over to use net_crc32_le()
       eepro100: switch eepro100 e100_compute_mcast_idx() over to use net_crc32()
       sunhme: switch sunhme over to use net_crc32_le()
       sungem: fix multicast filter CRC calculation
       eepro100: use inline net_crc32() and bitshift instead of compute_mcast_idx()
       opencores_eth: use inline net_crc32() and bitshift instead of compute_mcast_idx()
       lan9118: use inline net_crc32() and bitshift instead of compute_mcast_idx()
       ftgmac100: use inline net_crc32() and bitshift instead of compute_mcast_idx()
       ne2000: use inline net_crc32() and bitshift instead of compute_mcast_idx()
       rtl8139: use inline net_crc32() and bitshift instead of compute_mcast_idx()
       net: remove unused compute_mcast_idx() function
-Thomas Huth (3):
+ hw/net/virtio-net.c                |   1 +
-      net: Remove the legacy "-net channel" parameter
+ hw/virtio/meson.build              |   2 +-
-      qemu-doc: The "-net nic" option can be used with "netdev=...", too
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
-      qemu-doc: Update the deprecation information of -tftp, -bootp, -redir and -smb
+ hw/virtio/vhost-iova-tree.h        |  27 ++
+ hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
- hw/net/e1000.c         | 92 ++++++++++++++++++++++++++++----------------------
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
- hw/net/e1000e.c        |  4 +--
+ hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
- hw/net/e1000e_core.c   | 16 ++++-----
+ include/hw/virtio/vhost-vdpa.h     |   8 +
- hw/net/e1000e_core.h   |  2 ++
+ include/qemu/iova-tree.h           |  38 ++-
- hw/net/e1000x_common.h |  2 --
+ util/iova-tree.c                   | 170 ++++++++++
- hw/net/eepro100.c      | 32 +++---------------
+files changed, 1584 insertions(+), 17 deletions(-)
- hw/net/ftgmac100.c     |  2 +-
+ create mode 100644 hw/virtio/vhost-iova-tree.c
- hw/net/lan9118.c       |  3 +-
+ create mode 100644 hw/virtio/vhost-iova-tree.h
- hw/net/ne2000.c        |  4 ++-
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
- hw/net/opencores_eth.c |  3 +-
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
  hw/net/pcnet.c         | 22 ++----------
  hw/net/rtl8139.c       |  2 +-
  hw/net/sungem.c        |  5 ++-
  hw/net/sunhme.c        | 25 +-------------
  include/net/net.h      |  5 ++-
  include/net/slirp.h    |  2 --
  net/net.c              | 40 +++++++++++++++-------
  net/slirp.c            | 34 -------------------
  qemu-doc.texi          | 38 +++++++++++----------
  qemu-options.hx        | 14 ++++----
 files changed, 144 insertions(+), 203 deletions(-)

-[Qemu-devel] [PULL 17/18] qemu-doc: The "-net nic" option can be used with "netdev=...", too
+[PULL V3 01/15] virtio-net: fix map leaking on error during receive
-From: Thomas Huth <thuth@redhat.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-Looks like we missed to document that it is also possible to specify
+Fixing this by detaching the cached elements on error. This addresses
-a netdev with "-net nic" - which is very useful if you want to
+CVE-2022-26353.
 configure your on-board NIC to use a backend that has been specified
 with "-netdev".
-Signed-off-by: Thomas Huth <thuth@redhat.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
 Cc: qemu-stable@nongnu.org
 Fixes: CVE-2022-26353
 Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- qemu-options.hx | 14 ++++++++------
+ hw/net/virtio-net.c | 1 +
-file changed, 8 insertions(+), 6 deletions(-)
+file changed, 1 insertion(+)
-diff --git a/qemu-options.hx b/qemu-options.hx
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
+--- a/hw/net/virtio-net.c
-+++ b/qemu-options.hx
++++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
-     "-netdev hubport,id=str,hubid=n\n"
-     "                configure a hub port on QEMU VLAN 'n'\n", QEMU_ARCH_ALL)
+ err:
- DEF("net", HAS_ARG, QEMU_OPTION_net,
+     for (j = 0; j < i; j++) {
--    "-net nic[,vlan=n][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
--    "                old way to create a new NIC and connect it to VLAN 'n'\n"
+         g_free(elems[j]);
--    "                (use the '-device devtype,netdev=str' option if possible instead)\n"
+     }
-+    "-net nic[,vlan=n][,netdev=nd][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
 +    "                configure or create an on-board (or machine default) NIC and\n"
 +    "                connect it either to VLAN 'n' or the netdev 'nd' (for pluggable\n"
 +    "                NICs please use '-device devtype,netdev=nd' instead)\n"
      "-net dump[,vlan=n][,file=f][,len=n]\n"
      "                dump traffic on vlan 'n' to file 'f' (max n bytes per packet)\n"
      "-net none       use it alone to have zero network devices. If no -net option\n"
@@ -XXX,XX +XXX,XX @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
      "                old way to initialize a host network interface\n"
      "                (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
  STEXI
 -@item -net nic[,vlan=@var{n}][,macaddr=@var{mac}][,model=@var{type}] [,name=@var{name}][,addr=@var{addr}][,vectors=@var{v}]
 +@item -net nic[,vlan=@var{n}][,netdev=@var{nd}][,macaddr=@var{mac}][,model=@var{type}] [,name=@var{name}][,addr=@var{addr}][,vectors=@var{v}]
  @findex -net
 -Create a new Network Interface Card and connect it to VLAN @var{n} (@var{n}
 -= 0 is the default). The NIC is an e1000 by default on the PC
 +Configure or create an on-board (or machine default) Network Interface Card
 +(NIC) and connect it either to VLAN @var{n} (@var{n} = 0 is the default), or
 +to the netdev @var{nd}. The NIC is an e1000 by default on the PC
  target. Optionally, the MAC address can be changed to @var{mac}, the
  device address set to @var{addr} (PCI cards only),
  and a @var{name} can be assigned for use in monitor commands.
 --
 .7.4

-[Qemu-devel] [PULL 14/18] rtl8139: use inline net_crc32() and bitshift instead of compute_mcast_idx()
+[PULL V3 02/15] vhost: Add VhostShadowVirtqueue
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-This makes it much easier to compare the multicast CRC calculation endian and
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-bitshift against the Linux driver implementation.
+notifications and buffers, allowing qemu to track them. While qemu is
 forwarding the buffers and virtqueue changes, it is able to commit the
 memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+This commit only exposes basic SVQ allocation and free. Next patches of
 the series add functionality like notifications and buffers forwarding.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/rtl8139.c | 2 +-
+ hw/virtio/meson.build              |  2 +-
-file changed, 1 insertion(+), 1 deletion(-)
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 files changed, 91 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
-diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/rtl8139.c
+--- a/hw/virtio/meson.build
-+++ b/hw/net/rtl8139.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-                 return size;
-             }
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
--            int mcast_idx = compute_mcast_idx(buf);
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
-+            int mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-             if (!(s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))))
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
-             {
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost shadow virtqueue
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
 +
 +#include "qemu/error-report.h"
 +
 +/**
 + * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
 + * Returns the new virtqueue or NULL.
 + *
 + * In case of error, reason is reported through error_report.
 + */
 +VhostShadowVirtqueue *vhost_svq_new(void)
 +{
 +    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 +    int r;
 +
 +    r = event_notifier_init(&svq->hdev_kick, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create kick event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_kick;
 +    }
 +
 +    r = event_notifier_init(&svq->hdev_call, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create call event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_call;
 +    }
 +
 +    return g_steal_pointer(&svq);
 +
 +err_init_hdev_call:
 +    event_notifier_cleanup(&svq->hdev_kick);
 +
 +err_init_hdev_kick:
 +    return NULL;
 +}
 +
 +/**
 + * Free the resources of the shadow virtqueue.
 + *
 + * @pvq: gpointer to SVQ so it can be used by autofree functions.
 + */
 +void vhost_svq_free(gpointer pvq)
 +{
 +    VhostShadowVirtqueue *vq = pvq;
 +    event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_cleanup(&vq->hdev_call);
 +    g_free(vq);
 +}
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost shadow virtqueue
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef VHOST_SHADOW_VIRTQUEUE_H
 +#define VHOST_SHADOW_VIRTQUEUE_H
 +
 +#include "qemu/event_notifier.h"
 +
 +/* Shadow virtqueue to relay notifications */
 +typedef struct VhostShadowVirtqueue {
 +    /* Shadow kick notifier, sent to vhost */
 +    EventNotifier hdev_kick;
 +    /* Shadow call notifier, sent to vhost */
 +    EventNotifier hdev_call;
 +} VhostShadowVirtqueue;
 +
 +VhostShadowVirtqueue *vhost_svq_new(void);
 +
 +void vhost_svq_free(gpointer vq);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 +
 +#endif
 --
 .7.4

-[Qemu-devel] [PULL 16/18] net: Remove the legacy "-net channel" parameter
+[PULL V3 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
-From: Thomas Huth <thuth@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-It has never been documented, so hardly anybody knows about this
+At this mode no buffer forwarding will be performed in SVQ mode: Qemu
-parameter, and it is marked as deprecated since QEMU v2.6.
+will just forward the guest's kicks to the device.
-Time to let it go now.
+Host memory notifiers regions are left out for simplicity, and they will
-Reviewed-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
+not be addressed in this series.
-Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/net/slirp.h |  2 --
+ hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
- net/net.c           |  7 -------
+ hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
- net/slirp.c         | 34 ----------------------------------
+ hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
- qemu-doc.texi       |  5 -----
+ include/hw/virtio/vhost-vdpa.h     |   4 ++
-files changed, 48 deletions(-)
+files changed, 215 insertions(+), 2 deletions(-)
-diff --git a/include/net/slirp.h b/include/net/slirp.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/net/slirp.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/net/slirp.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict);
+@@ -XXX,XX +XXX,XX @@
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
- int net_slirp_redir(const char *redir_str);
+ #include "qemu/error-report.h"
--int net_slirp_parse_legacy(QemuOptsList *opts_list, const char *optarg, int *ret);
++#include "qemu/main-loop.h"
--
++#include "linux-headers/linux/vhost.h"
- int net_slirp_smb(const char *exported_dir);
++
++/**
- void hmp_info_usernet(Monitor *mon, const QDict *qdict);
++ * Forward guest notifications.
-diff --git a/net/net.c b/net/net.c
++ *
 + * @n: guest kick event notifier, the one that guest set to notify svq.
 + */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
 +    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
  void vhost_svq_free(gpointer pvq)
  {
      VhostShadowVirtqueue *vq = pvq;
 +    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/net/net.c
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ int net_init_clients(void)
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+     EventNotifier hdev_kick;
- int net_client_parse(QemuOptsList *opts_list, const char *optarg)
+     /* Shadow call notifier, sent to vhost */
- {
+     EventNotifier hdev_call;
--#if defined(CONFIG_SLIRP)
++
--    int ret;
++    /*
--    if (net_slirp_parse_legacy(opts_list, optarg, &ret)) {
++     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
--        return ret;
++     * notifier allows to recover the VhostShadowVirtqueue from the event loop
--    }
++     * easily. If we use the VirtQueue's one, we don't have an easy way to
--#endif
++     * retrieve VhostShadowVirtqueue.
--
++     *
-     if (!qemu_opts_parse_noisily(opts_list, optarg, true)) {
++     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
-         return -1;
++     */
-     }
++    EventNotifier svq_kick;
-diff --git a/net/slirp.c b/net/slirp.c
+ } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/slirp.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/slirp.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ int net_init_slirp(const Netdev *netdev, const char *name,
+@@ -XXX,XX +XXX,XX @@
+ #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
 +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t idx;
 +
 +    if (!v->shadow_vqs) {
 +        return;
 +    }
 +
 +    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 +        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 +    }
 +    g_ptr_array_free(v->shadow_vqs, true);
 +}
 +
  static int vhost_vdpa_cleanup(struct vhost_dev *dev)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
      trace_vhost_vdpa_cleanup(dev, v);
      vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      memory_listener_unregister(&v->listener);
 +    vhost_vdpa_svq_cleanup(dev);
      dev->opaque = NULL;
      ram_block_discard_disable(false);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
      return ret;
  }
--
--int net_slirp_parse_legacy(QemuOptsList *opts_list, const char *optarg, int *ret)
++static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
--{
++{
--    if (strcmp(opts_list->name, "net") != 0 ||
++    if (!v->shadow_vqs_enabled) {
--        strncmp(optarg, "channel,", strlen("channel,")) != 0) {
++        return;
--        return 0;
++    }
--    }
++
--
++    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
--    error_report("The '-net channel' option is deprecated. "
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
--                 "Please use '-netdev user,guestfwd=...' instead.");
++        vhost_svq_stop(svq);
--
++    }
--    /* handle legacy -net channel,port:chr */
++}
--    optarg += strlen("channel,");
++
--
+ static int vhost_vdpa_reset_device(struct vhost_dev *dev)
--    if (QTAILQ_EMPTY(&slirp_stacks)) {
+ {
--        struct slirp_config_str *config;
++    struct vhost_vdpa *v = dev->opaque;
--
+     int ret;
--        config = g_malloc(sizeof(*config));
+     uint8_t status = 0;
--        pstrcpy(config->str, sizeof(config->str), optarg);
--        config->flags = SLIRP_CFG_LEGACY;
++    vhost_vdpa_reset_svq(v);
--        config->next = slirp_configs;
++
--        slirp_configs = config;
+     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
--        *ret = 0;
+     trace_vhost_vdpa_reset_device(dev, status);
--    } else {
+     return ret;
--        Error *err = NULL;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
--        *ret = slirp_guestfwd(QTAILQ_FIRST(&slirp_stacks), optarg, 1, &err);
+     return ret;
--        if (*ret < 0) {
+  }
--            error_report_err(err);
--        }
++static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
--    }
++                                         struct vhost_vring_file *file)
--
++{
--    return 1;
++    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
--}
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
--
++}
-diff --git a/qemu-doc.texi b/qemu-doc.texi
++
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-doc.texi
+--- a/include/hw/virtio/vhost-vdpa.h
-+++ b/qemu-doc.texi
++++ b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ The ``-smb /some/dir'' argument is now a synonym for setting
+@@ -XXX,XX +XXX,XX @@
- the ``-netdev user,smb=/some/dir'' argument instead. The new
+ #ifndef HW_VIRTIO_VHOST_VDPA_H
- syntax allows different settings to be provided per NIC.
+ #define HW_VIRTIO_VHOST_VDPA_H
--@subsection -net channel (since 2.6.0)
++#include <gmodule.h>
--
++
--The ``--net channel,ARGS'' argument is now a synonym for setting
+ #include "hw/virtio/virtio.h"
--the ``-netdev user,guestfwd=ARGS'' argument instead.
+ #include "standard-headers/linux/vhost_types.h"
--
- @subsection -net vlan (since 2.9.0)
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
+     bool iotlb_batch_begin_sent;
- The ``-net vlan=NN'' argument is partially replaced with the
+     MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
 .7.4

-[Qemu-devel] [PULL 15/18] net: remove unused compute_mcast_idx() function
+[PULL V3 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-Now that all of the callers have been converted to compute the multicast index
+This will make qemu aware of the device used buffers, allowing it to
-inline using new net CRC functions, this function can now be dropped.
+write the guest memory with its contents if needed.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/net.c | 5 -----
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
-file changed, 5 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
  hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 files changed, 71 insertions(+), 2 deletions(-)
-diff --git a/net/net.c b/net/net.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/net.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ uint32_t net_crc32_le(const uint8_t *p, int len)
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
      return crc;
  }
--unsigned compute_mcast_idx(const uint8_t *ep)
+ /**
--{
++ * Forward vhost notifications
--    return net_crc32(ep, ETH_ALEN) >> 26;
++ *
--}
++ * @n: hdev call event notifier, the one that device set to notify svq.
--
++ */
- QemuOptsList qemu_netdev_opts = {
++static void vhost_svq_handle_call(EventNotifier *n)
-     .name = "netdev",
++{
-     .implied_opt_name = "type",
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
 +                                             hdev_call);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->svq_call);
 +}
 +
 +/**
 + * Set the call notifier for the SVQ to call the guest
 + *
 + * @svq: Shadow virtqueue
 + * @call_fd: call notifier
 + *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
 +    if (call_fd == VHOST_FILE_UNBIND) {
 +        /*
 +         * Fail event_notifier_set if called handling device call.
 +         *
 +         * SVQ still needs device notifications, since it needs to keep
 +         * forwarding used buffers even with the unbind.
 +         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
 +        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      }
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
 +    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
      VhostShadowVirtqueue *vq = pvq;
      vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_set_handler(&vq->hdev_call, NULL);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
  }
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
       * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
       */
      EventNotifier svq_kick;
 +
 +    /* Guest's call notifier, where the SVQ calls guest. */
 +    EventNotifier svq_call;
  } VhostShadowVirtqueue;
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
 +static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
   * @svq: The shadow virtqueue
   * @idx: The index of the virtqueue in the vhost device
   * @errp: Error
 + *
 + * Note that this function does not rewind kick file descriptor if cannot set
 + * call one.
   */
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                   VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 +        return false;
 +    }
 +
 +    event_notifier = &svq->hdev_call;
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_call(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device call fd");
      }
      return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        int vdpa_idx = file->index - dev->vq_index;
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +
 +        vhost_svq_set_svq_call_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_call(dev, file);
 +    }
  }
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
 --
 .7.4

-[Qemu-devel] [PULL 04/18] net: introduce net_crc32_le() function
+[PULL V3 05/15] vhost: Add vhost_svq_valid_features to shadow vq
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-This provides a standard ethernet CRC32 little-endian implementation.
+This allows SVQ to negotiate features with the guest and the device. For
 the device, SVQ is a driver. While this function bypasses all
 non-transport features, it needs to disable the features that SVQ does
 not support when forwarding buffers. This includes packed vq layout,
 indirect descriptors or event idx.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Future changes can add support to offer more features to the guest,
-Reviewed-by: Eric Blake <eblake@redhat.com>
+since the use of VirtQueue gives this for free. This is left out at the
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+moment for simplicity.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/net/net.h |  2 ++
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
- net/net.c         | 22 ++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
-files changed, 24 insertions(+)
+ hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 files changed, 61 insertions(+)
-diff --git a/include/net/net.h b/include/net/net.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/net/net.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/net/net.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id);
+@@ -XXX,XX +XXX,XX @@
- void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
- #define POLYNOMIAL_BE 0x04c11db6
+ #include "qemu/error-report.h"
-+#define POLYNOMIAL_LE 0xedb88320
++#include "qapi/error.h"
- uint32_t net_crc32(const uint8_t *p, int len);
+ #include "qemu/main-loop.h"
-+uint32_t net_crc32_le(const uint8_t *p, int len);
+ #include "linux-headers/linux/vhost.h"
- unsigned compute_mcast_idx(const uint8_t *ep);
+ /**
- #define vmstate_offset_macaddr(_state, _field)                       \
++ * Validate the transport device features that both guests can use with the SVQ
-diff --git a/net/net.c b/net/net.c
++ * and SVQs can use with the device.
-index XXXXXXX..XXXXXXX 100644
++ *
---- a/net/net.c
++ * @dev_features: The features
-+++ b/net/net.c
++ * @errp: Error pointer
-@@ -XXX,XX +XXX,XX @@ uint32_t net_crc32(const uint8_t *p, int len)
++ */
-     return crc;
++bool vhost_svq_valid_features(uint64_t features, Error **errp)
  }
 +uint32_t net_crc32_le(const uint8_t *p, int len)
 +{
-+    uint32_t crc;
++    bool ok = true;
-+    int carry, i, j;
++    uint64_t svq_features = features;
 +    uint8_t b;
 +
-+    crc = 0xffffffff;
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
-+    for (i = 0; i < len; i++) {
++         ++b) {
-+        b = *p++;
++        switch (b) {
-+        for (j = 0; j < 8; j++) {
++        case VIRTIO_F_ANY_LAYOUT:
-+            carry = (crc & 0x1) ^ (b & 0x01);
++            continue;
-+            crc >>= 1;
++
-+            b >>= 1;
++        case VIRTIO_F_ACCESS_PLATFORM:
-+            if (carry) {
++            /* SVQ trust in the host's IOMMU to translate addresses */
-+                crc ^= POLYNOMIAL_LE;
++        case VIRTIO_F_VERSION_1:
 +            /* SVQ trust that the guest vring is little endian */
 +            if (!(svq_features & BIT_ULL(b))) {
 +                svq_features |= BIT_ULL(b);
 +                ok = false;
 +            }
 +            continue;
 +
 +        default:
 +            if (svq_features & BIT_ULL(b)) {
 +                svq_features &= ~BIT_ULL(b);
 +                ok = false;
 +            }
 +        }
 +    }
 +
-+    return crc;
++    if (!ok) {
 +        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
 +                         ", ok: 0x%"PRIx64, features, svq_features);
 +    }
 +    return ok;
 +}
 +
- unsigned compute_mcast_idx(const uint8_t *ep)
++/**
   * Forward guest notifications.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier svq_call;
  } VhostShadowVirtqueue;
 +bool vhost_svq_valid_features(uint64_t features, Error **errp);
 +
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
-     return net_crc32(ep, ETH_ALEN) >> 26;
+     g_autoptr(GPtrArray) shadow_vqs = NULL;
 +    uint64_t dev_features, svq_features;
 +    int r;
 +    bool ok;
      if (!v->shadow_vqs_enabled) {
          return 0;
      }
 +    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    if (r != 0) {
 +        error_setg_errno(errp, -r, "Can't get vdpa device features");
 +        return r;
 +    }
 +
 +    svq_features = dev_features;
 +    ok = vhost_svq_valid_features(svq_features, errp);
 +    if (unlikely(!ok)) {
 +        return -1;
 +    }
 +
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
          g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 --
 .7.4

-[Qemu-devel] [PULL 18/18] qemu-doc: Update the deprecation information of -tftp, -bootp, -redir and -smb
+[PULL V3 06/15] virtio: Add vhost_svq_get_vring_addr
-From: Thomas Huth <thuth@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The information how to update the deprecated parameters was too scarce,
+It reports the shadow virtqueue address from qemu virtual address space.
 so that some people did not update to the new syntax yet. Provide some
 more information to make sure that it is clear how to update from the
 old syntax to the new one.
-Signed-off-by: Thomas Huth <thuth@redhat.com>
+Since this will be different from the guest's vaddr, but the device can
 access it, SVQ takes special care about its alignment & lack of garbage
 data. It assumes that IOMMU will work in host_page_size ranges for that.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- qemu-doc.texi | 33 +++++++++++++++++++++------------
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
-file changed, 21 insertions(+), 12 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
 files changed, 38 insertions(+)
-diff --git a/qemu-doc.texi b/qemu-doc.texi
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-doc.texi
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/qemu-doc.texi
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ combined with ``-vnc tls-creds=tls0'
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+ }
- @subsection -tftp (since 2.6.0)
+ /**
--The ``-tftp /some/dir'' argument is now a synonym for setting
++ * Get the shadow vq vring address.
--the ``-netdev user,tftp=/some/dir' argument. The new syntax
++ * @svq: Shadow virtqueue
--allows different settings to be provided per NIC.
++ * @addr: Destination to store address
-+The ``-tftp /some/dir'' argument is replaced by
++ */
-+``-netdev user,id=x,tftp=/some/dir'', either accompanied with
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-+``-device ...,netdev=x'' (for pluggable NICs) or ``-net nic,netdev=x''
++                              struct vhost_vring_addr *addr)
-+(for embedded NICs). The new syntax allows different settings to be
++{
-+provided per NIC.
++    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
++    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
- @subsection -bootp (since 2.6.0)
++    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
++}
--The ``-bootp /some/file'' argument is now a synonym for setting
++
--the ``-netdev user,bootp=/some/file' argument. The new syntax
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
--allows different settings to be provided per NIC.
++{
-+The ``-bootp /some/file'' argument is replaced by
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
-+``-netdev user,id=x,bootp=/some/file'', either accompanied with
++    size_t avail_size = offsetof(vring_avail_t, ring) +
-+``-device ...,netdev=x'' (for pluggable NICs) or ``-net nic,netdev=x''
++                                             sizeof(uint16_t) * svq->vring.num;
-+(for embedded NICs). The new syntax allows different settings to be
++
-+provided per NIC.
++    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
++}
- @subsection -redir (since 2.6.0)
++
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
--The ``-redir ARGS'' argument is now a synonym for setting
++{
--the ``-netdev user,hostfwd=ARGS'' argument instead. The new
++    size_t used_size = offsetof(vring_used_t, ring) +
--syntax allows different settings to be provided per NIC.
++                                    sizeof(vring_used_elem_t) * svq->vring.num;
-+The ``-redir [tcp|udp]:hostport:[guestaddr]:guestport'' argument is
++    return ROUND_UP(used_size, qemu_real_host_page_size);
-+replaced by ``-netdev
++}
-+user,id=x,hostfwd=[tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport'',
++
-+either accompanied with ``-device ...,netdev=x'' (for pluggable NICs) or
++/**
-+``-net nic,netdev=x'' (for embedded NICs). The new syntax allows different
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
-+settings to be provided per NIC.
+  *
+  * @svq: The svq
- @subsection -smb (since 2.6.0)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
--The ``-smb /some/dir'' argument is now a synonym for setting
+--- a/hw/virtio/vhost-shadow-virtqueue.h
--the ``-netdev user,smb=/some/dir'' argument instead. The new
++++ b/hw/virtio/vhost-shadow-virtqueue.h
--syntax allows different settings to be provided per NIC.
+@@ -XXX,XX +XXX,XX @@
-+The ``-smb /some/dir'' argument is replaced by
+ #define VHOST_SHADOW_VIRTQUEUE_H
-+``-netdev user,id=x,smb=/some/dir'', either accompanied with
-+``-device ...,netdev=x'' (for pluggable NICs) or ``-net nic,netdev=x''
+ #include "qemu/event_notifier.h"
-+(for embedded NICs). The new syntax allows different settings to be
++#include "hw/virtio/virtio.h"
-+provided per NIC.
++#include "standard-headers/linux/vhost_types.h"
- @subsection -net vlan (since 2.9.0)
+ /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
 +    /* Shadow vring */
 +    struct vring vring;
 +
      /* Shadow kick notifier, sent to vhost */
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr);
 +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 --
 .7.4

-[Qemu-devel] [PULL 13/18] ne2000: use inline net_crc32() and bitshift instead of compute_mcast_idx()
+[PULL V3 07/15] vdpa: adapt vhost_ops callbacks to svq
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-This makes it much easier to compare the multicast CRC calculation endian and
+First half of the buffers forwarding part, preparing vhost-vdpa
-bitshift against the Linux driver implementation.
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
 this is effectively dead code at the moment, but it helps to reduce
 patch size.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/ne2000.c | 4 +++-
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
-file changed, 3 insertions(+), 1 deletion(-)
+file changed, 41 insertions(+), 7 deletions(-)
-diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/ne2000.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/ne2000.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
-  */
+     return ret;
- #include "qemu/osdep.h"
+  }
- #include "hw/pci/pci.h"
-+#include "net/net.h"
++static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
-+#include "net/eth.h"
++                                         struct vhost_vring_state *ring)
- #include "ne2000.h"
++{
- #include "hw/loader.h"
++    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
- #include "sysemu/sysemu.h"
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
-@@ -XXX,XX +XXX,XX @@ ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
++}
-             /* multicast */
++
-             if (!(s->rxcr & 0x08))
+ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-                 return size;
+                                          struct vhost_vring_file *file)
--            mcast_idx = compute_mcast_idx(buf);
+ {
-+            mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
-             if (!(s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))))
+     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
-                 return size;
+ }
-         } else if (s->mem[0] == buf[0] &&
 +static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
 +                                         struct vhost_vring_addr *addr)
 +{
 +    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
 +                                addr->desc_user_addr, addr->used_user_addr,
 +                                addr->avail_user_addr,
 +                                addr->log_guest_addr);
 +
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 +
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
  static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                         struct vhost_vring_addr *addr)
  {
 -    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
 -                                    addr->desc_user_addr, addr->used_user_addr,
 -                                    addr->avail_user_addr,
 -                                    addr->log_guest_addr);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        /*
 +         * Device vring addr was set at device start. SVQ base is handled by
 +         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
 +    return vhost_vdpa_set_vring_dev_addr(dev, addr);
  }
  static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                         struct vhost_vring_state *ring)
  {
 -    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        /*
 +         * Device vring base was set at device start. SVQ base is handled by
 +         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
 +    return vhost_vdpa_set_dev_vring_base(dev, ring);
  }
  static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
 --
 .7.4

-[Qemu-devel] [PULL 01/18] e1000, e1000e: Move per-packet TX offload flags out of context state
+[PULL V3 08/15] vhost: Shadow virtqueue buffers forwarding
-From: Ed Swierk via Qemu-devel <qemu-devel@nongnu.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-sum_needed and cptse flags are received from the guest within each
+Initial version of shadow virtqueue that actually forward buffers. There
-transmit data descriptor. They are not part of the offload context;
+is no iommu support at the moment, and that will be addressed in future
-instead, they determine how to apply a previously received context to
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
-the packet being transmitted:
+this means that SVQ is not usable at this point of the series on any
+device.
-- If cptse is set, perform both segmentation and checksum offload
-  using the parameters in the TSO context; otherwise just do checksum
+For simplicity it only supports modern devices, that expects vring
-  offload. (Currently the e1000 device incorrectly stores only one
+in little endian, with split ring and no event idx or indirect
-  context, which will be fixed in a subsequent patch.)
+descriptors. Support for them will not be added in this series.
-- Depending on the bits set in sum_needed, possibly perform L4
+It reuses the VirtQueue code for the device part. The driver part is
-  checksum offload and/or IP checksum offload, using the parameters in
+based on Linux's virtio_ring driver, but with stripped functionality
-  the appropriate context.
+and optimizations so it's easier to review.
-Move these flags out of struct e1000x_txd_props, which is otherwise
+However, forwarding buffers have some particular pieces: One of the most
-dedicated to storing values from a context descriptor, and into the
+unexpected ones is that a guest's buffer can expand through more than
-per-packet TX struct.
+one descriptor in SVQ. While this is handled gracefully by qemu's
+emulated virtio devices, it may cause unexpected SVQ queue full. This
-Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
+patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000.c         | 30 ++++++++++++++++--------------
+ hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
- hw/net/e1000e.c        |  4 ++--
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
- hw/net/e1000e_core.c   | 16 ++++++++--------
+ hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
- hw/net/e1000e_core.h   |  2 ++
+files changed, 522 insertions(+), 11 deletions(-)
- hw/net/e1000x_common.h |  2 --
-files changed, 28 insertions(+), 26 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 diff --git a/hw/net/e1000.c b/hw/net/e1000.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/e1000.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ typedef struct E1000State_st {
+@@ -XXX,XX +XXX,XX @@
-         unsigned char data[0x10000];
+ #include "qemu/error-report.h"
-         uint16_t size;
+ #include "qapi/error.h"
-         unsigned char vlan_needed;
+ #include "qemu/main-loop.h"
-+        unsigned char sum_needed;
++#include "qemu/log.h"
-+        bool cptse;
++#include "qemu/memalign.h"
-         e1000x_txd_props props;
+ #include "linux-headers/linux/vhost.h"
-         uint16_t tso_frames;
-     } tx;
+ /**
-@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
      unsigned int frames = s->tx.tso_frames, css, sofar;
      struct e1000_tx *tp = &s->tx;
 -    if (tp->props.tse && tp->props.cptse) {
 +    if (tp->props.tse && tp->cptse) {
          css = tp->props.ipcss;
          DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
                 frames, tp->size, css);
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
              }
          } else    /* UDP */
              stw_be_p(tp->data+css+4, len);
 -        if (tp->props.sum_needed & E1000_TXD_POPTS_TXSM) {
 +        if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
              unsigned int phsum;
              // add pseudo-header length before checksum calculation
              void *sp = tp->data + tp->props.tucso;
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
          tp->tso_frames++;
      }
 -    if (tp->props.sum_needed & E1000_TXD_POPTS_TXSM) {
 +    if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
          putsum(tp->data, tp->size, tp->props.tucso,
                 tp->props.tucss, tp->props.tucse);
      }
 -    if (tp->props.sum_needed & E1000_TXD_POPTS_IXSM) {
 +    if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
          putsum(tp->data, tp->size, tp->props.ipcso,
                 tp->props.ipcss, tp->props.ipcse);
      }
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
      } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
          // data descriptor
          if (tp->size == 0) {
 -            tp->props.sum_needed = le32_to_cpu(dp->upper.data) >> 8;
 +            tp->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
          }
 -        tp->props.cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
 +        tp->cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
      } else {
          // legacy descriptor
 -        tp->props.cptse = 0;
 +        tp->cptse = 0;
      }
      if (e1000x_vlan_enabled(s->mac_reg) &&
          e1000x_is_vlan_txd(txd_lower) &&
 -        (tp->props.cptse || txd_lower & E1000_TXD_CMD_EOP)) {
 +        (tp->cptse || txd_lower & E1000_TXD_CMD_EOP)) {
          tp->vlan_needed = 1;
          stw_be_p(tp->vlan_header,
                        le16_to_cpu(s->mac_reg[VET]));
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
      }
      addr = le64_to_cpu(dp->buffer_addr);
 -    if (tp->props.tse && tp->props.cptse) {
 +    if (tp->props.tse && tp->cptse) {
          msh = tp->props.hdr_len + tp->props.mss;
          do {
              bytes = split_size;
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
              }
              split_size -= bytes;
          } while (bytes && split_size);
 -    } else if (!tp->props.tse && tp->props.cptse) {
 +    } else if (!tp->props.tse && tp->cptse) {
          // context descriptor TSE is not set, while data descriptor TSE is set
          DBGOUT(TXERR, "TCP segmentation error\n");
      } else {
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
      if (!(txd_lower & E1000_TXD_CMD_EOP))
          return;
 -    if (!(tp->props.tse && tp->props.cptse && tp->size < tp->props.hdr_len)) {
 +    if (!(tp->props.tse && tp->cptse && tp->size < tp->props.hdr_len)) {
          xmit_seg(s);
      }
      tp->tso_frames = 0;
 -    tp->props.sum_needed = 0;
 +    tp->sum_needed = 0;
      tp->vlan_needed = 0;
      tp->size = 0;
 -    tp->props.cptse = 0;
 +    tp->cptse = 0;
  }
- static uint32_t
+ /**
-@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_e1000 = {
+- * Forward guest notifications.
-         VMSTATE_UINT16(tx.props.mss, E1000State),
++ * Number of descriptors that the SVQ can make available from the guest.
-         VMSTATE_UINT16(tx.size, E1000State),
++ *
-         VMSTATE_UINT16(tx.tso_frames, E1000State),
++ * @svq: The svq
--        VMSTATE_UINT8(tx.props.sum_needed, E1000State),
++ */
-+        VMSTATE_UINT8(tx.sum_needed, E1000State),
++static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-         VMSTATE_INT8(tx.props.ip, E1000State),
++{
-         VMSTATE_INT8(tx.props.tcp, E1000State),
++    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
-         VMSTATE_BUFFER(tx.header, E1000State),
++}
-diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
++
-index XXXXXXX..XXXXXXX 100644
++static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
---- a/hw/net/e1000e.c
++                                    const struct iovec *iovec, size_t num,
-+++ b/hw/net/e1000e.c
++                                    bool more_descs, bool write)
-@@ -XXX,XX +XXX,XX @@ static const VMStateDescription e1000e_vmstate_tx = {
++{
-     .version_id = 1,
++    uint16_t i = svq->free_head, last = svq->free_head;
-     .minimum_version_id = 1,
++    unsigned n;
-     .fields = (VMStateField[]) {
++    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
--        VMSTATE_UINT8(props.sum_needed, struct e1000e_tx),
++    vring_desc_t *descs = svq->vring.desc;
-+        VMSTATE_UINT8(sum_needed, struct e1000e_tx),
++
-         VMSTATE_UINT8(props.ipcss, struct e1000e_tx),
++    if (num == 0) {
-         VMSTATE_UINT8(props.ipcso, struct e1000e_tx),
++        return;
-         VMSTATE_UINT16(props.ipcse, struct e1000e_tx),
++    }
-@@ -XXX,XX +XXX,XX @@ static const VMStateDescription e1000e_vmstate_tx = {
++
-         VMSTATE_INT8(props.ip, struct e1000e_tx),
++    for (n = 0; n < num; n++) {
-         VMSTATE_INT8(props.tcp, struct e1000e_tx),
++        if (more_descs || (n + 1 < num)) {
-         VMSTATE_BOOL(props.tse, struct e1000e_tx),
++            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
--        VMSTATE_BOOL(props.cptse, struct e1000e_tx),
++        } else {
-+        VMSTATE_BOOL(cptse, struct e1000e_tx),
++            descs[i].flags = flags;
-         VMSTATE_BOOL(skip_cp, struct e1000e_tx),
++        }
-         VMSTATE_END_OF_LIST()
++        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
-     }
++        descs[i].len = cpu_to_le32(iovec[n].iov_len);
-diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
++
-index XXXXXXX..XXXXXXX 100644
++        last = i;
---- a/hw/net/e1000e_core.c
++        i = cpu_to_le16(descs[i].next);
-+++ b/hw/net/e1000e_core.c
++    }
-@@ -XXX,XX +XXX,XX @@ e1000e_rss_parse_packet(E1000ECore *core,
++
- static void
++    svq->free_head = le16_to_cpu(descs[last].next);
- e1000e_setup_tx_offloads(E1000ECore *core, struct e1000e_tx *tx)
++}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem, unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 +                            false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
--    if (tx->props.tse && tx->props.cptse) {
+     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
-+    if (tx->props.tse && tx->cptse) {
+     event_notifier_test_and_clear(n);
-         net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->props.mss);
+-    event_notifier_set(&svq->hdev_kick);
-         net_tx_pkt_update_ip_checksums(tx->tx_pkt);
++    vhost_handle_guest_kick(svq);
-         e1000x_inc_reg_if_not_full(core->mac, TSCTC);
++}
-         return;
++
-     }
++static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
++{
--    if (tx->props.sum_needed & E1000_TXD_POPTS_TXSM) {
++    if (svq->last_used_idx != svq->shadow_used_idx) {
-+    if (tx->sum_needed & E1000_TXD_POPTS_TXSM) {
++        return true;
-         net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0);
++    }
-     }
++
++    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
--    if (tx->props.sum_needed & E1000_TXD_POPTS_IXSM) {
++
-+    if (tx->sum_needed & E1000_TXD_POPTS_IXSM) {
++    return svq->last_used_idx != svq->shadow_used_idx;
-         net_tx_pkt_update_ip_hdr_checksum(tx->tx_pkt);
+ }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
-@@ -XXX,XX +XXX,XX @@ e1000e_process_tx_desc(E1000ECore *core,
-         return;
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
-     } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
+ void vhost_svq_stop(VhostShadowVirtqueue *svq)
-         /* data descriptor */
+ {
--        tx->props.sum_needed = le32_to_cpu(dp->upper.data) >> 8;
+     event_notifier_set_handler(&svq->svq_kick, NULL);
--        tx->props.cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
++    g_autofree VirtQueueElement *next_avail_elem = NULL;
-+        tx->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
++
-+        tx->cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
++    if (!svq->vq) {
-         e1000e_process_ts_option(core, dp);
++        return;
 +    }
 +
 +    /* Send all pending used descriptors to guest */
 +    vhost_svq_flush(svq, false);
 +
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq, unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq, unsigned idx,
 +                                  Error **errp)
  {
      struct vhost_vring_file file = {
          .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 -        return false;
 +        return r;
      }
      event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
          error_setg_errno(errp, -r, "Can't set device call fd");
      }
 +    return r;
 +}
 +
 +/**
 + * Unmap a SVQ area in the device
 + */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)(uintptr_t)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)(intptr_t)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
      }
      for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
          VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        struct vhost_vring_addr addr = {
 +            .index = i,
 +        };
 +        int r;
          bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
          if (unlikely(!ok)) {
 -            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            goto err;
 +        }
 +
 +        vhost_svq_start(svq, dev->vdev, vq);
 +        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
 +        if (unlikely(!ok)) {
 +            goto err_map;
 +        }
 +
 +        /* Override vring GPA set by vhost subsystem */
 +        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
 +        if (unlikely(r != 0)) {
 +            error_setg_errno(&err, -r, "Cannot set device address");
 +            goto err_set_addr;
 +        }
 +    }
 +
 +    return true;
 +
 +err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
      }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
          }
          vhost_vdpa_set_vring_ready(dev);
      } else {
-         /* legacy descriptor */
++        ok = vhost_vdpa_svqs_stop(dev);
-         e1000e_process_ts_option(core, dp);
++        if (unlikely(!ok)) {
--        tx->props.cptse = 0;
++            return -1;
-+        tx->cptse = 0;
++        }
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      }
-     addr = le64_to_cpu(dp->buffer_addr);
-@@ -XXX,XX +XXX,XX @@ e1000e_process_tx_desc(E1000ECore *core,
-         tx->skip_cp = false;
-         net_tx_pkt_reset(tx->tx_pkt);
--        tx->props.sum_needed = 0;
--        tx->props.cptse = 0;
-+        tx->sum_needed = 0;
-+        tx->cptse = 0;
-     }
- }
-diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e_core.h
-+++ b/hw/net/e1000e_core.h
-@@ -XXX,XX +XXX,XX @@ struct E1000Core {
-         e1000x_txd_props props;
-         bool skip_cp;
-+        unsigned char sum_needed;
-+        bool cptse;
-         struct NetTxPkt *tx_pkt;
-     } tx[E1000E_NUM_QUEUES];
-diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000x_common.h
-+++ b/hw/net/e1000x_common.h
-@@ -XXX,XX +XXX,XX @@ void e1000x_update_regs_on_autoneg_done(uint32_t *mac, uint16_t *phy);
- void e1000x_increase_size_stats(uint32_t *mac, const int *size_regs, int size);
- typedef struct e1000x_txd_props {
--    unsigned char sum_needed;
-     uint8_t ipcss;
-     uint8_t ipcso;
-     uint16_t ipcse;
-@@ -XXX,XX +XXX,XX @@ typedef struct e1000x_txd_props {
-     int8_t ip;
-     int8_t tcp;
-     bool tse;
--    bool cptse;
- } e1000x_txd_props;
- void e1000x_read_tx_ctx_descr(struct e1000_context_desc *d,
 --
 .7.4

-[Qemu-devel] [PULL 07/18] sunhme: switch sunhme over to use net_crc32_le()
+[PULL V3 09/15] util: Add iova_tree_alloc_map
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-Instead of sunhme_crc32_le() using its own implementation, we can simply call
+This iova tree function allows it to look for a hole in allocated
-net_crc32_le() directly and apply the bit shift inline.
+regions and return a totally new translation for a given translated
+address.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Reviewed-by: Eric Blake <eblake@redhat.com>
+It's usage is mainly to allow devices to access qemu address space,
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+remapping guest's one into a new iova space where qemu can add chunks of
 addresses.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/sunhme.c | 25 +------------------------
+ include/qemu/iova-tree.h |  18 +++++++
-file changed, 1 insertion(+), 24 deletions(-)
+ util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
+files changed, 154 insertions(+)
-diff --git a/hw/net/sunhme.c b/hw/net/sunhme.c
 diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/sunhme.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/sunhme.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static inline void sunhme_set_rx_ring_nr(SunHMEState *s, int i)
+@@ -XXX,XX +XXX,XX @@
-     s->erxregs[HME_ERXI_RING >> 2] = ring;
+ #define  IOVA_OK           (0)
  #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
  #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
 +#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
  typedef struct IOVATree IOVATree;
  typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
  void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next)
 +{
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
--#define POLYNOMIAL_LE 0xedb88320
++/**
--static uint32_t sunhme_crc32_le(const uint8_t *p, int len)
++ * Try to find an unallocated IOVA range between prev and this elements.
--{
++ *
--    uint32_t crc;
++ * @args: Arguments to allocation
--    int carry, i, j;
++ *
--    uint8_t b;
++ * Cases:
--
++ *
--    crc = 0xffffffff;
++ * (1) !prev, !this: No entries allocated, always succeed
--    for (i = 0; i < len; i++) {
++ *
--        b = *p++;
++ * (2) !prev, this: We're iterating at the 1st element.
--        for (j = 0; j < 8; j++) {
++ *
--            carry = (crc & 0x1) ^ (b & 0x01);
++ * (3) prev, !this: We're iterating at the last element.
--            crc >>= 1;
++ *
--            b >>= 1;
++ * (4) prev, this: this is the most common case, we'll try to find a hole
--            if (carry) {
++ * between "prev" and "this" mapping.
--                crc = crc ^ POLYNOMIAL_LE;
++ *
--            }
++ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
--        }
++ * searches linearly so it's easy to discard the result if it's not the case.
--    }
++ */
--
++static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
--    return crc;
++{
--}
++    const DMAMap *prev = args->prev, *this = args->this;
--
++    uint64_t hole_start, hole_last;
- #define MIN_BUF_SIZE 60
++
++    if (this && this->iova + this->size < args->iova_begin) {
- static ssize_t sunhme_receive(NetClientState *nc, const uint8_t *buf,
++        return;
-@@ -XXX,XX +XXX,XX @@ static ssize_t sunhme_receive(NetClientState *nc, const uint8_t *buf,
++    }
-             trace_sunhme_rx_filter_bcast_match();
++
-         } else if (s->macregs[HME_MACI_RXCFG >> 2] & HME_MAC_RXCFG_HENABLE) {
++    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
-             /* Didn't match local address, check hash filter */
++    hole_last = this ? this->iova : HWADDR_MAX;
--            int mcast_idx = sunhme_crc32_le(buf, 6) >> 26;
++
-+            int mcast_idx = net_crc32_le(buf, ETH_ALEN) >> 26;
++    if (hole_last - hole_start > args->new_size) {
-             if (!(s->macregs[(HME_MACI_HASHTAB0 >> 2) - (mcast_idx >> 4)] &
++        args->iova_result = hole_start;
-                     (1 << (mcast_idx & 0xf)))) {
++        args->iova_found = true;
-                 /* Didn't match hash filter */
++    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
      g_tree_destroy(tree->tree);
 --
 .7.4

-[Qemu-devel] [PULL 03/18] net: move CRC32 calculation from compute_mcast_idx() into its own net_crc32() function
+[PULL V3 10/15] util: add iova_tree_find_iova
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-Separate out the standard ethernet CRC32 calculation into a new net_crc32()
+This function does the reverse operation of iova_tree_find: To look for
-function, renaming the constant POLYNOMIAL to POLYNOMIAL_BE to make it clear
+a mapping that match a translated address so we can do the reverse.
 that this is a big-endian CRC32 calculation.
-As part of the constant rename, remove the duplicate definition of POLYNOMIAL
+This have linear complexity instead of logarithmic, but it supports
-from eepro100.c and use the new POLYNOMIAL_BE constant instead.
+overlapping HVA. Future developments could reduce it.
-Once this is complete remove the existing CRC32 implementation from
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-compute_mcast_idx() and call the new net_crc32() function in its place.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/eepro100.c |  4 +---
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
- include/net/net.h |  3 ++-
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
- net/net.c         | 16 +++++++++++-----
+files changed, 53 insertions(+), 1 deletion(-)
 files changed, 14 insertions(+), 9 deletions(-)
-diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/eepro100.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/eepro100.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static const uint16_t eepro100_mdi_mask[] = {
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+  * @tree: the iova tree to search from
   * @map: the mapping to search
   *
 - * Search for a mapping in the iova tree that overlaps with the
 + * Search for a mapping in the iova tree that iova overlaps with the
   * mapping range specified.  Only the first found mapping will be
   * returned.
   *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
      bool iova_found;
  };
--#define POLYNOMIAL 0x04c11db6
++typedef struct IOVATreeFindIOVAArgs {
--
++    const DMAMap *needle;
- static E100PCIDeviceInfo *eepro100_get_class(EEPRO100State *s);
++    const DMAMap *result;
++} IOVATreeFindIOVAArgs;
  /* From FreeBSD (locally modified). */
@@ -XXX,XX +XXX,XX @@ static unsigned e100_compute_mcast_idx(const uint8_t *ep)
              crc <<= 1;
              b >>= 1;
              if (carry) {
 -                crc = ((crc ^ POLYNOMIAL) | carry);
 +                crc = ((crc ^ POLYNOMIAL_BE) | carry);
              }
          }
      }
 diff --git a/include/net/net.h b/include/net/net.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/net/net.h
 +++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id);
  void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
 -#define POLYNOMIAL 0x04c11db6
 +#define POLYNOMIAL_BE 0x04c11db6
 +uint32_t net_crc32(const uint8_t *p, int len);
  unsigned compute_mcast_idx(const uint8_t *ep);
  #define vmstate_offset_macaddr(_state, _field)                       \
 diff --git a/net/net.c b/net/net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ int net_client_parse(QemuOptsList *opts_list, const char *optarg)
  /* From FreeBSD */
  /* XXX: optimize */
 -unsigned compute_mcast_idx(const uint8_t *ep)
 +uint32_t net_crc32(const uint8_t *p, int len)
  {
      uint32_t crc;
      int carry, i, j;
      uint8_t b;
      crc = 0xffffffff;
 -    for (i = 0; i < 6; i++) {
 -        b = *ep++;
 +    for (i = 0; i < len; i++) {
 +        b = *p++;
          for (j = 0; j < 8; j++) {
              carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01);
              crc <<= 1;
              b >>= 1;
              if (carry) {
 -                crc = ((crc ^ POLYNOMIAL) | carry);
 +                crc = ((crc ^ POLYNOMIAL_BE) | carry);
              }
          }
      }
 -    return crc >> 26;
 +
-+    return crc;
+ /**
   * Iterate args to the next hole
   *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
      return g_tree_lookup(tree->tree, map);
  }
 +static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
 +                                                gpointer data)
 +{
 +    const DMAMap *map = key;
 +    IOVATreeFindIOVAArgs *args = data;
 +    const DMAMap *needle;
 +
 +    g_assert(key == value);
 +
 +    needle = args->needle;
 +    if (map->translated_addr + map->size < needle->translated_addr ||
 +        needle->translated_addr + needle->size < map->translated_addr) {
 +        return false;
 +    }
 +
 +    args->result = map;
 +    return true;
 +}
 +
-+unsigned compute_mcast_idx(const uint8_t *ep)
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
 +{
-+    return net_crc32(ep, ETH_ALEN) >> 26;
++    IOVATreeFindIOVAArgs args = {
- }
++        .needle = map,
++    };
- QemuOptsList qemu_netdev_opts = {
++
 +    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
 +    return args.result;
 +}
 +
  const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
  {
      const DMAMap map = { .iova = iova, .size = 0 };
 --
 .7.4

-[Qemu-devel] [PULL 12/18] ftgmac100: use inline net_crc32() and bitshift instead of compute_mcast_idx()
+[PULL V3 11/15] vhost: Add VhostIOVATree
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-This makes it much easier to compare the multicast CRC calculation endian and
+This tree is able to look for a translated address from an IOVA address.
 bitshift against the Linux driver implementation.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+At first glance it is similar to util/iova-tree. However, SVQ working on
 devices with limited IOVA space need more capabilities, like allocating
 IOVA chunks or performing reverse translations (qemu addresses to iova).
 The allocation capability, as "assign a free IOVA address to this chunk
 of memory in qemu's address space" allows shadow virtqueue to create a
 new address space that is not restricted by guest's addressable one, so
 we can allocate shadow vqs vrings outside of it.
 It duplicates the tree so it can search efficiently in both directions,
 and it will signal overlap if iova or the translated address is present
 in any tree.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/ftgmac100.c | 2 +-
+ hw/virtio/meson.build       |   2 +-
-file changed, 1 insertion(+), 1 deletion(-)
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 files changed, 138 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/hw/net/ftgmac100.c b/hw/net/ftgmac100.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/ftgmac100.c
+--- a/hw/virtio/meson.build
-+++ b/hw/net/ftgmac100.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ static int ftgmac100_filter(FTGMAC100State *s, const uint8_t *buf, size_t len)
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-             }
+ virtio_ss = ss.source_set()
-             /* TODO: this does not seem to work for ftgmac100 */
+ virtio_ss.add(files('virtio.c'))
--            mcast_idx = compute_mcast_idx(buf);
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+            mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
-             if (!(s->math[mcast_idx / 32] & (1 << (mcast_idx % 32)))) {
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-                 return 0;
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
-             }
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/iova-tree.h"
 +#include "vhost-iova-tree.h"
 +
 +#define iova_min_addr qemu_real_host_page_size
 +
 +/**
 + * VhostIOVATree, able to:
 + * - Translate iova address
 + * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
 +struct VhostIOVATree {
 +    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
 +    /* Last addressable iova address in the device */
 +    uint64_t iova_last;
 +
 +    /* IOVA address to qemu memory maps. */
 +    IOVATree *iova_taddr_map;
 +};
 +
 +/**
 + * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
 +    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
 +
 +    /* Some devices do not like 0 addresses */
 +    tree->iova_first = MAX(iova_first, iova_min_addr);
 +    tree->iova_last = iova_last;
 +
 +    tree->iova_taddr_map = iova_tree_new();
 +    return tree;
 +}
 +
 +/**
 + * Delete an iova tree
 + */
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
 +{
 +    iova_tree_destroy(iova_tree->iova_taddr_map);
 +    g_free(iova_tree);
 +}
 +
 +/**
 + * Find the IOVA address stored from a memory address
 + *
 + * @tree: The iova tree
 + * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
 +    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +}
 +
 +/**
 + * Allocate a new mapping
 + *
 + * @tree: The iova tree
 + * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
 +    /* Some vhost devices do not like addr 0. Skip first page */
 +    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
 +    if (map->translated_addr + map->size < map->translated_addr ||
 +        map->perm == IOMMU_NONE) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /* Allocate a node in IOVA address */
 +    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
 +                               tree->iova_last);
 +}
 +
 +/**
 + * Remove existing mappings from iova tree
 + *
 + * @iova_tree: The vhost iova tree
 + * @map: The map to remove
 + */
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
 +{
 +    iova_tree_remove(iova_tree->iova_taddr_map, map);
 +}
 diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
 +#define HW_VIRTIO_VHOST_IOVA_TREE_H
 +
 +#include "qemu/iova-tree.h"
 +#include "exec/memory.h"
 +
 +typedef struct VhostIOVATree VhostIOVATree;
 +
 +VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
 +
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
 +                                        const DMAMap *map);
 +int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
 +
 +#endif
 --
 .7.4

-[Qemu-devel] [PULL 02/18] e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
+[PULL V3 12/15] vdpa: Add custom IOTLB translations to SVQ
-From: Ed Swierk via Qemu-devel <qemu-devel@nongnu.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-The device is supposed to maintain two distinct contexts for transmit
+Use translations added in VhostIOVATree in SVQ.
-offloads: one has parameters for both segmentation and checksum
-offload, the other only for checksum offload. The guest driver can
+Only introduce usage here, not allocation and deallocation. As with
-send two context descriptors, one for each context (the TSE flag
+previous patches, we use the dead code paths of shadow_vqs_enabled to
-specifies which). Then the guest can refer to one or the other context
+avoid commiting too many changes at once. These are impossible to take
-in subsequent transmit data descriptors, depending on what offloads it
+at the moment.
-wants applied to each packet.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Currently the e1000 device stores just one context, and misinterprets
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 the TSE flags in the context and data descriptors. This is often okay:
 Linux happens to send a fresh context descriptor before every data
 descriptor, so forgetting the other context doesn't matter. Windows
 does rely on separate contexts for TSO vs. non-TSO packets, but for
 mostly-TCP traffic the two contexts have identical TCP-specific
 offload parameters so confusing them doesn't matter.
 One case where this confusion matters is when a Windows guest sets up
 a TSO context for TCP and a non-TSO context for UDP, and then
 transmits both TCP and UDP traffic in parallel. The e1000 device
 sometimes ends up using TCP-specific parameters while doing checksum
 offload on a UDP datagram: it writes the checksum to offset 16 (the
 correct location for a TCP checksum), stomping on two bytes of UDP
 data, and leaving the wrong value in the actual UDP checksum field at
 offset 6. (Even worse, the host network stack may then recompute the
 UDP checksum, "correcting" it to match the corrupt data before sending
 it out a physical interface.)
 Correct this by tracking the TSO context independently of the non-TSO
 context, and selecting the appropriate context based on the TSE flag
 in each transmit data descriptor.
 Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000.c | 70 +++++++++++++++++++++++++++++++++-------------------------
+ hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
-file changed, 40 insertions(+), 30 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
-diff --git a/hw/net/e1000.c b/hw/net/e1000.c
+ include/hw/virtio/vhost-vdpa.h     |   3 +
 files changed, 187 insertions(+), 30 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/e1000.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ typedef struct E1000State_st {
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-         unsigned char sum_needed;
+     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
-         bool cptse;
+ }
-         e1000x_txd_props props;
-+        e1000x_txd_props tso_props;
+-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
-         uint16_t tso_frames;
++/**
-     } tx;
++ * Translate addresses between the qemu's virtual address and the SVQ IOVA
++ *
-@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
++ * @svq: Shadow VirtQueue
-     uint16_t len;
++ * @vaddr: Translated IOVA addresses
-     unsigned int frames = s->tx.tso_frames, css, sofar;
++ * @iovec: Source qemu's VA addresses
-     struct e1000_tx *tp = &s->tx;
++ * @num: Length of iovec and minimum length of vaddr
-+    struct e1000x_txd_props *props = tp->cptse ? &tp->tso_props : &tp->props;
++ */
++static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
--    if (tp->props.tse && tp->cptse) {
++                                     hwaddr *addrs, const struct iovec *iovec,
--        css = tp->props.ipcss;
++                                     size_t num)
-+    if (tp->cptse) {
++{
-+        css = props->ipcss;
++    if (num == 0) {
-         DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
++        return true;
-                frames, tp->size, css);
++    }
--        if (tp->props.ip) {    /* IPv4 */
++
-+        if (props->ip) {    /* IPv4 */
++    for (size_t i = 0; i < num; ++i) {
-             stw_be_p(tp->data+css+2, tp->size - css);
++        DMAMap needle = {
-             stw_be_p(tp->data+css+4,
++            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
-                      lduw_be_p(tp->data + css + 4) + frames);
++            .size = iovec[i].iov_len,
-         } else {         /* IPv6 */
++        };
-             stw_be_p(tp->data+css+4, tp->size - css);
++        Int128 needle_last, map_last;
 +        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = map->iova + off;
 +
 +        needle_last = int128_add(int128_make64(needle.translated_addr),
 +                                 int128_make64(iovec[i].iov_len));
 +        map_last = int128_make64(map->translated_addr + map->size);
 +        if (unlikely(int128_gt(needle_last, map_last))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                      const struct iovec *iovec, size_t num,
                                      bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
--        css = tp->props.tucss;
+-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
-+        css = props->tucss;
++        descs[i].addr = cpu_to_le64(sg[n]);
-         len = tp->size - css;
+         descs[i].len = cpu_to_le32(iovec[n].iov_len);
--        DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->props.tcp, css, len);
--        if (tp->props.tcp) {
+         last = i;
--            sofar = frames * tp->props.mss;
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-+        DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len);
+ {
-+        if (props->tcp) {
+     unsigned avail_idx;
-+            sofar = frames * props->mss;
+     vring_avail_t *avail = svq->vring.avail;
-             stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
++    bool ok;
--            if (tp->props.paylen - sofar > tp->props.mss) {
++    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
-+            if (props->paylen - sofar > props->mss) {
-                 tp->data[css + 13] &= ~9;    /* PSH, FIN */
+     *head = svq->free_head;
-             } else if (frames) {
-                 e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC);
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-             }
+         return false;
--        } else    /* UDP */
+     }
-+        } else {    /* UDP */
-             stw_be_p(tp->data+css+4, len);
+-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 -                            false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                                struct vhost_vring_addr *addr)
  {
 -    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
 -    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
 -    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
  }
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/event_notifier.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 +#include "hw/virtio/vhost-iova-tree.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
-         if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
++
-             unsigned int phsum;
++        iova = mem_region.iova;
-             // add pseudo-header length before checksum calculation
++    }
--            void *sp = tp->data + tp->props.tucso;
-+            void *sp = tp->data + props->tucso;
+     vhost_vdpa_iotlb_batch_begin_once(v);
+     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
-             phsum = lduw_be_p(sp) + len;
+@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
-             phsum = (phsum >> 16) + (phsum & 0xffff);
-@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
+     llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
-     if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
+-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
--        putsum(tp->data, tp->size, tp->props.tucso,
++    needle.translated_addr = svq_addr.used_user_addr;
--               tp->props.tucss, tp->props.tucse);
++    return vhost_vdpa_svq_unmap_ring(v, &needle);
-+        putsum(tp->data, tp->size, props->tucso, props->tucss, props->tucse);
++}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)(uintptr_t)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)(uintptr_t)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
-     if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
++    addr->desc_user_addr = driver_region.iova;
--        putsum(tp->data, tp->size, tp->props.ipcso,
++    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
--               tp->props.ipcss, tp->props.ipcse);
++    addr->avail_user_addr = driver_region.iova + avail_offset;
-+        putsum(tp->data, tp->size, props->ipcso, props->ipcss, props->ipcse);
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)(intptr_t)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
-     if (tp->vlan_needed) {
++    addr->used_user_addr = device_region.iova;
-         memmove(tp->vlan, tp->data, 4);
-@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
+-    return r == 0;
++    return ok;
-     s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
+ }
-     if (dtype == E1000_TXD_CMD_DEXT) {    /* context descriptor */
--        e1000x_read_tx_ctx_descr(xp, &tp->props);
+ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
--        tp->tso_frames = 0;
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
--        if (tp->props.tucso == 0) {    /* this is probably wrong */
+index XXXXXXX..XXXXXXX 100644
--            DBGOUT(TXSUM, "TCP/UDP: cso 0!\n");
+--- a/include/hw/virtio/vhost-vdpa.h
--            tp->props.tucso = tp->props.tucss + (tp->props.tcp ? 16 : 6);
++++ b/include/hw/virtio/vhost-vdpa.h
-+        if (le32_to_cpu(xp->cmd_and_length) & E1000_TXD_CMD_TSE) {
+@@ -XXX,XX +XXX,XX @@
-+            e1000x_read_tx_ctx_descr(xp, &tp->tso_props);
-+            tp->tso_frames = 0;
+ #include <gmodule.h>
-+        } else {
-+            e1000x_read_tx_ctx_descr(xp, &tp->props);
++#include "hw/virtio/vhost-iova-tree.h"
-         }
+ #include "hw/virtio/virtio.h"
-         return;
+ #include "standard-headers/linux/vhost_types.h"
-     } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
-@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
-     }
+     MemoryListener listener;
+     struct vhost_vdpa_iova_range iova_range;
-     addr = le64_to_cpu(dp->buffer_addr);
+     bool shadow_vqs_enabled;
--    if (tp->props.tse && tp->cptse) {
++    /* IOVA mapping used by the Shadow Virtqueue */
--        msh = tp->props.hdr_len + tp->props.mss;
++    VhostIOVATree *iova_tree;
-+    if (tp->cptse) {
+     GPtrArray *shadow_vqs;
-+        msh = tp->tso_props.hdr_len + tp->tso_props.mss;
+     struct vhost_dev *dev;
-         do {
+     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
              bytes = split_size;
              if (tp->size + bytes > msh)
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
              bytes = MIN(sizeof(tp->data) - tp->size, bytes);
              pci_dma_read(d, addr, tp->data + tp->size, bytes);
              sz = tp->size + bytes;
 -            if (sz >= tp->props.hdr_len && tp->size < tp->props.hdr_len) {
 -                memmove(tp->header, tp->data, tp->props.hdr_len);
 +            if (sz >= tp->tso_props.hdr_len
 +                && tp->size < tp->tso_props.hdr_len) {
 +                memmove(tp->header, tp->data, tp->tso_props.hdr_len);
              }
              tp->size = sz;
              addr += bytes;
              if (sz == msh) {
                  xmit_seg(s);
 -                memmove(tp->data, tp->header, tp->props.hdr_len);
 -                tp->size = tp->props.hdr_len;
 +                memmove(tp->data, tp->header, tp->tso_props.hdr_len);
 +                tp->size = tp->tso_props.hdr_len;
              }
              split_size -= bytes;
          } while (bytes && split_size);
 -    } else if (!tp->props.tse && tp->cptse) {
 -        // context descriptor TSE is not set, while data descriptor TSE is set
 -        DBGOUT(TXERR, "TCP segmentation error\n");
      } else {
          split_size = MIN(sizeof(tp->data) - tp->size, split_size);
          pci_dma_read(d, addr, tp->data + tp->size, split_size);
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
      if (!(txd_lower & E1000_TXD_CMD_EOP))
          return;
 -    if (!(tp->props.tse && tp->cptse && tp->size < tp->props.hdr_len)) {
 +    if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
          xmit_seg(s);
      }
      tp->tso_frames = 0;
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_e1000_full_mac_state = {
  static const VMStateDescription vmstate_e1000 = {
      .name = "e1000",
 -    .version_id = 2,
 +    .version_id = 3,
      .minimum_version_id = 1,
      .pre_save = e1000_pre_save,
      .post_load = e1000_post_load,
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_e1000 = {
          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
 +        VMSTATE_UINT8_V(tx.tso_props.ipcss, E1000State, 3),
 +        VMSTATE_UINT8_V(tx.tso_props.ipcso, E1000State, 3),
 +        VMSTATE_UINT16_V(tx.tso_props.ipcse, E1000State, 3),
 +        VMSTATE_UINT8_V(tx.tso_props.tucss, E1000State, 3),
 +        VMSTATE_UINT8_V(tx.tso_props.tucso, E1000State, 3),
 +        VMSTATE_UINT16_V(tx.tso_props.tucse, E1000State, 3),
 +        VMSTATE_UINT32_V(tx.tso_props.paylen, E1000State, 3),
 +        VMSTATE_UINT8_V(tx.tso_props.hdr_len, E1000State, 3),
 +        VMSTATE_UINT16_V(tx.tso_props.mss, E1000State, 3),
 +        VMSTATE_INT8_V(tx.tso_props.ip, E1000State, 3),
 +        VMSTATE_INT8_V(tx.tso_props.tcp, E1000State, 3),
          VMSTATE_END_OF_LIST()
      },
      .subsections = (const VMStateDescription*[]) {
 --
 .7.4

-[Qemu-devel] [PULL 05/18] pcnet: switch pcnet over to use net_crc32_le()
+Deleted patch
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Instead of lnc_mchash() using its own implementation, we can simply call
-net_crc32_le() directly and apply the bit shift inline.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/pcnet.c | 22 ++--------------------
-file changed, 2 insertions(+), 20 deletions(-)
-diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/pcnet.c
-+++ b/hw/net/pcnet.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
- #include "hw/qdev.h"
- #include "net/net.h"
-+#include "net/eth.h"
- #include "qemu/timer.h"
- #include "qemu/sockets.h"
- #include "sysemu/sysemu.h"
-@@ -XXX,XX +XXX,XX @@ static inline void pcnet_rmd_store(PCNetState *s, struct pcnet_RMD *rmd,
-            be16_to_cpu(hdr->ether_type));       \
- } while (0)
--#define MULTICAST_FILTER_LEN 8
--
--static inline uint32_t lnc_mchash(const uint8_t *ether_addr)
--{
--#define LNC_POLYNOMIAL          0xEDB88320UL
--    uint32_t crc = 0xFFFFFFFF;
--    int idx, bit;
--    uint8_t data;
--
--    for (idx = 0; idx < 6; idx++) {
--        for (data = *ether_addr++, bit = 0; bit < MULTICAST_FILTER_LEN; bit++) {
--            crc = (crc >> 1) ^ (((crc ^ data) & 1) ? LNC_POLYNOMIAL : 0);
--            data >>= 1;
--        }
--    }
--    return crc;
--#undef LNC_POLYNOMIAL
--}
--
- #define CRC(crc, ch)     (crc = (crc >> 8) ^ crctab[(crc ^ (ch)) & 0xff])
- /* generated using the AUTODIN II polynomial
-@@ -XXX,XX +XXX,XX @@ static inline int ladr_match(PCNetState *s, const uint8_t *buf, int size)
-             s->csr[10] & 0xff, s->csr[10] >> 8,
-             s->csr[11] & 0xff, s->csr[11] >> 8
-         };
--        int index = lnc_mchash(hdr->ether_dhost) >> 26;
-+        int index = net_crc32_le(hdr->ether_dhost, ETH_ALEN) >> 26;
-         return !!(ladr[index >> 3] & (1 << (index & 7)));
-     }
-     return 0;
---
-.7.4

-[Qemu-devel] [PULL 06/18] eepro100: switch eepro100 e100_compute_mcast_idx() over to use net_crc32()
+Deleted patch
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Instead of e100_compute_mcast_idx() using its own implementation, we can
-simply call net_crc32() directly and apply the bit shift inline.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Reviewed-by: Stefan Weil <sw@weilnetz.de>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/eepro100.c | 28 ++++------------------------
-file changed, 4 insertions(+), 24 deletions(-)
-diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/eepro100.c
-+++ b/hw/net/eepro100.c
-@@ -XXX,XX +XXX,XX @@
- #include "hw/hw.h"
- #include "hw/pci/pci.h"
- #include "net/net.h"
-+#include "net/eth.h"
- #include "hw/nvram/eeprom93xx.h"
- #include "sysemu/sysemu.h"
- #include "sysemu/dma.h"
-@@ -XXX,XX +XXX,XX @@ static const uint16_t eepro100_mdi_mask[] = {
- static E100PCIDeviceInfo *eepro100_get_class(EEPRO100State *s);
--/* From FreeBSD (locally modified). */
--static unsigned e100_compute_mcast_idx(const uint8_t *ep)
--{
--    uint32_t crc;
--    int carry, i, j;
--    uint8_t b;
--
--    crc = 0xffffffff;
--    for (i = 0; i < 6; i++) {
--        b = *ep++;
--        for (j = 0; j < 8; j++) {
--            carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01);
--            crc <<= 1;
--            b >>= 1;
--            if (carry) {
--                crc = ((crc ^ POLYNOMIAL_BE) | carry);
--            }
--        }
--    }
--    return (crc & BITS(7, 2)) >> 2;
--}
--
- /* Read a 16 bit control/status (CSR) register. */
- static uint16_t e100_read_reg2(EEPRO100State *s, E100RegisterOffset addr)
- {
-@@ -XXX,XX +XXX,XX @@ static void set_multicast_list(EEPRO100State *s)
-         uint8_t multicast_addr[6];
-         pci_dma_read(&s->dev, s->cb_address + 10 + i, multicast_addr, 6);
-         TRACE(OTHER, logout("multicast entry %s\n", nic_dump(multicast_addr, 6)));
--        unsigned mcast_idx = e100_compute_mcast_idx(multicast_addr);
-+        unsigned mcast_idx = (net_crc32(multicast_addr, ETH_ALEN) &
-+                              BITS(7, 2)) >> 2;
-         assert(mcast_idx < 64);
-         s->mult[mcast_idx >> 3] |= (1 << (mcast_idx & 7));
-     }
-@@ -XXX,XX +XXX,XX @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
-         if (s->configuration[21] & BIT(3)) {
-           /* Multicast all bit is set, receive all multicast frames. */
-         } else {
--          unsigned mcast_idx = e100_compute_mcast_idx(buf);
-+          unsigned mcast_idx = (net_crc32(buf, ETH_ALEN) & BITS(7, 2)) >> 2;
-           assert(mcast_idx < 64);
-           if (s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))) {
-             /* Multicast frame is allowed in hash table. */
---
-.7.4

-[Qemu-devel] [PULL 11/18] lan9118: use inline net_crc32() and bitshift instead of compute_mcast_idx()
+[PULL V3 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-This makes it much easier to compare the multicast CRC calculation endian and
+This is needed to achieve migration, so the destination can restore its
-bitshift against the Linux driver implementation.
+index.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Setting base as last used idx, so destination will see as available all
 the entries that the device did not use, including the in-flight
 processing ones.
 This is ok for networking, but other kinds of devices might have
 problems with these retransmissions.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/lan9118.c | 3 ++-
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
-file changed, 2 insertions(+), 1 deletion(-)
+file changed, 17 insertions(+)
-diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/lan9118.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/lan9118.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
- #include "qemu/osdep.h"
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
- #include "hw/sysbus.h"
+                                        struct vhost_vring_state *ring)
- #include "net/net.h"
+ {
-+#include "net/eth.h"
++    struct vhost_vdpa *v = dev->opaque;
- #include "hw/devices.h"
+     int ret;
- #include "sysemu/sysemu.h"
- #include "hw/ptimer.h"
++    if (v->shadow_vqs_enabled) {
-@@ -XXX,XX +XXX,XX @@ static int lan9118_filter(lan9118_state *s, const uint8_t *addr)
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-         }
++                                                      ring->index);
-     } else {
++
-         /* Hash matching  */
++        /*
--        hash = compute_mcast_idx(addr);
++         * Setting base as last used idx, so destination will see as available
-+        hash = net_crc32(addr, ETH_ALEN) >> 26;
++         * all the entries that the device did not use, including the in-flight
-         if (hash & 0x20) {
++         * processing ones.
-             return (s->mac_hashh >> (hash & 0x1f)) & 1;
++         *
-         } else {
++         * TODO: This is ok for networking, but other kinds of devices might
 +         * have problems with these retransmissions.
 +         */
 +        ring->num = svq->last_used_idx;
 +        return 0;
 +    }
 +
      ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
      trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
      return ret;
 --
 .7.4

-[Qemu-devel] [PULL 10/18] opencores_eth: use inline net_crc32() and bitshift instead of compute_mcast_idx()
+[PULL V3 14/15] vdpa: Never set log_base addr if SVQ is enabled
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-This makes it much easier to compare the multicast CRC calculation endian and
+Setting the log address would make the device start reporting invalid
-bitshift against the Linux driver implementation.
+dirty memory because the SVQ vrings are located in qemu's memory.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/opencores_eth.c | 3 ++-
+ hw/virtio/vhost-vdpa.c | 3 ++-
 file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/opencores_eth.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/opencores_eth.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
- #include "hw/net/mii.h"
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
- #include "hw/sysbus.h"
+                                      struct vhost_log *log)
- #include "net/net.h"
+ {
-+#include "net/eth.h"
+-    if (vhost_vdpa_one_time_request(dev)) {
- #include "sysemu/sysemu.h"
++    struct vhost_vdpa *v = dev->opaque;
- #include "trace.h"
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
+         return 0;
-@@ -XXX,XX +XXX,XX @@ static ssize_t open_eth_receive(NetClientState *nc,
+     }
-         if (memcmp(buf, bcast_addr, sizeof(bcast_addr)) == 0) {
              miss = GET_REGBIT(s, MODER, BRO);
          } else if ((buf[0] & 0x1) || GET_REGBIT(s, MODER, IAM)) {
 -            unsigned mcast_idx = compute_mcast_idx(buf);
 +            unsigned mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
              miss = !(s->regs[HASH0 + mcast_idx / 32] &
                      (1 << (mcast_idx % 32)));
              trace_open_eth_receive_mcast(
 --
 .7.4

-[Qemu-devel] [PULL 08/18] sungem: fix multicast filter CRC calculation
+[PULL V3 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+From: Eugenio Pérez <eperezma@redhat.com>
-From the Linux sungem driver, we know that the multicast filter CRC is
+SVQ is able to log the dirty bits by itself, so let's use it to not
-implemented using ether_crc_le() which isn't the same as calling zlib's
+block migration.
 crc32() function (the zlib implementation requires a complemented initial value
 and also returns the complemented result).
-Fix the multicast filter by simply using the new net_crc32_le() function.
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
 enabled. Even if the device supports it, the reports would be nonsense
 because SVQ memory is in the qemu region.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
+The log region is still allocated. Future changes might skip that, but
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+this series is already long enough.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/sungem.c | 5 ++---
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
-file changed, 2 insertions(+), 3 deletions(-)
+ include/hw/virtio/vhost-vdpa.h |  1 +
 files changed, 36 insertions(+), 4 deletions(-)
-diff --git a/hw/net/sungem.c b/hw/net/sungem.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/sungem.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/sungem.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
- #include "hw/pci/pci.h"
+     return v->index != 0;
- #include "qemu/log.h"
+ }
- #include "net/net.h"
-+#include "net/eth.h"
++static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
- #include "net/checksum.h"
++                                       uint64_t *features)
- #include "hw/net/mii.h"
++{
- #include "sysemu/sysemu.h"
++    int ret;
- #include "trace.h"
++
--/* For crc32 */
++    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
--#include <zlib.h>
++    trace_vhost_vdpa_get_features(dev, *features);
++    return ret;
- #define TYPE_SUNGEM "sungem"
++}
++
-@@ -XXX,XX +XXX,XX @@ static ssize_t sungem_receive(NetClientState *nc, const uint8_t *buf,
+ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
          return 0;
      }
-     /* Get MAC crc */
+-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
--    mac_crc = crc32(~0, buf, 6);
++    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
-+    mac_crc = net_crc32_le(buf, ETH_ALEN);
+     if (r != 0) {
+         error_setg_errno(errp, -r, "Can't get vdpa device features");
-     /* Packet isn't for me ? */
+         return r;
-     rx_cond = sungem_check_rx_mac(s, buf, mac_crc);
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
  static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                     uint64_t features)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      if (vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
 +
 +        v->acked_features = features;
 +
 +        /* We must not ack _F_LOG if SVQ is enabled */
 +        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 +
      trace_vhost_vdpa_set_features(dev, features);
      ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                       uint64_t *features)
  {
 -    int ret;
 +    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 -    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    uint64_t acked_features;
      bool shadow_vqs_enabled;
      /* IOVA mapping used by the Shadow Virtqueue */
      VhostIOVATree *iova_tree;
 --
 .7.4

-[Qemu-devel] [PULL 09/18] eepro100: use inline net_crc32() and bitshift instead of compute_mcast_idx()
+Deleted patch
-From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-This makes it much easier to compare the multicast CRC calculation endian and
-bitshift against the Linux driver implementation.
-Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/eepro100.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/eepro100.c
-+++ b/hw/net/eepro100.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
-         rfd_status |= 0x0004;
-     } else if (s->configuration[20] & BIT(6)) {
-         /* Multiple IA bit set. */
--        unsigned mcast_idx = compute_mcast_idx(buf);
-+        unsigned mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
-         assert(mcast_idx < 64);
-         if (s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))) {
-             TRACE(RXTX, logout("%p accepted, multiple IA bit set\n", s));
---
-.7.4

The following changes since commit 43ab9a5376c95c61ae898a222c4d04bdf60e239b:

hw/i386/vmport: fix missing definitions with non-log trace backends (2017-12-21 22:52:28 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 0065e915192cdf83c2700bb377e5323c2649476e:

qemu-doc: Update the deprecation information of -tftp, -bootp, -redir and -smb (2017-12-22 10:06:05 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Ed Swierk via Qemu-devel (2):
      e1000, e1000e: Move per-packet TX offload flags out of context state
      e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption

Mark Cave-Ayland (13):
      net: move CRC32 calculation from compute_mcast_idx() into its own net_crc32() function
      net: introduce net_crc32_le() function
      pcnet: switch pcnet over to use net_crc32_le()
      eepro100: switch eepro100 e100_compute_mcast_idx() over to use net_crc32()
      sunhme: switch sunhme over to use net_crc32_le()
      sungem: fix multicast filter CRC calculation
      eepro100: use inline net_crc32() and bitshift instead of compute_mcast_idx()
      opencores_eth: use inline net_crc32() and bitshift instead of compute_mcast_idx()
      lan9118: use inline net_crc32() and bitshift instead of compute_mcast_idx()
      ftgmac100: use inline net_crc32() and bitshift instead of compute_mcast_idx()
      ne2000: use inline net_crc32() and bitshift instead of compute_mcast_idx()
      rtl8139: use inline net_crc32() and bitshift instead of compute_mcast_idx()
      net: remove unused compute_mcast_idx() function

Thomas Huth (3):
      net: Remove the legacy "-net channel" parameter
      qemu-doc: The "-net nic" option can be used with "netdev=...", too
      qemu-doc: Update the deprecation information of -tftp, -bootp, -redir and -smb

From: Ed Swierk via Qemu-devel <qemu-devel@nongnu.org>

sum_needed and cptse flags are received from the guest within each
transmit data descriptor. They are not part of the offload context;
instead, they determine how to apply a previously received context to
the packet being transmitted:

- If cptse is set, perform both segmentation and checksum offload
  using the parameters in the TSO context; otherwise just do checksum
  offload. (Currently the e1000 device incorrectly stores only one
  context, which will be fixed in a subsequent patch.)

- Depending on the bits set in sum_needed, possibly perform L4
  checksum offload and/or IP checksum offload, using the parameters in
  the appropriate context.

Move these flags out of struct e1000x_txd_props, which is otherwise
dedicated to storing values from a context descriptor, and into the
per-packet TX struct.

Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000.c         | 30 ++++++++++++++++--------------
 hw/net/e1000e.c        |  4 ++--
 hw/net/e1000e_core.c   | 16 ++++++++--------
 hw/net/e1000e_core.h   |  2 ++
 hw/net/e1000x_common.h |  2 --
 5 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -XXX,XX +XXX,XX @@ typedef struct E1000State_st {
         unsigned char data[0x10000];
         uint16_t size;
         unsigned char vlan_needed;
+        unsigned char sum_needed;
+        bool cptse;
         e1000x_txd_props props;
         uint16_t tso_frames;
     } tx;
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
     unsigned int frames = s->tx.tso_frames, css, sofar;
     struct e1000_tx *tp = &s->tx;
 
-    if (tp->props.tse && tp->props.cptse) {
+    if (tp->props.tse && tp->cptse) {
         css = tp->props.ipcss;
         DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
                frames, tp->size, css);
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
             }
         } else    /* UDP */
             stw_be_p(tp->data+css+4, len);
-        if (tp->props.sum_needed & E1000_TXD_POPTS_TXSM) {
+        if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
             unsigned int phsum;
             // add pseudo-header length before checksum calculation
             void *sp = tp->data + tp->props.tucso;
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
         tp->tso_frames++;
     }
 
-    if (tp->props.sum_needed & E1000_TXD_POPTS_TXSM) {
+    if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
         putsum(tp->data, tp->size, tp->props.tucso,
                tp->props.tucss, tp->props.tucse);
     }
-    if (tp->props.sum_needed & E1000_TXD_POPTS_IXSM) {
+    if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
         putsum(tp->data, tp->size, tp->props.ipcso,
                tp->props.ipcss, tp->props.ipcse);
     }
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
     } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
         // data descriptor
         if (tp->size == 0) {
-            tp->props.sum_needed = le32_to_cpu(dp->upper.data) >> 8;
+            tp->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
         }
-        tp->props.cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
+        tp->cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
     } else {
         // legacy descriptor
-        tp->props.cptse = 0;
+        tp->cptse = 0;
     }
 
     if (e1000x_vlan_enabled(s->mac_reg) &&
         e1000x_is_vlan_txd(txd_lower) &&
-        (tp->props.cptse || txd_lower & E1000_TXD_CMD_EOP)) {
+        (tp->cptse || txd_lower & E1000_TXD_CMD_EOP)) {
         tp->vlan_needed = 1;
         stw_be_p(tp->vlan_header,
                       le16_to_cpu(s->mac_reg[VET]));
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
     }
 
     addr = le64_to_cpu(dp->buffer_addr);
-    if (tp->props.tse && tp->props.cptse) {
+    if (tp->props.tse && tp->cptse) {
         msh = tp->props.hdr_len + tp->props.mss;
         do {
             bytes = split_size;
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
             }
             split_size -= bytes;
         } while (bytes && split_size);
-    } else if (!tp->props.tse && tp->props.cptse) {
+    } else if (!tp->props.tse && tp->cptse) {
         // context descriptor TSE is not set, while data descriptor TSE is set
         DBGOUT(TXERR, "TCP segmentation error\n");
     } else {
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
 
     if (!(txd_lower & E1000_TXD_CMD_EOP))
         return;
-    if (!(tp->props.tse && tp->props.cptse && tp->size < tp->props.hdr_len)) {
+    if (!(tp->props.tse && tp->cptse && tp->size < tp->props.hdr_len)) {
         xmit_seg(s);
     }
     tp->tso_frames = 0;
-    tp->props.sum_needed = 0;
+    tp->sum_needed = 0;
     tp->vlan_needed = 0;
     tp->size = 0;
-    tp->props.cptse = 0;
+    tp->cptse = 0;
 }
 
 static uint32_t
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_e1000 = {
         VMSTATE_UINT16(tx.props.mss, E1000State),
         VMSTATE_UINT16(tx.size, E1000State),
         VMSTATE_UINT16(tx.tso_frames, E1000State),
-        VMSTATE_UINT8(tx.props.sum_needed, E1000State),
+        VMSTATE_UINT8(tx.sum_needed, E1000State),
         VMSTATE_INT8(tx.props.ip, E1000State),
         VMSTATE_INT8(tx.props.tcp, E1000State),
         VMSTATE_BUFFER(tx.header, E1000State),
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription e1000e_vmstate_tx = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_UINT8(props.sum_needed, struct e1000e_tx),
+        VMSTATE_UINT8(sum_needed, struct e1000e_tx),
         VMSTATE_UINT8(props.ipcss, struct e1000e_tx),
         VMSTATE_UINT8(props.ipcso, struct e1000e_tx),
         VMSTATE_UINT16(props.ipcse, struct e1000e_tx),
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription e1000e_vmstate_tx = {
         VMSTATE_INT8(props.ip, struct e1000e_tx),
         VMSTATE_INT8(props.tcp, struct e1000e_tx),
         VMSTATE_BOOL(props.tse, struct e1000e_tx),
-        VMSTATE_BOOL(props.cptse, struct e1000e_tx),
+        VMSTATE_BOOL(cptse, struct e1000e_tx),
         VMSTATE_BOOL(skip_cp, struct e1000e_tx),
         VMSTATE_END_OF_LIST()
     }
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -XXX,XX +XXX,XX @@ e1000e_rss_parse_packet(E1000ECore *core,
 static void
 e1000e_setup_tx_offloads(E1000ECore *core, struct e1000e_tx *tx)
 {
-    if (tx->props.tse && tx->props.cptse) {
+    if (tx->props.tse && tx->cptse) {
         net_tx_pkt_build_vheader(tx->tx_pkt, true, true, tx->props.mss);
         net_tx_pkt_update_ip_checksums(tx->tx_pkt);
         e1000x_inc_reg_if_not_full(core->mac, TSCTC);
         return;
     }
 
-    if (tx->props.sum_needed & E1000_TXD_POPTS_TXSM) {
+    if (tx->sum_needed & E1000_TXD_POPTS_TXSM) {
         net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0);
     }
 
-    if (tx->props.sum_needed & E1000_TXD_POPTS_IXSM) {
+    if (tx->sum_needed & E1000_TXD_POPTS_IXSM) {
         net_tx_pkt_update_ip_hdr_checksum(tx->tx_pkt);
     }
 }
@@ -XXX,XX +XXX,XX @@ e1000e_process_tx_desc(E1000ECore *core,
         return;
     } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
         /* data descriptor */
-        tx->props.sum_needed = le32_to_cpu(dp->upper.data) >> 8;
-        tx->props.cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
+        tx->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
+        tx->cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
         e1000e_process_ts_option(core, dp);
     } else {
         /* legacy descriptor */
         e1000e_process_ts_option(core, dp);
-        tx->props.cptse = 0;
+        tx->cptse = 0;
     }
 
     addr = le64_to_cpu(dp->buffer_addr);
@@ -XXX,XX +XXX,XX @@ e1000e_process_tx_desc(E1000ECore *core,
         tx->skip_cp = false;
         net_tx_pkt_reset(tx->tx_pkt);
 
-        tx->props.sum_needed = 0;
-        tx->props.cptse = 0;
+        tx->sum_needed = 0;
+        tx->cptse = 0;
     }
 }
 
diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -XXX,XX +XXX,XX @@ struct E1000Core {
         e1000x_txd_props props;
 
         bool skip_cp;
+        unsigned char sum_needed;
+        bool cptse;
         struct NetTxPkt *tx_pkt;
     } tx[E1000E_NUM_QUEUES];
 
diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000x_common.h
+++ b/hw/net/e1000x_common.h
@@ -XXX,XX +XXX,XX @@ void e1000x_update_regs_on_autoneg_done(uint32_t *mac, uint16_t *phy);
 void e1000x_increase_size_stats(uint32_t *mac, const int *size_regs, int size);
 
 typedef struct e1000x_txd_props {
-    unsigned char sum_needed;
     uint8_t ipcss;
     uint8_t ipcso;
     uint16_t ipcse;
@@ -XXX,XX +XXX,XX @@ typedef struct e1000x_txd_props {
     int8_t ip;
     int8_t tcp;
     bool tse;
-    bool cptse;
 } e1000x_txd_props;
 
 void e1000x_read_tx_ctx_descr(struct e1000_context_desc *d,
-- 
2.7.4

From: Ed Swierk via Qemu-devel <qemu-devel@nongnu.org>

The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.

Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.

One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)

Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.

Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000.c | 70 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -XXX,XX +XXX,XX @@ typedef struct E1000State_st {
         unsigned char sum_needed;
         bool cptse;
         e1000x_txd_props props;
+        e1000x_txd_props tso_props;
         uint16_t tso_frames;
     } tx;
 
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
     uint16_t len;
     unsigned int frames = s->tx.tso_frames, css, sofar;
     struct e1000_tx *tp = &s->tx;
+    struct e1000x_txd_props *props = tp->cptse ? &tp->tso_props : &tp->props;
 
-    if (tp->props.tse && tp->cptse) {
-        css = tp->props.ipcss;
+    if (tp->cptse) {
+        css = props->ipcss;
         DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
                frames, tp->size, css);
-        if (tp->props.ip) {    /* IPv4 */
+        if (props->ip) {    /* IPv4 */
             stw_be_p(tp->data+css+2, tp->size - css);
             stw_be_p(tp->data+css+4,
                      lduw_be_p(tp->data + css + 4) + frames);
         } else {         /* IPv6 */
             stw_be_p(tp->data+css+4, tp->size - css);
         }
-        css = tp->props.tucss;
+        css = props->tucss;
         len = tp->size - css;
-        DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->props.tcp, css, len);
-        if (tp->props.tcp) {
-            sofar = frames * tp->props.mss;
+        DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len);
+        if (props->tcp) {
+            sofar = frames * props->mss;
             stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
-            if (tp->props.paylen - sofar > tp->props.mss) {
+            if (props->paylen - sofar > props->mss) {
                 tp->data[css + 13] &= ~9;    /* PSH, FIN */
             } else if (frames) {
                 e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC);
             }
-        } else    /* UDP */
+        } else {    /* UDP */
             stw_be_p(tp->data+css+4, len);
+        }
         if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
             unsigned int phsum;
             // add pseudo-header length before checksum calculation
-            void *sp = tp->data + tp->props.tucso;
+            void *sp = tp->data + props->tucso;
 
             phsum = lduw_be_p(sp) + len;
             phsum = (phsum >> 16) + (phsum & 0xffff);
@@ -XXX,XX +XXX,XX @@ xmit_seg(E1000State *s)
     }
 
     if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
-        putsum(tp->data, tp->size, tp->props.tucso,
-               tp->props.tucss, tp->props.tucse);
+        putsum(tp->data, tp->size, props->tucso, props->tucss, props->tucse);
     }
     if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
-        putsum(tp->data, tp->size, tp->props.ipcso,
-               tp->props.ipcss, tp->props.ipcse);
+        putsum(tp->data, tp->size, props->ipcso, props->ipcss, props->ipcse);
     }
     if (tp->vlan_needed) {
         memmove(tp->vlan, tp->data, 4);
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
 
     s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
     if (dtype == E1000_TXD_CMD_DEXT) {    /* context descriptor */
-        e1000x_read_tx_ctx_descr(xp, &tp->props);
-        tp->tso_frames = 0;
-        if (tp->props.tucso == 0) {    /* this is probably wrong */
-            DBGOUT(TXSUM, "TCP/UDP: cso 0!\n");
-            tp->props.tucso = tp->props.tucss + (tp->props.tcp ? 16 : 6);
+        if (le32_to_cpu(xp->cmd_and_length) & E1000_TXD_CMD_TSE) {
+            e1000x_read_tx_ctx_descr(xp, &tp->tso_props);
+            tp->tso_frames = 0;
+        } else {
+            e1000x_read_tx_ctx_descr(xp, &tp->props);
         }
         return;
     } else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
     }
 
     addr = le64_to_cpu(dp->buffer_addr);
-    if (tp->props.tse && tp->cptse) {
-        msh = tp->props.hdr_len + tp->props.mss;
+    if (tp->cptse) {
+        msh = tp->tso_props.hdr_len + tp->tso_props.mss;
         do {
             bytes = split_size;
             if (tp->size + bytes > msh)
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
             bytes = MIN(sizeof(tp->data) - tp->size, bytes);
             pci_dma_read(d, addr, tp->data + tp->size, bytes);
             sz = tp->size + bytes;
-            if (sz >= tp->props.hdr_len && tp->size < tp->props.hdr_len) {
-                memmove(tp->header, tp->data, tp->props.hdr_len);
+            if (sz >= tp->tso_props.hdr_len
+                && tp->size < tp->tso_props.hdr_len) {
+                memmove(tp->header, tp->data, tp->tso_props.hdr_len);
             }
             tp->size = sz;
             addr += bytes;
             if (sz == msh) {
                 xmit_seg(s);
-                memmove(tp->data, tp->header, tp->props.hdr_len);
-                tp->size = tp->props.hdr_len;
+                memmove(tp->data, tp->header, tp->tso_props.hdr_len);
+                tp->size = tp->tso_props.hdr_len;
             }
             split_size -= bytes;
         } while (bytes && split_size);
-    } else if (!tp->props.tse && tp->cptse) {
-        // context descriptor TSE is not set, while data descriptor TSE is set
-        DBGOUT(TXERR, "TCP segmentation error\n");
     } else {
         split_size = MIN(sizeof(tp->data) - tp->size, split_size);
         pci_dma_read(d, addr, tp->data + tp->size, split_size);
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
 
     if (!(txd_lower & E1000_TXD_CMD_EOP))
         return;
-    if (!(tp->props.tse && tp->cptse && tp->size < tp->props.hdr_len)) {
+    if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
         xmit_seg(s);
     }
     tp->tso_frames = 0;
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_e1000_full_mac_state = {
 
 static const VMStateDescription vmstate_e1000 = {
     .name = "e1000",
-    .version_id = 2,
+    .version_id = 3,
     .minimum_version_id = 1,
     .pre_save = e1000_pre_save,
     .post_load = e1000_post_load,
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_e1000 = {
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
         VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
+        VMSTATE_UINT8_V(tx.tso_props.ipcss, E1000State, 3),
+        VMSTATE_UINT8_V(tx.tso_props.ipcso, E1000State, 3),
+        VMSTATE_UINT16_V(tx.tso_props.ipcse, E1000State, 3),
+        VMSTATE_UINT8_V(tx.tso_props.tucss, E1000State, 3),
+        VMSTATE_UINT8_V(tx.tso_props.tucso, E1000State, 3),
+        VMSTATE_UINT16_V(tx.tso_props.tucse, E1000State, 3),
+        VMSTATE_UINT32_V(tx.tso_props.paylen, E1000State, 3),
+        VMSTATE_UINT8_V(tx.tso_props.hdr_len, E1000State, 3),
+        VMSTATE_UINT16_V(tx.tso_props.mss, E1000State, 3),
+        VMSTATE_INT8_V(tx.tso_props.ip, E1000State, 3),
+        VMSTATE_INT8_V(tx.tso_props.tcp, E1000State, 3),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription*[]) {
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

Separate out the standard ethernet CRC32 calculation into a new net_crc32()
function, renaming the constant POLYNOMIAL to POLYNOMIAL_BE to make it clear
that this is a big-endian CRC32 calculation.

As part of the constant rename, remove the duplicate definition of POLYNOMIAL
from eepro100.c and use the new POLYNOMIAL_BE constant instead.

Once this is complete remove the existing CRC32 implementation from
compute_mcast_idx() and call the new net_crc32() function in its place.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/eepro100.c |  4 +---
 include/net/net.h |  3 ++-
 net/net.c         | 16 +++++++++++-----
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -XXX,XX +XXX,XX @@ static const uint16_t eepro100_mdi_mask[] = {
     0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 };
 
-#define POLYNOMIAL 0x04c11db6
-
 static E100PCIDeviceInfo *eepro100_get_class(EEPRO100State *s);
 
 /* From FreeBSD (locally modified). */
@@ -XXX,XX +XXX,XX @@ static unsigned e100_compute_mcast_idx(const uint8_t *ep)
             crc <<= 1;
             b >>= 1;
             if (carry) {
-                crc = ((crc ^ POLYNOMIAL) | carry);
+                crc = ((crc ^ POLYNOMIAL_BE) | carry);
             }
         }
     }
diff --git a/include/net/net.h b/include/net/net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id);
 
 void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
 
-#define POLYNOMIAL 0x04c11db6
+#define POLYNOMIAL_BE 0x04c11db6
+uint32_t net_crc32(const uint8_t *p, int len);
 unsigned compute_mcast_idx(const uint8_t *ep);
 
 #define vmstate_offset_macaddr(_state, _field)                       \
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ int net_client_parse(QemuOptsList *opts_list, const char *optarg)
 
 /* From FreeBSD */
 /* XXX: optimize */
-unsigned compute_mcast_idx(const uint8_t *ep)
+uint32_t net_crc32(const uint8_t *p, int len)
 {
     uint32_t crc;
     int carry, i, j;
     uint8_t b;
 
     crc = 0xffffffff;
-    for (i = 0; i < 6; i++) {
-        b = *ep++;
+    for (i = 0; i < len; i++) {
+        b = *p++;
         for (j = 0; j < 8; j++) {
             carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01);
             crc <<= 1;
             b >>= 1;
             if (carry) {
-                crc = ((crc ^ POLYNOMIAL) | carry);
+                crc = ((crc ^ POLYNOMIAL_BE) | carry);
             }
         }
     }
-    return crc >> 26;
+
+    return crc;
+}
+
+unsigned compute_mcast_idx(const uint8_t *ep)
+{
+    return net_crc32(ep, ETH_ALEN) >> 26;
 }
 
 QemuOptsList qemu_netdev_opts = {
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

This provides a standard ethernet CRC32 little-endian implementation.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/net/net.h |  2 ++
 net/net.c         | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/net/net.h b/include/net/net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id);
 void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
 
 #define POLYNOMIAL_BE 0x04c11db6
+#define POLYNOMIAL_LE 0xedb88320
 uint32_t net_crc32(const uint8_t *p, int len);
+uint32_t net_crc32_le(const uint8_t *p, int len);
 unsigned compute_mcast_idx(const uint8_t *ep);
 
 #define vmstate_offset_macaddr(_state, _field)                       \
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ uint32_t net_crc32(const uint8_t *p, int len)
     return crc;
 }
 
+uint32_t net_crc32_le(const uint8_t *p, int len)
+{
+    uint32_t crc;
+    int carry, i, j;
+    uint8_t b;
+
+    crc = 0xffffffff;
+    for (i = 0; i < len; i++) {
+        b = *p++;
+        for (j = 0; j < 8; j++) {
+            carry = (crc & 0x1) ^ (b & 0x01);
+            crc >>= 1;
+            b >>= 1;
+            if (carry) {
+                crc ^= POLYNOMIAL_LE;
+            }
+        }
+    }
+
+    return crc;
+}
+
 unsigned compute_mcast_idx(const uint8_t *ep)
 {
     return net_crc32(ep, ETH_ALEN) >> 26;
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

Instead of lnc_mchash() using its own implementation, we can simply call
net_crc32_le() directly and apply the bit shift inline.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/pcnet.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/pcnet.c
+++ b/hw/net/pcnet.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "hw/qdev.h"
 #include "net/net.h"
+#include "net/eth.h"
 #include "qemu/timer.h"
 #include "qemu/sockets.h"
 #include "sysemu/sysemu.h"
@@ -XXX,XX +XXX,XX @@ static inline void pcnet_rmd_store(PCNetState *s, struct pcnet_RMD *rmd,
            be16_to_cpu(hdr->ether_type));       \
 } while (0)
 
-#define MULTICAST_FILTER_LEN 8
-
-static inline uint32_t lnc_mchash(const uint8_t *ether_addr)
-{
-#define LNC_POLYNOMIAL          0xEDB88320UL
-    uint32_t crc = 0xFFFFFFFF;
-    int idx, bit;
-    uint8_t data;
-
-    for (idx = 0; idx < 6; idx++) {
-        for (data = *ether_addr++, bit = 0; bit < MULTICAST_FILTER_LEN; bit++) {
-            crc = (crc >> 1) ^ (((crc ^ data) & 1) ? LNC_POLYNOMIAL : 0);
-            data >>= 1;
-        }
-    }
-    return crc;
-#undef LNC_POLYNOMIAL
-}
-
 #define CRC(crc, ch)	 (crc = (crc >> 8) ^ crctab[(crc ^ (ch)) & 0xff])
 
 /* generated using the AUTODIN II polynomial
@@ -XXX,XX +XXX,XX @@ static inline int ladr_match(PCNetState *s, const uint8_t *buf, int size)
             s->csr[10] & 0xff, s->csr[10] >> 8,
             s->csr[11] & 0xff, s->csr[11] >> 8
         };
-        int index = lnc_mchash(hdr->ether_dhost) >> 26;
+        int index = net_crc32_le(hdr->ether_dhost, ETH_ALEN) >> 26;
         return !!(ladr[index >> 3] & (1 << (index & 7)));
     }
     return 0;
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

Instead of e100_compute_mcast_idx() using its own implementation, we can
simply call net_crc32() directly and apply the bit shift inline.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/eepro100.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/hw.h"
 #include "hw/pci/pci.h"
 #include "net/net.h"
+#include "net/eth.h"
 #include "hw/nvram/eeprom93xx.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/dma.h"
@@ -XXX,XX +XXX,XX @@ static const uint16_t eepro100_mdi_mask[] = {
 
 static E100PCIDeviceInfo *eepro100_get_class(EEPRO100State *s);
 
-/* From FreeBSD (locally modified). */
-static unsigned e100_compute_mcast_idx(const uint8_t *ep)
-{
-    uint32_t crc;
-    int carry, i, j;
-    uint8_t b;
-
-    crc = 0xffffffff;
-    for (i = 0; i < 6; i++) {
-        b = *ep++;
-        for (j = 0; j < 8; j++) {
-            carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01);
-            crc <<= 1;
-            b >>= 1;
-            if (carry) {
-                crc = ((crc ^ POLYNOMIAL_BE) | carry);
-            }
-        }
-    }
-    return (crc & BITS(7, 2)) >> 2;
-}
-
 /* Read a 16 bit control/status (CSR) register. */
 static uint16_t e100_read_reg2(EEPRO100State *s, E100RegisterOffset addr)
 {
@@ -XXX,XX +XXX,XX @@ static void set_multicast_list(EEPRO100State *s)
         uint8_t multicast_addr[6];
         pci_dma_read(&s->dev, s->cb_address + 10 + i, multicast_addr, 6);
         TRACE(OTHER, logout("multicast entry %s\n", nic_dump(multicast_addr, 6)));
-        unsigned mcast_idx = e100_compute_mcast_idx(multicast_addr);
+        unsigned mcast_idx = (net_crc32(multicast_addr, ETH_ALEN) &
+                              BITS(7, 2)) >> 2;
         assert(mcast_idx < 64);
         s->mult[mcast_idx >> 3] |= (1 << (mcast_idx & 7));
     }
@@ -XXX,XX +XXX,XX @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
         if (s->configuration[21] & BIT(3)) {
           /* Multicast all bit is set, receive all multicast frames. */
         } else {
-          unsigned mcast_idx = e100_compute_mcast_idx(buf);
+          unsigned mcast_idx = (net_crc32(buf, ETH_ALEN) & BITS(7, 2)) >> 2;
           assert(mcast_idx < 64);
           if (s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))) {
             /* Multicast frame is allowed in hash table. */
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

Instead of sunhme_crc32_le() using its own implementation, we can simply call
net_crc32_le() directly and apply the bit shift inline.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/sunhme.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/hw/net/sunhme.c b/hw/net/sunhme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/sunhme.c
+++ b/hw/net/sunhme.c
@@ -XXX,XX +XXX,XX @@ static inline void sunhme_set_rx_ring_nr(SunHMEState *s, int i)
     s->erxregs[HME_ERXI_RING >> 2] = ring;
 }
 
-#define POLYNOMIAL_LE 0xedb88320
-static uint32_t sunhme_crc32_le(const uint8_t *p, int len)
-{
-    uint32_t crc;
-    int carry, i, j;
-    uint8_t b;
-
-    crc = 0xffffffff;
-    for (i = 0; i < len; i++) {
-        b = *p++;
-        for (j = 0; j < 8; j++) {
-            carry = (crc & 0x1) ^ (b & 0x01);
-            crc >>= 1;
-            b >>= 1;
-            if (carry) {
-                crc = crc ^ POLYNOMIAL_LE;
-            }
-        }
-    }
-
-    return crc;
-}
-
 #define MIN_BUF_SIZE 60
 
 static ssize_t sunhme_receive(NetClientState *nc, const uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static ssize_t sunhme_receive(NetClientState *nc, const uint8_t *buf,
             trace_sunhme_rx_filter_bcast_match();
         } else if (s->macregs[HME_MACI_RXCFG >> 2] & HME_MAC_RXCFG_HENABLE) {
             /* Didn't match local address, check hash filter */
-            int mcast_idx = sunhme_crc32_le(buf, 6) >> 26;
+            int mcast_idx = net_crc32_le(buf, ETH_ALEN) >> 26;
             if (!(s->macregs[(HME_MACI_HASHTAB0 >> 2) - (mcast_idx >> 4)] &
                     (1 << (mcast_idx & 0xf)))) {
                 /* Didn't match hash filter */
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

From the Linux sungem driver, we know that the multicast filter CRC is
implemented using ether_crc_le() which isn't the same as calling zlib's
crc32() function (the zlib implementation requires a complemented initial value
and also returns the complemented result).

Fix the multicast filter by simply using the new net_crc32_le() function.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/sungem.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/net/sungem.c b/hw/net/sungem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/sungem.c
+++ b/hw/net/sungem.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/pci/pci.h"
 #include "qemu/log.h"
 #include "net/net.h"
+#include "net/eth.h"
 #include "net/checksum.h"
 #include "hw/net/mii.h"
 #include "sysemu/sysemu.h"
 #include "trace.h"
-/* For crc32 */
-#include <zlib.h>
 
 #define TYPE_SUNGEM "sungem"
 
@@ -XXX,XX +XXX,XX @@ static ssize_t sungem_receive(NetClientState *nc, const uint8_t *buf,
     }
 
     /* Get MAC crc */
-    mac_crc = crc32(~0, buf, 6);
+    mac_crc = net_crc32_le(buf, ETH_ALEN);
 
     /* Packet isn't for me ? */
     rx_cond = sungem_check_rx_mac(s, buf, mac_crc);
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

This makes it much easier to compare the multicast CRC calculation endian and
bitshift against the Linux driver implementation.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/eepro100.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -XXX,XX +XXX,XX @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size)
         rfd_status |= 0x0004;
     } else if (s->configuration[20] & BIT(6)) {
         /* Multiple IA bit set. */
-        unsigned mcast_idx = compute_mcast_idx(buf);
+        unsigned mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
         assert(mcast_idx < 64);
         if (s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))) {
             TRACE(RXTX, logout("%p accepted, multiple IA bit set\n", s));
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

This makes it much easier to compare the multicast CRC calculation endian and
bitshift against the Linux driver implementation.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/opencores_eth.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/opencores_eth.c
+++ b/hw/net/opencores_eth.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/net/mii.h"
 #include "hw/sysbus.h"
 #include "net/net.h"
+#include "net/eth.h"
 #include "sysemu/sysemu.h"
 #include "trace.h"
 
@@ -XXX,XX +XXX,XX @@ static ssize_t open_eth_receive(NetClientState *nc,
         if (memcmp(buf, bcast_addr, sizeof(bcast_addr)) == 0) {
             miss = GET_REGBIT(s, MODER, BRO);
         } else if ((buf[0] & 0x1) || GET_REGBIT(s, MODER, IAM)) {
-            unsigned mcast_idx = compute_mcast_idx(buf);
+            unsigned mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
             miss = !(s->regs[HASH0 + mcast_idx / 32] &
                     (1 << (mcast_idx % 32)));
             trace_open_eth_receive_mcast(
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

This makes it much easier to compare the multicast CRC calculation endian and
bitshift against the Linux driver implementation.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/lan9118.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/lan9118.c
+++ b/hw/net/lan9118.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "hw/sysbus.h"
 #include "net/net.h"
+#include "net/eth.h"
 #include "hw/devices.h"
 #include "sysemu/sysemu.h"
 #include "hw/ptimer.h"
@@ -XXX,XX +XXX,XX @@ static int lan9118_filter(lan9118_state *s, const uint8_t *addr)
         }
     } else {
         /* Hash matching  */
-        hash = compute_mcast_idx(addr);
+        hash = net_crc32(addr, ETH_ALEN) >> 26;
         if (hash & 0x20) {
             return (s->mac_hashh >> (hash & 0x1f)) & 1;
         } else {
-- 
2.7.4

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

This makes it much easier to compare the multicast CRC calculation endian and
bitshift against the Linux driver implementation.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/ne2000.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/ne2000.c
+++ b/hw/net/ne2000.c
@@ -XXX,XX +XXX,XX @@
  */
 #include "qemu/osdep.h"
 #include "hw/pci/pci.h"
+#include "net/net.h"
+#include "net/eth.h"
 #include "ne2000.h"
 #include "hw/loader.h"
 #include "sysemu/sysemu.h"
@@ -XXX,XX +XXX,XX @@ ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
             /* multicast */
             if (!(s->rxcr & 0x08))
                 return size;
-            mcast_idx = compute_mcast_idx(buf);
+            mcast_idx = net_crc32(buf, ETH_ALEN) >> 26;
             if (!(s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))))
                 return size;
         } else if (s->mem[0] == buf[0] &&
-- 
2.7.4

From: Thomas Huth <thuth@redhat.com>

It has never been documented, so hardly anybody knows about this
parameter, and it is marked as deprecated since QEMU v2.6.
Time to let it go now.

Reviewed-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/net/slirp.h |  2 --
 net/net.c           |  7 -------
 net/slirp.c         | 34 ----------------------------------
 qemu-doc.texi       |  5 -----
 4 files changed, 48 deletions(-)

diff --git a/include/net/slirp.h b/include/net/slirp.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/slirp.h
+++ b/include/net/slirp.h
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict);
 
 int net_slirp_redir(const char *redir_str);
 
-int net_slirp_parse_legacy(QemuOptsList *opts_list, const char *optarg, int *ret);
-
 int net_slirp_smb(const char *exported_dir);
 
 void hmp_info_usernet(Monitor *mon, const QDict *qdict);
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ int net_init_clients(void)
 
 int net_client_parse(QemuOptsList *opts_list, const char *optarg)
 {
-#if defined(CONFIG_SLIRP)
-    int ret;
-    if (net_slirp_parse_legacy(opts_list, optarg, &ret)) {
-        return ret;
-    }
-#endif
-
     if (!qemu_opts_parse_noisily(opts_list, optarg, true)) {
         return -1;
     }
diff --git a/net/slirp.c b/net/slirp.c
index XXXXXXX..XXXXXXX 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -XXX,XX +XXX,XX @@ int net_init_slirp(const Netdev *netdev, const char *name,
 
     return ret;
 }
-
-int net_slirp_parse_legacy(QemuOptsList *opts_list, const char *optarg, int *ret)
-{
-    if (strcmp(opts_list->name, "net") != 0 ||
-        strncmp(optarg, "channel,", strlen("channel,")) != 0) {
-        return 0;
-    }
-
-    error_report("The '-net channel' option is deprecated. "
-                 "Please use '-netdev user,guestfwd=...' instead.");
-
-    /* handle legacy -net channel,port:chr */
-    optarg += strlen("channel,");
-
-    if (QTAILQ_EMPTY(&slirp_stacks)) {
-        struct slirp_config_str *config;
-
-        config = g_malloc(sizeof(*config));
-        pstrcpy(config->str, sizeof(config->str), optarg);
-        config->flags = SLIRP_CFG_LEGACY;
-        config->next = slirp_configs;
-        slirp_configs = config;
-        *ret = 0;
-    } else {
-        Error *err = NULL;
-        *ret = slirp_guestfwd(QTAILQ_FIRST(&slirp_stacks), optarg, 1, &err);
-        if (*ret < 0) {
-            error_report_err(err);
-        }
-    }
-
-    return 1;
-}
-
diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ The ``-smb /some/dir'' argument is now a synonym for setting
 the ``-netdev user,smb=/some/dir'' argument instead. The new
 syntax allows different settings to be provided per NIC.
 
-@subsection -net channel (since 2.6.0)
-
-The ``--net channel,ARGS'' argument is now a synonym for setting
-the ``-netdev user,guestfwd=ARGS'' argument instead.
-
 @subsection -net vlan (since 2.9.0)
 
 The ``-net vlan=NN'' argument is partially replaced with the
-- 
2.7.4

From: Thomas Huth <thuth@redhat.com>

Looks like we missed to document that it is also possible to specify
a netdev with "-net nic" - which is very useful if you want to
configure your on-board NIC to use a backend that has been specified
with "-netdev".

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 qemu-options.hx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
     "-netdev hubport,id=str,hubid=n\n"
     "                configure a hub port on QEMU VLAN 'n'\n", QEMU_ARCH_ALL)
 DEF("net", HAS_ARG, QEMU_OPTION_net,
-    "-net nic[,vlan=n][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
-    "                old way to create a new NIC and connect it to VLAN 'n'\n"
-    "                (use the '-device devtype,netdev=str' option if possible instead)\n"
+    "-net nic[,vlan=n][,netdev=nd][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
+    "                configure or create an on-board (or machine default) NIC and\n"
+    "                connect it either to VLAN 'n' or the netdev 'nd' (for pluggable\n"
+    "                NICs please use '-device devtype,netdev=nd' instead)\n"
     "-net dump[,vlan=n][,file=f][,len=n]\n"
     "                dump traffic on vlan 'n' to file 'f' (max n bytes per packet)\n"
     "-net none       use it alone to have zero network devices. If no -net option\n"
@@ -XXX,XX +XXX,XX @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
     "                old way to initialize a host network interface\n"
     "                (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
 STEXI
-@item -net nic[,vlan=@var{n}][,macaddr=@var{mac}][,model=@var{type}] [,name=@var{name}][,addr=@var{addr}][,vectors=@var{v}]
+@item -net nic[,vlan=@var{n}][,netdev=@var{nd}][,macaddr=@var{mac}][,model=@var{type}] [,name=@var{name}][,addr=@var{addr}][,vectors=@var{v}]
 @findex -net
-Create a new Network Interface Card and connect it to VLAN @var{n} (@var{n}
-= 0 is the default). The NIC is an e1000 by default on the PC
+Configure or create an on-board (or machine default) Network Interface Card
+(NIC) and connect it either to VLAN @var{n} (@var{n} = 0 is the default), or
+to the netdev @var{nd}. The NIC is an e1000 by default on the PC
 target. Optionally, the MAC address can be changed to @var{mac}, the
 device address set to @var{addr} (PCI cards only),
 and a @var{name} can be assigned for use in monitor commands.
-- 
2.7.4

From: Thomas Huth <thuth@redhat.com>

The information how to update the deprecated parameters was too scarce,
so that some people did not update to the new syntax yet. Provide some
more information to make sure that it is clear how to update from the
old syntax to the new one.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 qemu-doc.texi | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/qemu-doc.texi b/qemu-doc.texi
index XXXXXXX..XXXXXXX 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -XXX,XX +XXX,XX @@ combined with ``-vnc tls-creds=tls0'
 
 @subsection -tftp (since 2.6.0)
 
-The ``-tftp /some/dir'' argument is now a synonym for setting
-the ``-netdev user,tftp=/some/dir' argument. The new syntax
-allows different settings to be provided per NIC.
+The ``-tftp /some/dir'' argument is replaced by
+``-netdev user,id=x,tftp=/some/dir'', either accompanied with
+``-device ...,netdev=x'' (for pluggable NICs) or ``-net nic,netdev=x''
+(for embedded NICs). The new syntax allows different settings to be
+provided per NIC.
 
 @subsection -bootp (since 2.6.0)
 
-The ``-bootp /some/file'' argument is now a synonym for setting
-the ``-netdev user,bootp=/some/file' argument. The new syntax
-allows different settings to be provided per NIC.
+The ``-bootp /some/file'' argument is replaced by
+``-netdev user,id=x,bootp=/some/file'', either accompanied with
+``-device ...,netdev=x'' (for pluggable NICs) or ``-net nic,netdev=x''
+(for embedded NICs). The new syntax allows different settings to be
+provided per NIC.
 
 @subsection -redir (since 2.6.0)
 
-The ``-redir ARGS'' argument is now a synonym for setting
-the ``-netdev user,hostfwd=ARGS'' argument instead. The new
-syntax allows different settings to be provided per NIC.
+The ``-redir [tcp|udp]:hostport:[guestaddr]:guestport'' argument is
+replaced by ``-netdev
+user,id=x,hostfwd=[tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport'',
+either accompanied with ``-device ...,netdev=x'' (for pluggable NICs) or
+``-net nic,netdev=x'' (for embedded NICs). The new syntax allows different
+settings to be provided per NIC.
 
 @subsection -smb (since 2.6.0)
 
-The ``-smb /some/dir'' argument is now a synonym for setting
-the ``-netdev user,smb=/some/dir'' argument instead. The new
-syntax allows different settings to be provided per NIC.
+The ``-smb /some/dir'' argument is replaced by
+``-netdev user,id=x,smb=/some/dir'', either accompanied with
+``-device ...,netdev=x'' (for pluggable NICs) or ``-net nic,netdev=x''
+(for embedded NICs). The new syntax allows different settings to be
+provided per NIC.
 
 @subsection -net vlan (since 2.9.0)
 
-- 
2.7.4

The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:

Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)

----------------------------------------------------------------

Changes since V2:
- fix 32bit build errros

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 170 ++++++++++
 10 files changed, 1584 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 ++
 4 files changed, 215 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 3 files changed, 522 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec, size_t num,
+                                    bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem, unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
+                            false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq, unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)(uintptr_t)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)(intptr_t)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq, unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next)
+{
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     hwaddr *addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        Int128 needle_last, map_last;
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = map->iova + off;
+
+        needle_last = int128_add(int128_make64(needle.translated_addr),
+                                 int128_make64(iovec[i].iov_len));
+        map_last = int128_make64(map->translated_addr + map->size);
+        if (unlikely(int128_gt(needle_last, map_last))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                     const struct iovec *iovec, size_t num,
                                     bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64(sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-                            false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr)
 {
-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 }
 
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)(uintptr_t)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)(uintptr_t)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)(intptr_t)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4