Series comparison

-[PULL 00/33] Net patches
+[PULL V3 00/15] Net patches
-The following changes since commit 7d3660e79830a069f1848bb4fa1cdf8f666424fb:
+The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
-  Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging (2020-06-12 23:06:22 +0100)
+  Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to 955aab203f932b8a7c23ff9c58ba036997cb3ed8:
+for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
-  net: Drop the NetLegacy structure, always use Netdev instead (2020-06-16 14:40:40 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
 ----------------------------------------------------------------
+Changes since V2:
+- fix 32bit build errros
 ----------------------------------------------------------------
-Derek Su (1):
+Eugenio Pérez (14):
-      colo-compare: Fix memory leak in packet_enqueue()
+      vhost: Add VhostShadowVirtqueue
+      vhost: Add Shadow VirtQueue kick forwarding capabilities
-Helge Deller (1):
+      vhost: Add Shadow VirtQueue call forwarding capabilities
-      Fix tulip breakage
+      vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
 Jason Wang (1):
-      net: use peer when purging queue in qemu_flush_or_purge_queue_packets()
+      virtio-net: fix map leaking on error during receive
-Lukas Straub (6):
+ hw/net/virtio-net.c                |   1 +
-      net/colo-compare.c: Create event_bh with the right AioContext
+ hw/virtio/meson.build              |   2 +-
-      chardev/char.c: Use qemu_co_sleep_ns if in coroutine
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
-      net/colo-compare.c: Fix deadlock in compare_chr_send
+ hw/virtio/vhost-iova-tree.h        |  27 ++
-      net/colo-compare.c: Only hexdump packets if tracing is enabled
+ hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
-      net/colo-compare.c: Check that colo-compare is active
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
-      net/colo-compare.c: Correct ordering in complete and finalize
+ hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
+ include/hw/virtio/vhost-vdpa.h     |   8 +
-Philippe Mathieu-Daudé (3):
+ include/qemu/iova-tree.h           |  38 ++-
-      hw/net/tulip: Fix 'Descriptor Error' definition
+ util/iova-tree.c                   | 170 ++++++++++
-      hw/net/tulip: Log descriptor overflows
+files changed, 1584 insertions(+), 17 deletions(-)
-      hw/net/e1000e: Do not abort() on invalid PSRCTL register value
+ create mode 100644 hw/virtio/vhost-iova-tree.c
+ create mode 100644 hw/virtio/vhost-iova-tree.h
-Sai Pavan Boddu (11):
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
-      net: cadence_gem: Fix debug statements
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
       net: cadence_gem: Fix the queue address update during wrap around
       net: cadence_gem: Fix irq update w.r.t queue
       net: cadence_gem: Define access permission for interrupt registers
       net: cadence_gem: Set ISR according to queue in use
       net: cadence_gem: Move tx/rx packet buffert to CadenceGEMState
       net: cadence_gem: Fix up code style
       net: cadence_gem: Add support for jumbo frames
       net: cadnece_gem: Update irq_read_clear field of designcfg_debug1 reg
       net: cadence_gem: Update the reset value for interrupt mask register
       net: cadence_gem: TX_LAST bit should be set by guest
 Thomas Huth (2):
       net: Drop the legacy "name" parameter from the -net option
       net: Drop the NetLegacy structure, always use Netdev instead
 Tong Ho (1):
       net: cadence_gem: Fix RX address filtering
 Yuri Benditovich (7):
       virtio-net: implement RSS configuration command
       virtio-net: implement RX RSS processing
       tap: allow extended virtio header with hash info
       virtio-net: reference implementation of hash report
       vmstate.h: provide VMSTATE_VARRAY_UINT16_ALLOC macro
       virtio-net: add migration support for RSS and hash report
       virtio-net: align RSC fields with updated virtio-net header
  chardev/char.c                 |   7 +-
  docs/system/deprecated.rst     |  15 +-
  hw/net/cadence_gem.c           | 458 +++++++++++++++++++++++------------------
  hw/net/e1000e_core.c           |  10 +-
  hw/net/trace-events            |   3 +
  hw/net/tulip.c                 |  12 +-
  hw/net/tulip.h                 |   2 +-
  hw/net/virtio-net.c            | 387 ++++++++++++++++++++++++++++++----
  include/hw/net/cadence_gem.h   |   6 +
  include/hw/virtio/virtio-net.h |  16 ++
  include/migration/vmstate.h    |  10 +
  net/colo-compare.c             | 277 ++++++++++++++++++-------
  net/colo.c                     |   7 +
  net/colo.h                     |   1 +
  net/net.c                      |  89 ++------
  net/tap.c                      |   3 +-
  net/trace-events               |   1 +
  qapi/net.json                  |  49 -----
 files changed, 904 insertions(+), 449 deletions(-)

-[PULL 06/33] virtio-net: add migration support for RSS and hash report
+[PULL V3 01/15] virtio-net: fix map leaking on error during receive
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-Save and restore RSS/hash report configuration.
+Fixing this by detaching the cached elements on error. This addresses
 CVE-2022-26353.
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
 Cc: qemu-stable@nongnu.org
 Fixes: CVE-2022-26353
 Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/virtio-net.c | 37 +++++++++++++++++++++++++++++++++++++
+ hw/net/virtio-net.c | 1 +
-file changed, 37 insertions(+)
+file changed, 1 insertion(+)
 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/virtio-net.c
 +++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
-         }
  err:
      for (j = 0; j < i; j++) {
 +        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
          g_free(elems[j]);
      }
-+    if (n->rss_data.enabled) {
-+        trace_virtio_net_rss_enable(n->rss_data.hash_types,
-+                                    n->rss_data.indirections_len,
-+                                    sizeof(n->rss_data.key));
-+    } else {
-+        trace_virtio_net_rss_disable();
-+    }
-     return 0;
- }
-@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_virtio_net_has_vnet = {
-     },
- };
-+static bool virtio_net_rss_needed(void *opaque)
-+{
-+    return VIRTIO_NET(opaque)->rss_data.enabled;
-+}
-+
-+static const VMStateDescription vmstate_virtio_net_rss = {
-+    .name      = "virtio-net-device/rss",
-+    .version_id = 1,
-+    .minimum_version_id = 1,
-+    .needed = virtio_net_rss_needed,
-+    .fields = (VMStateField[]) {
-+        VMSTATE_BOOL(rss_data.enabled, VirtIONet),
-+        VMSTATE_BOOL(rss_data.redirect, VirtIONet),
-+        VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
-+        VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
-+        VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
-+        VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
-+        VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
-+                            VIRTIO_NET_RSS_MAX_KEY_SIZE),
-+        VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
-+                                    rss_data.indirections_len, 0,
-+                                    vmstate_info_uint16, uint16_t),
-+        VMSTATE_END_OF_LIST()
-+    },
-+};
-+
- static const VMStateDescription vmstate_virtio_net_device = {
-     .name = "virtio-net-device",
-     .version_id = VIRTIO_NET_VM_VERSION,
-@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_virtio_net_device = {
-                             has_ctrl_guest_offloads),
-         VMSTATE_END_OF_LIST()
-    },
-+    .subsections = (const VMStateDescription * []) {
-+        &vmstate_virtio_net_rss,
-+        NULL
-+    }
- };
- static NetClientInfo net_virtio_info = {
 --
-.5.0
+.7.4

-[PULL 15/33] net: cadence_gem: Set ISR according to queue in use
+[PULL V3 02/15] vhost: Add VhostShadowVirtqueue
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Set ISR according to queue in use, added interrupt support for
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-all queues.
+notifications and buffers, allowing qemu to track them. While qemu is
 forwarding the buffers and virtqueue changes, it is able to commit the
 memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
+This commit only exposes basic SVQ allocation and free. Next patches of
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
+the series add functionality like notifications and buffers forwarding.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/cadence_gem.c | 27 +++++++++++++++++----------
+ hw/virtio/meson.build              |  2 +-
-file changed, 17 insertions(+), 10 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 files changed, 91 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
+--- a/hw/virtio/meson.build
-+++ b/hw/net/cadence_gem.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ static inline void rx_desc_set_sar(uint32_t *desc, int sar_idx)
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
- /* The broadcast MAC address: 0xFFFFFFFFFFFF */
- static const uint8_t broadcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
-+static void gem_set_isr(CadenceGEMState *s, int q, uint32_t flag)
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
 +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
  virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost shadow virtqueue
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
 +
 +#include "qemu/error-report.h"
 +
 +/**
 + * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
 + * Returns the new virtqueue or NULL.
 + *
 + * In case of error, reason is reported through error_report.
 + */
 +VhostShadowVirtqueue *vhost_svq_new(void)
 +{
-+    if (q == 0) {
++    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
-+        s->regs[GEM_ISR] |= flag & ~(s->regs[GEM_IMR]);
++    int r;
-+    } else {
++
-+        s->regs[GEM_INT_Q1_STATUS + q - 1] |= flag &
++    r = event_notifier_init(&svq->hdev_kick, 0);
-+                                      ~(s->regs[GEM_INT_Q1_MASK + q - 1]);
++    if (r != 0) {
 +        error_report("Couldn't create kick event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_kick;
 +    }
++
++    r = event_notifier_init(&svq->hdev_call, 0);
++    if (r != 0) {
++        error_report("Couldn't create call event notifier: %s (%d)",
++                     g_strerror(errno), errno);
++        goto err_init_hdev_call;
++    }
++
++    return g_steal_pointer(&svq);
++
++err_init_hdev_call:
++    event_notifier_cleanup(&svq->hdev_kick);
++
++err_init_hdev_kick:
++    return NULL;
 +}
 +
- /*
++/**
-  * gem_init_register_masks:
++ * Free the resources of the shadow virtqueue.
-  * One time initialization.
++ *
-@@ -XXX,XX +XXX,XX @@ static void gem_get_rx_desc(CadenceGEMState *s, int q)
++ * @pvq: gpointer to SVQ so it can be used by autofree functions.
-     if (rx_desc_get_ownership(s->rx_desc[q]) == 1) {
++ */
-         DB_PRINT("descriptor 0x%" HWADDR_PRIx " owned by sw.\n", desc_addr);
++void vhost_svq_free(gpointer pvq)
-         s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_NOBUF;
++{
--        s->regs[GEM_ISR] |= GEM_INT_RXUSED & ~(s->regs[GEM_IMR]);
++    VhostShadowVirtqueue *vq = pvq;
-+        gem_set_isr(s, q, GEM_INT_RXUSED);
++    event_notifier_cleanup(&vq->hdev_kick);
-         /* Handle interrupt consequences */
++    event_notifier_cleanup(&vq->hdev_call);
-         gem_update_int_status(s);
++    g_free(vq);
-     }
++}
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-     gem_receive_updatestats(s, buf, size);
+new file mode 100644
+index XXXXXXX..XXXXXXX
-     s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_FRMRCVD;
+--- /dev/null
--    s->regs[GEM_ISR] |= GEM_INT_RXCMPL & ~(s->regs[GEM_IMR]);
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-+    gem_set_isr(s, q, GEM_INT_RXCMPL);
+@@ -XXX,XX +XXX,XX @@
++/*
-     /* Handle interrupt consequences */
++ * vhost shadow virtqueue
-     gem_update_int_status(s);
++ *
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
-                 DB_PRINT("TX descriptor next: 0x%08x\n", s->tx_desc_addr[q]);
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
++ *
-                 s->regs[GEM_TXSTATUS] |= GEM_TXSTATUS_TXCMPL;
++ * SPDX-License-Identifier: GPL-2.0-or-later
--                s->regs[GEM_ISR] |= GEM_INT_TXCMPL & ~(s->regs[GEM_IMR]);
++ */
--
++
--                /* Update queue interrupt status */
++#ifndef VHOST_SHADOW_VIRTQUEUE_H
--                if (s->num_priority_queues > 1) {
++#define VHOST_SHADOW_VIRTQUEUE_H
--                    s->regs[GEM_INT_Q1_STATUS + q] |=
++
--                            GEM_INT_TXCMPL & ~(s->regs[GEM_INT_Q1_MASK + q]);
++#include "qemu/event_notifier.h"
--                }
++
-+                gem_set_isr(s, q, GEM_INT_TXCMPL);
++/* Shadow virtqueue to relay notifications */
++typedef struct VhostShadowVirtqueue {
-                 /* Handle interrupt consequences */
++    /* Shadow kick notifier, sent to vhost */
-                 gem_update_int_status(s);
++    EventNotifier hdev_kick;
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
++    /* Shadow call notifier, sent to vhost */
++    EventNotifier hdev_call;
-         if (tx_desc_get_used(desc)) {
++} VhostShadowVirtqueue;
-             s->regs[GEM_TXSTATUS] |= GEM_TXSTATUS_USED;
++
--            s->regs[GEM_ISR] |= GEM_INT_TXUSED & ~(s->regs[GEM_IMR]);
++VhostShadowVirtqueue *vhost_svq_new(void);
-+            /* IRQ TXUSED is defined only for queue 0 */
++
-+            if (q == 0) {
++void vhost_svq_free(gpointer vq);
-+                gem_set_isr(s, 0, GEM_INT_TXUSED);
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
-+            }
++
-             gem_update_int_status(s);
++#endif
          }
      }
 --
-.5.0
+.7.4

-[PULL 29/33] net/colo-compare.c: Correct ordering in complete and finalize
+[PULL V3 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-In colo_compare_complete, insert CompareState into net_compares
+At this mode no buffer forwarding will be performed in SVQ mode: Qemu
-only after everything has been initialized.
+will just forward the guest's kicks to the device.
-In colo_compare_finalize, remove CompareState from net_compares
-before anything is deinitialized.
+Host memory notifiers regions are left out for simplicity, and they will
+not be addressed in this series.
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 45 +++++++++++++++++++++++----------------------
+ hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
-file changed, 23 insertions(+), 22 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
+ hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+ include/hw/virtio/vhost-vdpa.h     |   4 ++
 files changed, 215 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
+@@ -XXX,XX +XXX,XX @@
-                            s->vnet_hdr);
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "qemu/error-report.h"
 +#include "qemu/main-loop.h"
 +#include "linux-headers/linux/vhost.h"
 +
 +/**
 + * Forward guest notifications.
 + *
 + * @n: guest kick event notifier, the one that guest set to notify svq.
 + */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
--    qemu_mutex_lock(&colo_compare_mutex);
++    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
--    if (!colo_compare_active) {
+     return g_steal_pointer(&svq);
--        qemu_mutex_init(&event_mtx);
--        qemu_cond_init(&event_complete_cond);
+ err_init_hdev_call:
--        colo_compare_active = true;
+@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
--    }
+ void vhost_svq_free(gpointer pvq)
--    QTAILQ_INSERT_TAIL(&net_compares, s, next);
+ {
--    qemu_mutex_unlock(&colo_compare_mutex);
+     VhostShadowVirtqueue *vq = pvq;
--
++    vhost_svq_stop(vq);
-     s->out_sendco.s = s;
+     event_notifier_cleanup(&vq->hdev_kick);
-     s->out_sendco.chr = &s->chr_out;
+     event_notifier_cleanup(&vq->hdev_call);
-     s->out_sendco.notify_remote_frame = false;
+     g_free(vq);
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-                                                       connection_destroy);
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-     colo_compare_iothread(s);
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-+
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
-+    qemu_mutex_lock(&colo_compare_mutex);
+     EventNotifier hdev_kick;
-+    if (!colo_compare_active) {
+     /* Shadow call notifier, sent to vhost */
-+        qemu_mutex_init(&event_mtx);
+     EventNotifier hdev_call;
-+        qemu_cond_init(&event_complete_cond);
++
-+        colo_compare_active = true;
++    /*
-+    }
++     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
-+    QTAILQ_INSERT_TAIL(&net_compares, s, next);
++     * notifier allows to recover the VhostShadowVirtqueue from the event loop
-+    qemu_mutex_unlock(&colo_compare_mutex);
++     * easily. If we use the VirtQueue's one, we don't have an easy way to
-+
++     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
++static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
-     CompareState *s = COLO_COMPARE(obj);
++{
-     CompareState *tmp = NULL;
++    struct vhost_vdpa *v = dev->opaque;
++    size_t idx;
--    qemu_chr_fe_deinit(&s->chr_pri_in, false);
++
--    qemu_chr_fe_deinit(&s->chr_sec_in, false);
++    if (!v->shadow_vqs) {
--    qemu_chr_fe_deinit(&s->chr_out, false);
++        return;
--    if (s->notify_dev) {
++    }
--        qemu_chr_fe_deinit(&s->chr_notify_dev, false);
++
--    }
++    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
--
++        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
--    if (s->iothread) {
++    }
--        colo_compare_timer_del(s);
++    g_ptr_array_free(v->shadow_vqs, true);
--    }
++}
--
++
--    qemu_bh_delete(s->event_bh);
+ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
--
+ {
-     qemu_mutex_lock(&colo_compare_mutex);
+     struct vhost_vdpa *v;
-     QTAILQ_FOREACH(tmp, &net_compares, next) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
-         if (tmp == s) {
+     trace_vhost_vdpa_cleanup(dev, v);
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
+     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
-     }
+     memory_listener_unregister(&v->listener);
-     qemu_mutex_unlock(&colo_compare_mutex);
++    vhost_vdpa_svq_cleanup(dev);
-+    qemu_chr_fe_deinit(&s->chr_pri_in, false);
+     dev->opaque = NULL;
-+    qemu_chr_fe_deinit(&s->chr_sec_in, false);
+     ram_block_discard_disable(false);
-+    qemu_chr_fe_deinit(&s->chr_out, false);
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
-+    if (s->notify_dev) {
+     return ret;
-+        qemu_chr_fe_deinit(&s->chr_notify_dev, false);
+ }
-+    }
-+
++static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
-+    if (s->iothread) {
++{
-+        colo_compare_timer_del(s);
++    if (!v->shadow_vqs_enabled) {
-+    }
++        return;
-+
++    }
-+    qemu_bh_delete(s->event_bh);
++
-+
++    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
-     AioContext *ctx = iothread_get_aio_context(s->iothread);
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-     aio_context_acquire(ctx);
++        vhost_svq_stop(svq);
-     AIO_WAIT_WHILE(ctx, !s->out_sendco.done);
++    }
 +}
 +
  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      uint8_t status = 0;
 +    vhost_vdpa_reset_svq(v);
 +
      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
      trace_vhost_vdpa_reset_device(dev, status);
      return ret;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +}
 +
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HW_VIRTIO_VHOST_VDPA_H
  #define HW_VIRTIO_VHOST_VDPA_H
 +#include <gmodule.h>
 +
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
-.5.0
+.7.4

-[PULL 26/33] net/colo-compare.c: Fix deadlock in compare_chr_send
+[PULL V3 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-The chr_out chardev is connected to a filter-redirector
+This will make qemu aware of the device used buffers, allowing it to
-running in the main loop. qemu_chr_fe_write_all might block
+write the guest memory with its contents if needed.
 here in compare_chr_send if the (socket-)buffer is full.
 If another filter-redirector in the main loop want's to
 send data to chr_pri_in it might also block if the buffer
 is full. This leads to a deadlock because both event loops
 get blocked.
-Fix this by converting compare_chr_send to a coroutine and
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-putting the packets in a send queue.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Lukas Straub <lukasstraub2@web.de>
 Reviewed-by: Zhang Chen <chen.zhang@intel.com>
 Tested-by: Zhang Chen <chen.zhang@intel.com>
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 193 ++++++++++++++++++++++++++++++++++++++++-------------
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
- net/colo.c         |   7 ++
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
- net/colo.h         |   1 +
+ hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
-files changed, 156 insertions(+), 45 deletions(-)
+files changed, 71 insertions(+), 2 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
  #include "migration/migration.h"
  #include "util.h"
 +#include "block/aio-wait.h"
 +#include "qemu/coroutine.h"
 +
  #define TYPE_COLO_COMPARE "colo-compare"
  #define COLO_COMPARE(obj) \
      OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
@@ -XXX,XX +XXX,XX @@ static int event_unhandled_count;
   *                    |packet  |  |packet  +    |packet  | |packet  +
   *                    +--------+  +--------+    +--------+ +--------+
   */
 +
 +typedef struct SendCo {
 +    Coroutine *co;
 +    struct CompareState *s;
 +    CharBackend *chr;
 +    GQueue send_list;
 +    bool notify_remote_frame;
 +    bool done;
 +    int ret;
 +} SendCo;
 +
 +typedef struct SendEntry {
 +    uint32_t size;
 +    uint32_t vnet_hdr_len;
 +    uint8_t *buf;
 +} SendEntry;
 +
  typedef struct CompareState {
      Object parent;
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
      SocketReadState pri_rs;
      SocketReadState sec_rs;
      SocketReadState notify_rs;
 +    SendCo out_sendco;
 +    SendCo notify_sendco;
      bool vnet_hdr;
      uint32_t compare_timeout;
      uint32_t expired_scan_cycle;
@@ -XXX,XX +XXX,XX @@ enum {
  static int compare_chr_send(CompareState *s,
 -                            const uint8_t *buf,
 +                            uint8_t *buf,
                              uint32_t size,
                              uint32_t vnet_hdr_len,
 -                            bool notify_remote_frame);
 +                            bool notify_remote_frame,
 +                            bool zero_copy);
  static bool packet_matches_str(const char *str,
                                 const uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static void notify_remote_frame(CompareState *s)
      char msg[] = "DO_CHECKPOINT";
      int ret = 0;
 -    ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true);
 +    ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true, false);
      if (ret < 0) {
          error_report("Notify Xen COLO-frame failed");
      }
@@ -XXX,XX +XXX,XX @@ static void colo_release_primary_pkt(CompareState *s, Packet *pkt)
                             pkt->data,
                             pkt->size,
                             pkt->vnet_hdr_len,
 -                           false);
 +                           false,
 +                           true);
      if (ret < 0) {
          error_report("colo send primary packet failed");
      }
      trace_colo_compare_main("packet same and release packet");
 -    packet_destroy(pkt, NULL);
 +    packet_destroy_partial(pkt, NULL);
  }
- /*
+ /**
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_connection(void *opaque, void *user_data)
++ * Forward vhost notifications
-     }
++ *
- }
++ * @n: hdev call event notifier, the one that device set to notify svq.
++ */
--static int compare_chr_send(CompareState *s,
++static void vhost_svq_handle_call(EventNotifier *n)
--                            const uint8_t *buf,
++{
--                            uint32_t size,
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
--                            uint32_t vnet_hdr_len,
++                                             hdev_call);
--                            bool notify_remote_frame)
++    event_notifier_test_and_clear(n);
-+static void coroutine_fn _compare_chr_send(void *opaque)
++    event_notifier_set(&svq->svq_call);
  {
 +    SendCo *sendco = opaque;
 +    CompareState *s = sendco->s;
      int ret = 0;
 -    uint32_t len = htonl(size);
 -    if (!size) {
 -        return 0;
 -    }
 +    while (!g_queue_is_empty(&sendco->send_list)) {
 +        SendEntry *entry = g_queue_pop_tail(&sendco->send_list);
 +        uint32_t len = htonl(entry->size);
 -    if (notify_remote_frame) {
 -        ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
 -                                    (uint8_t *)&len,
 -                                    sizeof(len));
 -    } else {
 -        ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len));
 -    }
 +        ret = qemu_chr_fe_write_all(sendco->chr, (uint8_t *)&len, sizeof(len));
 -    if (ret != sizeof(len)) {
 -        goto err;
 -    }
 +        if (ret != sizeof(len)) {
 +            g_free(entry->buf);
 +            g_slice_free(SendEntry, entry);
 +            goto err;
 +        }
 -    if (s->vnet_hdr) {
 -        /*
 -         * We send vnet header len make other module(like filter-redirector)
 -         * know how to parse net packet correctly.
 -         */
 -        len = htonl(vnet_hdr_len);
 +        if (!sendco->notify_remote_frame && s->vnet_hdr) {
 +            /*
 +             * We send vnet header len make other module(like filter-redirector)
 +             * know how to parse net packet correctly.
 +             */
 +            len = htonl(entry->vnet_hdr_len);
 -        if (!notify_remote_frame) {
 -            ret = qemu_chr_fe_write_all(&s->chr_out,
 +            ret = qemu_chr_fe_write_all(sendco->chr,
                                          (uint8_t *)&len,
                                          sizeof(len));
 +
 +            if (ret != sizeof(len)) {
 +                g_free(entry->buf);
 +                g_slice_free(SendEntry, entry);
 +                goto err;
 +            }
          }
 -        if (ret != sizeof(len)) {
 +        ret = qemu_chr_fe_write_all(sendco->chr,
 +                                    (uint8_t *)entry->buf,
 +                                    entry->size);
 +
 +        if (ret != entry->size) {
 +            g_free(entry->buf);
 +            g_slice_free(SendEntry, entry);
              goto err;
          }
 +
 +        g_free(entry->buf);
 +        g_slice_free(SendEntry, entry);
      }
 +    sendco->ret = 0;
 +    goto out;
 +
 +err:
 +    while (!g_queue_is_empty(&sendco->send_list)) {
 +        SendEntry *entry = g_queue_pop_tail(&sendco->send_list);
 +        g_free(entry->buf);
 +        g_slice_free(SendEntry, entry);
 +    }
 +    sendco->ret = ret < 0 ? ret : -EIO;
 +out:
 +    sendco->co = NULL;
 +    sendco->done = true;
 +    aio_wait_kick();
 +}
 +
-+static int compare_chr_send(CompareState *s,
++/**
-+                            uint8_t *buf,
++ * Set the call notifier for the SVQ to call the guest
-+                            uint32_t size,
++ *
-+                            uint32_t vnet_hdr_len,
++ * @svq: Shadow virtqueue
-+                            bool notify_remote_frame,
++ * @call_fd: call notifier
-+                            bool zero_copy)
++ *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
-+    SendCo *sendco;
++    if (call_fd == VHOST_FILE_UNBIND) {
-+    SendEntry *entry;
++        /*
 +         * Fail event_notifier_set if called handling device call.
 +         *
 +         * SVQ still needs device notifications, since it needs to keep
 +         * forwarding used buffers even with the unbind.
 +         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
 +        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
 +}
 +
-     if (notify_remote_frame) {
++/**
--        ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
--                                    (uint8_t *)buf,
+  *
--                                    size);
+  * @svq: The svq
-+        sendco = &s->notify_sendco;
+@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      } else {
 -        ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)buf, size);
 +        sendco = &s->out_sendco;
      }
--    if (ret != size) {
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
--        goto err;
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
-+    if (!size) {
+     return g_steal_pointer(&svq);
-+        return 0;
-     }
+ err_init_hdev_call:
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
--    return 0;
+     VhostShadowVirtqueue *vq = pvq;
-+    entry = g_slice_new(SendEntry);
+     vhost_svq_stop(vq);
-+    entry->size = size;
+     event_notifier_cleanup(&vq->hdev_kick);
-+    entry->vnet_hdr_len = vnet_hdr_len;
++    event_notifier_set_handler(&vq->hdev_call, NULL);
-+    if (zero_copy) {
+     event_notifier_cleanup(&vq->hdev_call);
-+        entry->buf = buf;
+     g_free(vq);
-+    } else {
+ }
-+        entry->buf = g_malloc(size);
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-+        memcpy(entry->buf, buf, size);
+index XXXXXXX..XXXXXXX 100644
-+    }
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+    g_queue_push_head(&sendco->send_list, entry);
++++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
       * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
       */
      EventNotifier svq_kick;
 +
-+    if (sendco->done) {
++    /* Guest's call notifier, where the SVQ calls guest. */
-+        sendco->co = qemu_coroutine_create(_compare_chr_send, sendco);
++    EventNotifier svq_call;
-+        sendco->done = false;
+ } VhostShadowVirtqueue;
-+        qemu_coroutine_enter(sendco->co);
-+        if (sendco->done) {
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
-+            /* report early errors */
++void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
-+            return sendco->ret;
-+        }
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
-+    }
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
--err:
+index XXXXXXX..XXXXXXX 100644
--    return ret < 0 ? ret : -EIO;
+--- a/hw/virtio/vhost-vdpa.c
-+    /* assume success */
++++ b/hw/virtio/vhost-vdpa.c
-+    return 0;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
- static int compare_chr_can_read(void *opaque)
++static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
-@@ -XXX,XX +XXX,XX @@ static void compare_pri_rs_finalize(SocketReadState *pri_rs)
++                                         struct vhost_vring_file *file)
-                          pri_rs->buf,
++{
-                          pri_rs->packet_len,
++    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-                          pri_rs->vnet_hdr_len,
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
-+                         false,
++}
                           false);
      } else {
          /* compare packet in the specified connection */
@@ -XXX,XX +XXX,XX @@ static void compare_notify_rs_finalize(SocketReadState *notify_rs)
      if (packet_matches_str("COLO_USERSPACE_PROXY_INIT",
                             notify_rs->buf,
                             notify_rs->packet_len)) {
 -        ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true);
 +        ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true, false);
          if (ret < 0) {
              error_report("Notify Xen COLO-frame INIT failed");
          }
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
      QTAILQ_INSERT_TAIL(&net_compares, s, next);
 +    s->out_sendco.s = s;
 +    s->out_sendco.chr = &s->chr_out;
 +    s->out_sendco.notify_remote_frame = false;
 +    s->out_sendco.done = true;
 +    g_queue_init(&s->out_sendco.send_list);
 +
-+    if (s->notify_dev) {
+ /**
-+        s->notify_sendco.s = s;
+  * Set the shadow virtqueue descriptors to the device
-+        s->notify_sendco.chr = &s->chr_notify_dev;
+  *
-+        s->notify_sendco.notify_remote_frame = true;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-+        s->notify_sendco.done = true;
+  * @svq: The shadow virtqueue
-+        g_queue_init(&s->notify_sendco.send_list);
+  * @idx: The index of the virtqueue in the vhost device
   * @errp: Error
 + *
 + * Note that this function does not rewind kick file descriptor if cannot set
 + * call one.
   */
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                   VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 +        return false;
 +    }
 +
-     g_queue_init(&s->conn_list);
++    event_notifier = &svq->hdev_call;
++    file.fd = event_notifier_get_fd(event_notifier);
-     qemu_mutex_init(&event_mtx);
++    r = vhost_vdpa_set_vring_dev_call(dev, &file);
-@@ -XXX,XX +XXX,XX @@ static void colo_flush_packets(void *opaque, void *user_data)
++    if (unlikely(r != 0)) {
-                          pkt->data,
++        error_setg_errno(errp, -r, "Can't set device call fd");
                           pkt->size,
                           pkt->vnet_hdr_len,
 -                         false);
 -        packet_destroy(pkt, NULL);
 +                         false,
 +                         true);
 +        packet_destroy_partial(pkt, NULL);
      }
-     while (!g_queue_is_empty(&conn->secondary_list)) {
-         pkt = g_queue_pop_head(&conn->secondary_list);
+     return r == 0;
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
-         }
+ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
-     }
+                                        struct vhost_vring_file *file)
+ {
-+    AioContext *ctx = iothread_get_aio_context(s->iothread);
+-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-+    aio_context_acquire(ctx);
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
-+    AIO_WAIT_WHILE(ctx, !s->out_sendco.done);
++    struct vhost_vdpa *v = dev->opaque;
-+    if (s->notify_dev) {
++
-+        AIO_WAIT_WHILE(ctx, !s->notify_sendco.done);
++    if (v->shadow_vqs_enabled) {
 +        int vdpa_idx = file->index - dev->vq_index;
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +
 +        vhost_svq_set_svq_call_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_call(dev, file);
 +    }
-+    aio_context_release(ctx);
-+
-     /* Release all unhandled packets after compare thead exited */
-     g_queue_foreach(&s->conn_list, colo_flush_packets, s);
-+    AIO_WAIT_WHILE(NULL, !s->out_sendco.done);
-     g_queue_clear(&s->conn_list);
-+    g_queue_clear(&s->out_sendco.send_list);
-+    if (s->notify_dev) {
-+        g_queue_clear(&s->notify_sendco.send_list);
-+    }
-     if (s->connection_track_table) {
-         g_hash_table_destroy(s->connection_track_table);
-diff --git a/net/colo.c b/net/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/colo.c
-+++ b/net/colo.c
-@@ -XXX,XX +XXX,XX @@ void packet_destroy(void *opaque, void *user_data)
-     g_slice_free(Packet, pkt);
  }
-+void packet_destroy_partial(void *opaque, void *user_data)
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
 +{
 +    Packet *pkt = opaque;
 +
 +    g_slice_free(Packet, pkt);
 +}
 +
  /*
   * Clear hashtable, stop this hash growing really huge
   */
 diff --git a/net/colo.h b/net/colo.h
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo.h
 +++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@ bool connection_has_tracked(GHashTable *connection_track_table,
  void connection_hashtable_reset(GHashTable *connection_track_table);
  Packet *packet_new(const void *data, int size, int vnet_hdr_len);
  void packet_destroy(void *opaque, void *user_data);
 +void packet_destroy_partial(void *opaque, void *user_data);
  #endif /* NET_COLO_H */
 --
-.5.0
+.7.4

-[PULL 18/33] net: cadence_gem: Add support for jumbo frames
+[PULL V3 05/15] vhost: Add vhost_svq_valid_features to shadow vq
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Add a property "jumbo-max-len", which sets default value of jumbo frames
+This allows SVQ to negotiate features with the guest and the device. For
-up to 16,383 bytes. Add Frame length checks for standard and jumbo
+the device, SVQ is a driver. While this function bypasses all
-frames.
+non-transport features, it needs to disable the features that SVQ does
 not support when forwarding buffers. This includes packed vq layout,
 indirect descriptors or event idx.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
+Future changes can add support to offer more features to the guest,
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
+since the use of VirtQueue gives this for free. This is left out at the
 moment for simplicity.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/cadence_gem.c         | 51 +++++++++++++++++++++++++++++++++++++++-----
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
- include/hw/net/cadence_gem.h |  4 +++-
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
-files changed, 49 insertions(+), 6 deletions(-)
+ hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 files changed, 61 insertions(+)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/cadence_gem.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
- #define GEM_TXPAUSE       (0x0000003C / 4) /* TX Pause Time reg */
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
- #define GEM_TXPARTIALSF   (0x00000040 / 4) /* TX Partial Store and Forward */
- #define GEM_RXPARTIALSF   (0x00000044 / 4) /* RX Partial Store and Forward */
+ #include "qemu/error-report.h"
-+#define GEM_JUMBO_MAX_LEN (0x00000048 / 4) /* Max Jumbo Frame Size */
++#include "qapi/error.h"
- #define GEM_HASHLO        (0x00000080 / 4) /* Hash Low address reg */
+ #include "qemu/main-loop.h"
- #define GEM_HASHHI        (0x00000084 / 4) /* Hash High address reg */
+ #include "linux-headers/linux/vhost.h"
- #define GEM_SPADDR1LO     (0x00000088 / 4) /* Specific addr 1 low reg */
-@@ -XXX,XX +XXX,XX @@
+ /**
- #define GEM_NWCFG_LERR_DISC    0x00010000 /* Discard RX frames with len err */
++ * Validate the transport device features that both guests can use with the SVQ
- #define GEM_NWCFG_BUFF_OFST_M  0x0000C000 /* Receive buffer offset mask */
++ * and SVQs can use with the device.
- #define GEM_NWCFG_BUFF_OFST_S  14         /* Receive buffer offset shift */
++ *
-+#define GEM_NWCFG_RCV_1538     0x00000100 /* Receive 1538 bytes frame */
++ * @dev_features: The features
- #define GEM_NWCFG_UCAST_HASH   0x00000080 /* accept unicast if hash match */
++ * @errp: Error pointer
- #define GEM_NWCFG_MCAST_HASH   0x00000040 /* accept multicast if hash match */
++ */
- #define GEM_NWCFG_BCAST_REJ    0x00000020 /* Reject broadcast packets */
++bool vhost_svq_valid_features(uint64_t features, Error **errp)
  #define GEM_NWCFG_PROMISC      0x00000010 /* Accept all packets */
 +#define GEM_NWCFG_JUMBO_FRAME  0x00000008 /* Jumbo Frames enable */
  #define GEM_DMACFG_ADDR_64B    (1U << 30)
  #define GEM_DMACFG_TX_BD_EXT   (1U << 29)
@@ -XXX,XX +XXX,XX @@
  /* GEM_ISR GEM_IER GEM_IDR GEM_IMR */
  #define GEM_INT_TXCMPL        0x00000080 /* Transmit Complete */
 +#define GEM_INT_AMBA_ERR      0x00000040
  #define GEM_INT_TXUSED         0x00000008
  #define GEM_INT_RXUSED         0x00000004
  #define GEM_INT_RXCMPL        0x00000002
@@ -XXX,XX +XXX,XX @@ static inline void rx_desc_set_sar(uint32_t *desc, int sar_idx)
  /* The broadcast MAC address: 0xFFFFFFFFFFFF */
  static const uint8_t broadcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 +static uint32_t gem_get_max_buf_len(CadenceGEMState *s, bool tx)
 +{
-+    uint32_t size;
++    bool ok = true;
-+    if (s->regs[GEM_NWCFG] & GEM_NWCFG_JUMBO_FRAME) {
++    uint64_t svq_features = features;
-+        size = s->regs[GEM_JUMBO_MAX_LEN];
++
-+        if (size > s->jumbo_max_len) {
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
-+            size = s->jumbo_max_len;
++         ++b) {
-+            qemu_log_mask(LOG_GUEST_ERROR, "GEM_JUMBO_MAX_LEN reg cannot be"
++        switch (b) {
-+                " greater than 0x%" PRIx32 "\n", s->jumbo_max_len);
++        case VIRTIO_F_ANY_LAYOUT:
 +            continue;
 +
 +        case VIRTIO_F_ACCESS_PLATFORM:
 +            /* SVQ trust in the host's IOMMU to translate addresses */
 +        case VIRTIO_F_VERSION_1:
 +            /* SVQ trust that the guest vring is little endian */
 +            if (!(svq_features & BIT_ULL(b))) {
 +                svq_features |= BIT_ULL(b);
 +                ok = false;
 +            }
 +            continue;
 +
 +        default:
 +            if (svq_features & BIT_ULL(b)) {
 +                svq_features &= ~BIT_ULL(b);
 +                ok = false;
 +            }
 +        }
-+    } else if (tx) {
-+        size = 1518;
-+    } else {
-+        size = s->regs[GEM_NWCFG] & GEM_NWCFG_RCV_1538 ? 1538 : 1518;
 +    }
-+    return size;
++
 +    if (!ok) {
 +        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
 +                         ", ok: 0x%"PRIx64, features, svq_features);
 +    }
 +    return ok;
 +}
 +
- static void gem_set_isr(CadenceGEMState *s, int q, uint32_t flag)
++/**
   * Forward guest notifications.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier svq_call;
  } VhostShadowVirtqueue;
 +bool vhost_svq_valid_features(uint64_t features, Error **errp);
 +
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
-     if (q == 0) {
+     g_autoptr(GPtrArray) shadow_vqs = NULL;
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
++    uint64_t dev_features, svq_features;
-     /* Find which queue we are targeting */
++    int r;
-     q = get_queue_from_screen(s, rxbuf_ptr, rxbufsize);
++    bool ok;
-+    if (size > gem_get_max_buf_len(s, false)) {
+     if (!v->shadow_vqs_enabled) {
-+        qemu_log_mask(LOG_GUEST_ERROR, "rx frame too long\n");
+         return 0;
-+        gem_set_isr(s, q, GEM_INT_AMBA_ERR);
+     }
 +    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    if (r != 0) {
 +        error_setg_errno(errp, -r, "Can't get vdpa device features");
 +        return r;
 +    }
 +
 +    svq_features = dev_features;
 +    ok = vhost_svq_valid_features(svq_features, errp);
 +    if (unlikely(!ok)) {
 +        return -1;
 +    }
 +
-     while (bytes_to_copy) {
+     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
-         hwaddr desc_addr;
+     for (unsigned n = 0; n < hdev->nvqs; ++n) {
+         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                  break;
              }
 -            if (tx_desc_get_length(desc) > MAX_FRAME_SIZE -
 +            if (tx_desc_get_length(desc) > gem_get_max_buf_len(s, true) -
                                                 (p - s->tx_packet)) {
 -                DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \
 -                         " too large: size 0x%x space 0x%zx\n",
 +                qemu_log_mask(LOG_GUEST_ERROR, "TX descriptor @ 0x%" \
 +                         HWADDR_PRIx " too large: size 0x%x space 0x%zx\n",
                           packet_desc_addr, tx_desc_get_length(desc),
 -                         MAX_FRAME_SIZE - (p - s->tx_packet));
 +                         gem_get_max_buf_len(s, true) - (p - s->tx_packet));
 +                gem_set_isr(s, q, GEM_INT_AMBA_ERR);
                  break;
              }
@@ -XXX,XX +XXX,XX @@ static void gem_reset(DeviceState *d)
      s->regs[GEM_RXPARTIALSF] = 0x000003ff;
      s->regs[GEM_MODID] = s->revision;
      s->regs[GEM_DESCONF] = 0x02500111;
 -    s->regs[GEM_DESCONF2] = 0x2ab13fff;
 +    s->regs[GEM_DESCONF2] = 0x2ab10000 | s->jumbo_max_len;
      s->regs[GEM_DESCONF5] = 0x002f2045;
      s->regs[GEM_DESCONF6] = GEM_DESCONF6_64B_MASK;
 +    s->regs[GEM_JUMBO_MAX_LEN] = s->jumbo_max_len;
      if (s->num_priority_queues > 1) {
          queues_mask = MAKE_64BIT_MASK(1, s->num_priority_queues - 1);
@@ -XXX,XX +XXX,XX @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
          s->regs[GEM_IMR] &= ~val;
          gem_update_int_status(s);
          break;
 +    case GEM_JUMBO_MAX_LEN:
 +        s->regs[GEM_JUMBO_MAX_LEN] = val & MAX_JUMBO_FRAME_SIZE_MASK;
 +        break;
      case GEM_INT_Q1_ENABLE ... GEM_INT_Q7_ENABLE:
          s->regs[GEM_INT_Q1_MASK + offset - GEM_INT_Q1_ENABLE] &= ~val;
          gem_update_int_status(s);
@@ -XXX,XX +XXX,XX @@ static void gem_realize(DeviceState *dev, Error **errp)
      s->nic = qemu_new_nic(&net_gem_info, &s->conf,
                            object_get_typename(OBJECT(dev)), dev->id, s);
 +
 +    if (s->jumbo_max_len > MAX_FRAME_SIZE) {
 +        error_setg(errp, "jumbo-max-len is greater than %d",
 +                  MAX_FRAME_SIZE);
 +        return;
 +    }
  }
  static void gem_init(Object *obj)
@@ -XXX,XX +XXX,XX @@ static Property gem_properties[] = {
                        num_type1_screeners, 4),
      DEFINE_PROP_UINT8("num-type2-screeners", CadenceGEMState,
                        num_type2_screeners, 4),
 +    DEFINE_PROP_UINT16("jumbo-max-len", CadenceGEMState,
 +                       jumbo_max_len, 10240),
      DEFINE_PROP_END_OF_LIST(),
  };
 diff --git a/include/hw/net/cadence_gem.h b/include/hw/net/cadence_gem.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/net/cadence_gem.h
 +++ b/include/hw/net/cadence_gem.h
@@ -XXX,XX +XXX,XX @@
  #define MAX_TYPE1_SCREENERS             16
  #define MAX_TYPE2_SCREENERS             16
 -#define MAX_FRAME_SIZE 2048
 +#define MAX_JUMBO_FRAME_SIZE_MASK 0x3FFF
 +#define MAX_FRAME_SIZE MAX_JUMBO_FRAME_SIZE_MASK
  typedef struct CadenceGEMState {
      /*< private >*/
@@ -XXX,XX +XXX,XX @@ typedef struct CadenceGEMState {
      uint8_t num_type1_screeners;
      uint8_t num_type2_screeners;
      uint32_t revision;
 +    uint16_t jumbo_max_len;
      /* GEM registers backing store */
      uint32_t regs[CADENCE_GEM_MAXREG];
 --
-.5.0
+.7.4

-[PULL 05/33] vmstate.h: provide VMSTATE_VARRAY_UINT16_ALLOC macro
+[PULL V3 06/15] virtio: Add vhost_svq_get_vring_addr
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Similar to VMSTATE_VARRAY_UINT32_ALLOC, but the size is
+It reports the shadow virtqueue address from qemu virtual address space.
 -bit field.
-Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Since this will be different from the guest's vaddr, but the device can
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
+access it, SVQ takes special care about its alignment & lack of garbage
 data. It assumes that IOMMU will work in host_page_size ranges for that.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/migration/vmstate.h | 10 ++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
-file changed, 10 insertions(+)
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
 files changed, 38 insertions(+)
-diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/migration/vmstate.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/migration/vmstate.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ extern const VMStateInfo vmstate_info_qlist;
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
      .offset     = vmstate_offset_pointer(_state, _field, _type),     \
  }
-+#define VMSTATE_VARRAY_UINT16_ALLOC(_field, _state, _field_num, _version, _info, _type) {\
+ /**
-+    .name       = (stringify(_field)),                               \
++ * Get the shadow vq vring address.
-+    .version_id = (_version),                                        \
++ * @svq: Shadow virtqueue
-+    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),\
++ * @addr: Destination to store address
-+    .info       = &(_info),                                          \
++ */
-+    .size       = sizeof(_type),                                     \
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-+    .flags      = VMS_VARRAY_UINT16 | VMS_POINTER | VMS_ALLOC,       \
++                              struct vhost_vring_addr *addr)
-+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
++{
 +    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +}
 +
- #define VMSTATE_VARRAY_UINT16_UNSAFE(_field, _state, _field_num, _version, _info, _type) {\
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
-     .name       = (stringify(_field)),                               \
++{
-     .version_id = (_version),                                        \
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    size_t avail_size = offsetof(vring_avail_t, ring) +
 +                                             sizeof(uint16_t) * svq->vring.num;
 +
 +    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
 +}
 +
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
 +{
 +    size_t used_size = offsetof(vring_used_t, ring) +
 +                                    sizeof(vring_used_elem_t) * svq->vring.num;
 +    return ROUND_UP(used_size, qemu_real_host_page_size);
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #define VHOST_SHADOW_VIRTQUEUE_H
  #include "qemu/event_notifier.h"
 +#include "hw/virtio/virtio.h"
 +#include "standard-headers/linux/vhost_types.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
 +    /* Shadow vring */
 +    struct vring vring;
 +
      /* Shadow kick notifier, sent to vhost */
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr);
 +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 --
-.5.0
+.7.4

-[PULL 02/33] virtio-net: implement RX RSS processing
+[PULL V3 07/15] vdpa: adapt vhost_ops callbacks to svq
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-If VIRTIO_NET_F_RSS negotiated and RSS is enabled, process
+First half of the buffers forwarding part, preparing vhost-vdpa
-incoming packets, calculate packet's hash and place the
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
-packet into respective RX virtqueue.
+this is effectively dead code at the moment, but it helps to reduce
 patch size.
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/virtio-net.c            | 88 +++++++++++++++++++++++++++++++++++++++++-
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
- include/hw/virtio/virtio-net.h |  1 +
+file changed, 41 insertions(+), 7 deletions(-)
 files changed, 87 insertions(+), 2 deletions(-)
-diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/virtio-net.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/virtio-net.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
- #include "trace.h"
+     return ret;
- #include "monitor/qdev.h"
+  }
- #include "hw/pci/pci.h"
-+#include "net_rx_pkt.h"
++static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
++                                         struct vhost_vring_state *ring)
  #define VIRTIO_NET_VM_VERSION    11
@@ -XXX,XX +XXX,XX @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
      return 0;
  }
 +static uint8_t virtio_net_get_hash_type(bool isip4,
 +                                        bool isip6,
 +                                        bool isudp,
 +                                        bool istcp,
 +                                        uint32_t types)
 +{
-+    if (isip4) {
++    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-+        if (istcp && (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) {
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +            return NetPktRssIpV4Tcp;
 +        }
 +        if (isudp && (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) {
 +            return NetPktRssIpV4Udp;
 +        }
 +        if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
 +            return NetPktRssIpV4;
 +        }
 +    } else if (isip6) {
 +        uint32_t mask = VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
 +                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6;
 +
 +        if (istcp && (types & mask)) {
 +            return (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) ?
 +                NetPktRssIpV6TcpEx : NetPktRssIpV6Tcp;
 +        }
 +        mask = VIRTIO_NET_RSS_HASH_TYPE_UDP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDPv6;
 +        if (isudp && (types & mask)) {
 +            return (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) ?
 +                NetPktRssIpV6UdpEx : NetPktRssIpV6Udp;
 +        }
 +        mask = VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_IPv6;
 +        if (types & mask) {
 +            return (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) ?
 +                NetPktRssIpV6Ex : NetPktRssIpV6;
 +        }
 +    }
 +    return 0xff;
 +}
 +
-+static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
+ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-+                                  size_t size)
+                                          struct vhost_vring_file *file)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
  }
 +static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
 +                                         struct vhost_vring_addr *addr)
 +{
-+    VirtIONet *n = qemu_get_nic_opaque(nc);
++    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-+    unsigned int index = nc->queue_index, new_index;
++                                addr->desc_user_addr, addr->used_user_addr,
-+    struct NetRxPkt *pkt = n->rx_pkt;
++                                addr->avail_user_addr,
-+    uint8_t net_hash_type;
++                                addr->log_guest_addr);
 +    uint32_t hash;
 +    bool isip4, isip6, isudp, istcp;
 +
-+    net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len,
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-+                             size - n->host_hdr_len);
++
-+    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
++}
-+    if (isip4 && (net_rx_pkt_get_ip4_info(pkt)->fragment)) {
++
-+        istcp = isudp = false;
+ /**
-+    }
+  * Set the shadow virtqueue descriptors to the device
-+    if (isip6 && (net_rx_pkt_get_ip6_info(pkt)->fragment)) {
+  *
-+        istcp = isudp = false;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
-+    }
+ static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
-+    net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp,
+                                        struct vhost_vring_addr *addr)
-+                                             n->rss_data.hash_types);
+ {
-+    if (net_hash_type > NetPktRssIpV6UdpEx) {
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-+        return n->rss_data.default_queue;
+-                                    addr->desc_user_addr, addr->used_user_addr,
 -                                    addr->avail_user_addr,
 -                                    addr->log_guest_addr);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        /*
 +         * Device vring addr was set at device start. SVQ base is handled by
 +         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
-+    hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
++    return vhost_vdpa_set_vring_dev_addr(dev, addr);
-+    new_index = hash & (n->rss_data.indirections_len - 1);
+ }
-+    new_index = n->rss_data.indirections_table[new_index];
-+    if (index == new_index) {
+ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-+        return -1;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-+    }
+ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-+    return new_index;
+                                        struct vhost_vring_state *ring)
-+}
+ {
 -    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +    struct vhost_vdpa *v = dev->opaque;
 +
- static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
++    if (v->shadow_vqs_enabled) {
--                                      size_t size)
++        /*
-+                                      size_t size, bool no_rss)
++         * Device vring base was set at device start. SVQ base is handled by
- {
++         * VirtQueue code.
-     VirtIONet *n = qemu_get_nic_opaque(nc);
++         */
-     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
++        return 0;
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
          return -1;
      }
 +    if (!no_rss && n->rss_data.enabled) {
 +        int index = virtio_net_process_rss(nc, buf, size);
 +        if (index >= 0) {
 +            NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
 +            return virtio_net_receive_rcu(nc2, buf, size, true);
 +        }
 +    }
 +
-     /* hdr_len refers to the header we supply to the guest */
++    return vhost_vdpa_set_dev_vring_base(dev, ring);
      if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
          return 0;
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
  {
      RCU_READ_LOCK_GUARD();
 -    return virtio_net_receive_rcu(nc, buf, size);
 +    return virtio_net_receive_rcu(nc, buf, size, false);
  }
- static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
      QTAILQ_INIT(&n->rsc_chains);
      n->qdev = dev;
 +
 +    net_rx_pkt_init(&n->rx_pkt, false);
  }
  static void virtio_net_device_unrealize(DeviceState *dev)
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_unrealize(DeviceState *dev)
      qemu_del_nic(n->nic);
      virtio_net_rsc_cleanup(n);
      g_free(n->rss_data.indirections_table);
 +    net_rx_pkt_uninit(n->rx_pkt);
      virtio_cleanup(vdev);
  }
 diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/virtio-net.h
 +++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
      DeviceListener primary_listener;
      Notifier migration_state;
      VirtioNetRssData rss_data;
 +    struct NetRxPkt *rx_pkt;
  };
  void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
 --
-.5.0
+.7.4

-[PULL 28/33] net/colo-compare.c: Check that colo-compare is active
+[PULL V3 08/15] vhost: Shadow virtqueue buffers forwarding
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-If the colo-compare object is removed before failover and a
+Initial version of shadow virtqueue that actually forward buffers. There
-checkpoint happens, qemu crashes because it tries to lock
+is no iommu support at the moment, and that will be addressed in future
-the destroyed event_mtx in colo_notify_compares_event.
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
+this means that SVQ is not usable at this point of the series on any
-Fix this by checking if everything is initialized by
+device.
-introducing a new variable colo_compare_active which
-is protected by a new mutex colo_compare_mutex. The new mutex
+For simplicity it only supports modern devices, that expects vring
-also protects against concurrent access of the net_compares
+in little endian, with split ring and no event idx or indirect
-list and makes sure that colo_notify_compares_event isn't
+descriptors. Support for them will not be added in this series.
-active while we destroy event_mtx and event_complete_cond.
+It reuses the VirtQueue code for the device part. The driver part is
-With this it also is again possible to use colo without
+based on Linux's virtio_ring driver, but with stripped functionality
-colo-compare (periodic mode) and to use multiple colo-compare
+and optimizations so it's easier to review.
-for multiple network interfaces.
+However, forwarding buffers have some particular pieces: One of the most
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
+unexpected ones is that a guest's buffer can expand through more than
-Tested-by: Lukas Straub <lukasstraub2@web.de>
+one descriptor in SVQ. While this is handled gracefully by qemu's
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
+emulated virtio devices, it may cause unexpected SVQ queue full. This
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 35 +++++++++++++++++++++++++++++------
+ hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
-file changed, 29 insertions(+), 6 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
+ hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+files changed, 522 insertions(+), 11 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static NotifierList colo_compare_notifiers =
+@@ -XXX,XX +XXX,XX @@
- #define REGULAR_PACKET_CHECK_MS 3000
+ #include "qemu/error-report.h"
- #define DEFAULT_TIME_OUT_MS 3000
+ #include "qapi/error.h"
+ #include "qemu/main-loop.h"
-+static QemuMutex colo_compare_mutex;
++#include "qemu/log.h"
-+static bool colo_compare_active;
++#include "qemu/memalign.h"
- static QemuMutex event_mtx;
+ #include "linux-headers/linux/vhost.h"
- static QemuCond event_complete_cond;
- static int event_unhandled_count;
+ /**
-@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
- void colo_notify_compares_event(void *opaque, int event, Error **errp)
+ }
  /**
 - * Forward guest notifications.
 + * Number of descriptors that the SVQ can make available from the guest.
 + *
 + * @svq: The svq
 + */
 +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
 +{
 +    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    const struct iovec *iovec, size_t num,
 +                                    bool more_descs, bool write)
 +{
 +    uint16_t i = svq->free_head, last = svq->free_head;
 +    unsigned n;
 +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 +    vring_desc_t *descs = svq->vring.desc;
 +
 +    if (num == 0) {
 +        return;
 +    }
 +
 +    for (n = 0; n < num; n++) {
 +        if (more_descs || (n + 1 < num)) {
 +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 +        } else {
 +            descs[i].flags = flags;
 +        }
 +        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
 +
 +        last = i;
 +        i = cpu_to_le16(descs[i].next);
 +    }
 +
 +    svq->free_head = le16_to_cpu(descs[last].next);
 +}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem, unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 +                            false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
-     CompareState *s;
+     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
-+    qemu_mutex_lock(&colo_compare_mutex);
+     event_notifier_test_and_clear(n);
-+
+-    event_notifier_set(&svq->hdev_kick);
-+    if (!colo_compare_active) {
++    vhost_handle_guest_kick(svq);
-+        qemu_mutex_unlock(&colo_compare_mutex);
++}
 +
 +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 +{
 +    if (svq->last_used_idx != svq->shadow_used_idx) {
 +        return true;
 +    }
 +
 +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
 +
 +    return svq->last_used_idx != svq->shadow_used_idx;
  }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
  void vhost_svq_stop(VhostShadowVirtqueue *svq)
  {
      event_notifier_set_handler(&svq->svq_kick, NULL);
 +    g_autofree VirtQueueElement *next_avail_elem = NULL;
 +
 +    if (!svq->vq) {
 +        return;
 +    }
++
-     qemu_mutex_lock(&event_mtx);
++    /* Send all pending used descriptors to guest */
-     QTAILQ_FOREACH(s, &net_compares, next) {
++    vhost_svq_flush(svq, false);
-@@ -XXX,XX +XXX,XX @@ void colo_notify_compares_event(void *opaque, int event, Error **errp)
++
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq, unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq, unsigned idx,
 +                                  Error **errp)
  {
      struct vhost_vring_file file = {
          .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 -        return false;
 +        return r;
      }
-     qemu_mutex_unlock(&event_mtx);
+     event_notifier = &svq->hdev_call;
-+    qemu_mutex_unlock(&colo_compare_mutex);
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
          error_setg_errno(errp, -r, "Can't set device call fd");
      }
 +    return r;
 +}
 +
 +/**
 + * Unmap a SVQ area in the device
 + */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)(uintptr_t)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)(intptr_t)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
- static void colo_compare_timer_init(CompareState *s)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
                             s->vnet_hdr);
      }
-+    qemu_mutex_lock(&colo_compare_mutex);
+     for (i = 0; i < v->shadow_vqs->len; ++i) {
-+    if (!colo_compare_active) {
++        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
-+        qemu_mutex_init(&event_mtx);
+         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-+        qemu_cond_init(&event_complete_cond);
++        struct vhost_vring_addr addr = {
-+        colo_compare_active = true;
++            .index = i,
-+    }
++        };
-     QTAILQ_INSERT_TAIL(&net_compares, s, next);
++        int r;
-+    qemu_mutex_unlock(&colo_compare_mutex);
+         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
+         if (unlikely(!ok)) {
-     s->out_sendco.s = s;
+-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
-     s->out_sendco.chr = &s->chr_out;
++            goto err;
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
++        }
++
-     g_queue_init(&s->conn_list);
++        vhost_svq_start(svq, dev->vdev, vq);
++        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
--    qemu_mutex_init(&event_mtx);
++        if (unlikely(!ok)) {
--    qemu_cond_init(&event_complete_cond);
++            goto err_map;
--
++        }
-     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
++
-                                                       connection_key_equal,
++        /* Override vring GPA set by vhost subsystem */
-                                                       g_free,
++        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
++        if (unlikely(r != 0)) {
++            error_setg_errno(&err, -r, "Cannot set device address");
-     qemu_bh_delete(s->event_bh);
++            goto err_set_addr;
++        }
-+    qemu_mutex_lock(&colo_compare_mutex);
++    }
-     QTAILQ_FOREACH(tmp, &net_compares, next) {
++
-         if (tmp == s) {
++    return true;
-             QTAILQ_REMOVE(&net_compares, s, next);
++
-             break;
++err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
      }
-+    if (QTAILQ_EMPTY(&net_compares)) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
-+        colo_compare_active = false;
+         }
-+        qemu_mutex_destroy(&event_mtx);
+         vhost_vdpa_set_vring_ready(dev);
-+        qemu_cond_destroy(&event_complete_cond);
+     } else {
-+    }
++        ok = vhost_vdpa_svqs_stop(dev);
-+    qemu_mutex_unlock(&colo_compare_mutex);
++        if (unlikely(!ok)) {
++            return -1;
-     AioContext *ctx = iothread_get_aio_context(s->iothread);
++        }
-     aio_context_acquire(ctx);
+         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
          object_unref(OBJECT(s->iothread));
      }
--    qemu_mutex_destroy(&event_mtx);
--    qemu_cond_destroy(&event_complete_cond);
--
-     g_free(s->pri_indev);
-     g_free(s->sec_indev);
-     g_free(s->outdev);
-     g_free(s->notify_dev);
- }
-+static void __attribute__((__constructor__)) colo_compare_init_globals(void)
-+{
-+    colo_compare_active = false;
-+    qemu_mutex_init(&colo_compare_mutex);
-+}
-+
- static const TypeInfo colo_compare_info = {
-     .name = TYPE_COLO_COMPARE,
-     .parent = TYPE_OBJECT,
 --
-.5.0
+.7.4

-[PULL 33/33] net: Drop the NetLegacy structure, always use Netdev instead
+[PULL V3 09/15] util: Add iova_tree_alloc_map
-From: Thomas Huth <thuth@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Now that the "name" parameter is gone, there is hardly any difference
+This iova tree function allows it to look for a hole in allocated
-between NetLegacy and Netdev anymore, so we can drop NetLegacy and always
+regions and return a totally new translation for a given translated
-use Netdev to simplify the code quite a bit.
+address.
-The only two differences that were really left between Netdev and NetLegacy:
+It's usage is mainly to allow devices to access qemu address space,
+remapping guest's one into a new iova space where qemu can add chunks of
-) NetLegacy does not allow a "hubport" type. We can continue to block
+addresses.
-   this with a simple check in net_client_init1() for this type.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-) The "id" parameter was optional in NetLegacy (and an internal id
+Reviewed-by: Peter Xu <peterx@redhat.com>
-   was chosen via assign_name() during initialization), but it is mandatory
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
    for Netdev. To avoid that the visitor code bails out here, we have to
    add an internal id to the QemuOpts already earlier now.
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/net.c     | 77 ++++++++++-------------------------------------------------
+ include/qemu/iova-tree.h |  18 +++++++
- qapi/net.json | 46 -----------------------------------
+ util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
-files changed, 13 insertions(+), 110 deletions(-)
+files changed, 154 insertions(+)
-diff --git a/net/net.c b/net/net.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
+--- a/include/qemu/iova-tree.h
-+++ b/net/net.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
+@@ -XXX,XX +XXX,XX @@
  #define  IOVA_OK           (0)
  #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
  #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
 +#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
  typedef struct IOVATree IOVATree;
  typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
  void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
++/* Args to pass to iova_tree_alloc foreach function. */
--static int net_client_init1(const void *object, bool is_netdev, Error **errp)
++struct IOVATreeAllocArgs {
-+static int net_client_init1(const Netdev *netdev, bool is_netdev, Error **errp)
++    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next)
 +{
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
--    Netdev legacy = {0};
+     const DMAMap *m1 = a, *m2 = b;
--    const Netdev *netdev;
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
-     NetClientState *peer = NULL;
+     return IOVA_OK;
+ }
-     if (is_netdev) {
--        netdev = object;
++/**
--
++ * Try to find an unallocated IOVA range between prev and this elements.
-         if (netdev->type == NET_CLIENT_DRIVER_NIC ||
++ *
-             !net_client_init_fun[netdev->type]) {
++ * @args: Arguments to allocation
-             error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
++ *
-@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
++ * Cases:
-             return -1;
++ *
-         }
++ * (1) !prev, !this: No entries allocated, always succeed
-     } else {
++ *
--        const NetLegacy *net = object;
++ * (2) !prev, this: We're iterating at the 1st element.
--        const NetLegacyOptions *opts = net->opts;
++ *
--        legacy.id = net->id;
++ * (3) prev, !this: We're iterating at the last element.
--        netdev = &legacy;
++ *
--
++ * (4) prev, this: this is the most common case, we'll try to find a hole
--        /* Map the old options to the new flat type */
++ * between "prev" and "this" mapping.
--        switch (opts->type) {
++ *
--        case NET_LEGACY_OPTIONS_TYPE_NONE:
++ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
-+        if (netdev->type == NET_CLIENT_DRIVER_NONE) {
++ * searches linearly so it's easy to discard the result if it's not the case.
-             return 0; /* nothing to do */
++ */
--        case NET_LEGACY_OPTIONS_TYPE_NIC:
++static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
--            legacy.type = NET_CLIENT_DRIVER_NIC;
++{
--            legacy.u.nic = opts->u.nic;
++    const DMAMap *prev = args->prev, *this = args->this;
--            break;
++    uint64_t hole_start, hole_last;
--        case NET_LEGACY_OPTIONS_TYPE_USER:
++
--            legacy.type = NET_CLIENT_DRIVER_USER;
++    if (this && this->iova + this->size < args->iova_begin) {
--            legacy.u.user = opts->u.user;
++        return;
--            break;
++    }
--        case NET_LEGACY_OPTIONS_TYPE_TAP:
++
--            legacy.type = NET_CLIENT_DRIVER_TAP;
++    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
--            legacy.u.tap = opts->u.tap;
++    hole_last = this ? this->iova : HWADDR_MAX;
--            break;
++
--        case NET_LEGACY_OPTIONS_TYPE_L2TPV3:
++    if (hole_last - hole_start > args->new_size) {
--            legacy.type = NET_CLIENT_DRIVER_L2TPV3;
++        args->iova_result = hole_start;
--            legacy.u.l2tpv3 = opts->u.l2tpv3;
++        args->iova_found = true;
--            break;
++    }
--        case NET_LEGACY_OPTIONS_TYPE_SOCKET:
++}
--            legacy.type = NET_CLIENT_DRIVER_SOCKET;
++
--            legacy.u.socket = opts->u.socket;
++/**
--            break;
++ * Foreach dma node in the tree, compare if there is a hole with its previous
--        case NET_LEGACY_OPTIONS_TYPE_VDE:
++ * node (or minimum iova address allowed) and the node.
--            legacy.type = NET_CLIENT_DRIVER_VDE;
++ *
--            legacy.u.vde = opts->u.vde;
++ * @key: Node iterating
--            break;
++ * @value: Node iterating
--        case NET_LEGACY_OPTIONS_TYPE_BRIDGE:
++ * @pargs: Struct to communicate with the outside world
--            legacy.type = NET_CLIENT_DRIVER_BRIDGE;
++ *
--            legacy.u.bridge = opts->u.bridge;
++ * Return: false to keep iterating, true if needs break.
--            break;
++ */
--        case NET_LEGACY_OPTIONS_TYPE_NETMAP:
++static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
--            legacy.type = NET_CLIENT_DRIVER_NETMAP;
++                                         gpointer pargs)
--            legacy.u.netmap = opts->u.netmap;
++{
--            break;
++    struct IOVATreeAllocArgs *args = pargs;
--        case NET_LEGACY_OPTIONS_TYPE_VHOST_USER:
++    DMAMap *node = value;
--            legacy.type = NET_CLIENT_DRIVER_VHOST_USER;
++
--            legacy.u.vhost_user = opts->u.vhost_user;
++    assert(key == value);
--            break;
++
--        default:
++    iova_tree_alloc_args_iterate(args, node);
--            abort();
++    iova_tree_alloc_map_in_hole(args);
-         }
++    return args->iova_found;
--
++}
--        if (!net_client_init_fun[netdev->type]) {
++
-+        if (netdev->type == NET_CLIENT_DRIVER_HUBPORT ||
++int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
-+            !net_client_init_fun[netdev->type]) {
++                        hwaddr iova_last)
-             error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
++{
-                        "a net backend type (maybe it is not compiled "
++    struct IOVATreeAllocArgs args = {
-                        "into this binary)");
++        .new_size = map->size,
-@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
++        .iova_begin = iova_begin,
++    };
-         /* Do not add to a hub if it's a nic with a netdev= parameter. */
++
-         if (netdev->type != NET_CLIENT_DRIVER_NIC ||
++    if (unlikely(iova_last < iova_begin)) {
--            !opts->u.nic.has_netdev) {
++        return IOVA_ERR_INVALID;
-+            !netdev->u.nic.has_netdev) {
++    }
-             peer = net_hub_add_port(0, NULL, NULL);
++
-         }
++    /*
-     }
++     * Find a valid hole for the mapping
-@@ -XXX,XX +XXX,XX @@ static void show_netdevs(void)
++     *
- static int net_client_init(QemuOpts *opts, bool is_netdev, Error **errp)
++     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
-     gchar **substrings = NULL;
+     g_tree_destroy(tree->tree);
 -    void *object = NULL;
 +    Netdev *object = NULL;
      Error *err = NULL;
      int ret = -1;
      Visitor *v = opts_visitor_new(opts);
@@ -XXX,XX +XXX,XX @@ static int net_client_init(QemuOpts *opts, bool is_netdev, Error **errp)
          }
      }
 -    if (is_netdev) {
 -        visit_type_Netdev(v, NULL, (Netdev **)&object, &err);
 -    } else {
 -        visit_type_NetLegacy(v, NULL, (NetLegacy **)&object, &err);
 +    /* Create an ID for -net if the user did not specify one */
 +    if (!is_netdev && !qemu_opts_id(opts)) {
 +        static int idx;
 +        qemu_opts_set_id(opts, g_strdup_printf("__org.qemu.net%i", idx++));
      }
 +    visit_type_Netdev(v, NULL, &object, &err);
 +
      if (!err) {
          ret = net_client_init1(object, is_netdev, &err);
      }
 -    if (is_netdev) {
 -        qapi_free_Netdev(object);
 -    } else {
 -        qapi_free_NetLegacy(object);
 -    }
 +    qapi_free_Netdev(object);
  out:
      error_propagate(errp, err);
 diff --git a/qapi/net.json b/qapi/net.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/net.json
 +++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
      'vhost-user': 'NetdevVhostUserOptions' } }
  ##
 -# @NetLegacy:
 -#
 -# Captures the configuration of a network device; legacy.
 -#
 -# @id: identifier for monitor commands
 -#
 -# @opts: device type specific properties (legacy)
 -#
 -# Since: 1.2
 -##
 -{ 'struct': 'NetLegacy',
 -  'data': {
 -    '*id':   'str',
 -    'opts':  'NetLegacyOptions' } }
 -
 -##
 -# @NetLegacyOptionsType:
 -#
 -# Since: 1.2
 -##
 -{ 'enum': 'NetLegacyOptionsType',
 -  'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
 -           'bridge', 'netmap', 'vhost-user'] }
 -
 -##
 -# @NetLegacyOptions:
 -#
 -# Like Netdev, but for use only by the legacy command line options
 -#
 -# Since: 1.2
 -##
 -{ 'union': 'NetLegacyOptions',
 -  'base': { 'type': 'NetLegacyOptionsType' },
 -  'discriminator': 'type',
 -  'data': {
 -    'nic':      'NetLegacyNicOptions',
 -    'user':     'NetdevUserOptions',
 -    'tap':      'NetdevTapOptions',
 -    'l2tpv3':   'NetdevL2TPv3Options',
 -    'socket':   'NetdevSocketOptions',
 -    'vde':      'NetdevVdeOptions',
 -    'bridge':   'NetdevBridgeOptions',
 -    'netmap':   'NetdevNetmapOptions',
 -    'vhost-user': 'NetdevVhostUserOptions' } }
 -
 -##
  # @NetFilterDirection:
  #
  # Indicates whether a netfilter is attached to a netdev's transmit queue or
 --
-.5.0
+.7.4

-[PULL 01/33] virtio-net: implement RSS configuration command
+[PULL V3 10/15] util: add iova_tree_find_iova
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Optionally report RSS feature.
+This function does the reverse operation of iova_tree_find: To look for
-Handle RSS configuration command and keep RSS parameters
+a mapping that match a translated address so we can do the reverse.
 in virtio-net device context.
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
+This have linear complexity instead of logarithmic, but it supports
 overlapping HVA. Future developments could reduce it.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/trace-events            |   3 +
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
- hw/net/virtio-net.c            | 167 ++++++++++++++++++++++++++++++++++++++---
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
- include/hw/virtio/virtio-net.h |  13 ++++
+files changed, 53 insertions(+), 1 deletion(-)
 files changed, 174 insertions(+), 9 deletions(-)
-diff --git a/hw/net/trace-events b/hw/net/trace-events
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/trace-events
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/trace-events
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ virtio_net_announce_notify(void) ""
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
- virtio_net_announce_timer(int round) "%d"
+  * @tree: the iova tree to search from
- virtio_net_handle_announce(int round) "%d"
+  * @map: the mapping to search
- virtio_net_post_load_device(void)
+  *
-+virtio_net_rss_disable(void)
+- * Search for a mapping in the iova tree that overlaps with the
-+virtio_net_rss_error(const char *msg, uint32_t value) "%s, value 0x%08x"
++ * Search for a mapping in the iova tree that iova overlaps with the
-+virtio_net_rss_enable(uint32_t p1, uint16_t p2, uint8_t p3) "hashes 0x%x, table of %d, key of %d"
+  * mapping range specified.  Only the first found mapping will be
+  * returned.
- # tulip.c
+  *
- tulip_reg_write(uint64_t addr, const char *name, int size, uint64_t val) "addr 0x%02"PRIx64" (%s) size %d value 0x%08"PRIx64
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/virtio-net.c
+--- a/util/iova-tree.c
-+++ b/hw/net/virtio-net.c
++++ b/util/iova-tree.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
-    tso/gso/gro 'off'. */
+     bool iova_found;
- #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
+ };
-+#define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
++typedef struct IOVATreeFindIOVAArgs {
-+                                         VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
++    const DMAMap *needle;
-+                                         VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
++    const DMAMap *result;
-+                                         VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
++} IOVATreeFindIOVAArgs;
 +                                         VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
 +                                         VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
 +                                         VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
 +                                         VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
 +                                         VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
 +
- /* temporary until standard header include it */
+ /**
- #if !defined(VIRTIO_NET_HDR_F_RSC_INFO)
+  * Iterate args to the next hole
+  *
-@@ -XXX,XX +XXX,XX @@ static VirtIOFeature feature_sizes[] = {
+@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
-      .end = endof(struct virtio_net_config, mtu)},
+     return g_tree_lookup(tree->tree, map);
      {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
       .end = endof(struct virtio_net_config, duplex)},
 +    {.flags = 1ULL << VIRTIO_NET_F_RSS,
 +     .end = endof(struct virtio_net_config, supported_hash_types)},
      {}
  };
@@ -XXX,XX +XXX,XX @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
      memcpy(netcfg.mac, n->mac, ETH_ALEN);
      virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
      netcfg.duplex = n->net_conf.duplex;
 +    netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
 +    virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
 +                 VIRTIO_NET_RSS_MAX_TABLE_LEN);
 +    virtio_stl_p(vdev, &netcfg.supported_hash_types,
 +                 VIRTIO_NET_RSS_SUPPORTED_HASHES);
      memcpy(config, &netcfg, n->config_size);
  }
-@@ -XXX,XX +XXX,XX @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
++static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
-         return features;
++                                                gpointer data)
      }
 +    virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
      features = vhost_net_get_features(get_vhost_net(nc->peer), features);
      vdev->backend_features = features;
@@ -XXX,XX +XXX,XX @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
      }
      virtio_net_set_multiqueue(n,
 +                              virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
                                virtio_has_feature(features, VIRTIO_NET_F_MQ));
      virtio_net_set_mrg_rx_bufs(n,
@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
      }
  }
 +static void virtio_net_disable_rss(VirtIONet *n)
 +{
-+    if (n->rss_data.enabled) {
++    const DMAMap *map = key;
-+        trace_virtio_net_rss_disable();
++    IOVATreeFindIOVAArgs *args = data;
 +    const DMAMap *needle;
 +
 +    g_assert(key == value);
 +
 +    needle = args->needle;
 +    if (map->translated_addr + map->size < needle->translated_addr ||
 +        needle->translated_addr + needle->size < map->translated_addr) {
 +        return false;
 +    }
-+    n->rss_data.enabled = false;
++
 +    args->result = map;
 +    return true;
 +}
 +
-+static uint16_t virtio_net_handle_rss(VirtIONet *n,
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
 +                                      struct iovec *iov, unsigned int iov_cnt)
 +{
-+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
++    IOVATreeFindIOVAArgs args = {
-+    struct virtio_net_rss_config cfg;
++        .needle = map,
-+    size_t s, offset = 0, size_get;
++    };
 +    uint16_t queues, i;
 +    struct {
 +        uint16_t us;
 +        uint8_t b;
 +    } QEMU_PACKED temp;
 +    const char *err_msg = "";
 +    uint32_t err_value = 0;
 +
-+    if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
++    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
-+        err_msg = "RSS is not negotiated";
++    return args.result;
 +        goto error;
 +    }
 +    size_get = offsetof(struct virtio_net_rss_config, indirection_table);
 +    s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
 +    if (s != size_get) {
 +        err_msg = "Short command buffer";
 +        err_value = (uint32_t)s;
 +        goto error;
 +    }
 +    n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
 +    n->rss_data.indirections_len =
 +        virtio_lduw_p(vdev, &cfg.indirection_table_mask);
 +    n->rss_data.indirections_len++;
 +    if (!is_power_of_2(n->rss_data.indirections_len)) {
 +        err_msg = "Invalid size of indirection table";
 +        err_value = n->rss_data.indirections_len;
 +        goto error;
 +    }
 +    if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
 +        err_msg = "Too large indirection table";
 +        err_value = n->rss_data.indirections_len;
 +        goto error;
 +    }
 +    n->rss_data.default_queue =
 +        virtio_lduw_p(vdev, &cfg.unclassified_queue);
 +    if (n->rss_data.default_queue >= n->max_queues) {
 +        err_msg = "Invalid default queue";
 +        err_value = n->rss_data.default_queue;
 +        goto error;
 +    }
 +    offset += size_get;
 +    size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
 +    g_free(n->rss_data.indirections_table);
 +    n->rss_data.indirections_table = g_malloc(size_get);
 +    if (!n->rss_data.indirections_table) {
 +        err_msg = "Can't allocate indirections table";
 +        err_value = n->rss_data.indirections_len;
 +        goto error;
 +    }
 +    s = iov_to_buf(iov, iov_cnt, offset,
 +                   n->rss_data.indirections_table, size_get);
 +    if (s != size_get) {
 +        err_msg = "Short indirection table buffer";
 +        err_value = (uint32_t)s;
 +        goto error;
 +    }
 +    for (i = 0; i < n->rss_data.indirections_len; ++i) {
 +        uint16_t val = n->rss_data.indirections_table[i];
 +        n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
 +    }
 +    offset += size_get;
 +    size_get = sizeof(temp);
 +    s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
 +    if (s != size_get) {
 +        err_msg = "Can't get queues";
 +        err_value = (uint32_t)s;
 +        goto error;
 +    }
 +    queues = virtio_lduw_p(vdev, &temp.us);
 +    if (queues == 0 || queues > n->max_queues) {
 +        err_msg = "Invalid number of queues";
 +        err_value = queues;
 +        goto error;
 +    }
 +    if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
 +        err_msg = "Invalid key size";
 +        err_value = temp.b;
 +        goto error;
 +    }
 +    if (!temp.b && n->rss_data.hash_types) {
 +        err_msg = "No key provided";
 +        err_value = 0;
 +        goto error;
 +    }
 +    if (!temp.b && !n->rss_data.hash_types) {
 +        virtio_net_disable_rss(n);
 +        return queues;
 +    }
 +    offset += size_get;
 +    size_get = temp.b;
 +    s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
 +    if (s != size_get) {
 +        err_msg = "Can get key buffer";
 +        err_value = (uint32_t)s;
 +        goto error;
 +    }
 +    n->rss_data.enabled = true;
 +    trace_virtio_net_rss_enable(n->rss_data.hash_types,
 +                                n->rss_data.indirections_len,
 +                                temp.b);
 +    return queues;
 +error:
 +    trace_virtio_net_rss_error(err_msg, err_value);
 +    virtio_net_disable_rss(n);
 +    return 0;
 +}
 +
- static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
+ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
                                  struct iovec *iov, unsigned int iov_cnt)
  {
-     VirtIODevice *vdev = VIRTIO_DEVICE(n);
+     const DMAMap map = { .iova = iova, .size = 0 };
 -    struct virtio_net_ctrl_mq mq;
 -    size_t s;
      uint16_t queues;
 -    s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
 -    if (s != sizeof(mq)) {
 -        return VIRTIO_NET_ERR;
 -    }
 +    virtio_net_disable_rss(n);
 +    if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
 +        queues = virtio_net_handle_rss(n, iov, iov_cnt);
 +    } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
 +        struct virtio_net_ctrl_mq mq;
 +        size_t s;
 +        if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
 +            return VIRTIO_NET_ERR;
 +        }
 +        s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
 +        if (s != sizeof(mq)) {
 +            return VIRTIO_NET_ERR;
 +        }
 +        queues = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
 -    if (cmd != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
 +    } else {
          return VIRTIO_NET_ERR;
      }
 -    queues = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
 -
      if (queues < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
          queues > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
          queues > n->max_queues ||
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_unrealize(DeviceState *dev)
      g_free(n->vqs);
      qemu_del_nic(n->nic);
      virtio_net_rsc_cleanup(n);
 +    g_free(n->rss_data.indirections_table);
      virtio_cleanup(vdev);
  }
@@ -XXX,XX +XXX,XX @@ static Property virtio_net_properties[] = {
      DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
                      VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
      DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
 +    DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
 +                    VIRTIO_NET_F_RSS, false),
      DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
                      VIRTIO_NET_F_RSC_EXT, false),
      DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
 diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/virtio-net.h
 +++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ typedef struct VirtioNetRscChain {
  /* Maximum packet size we can receive from tap device: header + 64k */
  #define VIRTIO_NET_MAX_BUFSIZE (sizeof(struct virtio_net_hdr) + (64 * KiB))
 +#define VIRTIO_NET_RSS_MAX_KEY_SIZE     40
 +#define VIRTIO_NET_RSS_MAX_TABLE_LEN    128
 +
 +typedef struct VirtioNetRssData {
 +    bool    enabled;
 +    uint32_t hash_types;
 +    uint8_t key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
 +    uint16_t indirections_len;
 +    uint16_t *indirections_table;
 +    uint16_t default_queue;
 +} VirtioNetRssData;
 +
  typedef struct VirtIONetQueue {
      VirtQueue *rx_vq;
      VirtQueue *tx_vq;
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
      bool failover;
      DeviceListener primary_listener;
      Notifier migration_state;
 +    VirtioNetRssData rss_data;
  };
  void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
 --
-.5.0
+.7.4

-[PULL 03/33] tap: allow extended virtio header with hash info
+Deleted patch
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- net/tap.c | 3 ++-
-file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/net/tap.c b/net/tap.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/tap.c
-+++ b/net/tap.c
-@@ -XXX,XX +XXX,XX @@ static void tap_set_vnet_hdr_len(NetClientState *nc, int len)
-     assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
-     assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) ||
--           len == sizeof(struct virtio_net_hdr));
-+           len == sizeof(struct virtio_net_hdr) ||
-+           len == sizeof(struct virtio_net_hdr_v1_hash));
-     tap_fd_set_vnet_hdr_len(s->fd, len);
-     s->host_vnet_hdr_len = len;
---
-.5.0

-[PULL 12/33] net: cadence_gem: Fix the queue address update during wrap around
+[PULL V3 11/15] vhost: Add VhostIOVATree
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-During wrap around and reset, queues are pointing to initial base
+This tree is able to look for a translated address from an IOVA address.
 address of queue 0, irrespective of what queue we are dealing with.
 Fix it by assigning proper base address every time.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
+At first glance it is similar to util/iova-tree. However, SVQ working on
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
+devices with limited IOVA space need more capabilities, like allocating
 IOVA chunks or performing reverse translations (qemu addresses to iova).
 The allocation capability, as "assign a free IOVA address to this chunk
 of memory in qemu's address space" allows shadow virtqueue to create a
 new address space that is not restricted by guest's addressable one, so
 we can allocate shadow vqs vrings outside of it.
 It duplicates the tree so it can search efficiently in both directions,
 and it will signal overlap if iova or the translated address is present
 in any tree.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/cadence_gem.c | 37 +++++++++++++++++++++++++++++++++----
+ hw/virtio/meson.build       |   2 +-
-file changed, 33 insertions(+), 4 deletions(-)
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 files changed, 138 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
+--- a/hw/virtio/meson.build
-+++ b/hw/net/cadence_gem.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ static int get_queue_from_screen(CadenceGEMState *s, uint8_t *rxbuf_ptr,
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-     return 0;
- }
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
-+static uint32_t gem_get_queue_base_addr(CadenceGEMState *s, bool tx, int q)
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
  virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/iova-tree.h"
 +#include "vhost-iova-tree.h"
 +
 +#define iova_min_addr qemu_real_host_page_size
 +
 +/**
 + * VhostIOVATree, able to:
 + * - Translate iova address
 + * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
 +struct VhostIOVATree {
 +    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
 +    /* Last addressable iova address in the device */
 +    uint64_t iova_last;
 +
 +    /* IOVA address to qemu memory maps. */
 +    IOVATree *iova_taddr_map;
 +};
 +
 +/**
 + * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
-+    uint32_t base_addr = 0;
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
 +
-+    switch (q) {
++    /* Some devices do not like 0 addresses */
-+    case 0:
++    tree->iova_first = MAX(iova_first, iova_min_addr);
-+        base_addr = s->regs[tx ? GEM_TXQBASE : GEM_RXQBASE];
++    tree->iova_last = iova_last;
 +        break;
 +    case 1 ... (MAX_PRIORITY_QUEUES - 1):
 +        base_addr = s->regs[(tx ? GEM_TRANSMIT_Q1_PTR :
 +                                 GEM_RECEIVE_Q1_PTR) + q - 1];
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    };
 +
-+    return base_addr;
++    tree->iova_taddr_map = iova_tree_new();
 +    return tree;
 +}
 +
-+static inline uint32_t gem_get_tx_queue_base_addr(CadenceGEMState *s, int q)
++/**
 + * Delete an iova tree
 + */
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
 +{
-+    return gem_get_queue_base_addr(s, true, q);
++    iova_tree_destroy(iova_tree->iova_taddr_map);
 +    g_free(iova_tree);
 +}
 +
-+static inline uint32_t gem_get_rx_queue_base_addr(CadenceGEMState *s, int q)
++/**
 + * Find the IOVA address stored from a memory address
 + *
 + * @tree: The iova tree
 + * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
-+    return gem_get_queue_base_addr(s, false, q);
++    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +}
 +
- static hwaddr gem_get_desc_addr(CadenceGEMState *s, bool tx, int q)
++/**
- {
++ * Allocate a new mapping
-     hwaddr desc_addr = 0;
++ *
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
++ * @tree: The iova tree
-         /* Next descriptor */
++ * @map: The iova map
-         if (rx_desc_get_wrap(s->rx_desc[q])) {
++ *
-             DB_PRINT("wrapping RX descriptor list\n");
++ * Returns:
--            s->rx_desc_addr[q] = s->regs[GEM_RXQBASE];
++ * - IOVA_OK if the map fits in the container
-+            s->rx_desc_addr[q] = gem_get_rx_queue_base_addr(s, q);
++ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
-         } else {
++ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
-             DB_PRINT("incrementing RX descriptor list\n");
++ *
-             s->rx_desc_addr[q] += 4 * gem_get_desc_len(s, true);
++ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
++ */
-                                     sizeof(desc_first));
++int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
-                 /* Advance the hardware current descriptor past this packet */
++{
-                 if (tx_desc_get_wrap(desc)) {
++    /* Some vhost devices do not like addr 0. Skip first page */
--                    s->tx_desc_addr[q] = s->regs[GEM_TXQBASE];
++    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
-+                    s->tx_desc_addr[q] = gem_get_tx_queue_base_addr(s, q);
++
-                 } else {
++    if (map->translated_addr + map->size < map->translated_addr ||
-                     s->tx_desc_addr[q] = packet_desc_addr +
++        map->perm == IOMMU_NONE) {
-* gem_get_desc_len(s, false);
++        return IOVA_ERR_INVALID;
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
++    }
-                 } else {
++
-                     packet_desc_addr = 0;
++    /* Allocate a node in IOVA address */
-                 }
++    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
--                packet_desc_addr |= s->regs[GEM_TXQBASE];
++                               tree->iova_last);
-+                packet_desc_addr |= gem_get_tx_queue_base_addr(s, q);
++}
-             } else {
++
-                 packet_desc_addr += 4 * gem_get_desc_len(s, false);
++/**
-             }
++ * Remove existing mappings from iova tree
-@@ -XXX,XX +XXX,XX @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
++ *
-         if (!(val & GEM_NWCTRL_TXENA)) {
++ * @iova_tree: The vhost iova tree
-             /* Reset to start of Q when transmit disabled. */
++ * @map: The map to remove
-             for (i = 0; i < s->num_priority_queues; i++) {
++ */
--                s->tx_desc_addr[i] = s->regs[GEM_TXQBASE];
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
-+                s->tx_desc_addr[i] = gem_get_tx_queue_base_addr(s, i);
++{
-             }
++    iova_tree_remove(iova_tree->iova_taddr_map, map);
-         }
++}
-         if (gem_can_receive(qemu_get_queue(s->nic))) {
+diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
 +#define HW_VIRTIO_VHOST_IOVA_TREE_H
 +
 +#include "qemu/iova-tree.h"
 +#include "exec/memory.h"
 +
 +typedef struct VhostIOVATree VhostIOVATree;
 +
 +VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
 +
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
 +                                        const DMAMap *map);
 +int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
 +
 +#endif
 --
-.5.0
+.7.4

-[PULL 04/33] virtio-net: reference implementation of hash report
+[PULL V3 12/15] vdpa: Add custom IOTLB translations to SVQ
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Suggest VIRTIO_NET_F_HASH_REPORT if specified in device
+Use translations added in VhostIOVATree in SVQ.
-parameters.
-If the VIRTIO_NET_F_HASH_REPORT is set,
+Only introduce usage here, not allocation and deallocation. As with
-the device extends configuration space. If the feature
+previous patches, we use the dead code paths of shadow_vqs_enabled to
-is negotiated, the packet layout is extended to
+avoid commiting too many changes at once. These are impossible to take
-accomodate the hash information. In this case deliver
+at the moment.
-packet's hash value and report type in virtio header
-extension.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Use for configuration the same procedure as already
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 used for RSS. We add two fields in rss_data that
 controls what the device does with the calculated hash
 if rss_data.enabled is set. If field 'populate' is set
 the hash is set in the packet, if field 'redirect' is
 set the hash is used to decide the queue to place the
 packet to.
 Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/virtio-net.c            | 99 +++++++++++++++++++++++++++++++++---------
+ hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
- include/hw/virtio/virtio-net.h |  2 +
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
-files changed, 81 insertions(+), 20 deletions(-)
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
+ include/hw/virtio/vhost-vdpa.h     |   3 +
-diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+files changed, 187 insertions(+), 30 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/virtio-net.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/virtio-net.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static VirtIOFeature feature_sizes[] = {
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-      .end = endof(struct virtio_net_config, mtu)},
+     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
      {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
       .end = endof(struct virtio_net_config, duplex)},
 -    {.flags = 1ULL << VIRTIO_NET_F_RSS,
 +    {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
       .end = endof(struct virtio_net_config, supported_hash_types)},
      {}
  };
@@ -XXX,XX +XXX,XX @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
      netcfg.duplex = n->net_conf.duplex;
      netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
      virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
 -                 VIRTIO_NET_RSS_MAX_TABLE_LEN);
 +                 virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
 +                 VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
      virtio_stl_p(vdev, &netcfg.supported_hash_types,
                   VIRTIO_NET_RSS_SUPPORTED_HASHES);
      memcpy(config, &netcfg, n->config_size);
@@ -XXX,XX +XXX,XX @@ static int peer_has_ufo(VirtIONet *n)
  }
- static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
+-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
--                                       int version_1)
++/**
-+                                       int version_1, int hash_report)
++ * Translate addresses between the qemu's virtual address and the SVQ IOVA
- {
++ *
-     int i;
++ * @svq: Shadow VirtQueue
-     NetClientState *nc;
++ * @vaddr: Translated IOVA addresses
-@@ -XXX,XX +XXX,XX @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
++ * @iovec: Source qemu's VA addresses
-     n->mergeable_rx_bufs = mergeable_rx_bufs;
++ * @num: Length of iovec and minimum length of vaddr
++ */
-     if (version_1) {
++static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
--        n->guest_hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
++                                     hwaddr *addrs, const struct iovec *iovec,
-+        n->guest_hdr_len = hash_report ?
++                                     size_t num)
-+            sizeof(struct virtio_net_hdr_v1_hash) :
++{
-+            sizeof(struct virtio_net_hdr_mrg_rxbuf);
++    if (num == 0) {
-+        n->rss_data.populate_hash = !!hash_report;
++        return true;
-     } else {
++    }
-         n->guest_hdr_len = n->mergeable_rx_bufs ?
++
-             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
++    for (size_t i = 0; i < num; ++i) {
-@@ -XXX,XX +XXX,XX @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
++        DMAMap needle = {
-         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
++            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
-         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
++            .size = iovec[i].iov_len,
-         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
++        };
-+
++        Int128 needle_last, map_last;
-+        virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
++        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = map->iova + off;
 +
 +        needle_last = int128_add(int128_make64(needle.translated_addr),
 +                                 int128_make64(iovec[i].iov_len));
 +        map_last = int128_make64(map->translated_addr + map->size);
 +        if (unlikely(int128_gt(needle_last, map_last))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                      const struct iovec *iovec, size_t num,
                                      bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
 -        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].addr = cpu_to_le64(sg[n]);
          descs[i].len = cpu_to_le32(iovec[n].iov_len);
          last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
-     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
+-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-@@ -XXX,XX +XXX,XX @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
+-                            false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                                struct vhost_vring_addr *addr)
  {
 -    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
 -    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
 -    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
  }
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/event_notifier.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 +#include "hw/virtio/vhost-iova-tree.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
 +
 +        iova = mem_region.iova;
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
-     virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
+-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
-+    virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
++    needle.translated_addr = svq_addr.used_user_addr;
-     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
++    return vhost_vdpa_svq_unmap_ring(v, &needle);
-     vdev->backend_features = features;
++}
++
-@@ -XXX,XX +XXX,XX @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
++/**
-                                virtio_has_feature(features,
++ * Map the SVQ area in the device
-                                                   VIRTIO_NET_F_MRG_RXBUF),
++ *
-                                virtio_has_feature(features,
++ * @v: Vhost-vdpa device
--                                                  VIRTIO_F_VERSION_1));
++ * @needle: The area to search iova
-+                                                  VIRTIO_F_VERSION_1),
++ * @errorp: Error pointer
-+                               virtio_has_feature(features,
++ */
-+                                                  VIRTIO_NET_F_HASH_REPORT));
++static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
++                                    Error **errp)
-     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
++{
-         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
++    int r;
-     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
++
-         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
++    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
-+    n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
++    if (unlikely(r != IOVA_OK)) {
++        error_setg(errp, "Cannot allocate iova (%d)", r);
-     if (n->has_vnet_hdr) {
++        return false;
-         n->curr_guest_offloads =
++    }
-@@ -XXX,XX +XXX,XX @@ static void virtio_net_disable_rss(VirtIONet *n)
++
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)(uintptr_t)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
- static uint16_t virtio_net_handle_rss(VirtIONet *n,
+ /**
--                                      struct iovec *iov, unsigned int iov_cnt)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
-+                                      struct iovec *iov,
+                                      struct vhost_vring_addr *addr,
-+                                      unsigned int iov_cnt,
+                                      Error **errp)
-+                                      bool do_rss)
+ {
- {
++    DMAMap device_region, driver_region;
-     VirtIODevice *vdev = VIRTIO_DEVICE(n);
++    struct vhost_vring_addr svq_addr;
-     struct virtio_net_rss_config cfg;
+     struct vhost_vdpa *v = dev->opaque;
-@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
+     size_t device_size = vhost_svq_device_area_size(svq);
-     const char *err_msg = "";
+     size_t driver_size = vhost_svq_driver_area_size(svq);
-     uint32_t err_value = 0;
+-    int r;
++    size_t avail_offset;
--    if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
++    bool ok;
-+    if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
-         err_msg = "RSS is not negotiated";
+     ERRP_GUARD();
-         goto error;
+-    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)(uintptr_t)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
-+    if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
++    addr->desc_user_addr = driver_region.iova;
-+        err_msg = "Hash report is not negotiated";
++    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
-+        goto error;
++    addr->avail_user_addr = driver_region.iova + avail_offset;
-+    }
-     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
+-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
+-                           (void *)(intptr_t)addr->used_user_addr, false);
-     if (s != size_get) {
+-    if (unlikely(r != 0)) {
-@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
+-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
-     n->rss_data.indirections_len =
++    device_region = (DMAMap) {
-         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
++        .translated_addr = svq_addr.used_user_addr,
-     n->rss_data.indirections_len++;
++        .size = device_size - 1,
-+    if (!do_rss) {
++        .perm = IOMMU_RW,
-+        n->rss_data.indirections_len = 1;
++    };
-+    }
++    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
-     if (!is_power_of_2(n->rss_data.indirections_len)) {
++    if (unlikely(!ok)) {
-         err_msg = "Invalid size of indirection table";
++        error_prepend(errp, "Cannot create vq device region: ");
-         err_value = n->rss_data.indirections_len;
++        vhost_vdpa_svq_unmap_ring(v, &driver_region);
@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
          err_value = n->rss_data.indirections_len;
          goto error;
      }
--    n->rss_data.default_queue =
++    addr->used_user_addr = device_region.iova;
--        virtio_lduw_p(vdev, &cfg.unclassified_queue);
-+    n->rss_data.default_queue = do_rss ?
+-    return r == 0;
-+        virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
++    return ok;
      if (n->rss_data.default_queue >= n->max_queues) {
          err_msg = "Invalid default queue";
          err_value = n->rss_data.default_queue;
@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
          err_value = (uint32_t)s;
          goto error;
      }
 -    queues = virtio_lduw_p(vdev, &temp.us);
 +    queues = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queues;
      if (queues == 0 || queues > n->max_queues) {
          err_msg = "Invalid number of queues";
          err_value = queues;
@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
      uint16_t queues;
      virtio_net_disable_rss(n);
 +    if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
 +        queues = virtio_net_handle_rss(n, iov, iov_cnt, false);
 +        return queues ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
 +    }
      if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
 -        queues = virtio_net_handle_rss(n, iov, iov_cnt);
 +        queues = virtio_net_handle_rss(n, iov, iov_cnt, true);
      } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
          struct virtio_net_ctrl_mq mq;
          size_t s;
@@ -XXX,XX +XXX,XX @@ static uint8_t virtio_net_get_hash_type(bool isip4,
      return 0xff;
  }
-+static void virtio_set_packet_hash(const uint8_t *buf, uint8_t report,
+ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-+                                   uint32_t hash)
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 +{
 +    struct virtio_net_hdr_v1_hash *hdr = (void *)buf;
 +    hdr->hash_value = hash;
 +    hdr->hash_report = report;
 +}
 +
  static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
                                    size_t size)
  {
      VirtIONet *n = qemu_get_nic_opaque(nc);
 -    unsigned int index = nc->queue_index, new_index;
 +    unsigned int index = nc->queue_index, new_index = index;
      struct NetRxPkt *pkt = n->rx_pkt;
      uint8_t net_hash_type;
      uint32_t hash;
      bool isip4, isip6, isudp, istcp;
 +    static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
 +        VIRTIO_NET_HASH_REPORT_IPv4,
 +        VIRTIO_NET_HASH_REPORT_TCPv4,
 +        VIRTIO_NET_HASH_REPORT_TCPv6,
 +        VIRTIO_NET_HASH_REPORT_IPv6,
 +        VIRTIO_NET_HASH_REPORT_IPv6_EX,
 +        VIRTIO_NET_HASH_REPORT_TCPv6_EX,
 +        VIRTIO_NET_HASH_REPORT_UDPv4,
 +        VIRTIO_NET_HASH_REPORT_UDPv6,
 +        VIRTIO_NET_HASH_REPORT_UDPv6_EX
 +    };
      net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len,
                               size - n->host_hdr_len);
@@ -XXX,XX +XXX,XX @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
      net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp,
                                               n->rss_data.hash_types);
      if (net_hash_type > NetPktRssIpV6UdpEx) {
 -        return n->rss_data.default_queue;
 +        if (n->rss_data.populate_hash) {
 +            virtio_set_packet_hash(buf, VIRTIO_NET_HASH_REPORT_NONE, 0);
 +        }
 +        return n->rss_data.redirect ? n->rss_data.default_queue : -1;
      }
      hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
 -    new_index = hash & (n->rss_data.indirections_len - 1);
 -    new_index = n->rss_data.indirections_table[new_index];
 -    if (index == new_index) {
 -        return -1;
 +
 +    if (n->rss_data.populate_hash) {
 +        virtio_set_packet_hash(buf, reports[net_hash_type], hash);
      }
 -    return new_index;
 +
 +    if (n->rss_data.redirect) {
 +        new_index = hash & (n->rss_data.indirections_len - 1);
 +        new_index = n->rss_data.indirections_table[new_index];
 +    }
 +
 +    return (index == new_index) ? -1 : new_index;
  }
  static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
              }
              receive_header(n, sg, elem->in_num, buf, size);
 +            if (n->rss_data.populate_hash) {
 +                offset = sizeof(mhdr);
 +                iov_from_buf(sg, elem->in_num, offset,
 +                             buf + offset, n->host_hdr_len - sizeof(mhdr));
 +            }
              offset = n->host_hdr_len;
              total += n->guest_hdr_len;
              guest_offset = n->guest_hdr_len;
@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
      trace_virtio_net_post_load_device();
      virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
                                 virtio_vdev_has_feature(vdev,
 -                                                       VIRTIO_F_VERSION_1));
 +                                                       VIRTIO_F_VERSION_1),
 +                               virtio_vdev_has_feature(vdev,
 +                                                       VIRTIO_NET_F_HASH_REPORT));
      /* MAC_TABLE_ENTRIES may be different from the saved image */
      if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
      n->vqs[0].tx_waiting = 0;
      n->tx_burst = n->net_conf.txburst;
 -    virtio_net_set_mrg_rx_bufs(n, 0, 0);
 +    virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
      n->promisc = 1; /* for compatibility */
      n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
@@ -XXX,XX +XXX,XX @@ static Property virtio_net_properties[] = {
      DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
      DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
                      VIRTIO_NET_F_RSS, false),
 +    DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
 +                    VIRTIO_NET_F_HASH_REPORT, false),
      DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
                      VIRTIO_NET_F_RSC_EXT, false),
      DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
 diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/virtio/virtio-net.h
+--- a/include/hw/virtio/vhost-vdpa.h
-+++ b/include/hw/virtio/virtio-net.h
++++ b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ typedef struct VirtioNetRscChain {
+@@ -XXX,XX +XXX,XX @@
- typedef struct VirtioNetRssData {
+ #include <gmodule.h>
-     bool    enabled;
-+    bool    redirect;
++#include "hw/virtio/vhost-iova-tree.h"
-+    bool    populate_hash;
+ #include "hw/virtio/virtio.h"
-     uint32_t hash_types;
+ #include "standard-headers/linux/vhost_types.h"
-     uint8_t key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
-     uint16_t indirections_len;
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
      bool shadow_vqs_enabled;
 +    /* IOVA mapping used by the Shadow Virtqueue */
 +    VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 --
-.5.0
+.7.4

-[PULL 07/33] virtio-net: align RSC fields with updated virtio-net header
+Deleted patch
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
-Removal of duplicated RSC definitions. Changing names of the
-fields to ones defined in the Linux header.
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/virtio-net.c | 28 ++++------------------------
-file changed, 4 insertions(+), 24 deletions(-)
-diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/virtio-net.c
-+++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@
-                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
-                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
--/* temporary until standard header include it */
--#if !defined(VIRTIO_NET_HDR_F_RSC_INFO)
--
--#define VIRTIO_NET_HDR_F_RSC_INFO  4 /* rsc_ext data in csum_ fields */
--#define VIRTIO_NET_F_RSC_EXT       61
--
--#endif
--
--static inline __virtio16 *virtio_net_rsc_ext_num_packets(
--    struct virtio_net_hdr *hdr)
--{
--    return &hdr->csum_start;
--}
--
--static inline __virtio16 *virtio_net_rsc_ext_num_dupacks(
--    struct virtio_net_hdr *hdr)
--{
--    return &hdr->csum_offset;
--}
--
- static VirtIOFeature feature_sizes[] = {
-     {.flags = 1ULL << VIRTIO_NET_F_MAC,
-      .end = endof(struct virtio_net_config, mac)},
-@@ -XXX,XX +XXX,XX @@ static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
-                                        VirtioNetRscSeg *seg)
- {
-     int ret;
--    struct virtio_net_hdr *h;
-+    struct virtio_net_hdr_v1 *h;
--    h = (struct virtio_net_hdr *)seg->buf;
-+    h = (struct virtio_net_hdr_v1 *)seg->buf;
-     h->flags = 0;
-     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-     if (seg->is_coalesced) {
--        *virtio_net_rsc_ext_num_packets(h) = seg->packets;
--        *virtio_net_rsc_ext_num_dupacks(h) = seg->dup_ack;
-+        h->rsc.segments = seg->packets;
-+        h->rsc.dup_acks = seg->dup_ack;
-         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
-         if (chain->proto == ETH_P_IP) {
-             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
---
-.5.0

-[PULL 08/33] Fix tulip breakage
+Deleted patch
-From: Helge Deller <deller@gmx.de>
-The tulip network driver in a qemu-system-hppa emulation is broken in
-the sense that bigger network packages aren't received any longer and
-thus even running e.g. "apt update" inside the VM fails.
-The breakage was introduced by commit 8ffb7265af ("check frame size and
-r/w data length") which added checks to prevent accesses outside of the
-rx/tx buffers.
-But the new checks were implemented wrong. The variable rx_frame_len
-counts backwards, from rx_frame_size down to zero, and the variable len
-is never bigger than rx_frame_len, so accesses just can't happen and the
-checks are unnecessary.
-On the contrary the checks now prevented bigger packages to be moved
-into the rx buffers.
-This patch reverts the wrong checks and were sucessfully tested with a
-qemu-system-hppa emulation.
-Fixes: 8ffb7265af ("check frame size and r/w data length")
-Buglink: https://bugs.launchpad.net/bugs/1874539
-Signed-off-by: Helge Deller <deller@gmx.de>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/tulip.c | 6 ------
-file changed, 6 deletions(-)
-diff --git a/hw/net/tulip.c b/hw/net/tulip.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/tulip.c
-+++ b/hw/net/tulip.c
-@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
-             len = s->rx_frame_len;
-         }
--        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
--            return;
--        }
-         pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
-             (s->rx_frame_size - s->rx_frame_len), len);
-         s->rx_frame_len -= len;
-@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
-             len = s->rx_frame_len;
-         }
--        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
--            return;
--        }
-         pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
-             (s->rx_frame_size - s->rx_frame_len), len);
-         s->rx_frame_len -= len;
---
-.5.0

-[PULL 09/33] hw/net/tulip: Fix 'Descriptor Error' definition
+Deleted patch
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Bit #14 is "DE" for 'Descriptor Error':
-  When set, indicates a frame truncation caused by a frame
-  that does not fit within the current descriptor buffers,
-  and that the 21143 does not own the next descriptor.
-  [Table 4-1. RDES0 Bit Fields Description]
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/tulip.h | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/tulip.h b/hw/net/tulip.h
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/tulip.h
-+++ b/hw/net/tulip.h
-@@ -XXX,XX +XXX,XX @@
- #define RDES0_RF         BIT(11)
- #define RDES0_DT_SHIFT   12
- #define RDES0_DT_MASK    3
--#define RDES0_LE         BIT(14)
-+#define RDES0_DE         BIT(14)
- #define RDES0_ES         BIT(15)
- #define RDES0_FL_SHIFT   16
- #define RDES0_FL_MASK    0x3fff
---
-.5.0

-[PULL 10/33] hw/net/tulip: Log descriptor overflows
+Deleted patch
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Log with GUEST_ERROR what the guest is doing wrong.
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/tulip.c | 6 ++++++
-file changed, 6 insertions(+)
-diff --git a/hw/net/tulip.c b/hw/net/tulip.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/tulip.c
-+++ b/hw/net/tulip.c
-@@ -XXX,XX +XXX,XX @@ static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
-     int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
-     if (s->tx_frame_len + len1 > sizeof(s->tx_frame)) {
-+        qemu_log_mask(LOG_GUEST_ERROR,
-+                      "%s: descriptor overflow (ofs: %u, len:%d, size:%zu)\n",
-+                      __func__, s->tx_frame_len, len1, sizeof(s->tx_frame));
-         return -1;
-     }
-     if (len1) {
-@@ -XXX,XX +XXX,XX @@ static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
-     }
-     if (s->tx_frame_len + len2 > sizeof(s->tx_frame)) {
-+        qemu_log_mask(LOG_GUEST_ERROR,
-+                      "%s: descriptor overflow (ofs: %u, len:%d, size:%zu)\n",
-+                      __func__, s->tx_frame_len, len2, sizeof(s->tx_frame));
-         return -1;
-     }
-     if (len2) {
---
-.5.0

-[PULL 11/33] net: cadence_gem: Fix debug statements
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Enabling debug breaks the build, Fix them and make debug statements
-always compilable. Fix few statements to use sized integer casting.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 27 +++++++++++++--------------
-file changed, 13 insertions(+), 14 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@
- #include "sysemu/dma.h"
- #include "net/checksum.h"
--#ifdef CADENCE_GEM_ERR_DEBUG
--#define DB_PRINT(...) do { \
--    fprintf(stderr,  ": %s: ", __func__); \
--    fprintf(stderr, ## __VA_ARGS__); \
--    } while (0)
--#else
--    #define DB_PRINT(...)
--#endif
-+#define CADENCE_GEM_ERR_DEBUG 0
-+#define DB_PRINT(...) do {\
-+    if (CADENCE_GEM_ERR_DEBUG) {   \
-+        qemu_log(": %s: ", __func__); \
-+        qemu_log(__VA_ARGS__); \
-+    } \
-+} while (0)
- #define GEM_NWCTRL        (0x00000000/4) /* Network Control reg */
- #define GEM_NWCFG         (0x00000004/4) /* Network Config reg */
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
-         size += 4;
-     }
--    DB_PRINT("config bufsize: %d packet size: %ld\n", rxbufsize, size);
-+    DB_PRINT("config bufsize: %u packet size: %zd\n", rxbufsize, size);
-     /* Find which queue we are targeting */
-     q = get_queue_from_screen(s, rxbuf_ptr, rxbufsize);
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
-             return -1;
-         }
--        DB_PRINT("copy %u bytes to 0x%" PRIx64 "\n",
--                 MIN(bytes_to_copy, rxbufsize),
--                 rx_desc_get_buffer(s, s->rx_desc[q]));
-+        DB_PRINT("copy %" PRIu32 " bytes to 0x%" PRIx64 "\n",
-+                MIN(bytes_to_copy, rxbufsize),
-+                rx_desc_get_buffer(s, s->rx_desc[q]));
-         /* Copy packet data to emulated DMA buffer */
-         address_space_write(&s->dma_as, rx_desc_get_buffer(s, s->rx_desc[q]) +
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
-              */
-             if ((tx_desc_get_buffer(s, desc) == 0) ||
-                 (tx_desc_get_length(desc) == 0)) {
--                DB_PRINT("Invalid TX descriptor @ 0x%x\n",
--                         (unsigned)packet_desc_addr);
-+                DB_PRINT("Invalid TX descriptor @ 0x%" HWADDR_PRIx "\n",
-+                         packet_desc_addr);
-                 break;
-             }
---
-.5.0

-[PULL 13/33] net: cadence_gem: Fix irq update w.r.t queue
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Set irq's specific to a queue, present implementation is setting q1 irq
-based on q0 status.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 25 +++----------------------
-file changed, 3 insertions(+), 22 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static void gem_update_int_status(CadenceGEMState *s)
- {
-     int i;
--    if (!s->regs[GEM_ISR]) {
--        /* ISR isn't set, clear all the interrupts */
--        for (i = 0; i < s->num_priority_queues; ++i) {
--            qemu_set_irq(s->irq[i], 0);
--        }
--        return;
--    }
-+    qemu_set_irq(s->irq[0], !!s->regs[GEM_ISR]);
--    /* If we get here we know s->regs[GEM_ISR] is set, so we don't need to
--     * check it again.
--     */
--    if (s->num_priority_queues == 1) {
--        /* No priority queues, just trigger the interrupt */
--        DB_PRINT("asserting int.\n");
--        qemu_set_irq(s->irq[0], 1);
--        return;
--    }
--
--    for (i = 0; i < s->num_priority_queues; ++i) {
--        if (s->regs[GEM_INT_Q1_STATUS + i]) {
--            DB_PRINT("asserting int. (q=%d)\n", i);
--            qemu_set_irq(s->irq[i], 1);
--        }
-+    for (i = 1; i < s->num_priority_queues; ++i) {
-+        qemu_set_irq(s->irq[i], !!s->regs[GEM_INT_Q1_STATUS + i - 1]);
-     }
- }
---
-.5.0

-[PULL 14/33] net: cadence_gem: Define access permission for interrupt registers
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Q1 to Q7 ISR's are clear-on-read, IER/IDR registers
-are write-only, mask reg are read-only.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 14 ++++++++++++++
-file changed, 14 insertions(+)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static const uint8_t broadcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
-  */
- static void gem_init_register_masks(CadenceGEMState *s)
- {
-+    unsigned int i;
-     /* Mask of register bits which are read only */
-     memset(&s->regs_ro[0], 0, sizeof(s->regs_ro));
-     s->regs_ro[GEM_NWCTRL]   = 0xFFF80000;
-@@ -XXX,XX +XXX,XX @@ static void gem_init_register_masks(CadenceGEMState *s)
-     s->regs_ro[GEM_ISR]      = 0xFFFFFFFF;
-     s->regs_ro[GEM_IMR]      = 0xFFFFFFFF;
-     s->regs_ro[GEM_MODID]    = 0xFFFFFFFF;
-+    for (i = 0; i < s->num_priority_queues; i++) {
-+        s->regs_ro[GEM_INT_Q1_STATUS + i] = 0xFFFFFFFF;
-+        s->regs_ro[GEM_INT_Q1_ENABLE + i] = 0xFFFFF319;
-+        s->regs_ro[GEM_INT_Q1_DISABLE + i] = 0xFFFFF319;
-+        s->regs_ro[GEM_INT_Q1_MASK + i] = 0xFFFFFFFF;
-+    }
-     /* Mask of register bits which are clear on read */
-     memset(&s->regs_rtc[0], 0, sizeof(s->regs_rtc));
-     s->regs_rtc[GEM_ISR]      = 0xFFFFFFFF;
-+    for (i = 0; i < s->num_priority_queues; i++) {
-+        s->regs_rtc[GEM_INT_Q1_STATUS + i] = 0x00000CE6;
-+    }
-     /* Mask of register bits which are write 1 to clear */
-     memset(&s->regs_w1c[0], 0, sizeof(s->regs_w1c));
-@@ -XXX,XX +XXX,XX @@ static void gem_init_register_masks(CadenceGEMState *s)
-     s->regs_wo[GEM_NWCTRL]   = 0x00073E60;
-     s->regs_wo[GEM_IER]      = 0x07FFFFFF;
-     s->regs_wo[GEM_IDR]      = 0x07FFFFFF;
-+    for (i = 0; i < s->num_priority_queues; i++) {
-+        s->regs_wo[GEM_INT_Q1_ENABLE + i] = 0x00000CE6;
-+        s->regs_wo[GEM_INT_Q1_DISABLE + i] = 0x00000CE6;
-+    }
- }
- /*
---
-.5.0

-[PULL 16/33] net: cadence_gem: Move tx/rx packet buffert to CadenceGEMState
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Moving this buffers to CadenceGEMState, as their size will be increased
-more when JUMBO frames support is added.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c         | 38 +++++++++++++++++---------------------
- include/hw/net/cadence_gem.h |  4 ++++
-files changed, 21 insertions(+), 21 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static void gem_get_rx_desc(CadenceGEMState *s, int q)
-  */
- static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
- {
--    CadenceGEMState *s;
-+    CadenceGEMState *s = qemu_get_nic_opaque(nc);
-     unsigned   rxbufsize, bytes_to_copy;
-     unsigned   rxbuf_offset;
--    uint8_t    rxbuf[2048];
-     uint8_t   *rxbuf_ptr;
-     bool first_desc = true;
-     int maf;
-     int q = 0;
--    s = qemu_get_nic_opaque(nc);
--
-     /* Is this destination MAC address "for us" ? */
-     maf = gem_mac_address_filter(s, buf);
-     if (maf == GEM_RX_REJECT) {
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
-     } else {
-         unsigned crc_val;
--        if (size > sizeof(rxbuf) - sizeof(crc_val)) {
--            size = sizeof(rxbuf) - sizeof(crc_val);
-+        if (size > MAX_FRAME_SIZE - sizeof(crc_val)) {
-+            size = MAX_FRAME_SIZE - sizeof(crc_val);
-         }
-         bytes_to_copy = size;
-         /* The application wants the FCS field, which QEMU does not provide.
-          * We must try and calculate one.
-          */
--        memcpy(rxbuf, buf, size);
--        memset(rxbuf + size, 0, sizeof(rxbuf) - size);
--        rxbuf_ptr = rxbuf;
--        crc_val = cpu_to_le32(crc32(0, rxbuf, MAX(size, 60)));
--        memcpy(rxbuf + size, &crc_val, sizeof(crc_val));
-+        memcpy(s->rx_packet, buf, size);
-+        memset(s->rx_packet + size, 0, MAX_FRAME_SIZE - size);
-+        rxbuf_ptr = s->rx_packet;
-+        crc_val = cpu_to_le32(crc32(0, s->rx_packet, MAX(size, 60)));
-+        memcpy(s->rx_packet + size, &crc_val, sizeof(crc_val));
-         bytes_to_copy += 4;
-         size += 4;
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
- {
-     uint32_t desc[DESC_MAX_NUM_WORDS];
-     hwaddr packet_desc_addr;
--    uint8_t     tx_packet[2048];
-     uint8_t     *p;
-     unsigned    total_bytes;
-     int q = 0;
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
-      * Packets scattered across multiple descriptors are gathered to this
-      * one contiguous buffer first.
-      */
--    p = tx_packet;
-+    p = s->tx_packet;
-     total_bytes = 0;
-     for (q = s->num_priority_queues - 1; q >= 0; q--) {
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
-                 break;
-             }
--            if (tx_desc_get_length(desc) > sizeof(tx_packet) -
--                                               (p - tx_packet)) {
-+            if (tx_desc_get_length(desc) > MAX_FRAME_SIZE -
-+                                               (p - s->tx_packet)) {
-                 DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \
-                          " too large: size 0x%x space 0x%zx\n",
-                          packet_desc_addr, tx_desc_get_length(desc),
--                         sizeof(tx_packet) - (p - tx_packet));
-+                         MAX_FRAME_SIZE - (p - s->tx_packet));
-                 break;
-             }
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
-                 /* Is checksum offload enabled? */
-                 if (s->regs[GEM_DMACFG] & GEM_DMACFG_TXCSUM_OFFL) {
--                    net_checksum_calculate(tx_packet, total_bytes);
-+                    net_checksum_calculate(s->tx_packet, total_bytes);
-                 }
-                 /* Update MAC statistics */
--                gem_transmit_updatestats(s, tx_packet, total_bytes);
-+                gem_transmit_updatestats(s, s->tx_packet, total_bytes);
-                 /* Send the packet somewhere */
-                 if (s->phy_loop || (s->regs[GEM_NWCTRL] &
-                                     GEM_NWCTRL_LOCALLOOP)) {
--                    gem_receive(qemu_get_queue(s->nic), tx_packet,
-+                    gem_receive(qemu_get_queue(s->nic), s->tx_packet,
-                                 total_bytes);
-                 } else {
--                    qemu_send_packet(qemu_get_queue(s->nic), tx_packet,
-+                    qemu_send_packet(qemu_get_queue(s->nic), s->tx_packet,
-                                      total_bytes);
-                 }
-                 /* Prepare for next packet */
--                p = tx_packet;
-+                p = s->tx_packet;
-                 total_bytes = 0;
-             }
-diff --git a/include/hw/net/cadence_gem.h b/include/hw/net/cadence_gem.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/hw/net/cadence_gem.h
-+++ b/include/hw/net/cadence_gem.h
-@@ -XXX,XX +XXX,XX @@
- #define MAX_TYPE1_SCREENERS             16
- #define MAX_TYPE2_SCREENERS             16
-+#define MAX_FRAME_SIZE 2048
-+
- typedef struct CadenceGEMState {
-     /*< private >*/
-     SysBusDevice parent_obj;
-@@ -XXX,XX +XXX,XX @@ typedef struct CadenceGEMState {
-     uint8_t can_rx_state; /* Debug only */
-+    uint8_t tx_packet[MAX_FRAME_SIZE];
-+    uint8_t rx_packet[MAX_FRAME_SIZE];
-     uint32_t rx_desc[MAX_PRIORITY_QUEUES][DESC_MAX_NUM_WORDS];
-     bool sar_active[4];
---
-.5.0

-[PULL 17/33] net: cadence_gem: Fix up code style
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Fix the code style for register definitions.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 204 ++++++++++++++++++++++++++-------------------------
-file changed, 103 insertions(+), 101 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@
-     } \
- } while (0)
--#define GEM_NWCTRL        (0x00000000/4) /* Network Control reg */
--#define GEM_NWCFG         (0x00000004/4) /* Network Config reg */
--#define GEM_NWSTATUS      (0x00000008/4) /* Network Status reg */
--#define GEM_USERIO        (0x0000000C/4) /* User IO reg */
--#define GEM_DMACFG        (0x00000010/4) /* DMA Control reg */
--#define GEM_TXSTATUS      (0x00000014/4) /* TX Status reg */
--#define GEM_RXQBASE       (0x00000018/4) /* RX Q Base address reg */
--#define GEM_TXQBASE       (0x0000001C/4) /* TX Q Base address reg */
--#define GEM_RXSTATUS      (0x00000020/4) /* RX Status reg */
--#define GEM_ISR           (0x00000024/4) /* Interrupt Status reg */
--#define GEM_IER           (0x00000028/4) /* Interrupt Enable reg */
--#define GEM_IDR           (0x0000002C/4) /* Interrupt Disable reg */
--#define GEM_IMR           (0x00000030/4) /* Interrupt Mask reg */
--#define GEM_PHYMNTNC      (0x00000034/4) /* Phy Maintenance reg */
--#define GEM_RXPAUSE       (0x00000038/4) /* RX Pause Time reg */
--#define GEM_TXPAUSE       (0x0000003C/4) /* TX Pause Time reg */
--#define GEM_TXPARTIALSF   (0x00000040/4) /* TX Partial Store and Forward */
--#define GEM_RXPARTIALSF   (0x00000044/4) /* RX Partial Store and Forward */
--#define GEM_HASHLO        (0x00000080/4) /* Hash Low address reg */
--#define GEM_HASHHI        (0x00000084/4) /* Hash High address reg */
--#define GEM_SPADDR1LO     (0x00000088/4) /* Specific addr 1 low reg */
--#define GEM_SPADDR1HI     (0x0000008C/4) /* Specific addr 1 high reg */
--#define GEM_SPADDR2LO     (0x00000090/4) /* Specific addr 2 low reg */
--#define GEM_SPADDR2HI     (0x00000094/4) /* Specific addr 2 high reg */
--#define GEM_SPADDR3LO     (0x00000098/4) /* Specific addr 3 low reg */
--#define GEM_SPADDR3HI     (0x0000009C/4) /* Specific addr 3 high reg */
--#define GEM_SPADDR4LO     (0x000000A0/4) /* Specific addr 4 low reg */
--#define GEM_SPADDR4HI     (0x000000A4/4) /* Specific addr 4 high reg */
--#define GEM_TIDMATCH1     (0x000000A8/4) /* Type ID1 Match reg */
--#define GEM_TIDMATCH2     (0x000000AC/4) /* Type ID2 Match reg */
--#define GEM_TIDMATCH3     (0x000000B0/4) /* Type ID3 Match reg */
--#define GEM_TIDMATCH4     (0x000000B4/4) /* Type ID4 Match reg */
--#define GEM_WOLAN         (0x000000B8/4) /* Wake on LAN reg */
--#define GEM_IPGSTRETCH    (0x000000BC/4) /* IPG Stretch reg */
--#define GEM_SVLAN         (0x000000C0/4) /* Stacked VLAN reg */
--#define GEM_MODID         (0x000000FC/4) /* Module ID reg */
--#define GEM_OCTTXLO       (0x00000100/4) /* Octects transmitted Low reg */
--#define GEM_OCTTXHI       (0x00000104/4) /* Octects transmitted High reg */
--#define GEM_TXCNT         (0x00000108/4) /* Error-free Frames transmitted */
--#define GEM_TXBCNT        (0x0000010C/4) /* Error-free Broadcast Frames */
--#define GEM_TXMCNT        (0x00000110/4) /* Error-free Multicast Frame */
--#define GEM_TXPAUSECNT    (0x00000114/4) /* Pause Frames Transmitted */
--#define GEM_TX64CNT       (0x00000118/4) /* Error-free 64 TX */
--#define GEM_TX65CNT       (0x0000011C/4) /* Error-free 65-127 TX */
--#define GEM_TX128CNT      (0x00000120/4) /* Error-free 128-255 TX */
--#define GEM_TX256CNT      (0x00000124/4) /* Error-free 256-511 */
--#define GEM_TX512CNT      (0x00000128/4) /* Error-free 512-1023 TX */
--#define GEM_TX1024CNT     (0x0000012C/4) /* Error-free 1024-1518 TX */
--#define GEM_TX1519CNT     (0x00000130/4) /* Error-free larger than 1519 TX */
--#define GEM_TXURUNCNT     (0x00000134/4) /* TX under run error counter */
--#define GEM_SINGLECOLLCNT (0x00000138/4) /* Single Collision Frames */
--#define GEM_MULTCOLLCNT   (0x0000013C/4) /* Multiple Collision Frames */
--#define GEM_EXCESSCOLLCNT (0x00000140/4) /* Excessive Collision Frames */
--#define GEM_LATECOLLCNT   (0x00000144/4) /* Late Collision Frames */
--#define GEM_DEFERTXCNT    (0x00000148/4) /* Deferred Transmission Frames */
--#define GEM_CSENSECNT     (0x0000014C/4) /* Carrier Sense Error Counter */
--#define GEM_OCTRXLO       (0x00000150/4) /* Octects Received register Low */
--#define GEM_OCTRXHI       (0x00000154/4) /* Octects Received register High */
--#define GEM_RXCNT         (0x00000158/4) /* Error-free Frames Received */
--#define GEM_RXBROADCNT    (0x0000015C/4) /* Error-free Broadcast Frames RX */
--#define GEM_RXMULTICNT    (0x00000160/4) /* Error-free Multicast Frames RX */
--#define GEM_RXPAUSECNT    (0x00000164/4) /* Pause Frames Received Counter */
--#define GEM_RX64CNT       (0x00000168/4) /* Error-free 64 byte Frames RX */
--#define GEM_RX65CNT       (0x0000016C/4) /* Error-free 65-127B Frames RX */
--#define GEM_RX128CNT      (0x00000170/4) /* Error-free 128-255B Frames RX */
--#define GEM_RX256CNT      (0x00000174/4) /* Error-free 256-512B Frames RX */
--#define GEM_RX512CNT      (0x00000178/4) /* Error-free 512-1023B Frames RX */
--#define GEM_RX1024CNT     (0x0000017C/4) /* Error-free 1024-1518B Frames RX */
--#define GEM_RX1519CNT     (0x00000180/4) /* Error-free 1519-max Frames RX */
--#define GEM_RXUNDERCNT    (0x00000184/4) /* Undersize Frames Received */
--#define GEM_RXOVERCNT     (0x00000188/4) /* Oversize Frames Received */
--#define GEM_RXJABCNT      (0x0000018C/4) /* Jabbers Received Counter */
--#define GEM_RXFCSCNT      (0x00000190/4) /* Frame Check seq. Error Counter */
--#define GEM_RXLENERRCNT   (0x00000194/4) /* Length Field Error Counter */
--#define GEM_RXSYMERRCNT   (0x00000198/4) /* Symbol Error Counter */
--#define GEM_RXALIGNERRCNT (0x0000019C/4) /* Alignment Error Counter */
--#define GEM_RXRSCERRCNT   (0x000001A0/4) /* Receive Resource Error Counter */
--#define GEM_RXORUNCNT     (0x000001A4/4) /* Receive Overrun Counter */
--#define GEM_RXIPCSERRCNT  (0x000001A8/4) /* IP header Checksum Error Counter */
--#define GEM_RXTCPCCNT     (0x000001AC/4) /* TCP Checksum Error Counter */
--#define GEM_RXUDPCCNT     (0x000001B0/4) /* UDP Checksum Error Counter */
--
--#define GEM_1588S         (0x000001D0/4) /* 1588 Timer Seconds */
--#define GEM_1588NS        (0x000001D4/4) /* 1588 Timer Nanoseconds */
--#define GEM_1588ADJ       (0x000001D8/4) /* 1588 Timer Adjust */
--#define GEM_1588INC       (0x000001DC/4) /* 1588 Timer Increment */
--#define GEM_PTPETXS       (0x000001E0/4) /* PTP Event Frame Transmitted (s) */
--#define GEM_PTPETXNS      (0x000001E4/4) /* PTP Event Frame Transmitted (ns) */
--#define GEM_PTPERXS       (0x000001E8/4) /* PTP Event Frame Received (s) */
--#define GEM_PTPERXNS      (0x000001EC/4) /* PTP Event Frame Received (ns) */
--#define GEM_PTPPTXS       (0x000001E0/4) /* PTP Peer Frame Transmitted (s) */
--#define GEM_PTPPTXNS      (0x000001E4/4) /* PTP Peer Frame Transmitted (ns) */
--#define GEM_PTPPRXS       (0x000001E8/4) /* PTP Peer Frame Received (s) */
--#define GEM_PTPPRXNS      (0x000001EC/4) /* PTP Peer Frame Received (ns) */
-+#define GEM_NWCTRL        (0x00000000 / 4) /* Network Control reg */
-+#define GEM_NWCFG         (0x00000004 / 4) /* Network Config reg */
-+#define GEM_NWSTATUS      (0x00000008 / 4) /* Network Status reg */
-+#define GEM_USERIO        (0x0000000C / 4) /* User IO reg */
-+#define GEM_DMACFG        (0x00000010 / 4) /* DMA Control reg */
-+#define GEM_TXSTATUS      (0x00000014 / 4) /* TX Status reg */
-+#define GEM_RXQBASE       (0x00000018 / 4) /* RX Q Base address reg */
-+#define GEM_TXQBASE       (0x0000001C / 4) /* TX Q Base address reg */
-+#define GEM_RXSTATUS      (0x00000020 / 4) /* RX Status reg */
-+#define GEM_ISR           (0x00000024 / 4) /* Interrupt Status reg */
-+#define GEM_IER           (0x00000028 / 4) /* Interrupt Enable reg */
-+#define GEM_IDR           (0x0000002C / 4) /* Interrupt Disable reg */
-+#define GEM_IMR           (0x00000030 / 4) /* Interrupt Mask reg */
-+#define GEM_PHYMNTNC      (0x00000034 / 4) /* Phy Maintenance reg */
-+#define GEM_RXPAUSE       (0x00000038 / 4) /* RX Pause Time reg */
-+#define GEM_TXPAUSE       (0x0000003C / 4) /* TX Pause Time reg */
-+#define GEM_TXPARTIALSF   (0x00000040 / 4) /* TX Partial Store and Forward */
-+#define GEM_RXPARTIALSF   (0x00000044 / 4) /* RX Partial Store and Forward */
-+#define GEM_HASHLO        (0x00000080 / 4) /* Hash Low address reg */
-+#define GEM_HASHHI        (0x00000084 / 4) /* Hash High address reg */
-+#define GEM_SPADDR1LO     (0x00000088 / 4) /* Specific addr 1 low reg */
-+#define GEM_SPADDR1HI     (0x0000008C / 4) /* Specific addr 1 high reg */
-+#define GEM_SPADDR2LO     (0x00000090 / 4) /* Specific addr 2 low reg */
-+#define GEM_SPADDR2HI     (0x00000094 / 4) /* Specific addr 2 high reg */
-+#define GEM_SPADDR3LO     (0x00000098 / 4) /* Specific addr 3 low reg */
-+#define GEM_SPADDR3HI     (0x0000009C / 4) /* Specific addr 3 high reg */
-+#define GEM_SPADDR4LO     (0x000000A0 / 4) /* Specific addr 4 low reg */
-+#define GEM_SPADDR4HI     (0x000000A4 / 4) /* Specific addr 4 high reg */
-+#define GEM_TIDMATCH1     (0x000000A8 / 4) /* Type ID1 Match reg */
-+#define GEM_TIDMATCH2     (0x000000AC / 4) /* Type ID2 Match reg */
-+#define GEM_TIDMATCH3     (0x000000B0 / 4) /* Type ID3 Match reg */
-+#define GEM_TIDMATCH4     (0x000000B4 / 4) /* Type ID4 Match reg */
-+#define GEM_WOLAN         (0x000000B8 / 4) /* Wake on LAN reg */
-+#define GEM_IPGSTRETCH    (0x000000BC / 4) /* IPG Stretch reg */
-+#define GEM_SVLAN         (0x000000C0 / 4) /* Stacked VLAN reg */
-+#define GEM_MODID         (0x000000FC / 4) /* Module ID reg */
-+#define GEM_OCTTXLO       (0x00000100 / 4) /* Octects transmitted Low reg */
-+#define GEM_OCTTXHI       (0x00000104 / 4) /* Octects transmitted High reg */
-+#define GEM_TXCNT         (0x00000108 / 4) /* Error-free Frames transmitted */
-+#define GEM_TXBCNT        (0x0000010C / 4) /* Error-free Broadcast Frames */
-+#define GEM_TXMCNT        (0x00000110 / 4) /* Error-free Multicast Frame */
-+#define GEM_TXPAUSECNT    (0x00000114 / 4) /* Pause Frames Transmitted */
-+#define GEM_TX64CNT       (0x00000118 / 4) /* Error-free 64 TX */
-+#define GEM_TX65CNT       (0x0000011C / 4) /* Error-free 65-127 TX */
-+#define GEM_TX128CNT      (0x00000120 / 4) /* Error-free 128-255 TX */
-+#define GEM_TX256CNT      (0x00000124 / 4) /* Error-free 256-511 */
-+#define GEM_TX512CNT      (0x00000128 / 4) /* Error-free 512-1023 TX */
-+#define GEM_TX1024CNT     (0x0000012C / 4) /* Error-free 1024-1518 TX */
-+#define GEM_TX1519CNT     (0x00000130 / 4) /* Error-free larger than 1519 TX */
-+#define GEM_TXURUNCNT     (0x00000134 / 4) /* TX under run error counter */
-+#define GEM_SINGLECOLLCNT (0x00000138 / 4) /* Single Collision Frames */
-+#define GEM_MULTCOLLCNT   (0x0000013C / 4) /* Multiple Collision Frames */
-+#define GEM_EXCESSCOLLCNT (0x00000140 / 4) /* Excessive Collision Frames */
-+#define GEM_LATECOLLCNT   (0x00000144 / 4) /* Late Collision Frames */
-+#define GEM_DEFERTXCNT    (0x00000148 / 4) /* Deferred Transmission Frames */
-+#define GEM_CSENSECNT     (0x0000014C / 4) /* Carrier Sense Error Counter */
-+#define GEM_OCTRXLO       (0x00000150 / 4) /* Octects Received register Low */
-+#define GEM_OCTRXHI       (0x00000154 / 4) /* Octects Received register High */
-+#define GEM_RXCNT         (0x00000158 / 4) /* Error-free Frames Received */
-+#define GEM_RXBROADCNT    (0x0000015C / 4) /* Error-free Broadcast Frames RX */
-+#define GEM_RXMULTICNT    (0x00000160 / 4) /* Error-free Multicast Frames RX */
-+#define GEM_RXPAUSECNT    (0x00000164 / 4) /* Pause Frames Received Counter */
-+#define GEM_RX64CNT       (0x00000168 / 4) /* Error-free 64 byte Frames RX */
-+#define GEM_RX65CNT       (0x0000016C / 4) /* Error-free 65-127B Frames RX */
-+#define GEM_RX128CNT      (0x00000170 / 4) /* Error-free 128-255B Frames RX */
-+#define GEM_RX256CNT      (0x00000174 / 4) /* Error-free 256-512B Frames RX */
-+#define GEM_RX512CNT      (0x00000178 / 4) /* Error-free 512-1023B Frames RX */
-+#define GEM_RX1024CNT     (0x0000017C / 4) /* Error-free 1024-1518B Frames RX */
-+#define GEM_RX1519CNT     (0x00000180 / 4) /* Error-free 1519-max Frames RX */
-+#define GEM_RXUNDERCNT    (0x00000184 / 4) /* Undersize Frames Received */
-+#define GEM_RXOVERCNT     (0x00000188 / 4) /* Oversize Frames Received */
-+#define GEM_RXJABCNT      (0x0000018C / 4) /* Jabbers Received Counter */
-+#define GEM_RXFCSCNT      (0x00000190 / 4) /* Frame Check seq. Error Counter */
-+#define GEM_RXLENERRCNT   (0x00000194 / 4) /* Length Field Error Counter */
-+#define GEM_RXSYMERRCNT   (0x00000198 / 4) /* Symbol Error Counter */
-+#define GEM_RXALIGNERRCNT (0x0000019C / 4) /* Alignment Error Counter */
-+#define GEM_RXRSCERRCNT   (0x000001A0 / 4) /* Receive Resource Error Counter */
-+#define GEM_RXORUNCNT     (0x000001A4 / 4) /* Receive Overrun Counter */
-+#define GEM_RXIPCSERRCNT  (0x000001A8 / 4) /* IP header Checksum Err Counter */
-+#define GEM_RXTCPCCNT     (0x000001AC / 4) /* TCP Checksum Error Counter */
-+#define GEM_RXUDPCCNT     (0x000001B0 / 4) /* UDP Checksum Error Counter */
-+
-+#define GEM_1588S         (0x000001D0 / 4) /* 1588 Timer Seconds */
-+#define GEM_1588NS        (0x000001D4 / 4) /* 1588 Timer Nanoseconds */
-+#define GEM_1588ADJ       (0x000001D8 / 4) /* 1588 Timer Adjust */
-+#define GEM_1588INC       (0x000001DC / 4) /* 1588 Timer Increment */
-+#define GEM_PTPETXS       (0x000001E0 / 4) /* PTP Event Frame Transmitted (s) */
-+#define GEM_PTPETXNS      (0x000001E4 / 4) /*
-+                                            * PTP Event Frame Transmitted (ns)
-+                                            */
-+#define GEM_PTPERXS       (0x000001E8 / 4) /* PTP Event Frame Received (s) */
-+#define GEM_PTPERXNS      (0x000001EC / 4) /* PTP Event Frame Received (ns) */
-+#define GEM_PTPPTXS       (0x000001E0 / 4) /* PTP Peer Frame Transmitted (s) */
-+#define GEM_PTPPTXNS      (0x000001E4 / 4) /* PTP Peer Frame Transmitted (ns) */
-+#define GEM_PTPPRXS       (0x000001E8 / 4) /* PTP Peer Frame Received (s) */
-+#define GEM_PTPPRXNS      (0x000001EC / 4) /* PTP Peer Frame Received (ns) */
- /* Design Configuration Registers */
--#define GEM_DESCONF       (0x00000280/4)
--#define GEM_DESCONF2      (0x00000284/4)
--#define GEM_DESCONF3      (0x00000288/4)
--#define GEM_DESCONF4      (0x0000028C/4)
--#define GEM_DESCONF5      (0x00000290/4)
--#define GEM_DESCONF6      (0x00000294/4)
-+#define GEM_DESCONF       (0x00000280 / 4)
-+#define GEM_DESCONF2      (0x00000284 / 4)
-+#define GEM_DESCONF3      (0x00000288 / 4)
-+#define GEM_DESCONF4      (0x0000028C / 4)
-+#define GEM_DESCONF5      (0x00000290 / 4)
-+#define GEM_DESCONF6      (0x00000294 / 4)
- #define GEM_DESCONF6_64B_MASK (1U << 23)
--#define GEM_DESCONF7      (0x00000298/4)
-+#define GEM_DESCONF7      (0x00000298 / 4)
- #define GEM_INT_Q1_STATUS               (0x00000400 / 4)
- #define GEM_INT_Q1_MASK                 (0x00000640 / 4)
---
-.5.0

-[PULL 19/33] net: cadnece_gem: Update irq_read_clear field of designcfg_debug1 reg
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Advertise support of clear-on-read for ISR registers.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static void gem_reset(DeviceState *d)
-     s->regs[GEM_TXPARTIALSF] = 0x000003ff;
-     s->regs[GEM_RXPARTIALSF] = 0x000003ff;
-     s->regs[GEM_MODID] = s->revision;
--    s->regs[GEM_DESCONF] = 0x02500111;
-+    s->regs[GEM_DESCONF] = 0x02D00111;
-     s->regs[GEM_DESCONF2] = 0x2ab10000 | s->jumbo_max_len;
-     s->regs[GEM_DESCONF5] = 0x002f2045;
-     s->regs[GEM_DESCONF6] = GEM_DESCONF6_64B_MASK;
---
-.5.0

-[PULL 20/33] net: cadence_gem: Update the reset value for interrupt mask register
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Mask all interrupt on reset.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 1 +
-file changed, 1 insertion(+)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static void gem_reset(DeviceState *d)
-     s->regs[GEM_DESCONF2] = 0x2ab10000 | s->jumbo_max_len;
-     s->regs[GEM_DESCONF5] = 0x002f2045;
-     s->regs[GEM_DESCONF6] = GEM_DESCONF6_64B_MASK;
-+    s->regs[GEM_INT_Q1_MASK] = 0x00000CE6;
-     s->regs[GEM_JUMBO_MAX_LEN] = s->jumbo_max_len;
-     if (s->num_priority_queues > 1) {
---
-.5.0

-[PULL 21/33] net: cadence_gem: TX_LAST bit should be set by guest
+Deleted patch
-From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-TX_LAST bit should not be set by hardware, its set by guest to inform
-the last bd of the frame.
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Signed-off-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 6 ------
-file changed, 6 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static inline unsigned tx_desc_get_last(uint32_t *desc)
-     return (desc[1] & DESC_1_TX_LAST) ? 1 : 0;
- }
--static inline void tx_desc_set_last(uint32_t *desc)
--{
--    desc[1] |= DESC_1_TX_LAST;
--}
--
- static inline unsigned tx_desc_get_length(uint32_t *desc)
- {
-     return desc[1] & DESC_1_LENGTH;
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
-             /* read next descriptor */
-             if (tx_desc_get_wrap(desc)) {
--                tx_desc_set_last(desc);
-                 if (s->regs[GEM_DMACFG] & GEM_DMACFG_ADDR_64B) {
-                     packet_desc_addr = s->regs[GEM_TBQPH];
---
-.5.0

-[PULL 22/33] net: cadence_gem: Fix RX address filtering
+Deleted patch
-From: Tong Ho <tong.ho@xilinx.com>
-Two defects are fixed:
-/ Detection of multicast frames
-/ Treating drop of mis-addressed frames as non-error
-Signed-off-by: Tong Ho <tong.ho@xilinx.com>
-Signed-off-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
-Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 26 +++++++++++---------------
-file changed, 11 insertions(+), 15 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/module.h"
- #include "sysemu/dma.h"
- #include "net/checksum.h"
-+#include "net/eth.h"
- #define CADENCE_GEM_ERR_DEBUG 0
- #define DB_PRINT(...) do {\
-@@ -XXX,XX +XXX,XX @@ static unsigned calc_mac_hash(const uint8_t *mac)
- static int gem_mac_address_filter(CadenceGEMState *s, const uint8_t *packet)
- {
-     uint8_t *gem_spaddr;
--    int i;
-+    int i, is_mc;
-     /* Promiscuous mode? */
-     if (s->regs[GEM_NWCFG] & GEM_NWCFG_PROMISC) {
-@@ -XXX,XX +XXX,XX @@ static int gem_mac_address_filter(CadenceGEMState *s, const uint8_t *packet)
-     }
-     /* Accept packets -w- hash match? */
--    if ((packet[0] == 0x01 && (s->regs[GEM_NWCFG] & GEM_NWCFG_MCAST_HASH)) ||
--        (packet[0] != 0x01 && (s->regs[GEM_NWCFG] & GEM_NWCFG_UCAST_HASH))) {
-+    is_mc = is_multicast_ether_addr(packet);
-+    if ((is_mc && (s->regs[GEM_NWCFG] & GEM_NWCFG_MCAST_HASH)) ||
-+        (!is_mc && (s->regs[GEM_NWCFG] & GEM_NWCFG_UCAST_HASH))) {
-+        uint64_t buckets;
-         unsigned hash_index;
-         hash_index = calc_mac_hash(packet);
--        if (hash_index < 32) {
--            if (s->regs[GEM_HASHLO] & (1<<hash_index)) {
--                return packet[0] == 0x01 ? GEM_RX_MULTICAST_HASH_ACCEPT :
--                                           GEM_RX_UNICAST_HASH_ACCEPT;
--            }
--        } else {
--            hash_index -= 32;
--            if (s->regs[GEM_HASHHI] & (1<<hash_index)) {
--                return packet[0] == 0x01 ? GEM_RX_MULTICAST_HASH_ACCEPT :
--                                           GEM_RX_UNICAST_HASH_ACCEPT;
--            }
-+        buckets = ((uint64_t)s->regs[GEM_HASHHI] << 32) | s->regs[GEM_HASHLO];
-+        if ((buckets >> hash_index) & 1) {
-+            return is_mc ? GEM_RX_MULTICAST_HASH_ACCEPT
-+                         : GEM_RX_UNICAST_HASH_ACCEPT;
-         }
-     }
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
-     /* Is this destination MAC address "for us" ? */
-     maf = gem_mac_address_filter(s, buf);
-     if (maf == GEM_RX_REJECT) {
--        return -1;
-+        return size;  /* no, drop siliently b/c it's not an error */
-     }
-     /* Discard packets with receive length error enabled ? */
---
-.5.0

-[PULL 23/33] net: use peer when purging queue in qemu_flush_or_purge_queue_packets()
+Deleted patch
-The sender of packet will be checked in the qemu_net_queue_purge() but
-we use NetClientState not its peer when trying to purge the incoming
-queue in qemu_flush_or_purge_packets(). This will trigger the assert
-in virtio_net_reset since we can't pass the sender check:
-hw/net/virtio-net.c:533: void virtio_net_reset(VirtIODevice *): Assertion
-`!virtio_net_get_subqueue(nc)->async_tx.elem' failed.
-#9 0x55a33fa31b78 in virtio_net_reset hw/net/virtio-net.c:533:13
-#10 0x55a33fc88412 in virtio_reset hw/virtio/virtio.c:1919:9
-#11 0x55a341d82764 in virtio_bus_reset hw/virtio/virtio-bus.c:95:9
-#12 0x55a341dba2de in virtio_pci_reset hw/virtio/virtio-pci.c:1824:5
-#13 0x55a341db3e02 in virtio_pci_common_write hw/virtio/virtio-pci.c:1252:13
-#14 0x55a33f62117b in memory_region_write_accessor memory.c:496:5
-#15 0x55a33f6205e4 in access_with_adjusted_size memory.c:557:18
-#16 0x55a33f61e177 in memory_region_dispatch_write memory.c:1488:16
-Reproducer:
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg701914.html
-Fix by using the peer.
-Reported-by: "Alexander Bulekov" <alxndr@bu.edu>
-Acked-by: Alexander Bulekov <alxndr@bu.edu>
-Fixes: ca77d85e1dbf9 ("net: complete all queued packets on VM stop")
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- net/net.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/net/net.c b/net/net.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
-+++ b/net/net.c
-@@ -XXX,XX +XXX,XX @@ void qemu_flush_or_purge_queued_packets(NetClientState *nc, bool purge)
-         qemu_notify_event();
-     } else if (purge) {
-         /* Unable to empty the queue, purge remaining packets */
--        qemu_net_queue_purge(nc->incoming_queue, nc);
-+        qemu_net_queue_purge(nc->incoming_queue, nc->peer);
-     }
- }
---
-.5.0

-[PULL 30/33] colo-compare: Fix memory leak in packet_enqueue()
+[PULL V3 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-From: Derek Su <dereksu@qnap.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The patch is to fix the "pkt" memory leak in packet_enqueue().
+This is needed to achieve migration, so the destination can restore its
-The allocated "pkt" needs to be freed if the colo compare
+index.
 primary or secondary queue is too big.
-Replace the error_report of full queue with a trace event.
+Setting base as last used idx, so destination will see as available all
 the entries that the device did not use, including the in-flight
 processing ones.
-Signed-off-by: Derek Su <dereksu@qnap.com>
+This is ok for networking, but other kinds of devices might have
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
+problems with these retransmissions.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 23 +++++++++++++++--------
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
- net/trace-events   |  1 +
+file changed, 17 insertions(+)
 files changed, 16 insertions(+), 8 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-     SECONDARY_IN,
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
- };
+                                        struct vhost_vring_state *ring)
+ {
-+static const char *colo_mode[] = {
++    struct vhost_vdpa *v = dev->opaque;
-+    [PRIMARY_IN] = "primary",
+     int ret;
-+    [SECONDARY_IN] = "secondary",
-+};
++    if (v->shadow_vqs_enabled) {
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
- static int compare_chr_send(CompareState *s,
++                                                      ring->index);
                              uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
      ConnectionKey key;
      Packet *pkt = NULL;
      Connection *conn;
 +    int ret;
      if (mode == PRIMARY_IN) {
          pkt = packet_new(s->pri_rs.buf,
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
      }
      if (mode == PRIMARY_IN) {
 -        if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) {
 -            error_report("colo compare primary queue size too big,"
 -                         "drop packet");
 -        }
 +        ret = colo_insert_packet(&conn->primary_list, pkt, &conn->pack);
      } else {
 -        if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) {
 -            error_report("colo compare secondary queue size too big,"
 -                         "drop packet");
 -        }
 +        ret = colo_insert_packet(&conn->secondary_list, pkt, &conn->sack);
      }
 +
-+    if (!ret) {
++        /*
-+        trace_colo_compare_drop_packet(colo_mode[mode],
++         * Setting base as last used idx, so destination will see as available
-+            "queue size too big, drop packet");
++         * all the entries that the device did not use, including the in-flight
-+        packet_destroy(pkt, NULL);
++         * processing ones.
-+        pkt = NULL;
++         *
 +         * TODO: This is ok for networking, but other kinds of devices might
 +         * have problems with these retransmissions.
 +         */
 +        ring->num = svq->last_used_idx;
 +        return 0;
 +    }
 +
-     *con = conn;
+     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
+     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
-     return 0;
+     return ret;
 diff --git a/net/trace-events b/net/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/net/trace-events
 +++ b/net/trace-events
@@ -XXX,XX +XXX,XX @@ colo_proxy_main(const char *chr) ": %s"
  # colo-compare.c
  colo_compare_main(const char *chr) ": %s"
 +colo_compare_drop_packet(const char *queue, const char *chr) ": %s: %s"
  colo_compare_udp_miscompare(const char *sta, int size) ": %s = %d"
  colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
  colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
 --
-.5.0
+.7.4

-[PULL 31/33] hw/net/e1000e: Do not abort() on invalid PSRCTL register value
+[PULL V3 14/15] vdpa: Never set log_base addr if SVQ is enabled
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-libFuzzer found using 'qemu-system-i386 -M q35':
+Setting the log address would make the device start reporting invalid
 dirty memory because the SVQ vrings are located in qemu's memory.
-qemu: hardware error: e1000e: PSRCTL.BSIZE0 cannot be zero
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-CPU #0:
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 EAX=00000000 EBX=00000000 ECX=00000000 EDX=00000663
 ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000
 EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
 ES =0000 00000000 0000ffff 00009300
 CS =f000 ffff0000 0000ffff 00009b00
 SS =0000 00000000 0000ffff 00009300
 DS =0000 00000000 0000ffff 00009300
 FS =0000 00000000 0000ffff 00009300
 GS =0000 00000000 0000ffff 00009300
 LDT=0000 00000000 0000ffff 00008200
 TR =0000 00000000 0000ffff 00008b00
 GDT=     00000000 0000ffff
 IDT=     00000000 0000ffff
 CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
 DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000
 DR6=ffff0ff0 DR7=00000400
 EFER=0000000000000000
 FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80
 FPR0=0000000000000000 0000 FPR1=0000000000000000 0000
 FPR2=0000000000000000 0000 FPR3=0000000000000000 0000
 FPR4=0000000000000000 0000 FPR5=0000000000000000 0000
 FPR6=0000000000000000 0000 FPR7=0000000000000000 0000
 XMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000
 XMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000
 XMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000
 XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000
 ==1988== ERROR: libFuzzer: deadly signal
     #6 0x7fae4d3ea894 in __GI_abort (/lib64/libc.so.6+0x22894)
     #7 0x563f4cc59a1d in hw_error (qemu-fuzz-i386+0xe8ca1d)
     #8 0x563f4d7c93f2 in e1000e_set_psrctl (qemu-fuzz-i386+0x19fc3f2)
     #9 0x563f4d7b798f in e1000e_core_write (qemu-fuzz-i386+0x19ea98f)
     #10 0x563f4d7afc46 in e1000e_mmio_write (qemu-fuzz-i386+0x19e2c46)
     #11 0x563f4cc9a0a7 in memory_region_write_accessor (qemu-fuzz-i386+0xecd0a7)
     #12 0x563f4cc99c13 in access_with_adjusted_size (qemu-fuzz-i386+0xeccc13)
     #13 0x563f4cc987b4 in memory_region_dispatch_write (qemu-fuzz-i386+0xecb7b4)
 It simply sent the following 2 I/O command to the e1000e
 PCI BAR #2 I/O region:
   writew 0x0100 0x0c00 # RCTL =   E1000_RCTL_DTYP_MASK
   writeb 0x2170 0x00   # PSRCTL = 0
 static void
 e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val)
 {
 if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) {
 if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
 hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
 }
 Instead of calling hw_error() which abort the process (it is
 meant for CPU fatal error condition, not for device logging),
 log the invalid request with qemu_log_mask(LOG_GUEST_ERROR)
 and return, ignoring the request.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000e_core.c | 10 +++++++---
+ hw/virtio/vhost-vdpa.c | 3 ++-
-file changed, 7 insertions(+), 3 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e_core.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/e1000e_core.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
- */
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
+                                      struct vhost_log *log)
- #include "qemu/osdep.h"
+ {
-+#include "qemu/log.h"
+-    if (vhost_vdpa_one_time_request(dev)) {
- #include "net/net.h"
++    struct vhost_vdpa *v = dev->opaque;
- #include "net/tap.h"
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
--#include "hw/hw.h"
+         return 0;
  #include "hw/pci/msi.h"
  #include "hw/pci/msix.h"
  #include "sysemu/runstate.h"
@@ -XXX,XX +XXX,XX @@ e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val)
      if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) {
          if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
 -            hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "e1000e: PSRCTL.BSIZE0 cannot be zero");
 +            return;
          }
          if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) {
 -            hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero");
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "e1000e: PSRCTL.BSIZE1 cannot be zero");
 +            return;
          }
      }
 --
-.5.0
+.7.4

-[PULL 24/33] net/colo-compare.c: Create event_bh with the right AioContext
+[PULL V3 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-qemu_bh_new will set the bh to be executed in the main
+SVQ is able to log the dirty bits by itself, so let's use it to not
-loop. This causes crashes as colo_compare_handle_event assumes
+block migration.
 that it has exclusive access the queues, which are also
 concurrently accessed in the iothread.
-Create the bh with the AioContext of the iothread to fulfill
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-these assumptions and fix the crashes. This is safe, because
+enabled. Even if the device supports it, the reports would be nonsense
-the bh already takes the appropriate locks.
+because SVQ memory is in the qemu region.
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
+The log region is still allocated. Future changes might skip that, but
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
+this series is already long enough.
-Reviewed-by: Derek Su <dereksu@qnap.com>
-Tested-by: Derek Su <dereksu@qnap.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 3 ++-
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
-file changed, 2 insertions(+), 1 deletion(-)
+ include/hw/virtio/vhost-vdpa.h |  1 +
 files changed, 36 insertions(+), 4 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_handle_event(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
+     return v->index != 0;
- static void colo_compare_iothread(CompareState *s)
+ }
 +static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
 +                                       uint64_t *features)
 +{
 +    int ret;
 +
 +    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 +    trace_vhost_vdpa_get_features(dev, *features);
 +    return ret;
 +}
 +
  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
-+    AioContext *ctx = iothread_get_aio_context(s->iothread);
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
-     object_ref(OBJECT(s->iothread));
+         return 0;
      s->worker_context = iothread_get_g_main_context(s->iothread);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_iothread(CompareState *s)
      }
-     colo_compare_timer_init(s);
+-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
--    s->event_bh = qemu_bh_new(colo_compare_handle_event, s);
++    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
-+    s->event_bh = aio_bh_new(ctx, colo_compare_handle_event, s);
+     if (r != 0) {
          error_setg_errno(errp, -r, "Can't get vdpa device features");
          return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
  static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                     uint64_t features)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      if (vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
 +
 +        v->acked_features = features;
 +
 +        /* We must not ack _F_LOG if SVQ is enabled */
 +        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 +
      trace_vhost_vdpa_set_features(dev, features);
      ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                       uint64_t *features)
  {
 -    int ret;
 +    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 -    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
- static char *compare_get_pri_indev(Object *obj, Error **errp)
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    uint64_t acked_features;
      bool shadow_vqs_enabled;
      /* IOVA mapping used by the Shadow Virtqueue */
      VhostIOVATree *iova_tree;
 --
-.5.0
+.7.4

-[PULL 25/33] chardev/char.c: Use qemu_co_sleep_ns if in coroutine
+Deleted patch
-From: Lukas Straub <lukasstraub2@web.de>
-To be able to convert compare_chr_send to a coroutine in the
-next commit, use qemu_co_sleep_ns if in coroutine.
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
-Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- chardev/char.c | 7 ++++++-
-file changed, 6 insertions(+), 1 deletion(-)
-diff --git a/chardev/char.c b/chardev/char.c
-index XXXXXXX..XXXXXXX 100644
---- a/chardev/char.c
-+++ b/chardev/char.c
-@@ -XXX,XX +XXX,XX @@
- #include "qemu/module.h"
- #include "qemu/option.h"
- #include "qemu/id.h"
-+#include "qemu/coroutine.h"
- #include "chardev/char-mux.h"
-@@ -XXX,XX +XXX,XX @@ static int qemu_chr_write_buffer(Chardev *s,
-     retry:
-         res = cc->chr_write(s, buf + *offset, len - *offset);
-         if (res < 0 && errno == EAGAIN && write_all) {
--            g_usleep(100);
-+            if (qemu_in_coroutine()) {
-+                qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
-+            } else {
-+                g_usleep(100);
-+            }
-             goto retry;
-         }
---
-.5.0

-[PULL 27/33] net/colo-compare.c: Only hexdump packets if tracing is enabled
+Deleted patch
-From: Lukas Straub <lukasstraub2@web.de>
-Else the log will be flooded if there is a lot of network
-traffic.
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- net/colo-compare.c | 10 ++++++----
-file changed, 6 insertions(+), 4 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
-+++ b/net/colo-compare.c
-@@ -XXX,XX +XXX,XX @@ sec:
-         g_queue_push_head(&conn->primary_list, ppkt);
-         g_queue_push_head(&conn->secondary_list, spkt);
--        qemu_hexdump((char *)ppkt->data, stderr,
--                     "colo-compare ppkt", ppkt->size);
--        qemu_hexdump((char *)spkt->data, stderr,
--                     "colo-compare spkt", spkt->size);
-+        if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
-+            qemu_hexdump((char *)ppkt->data, stderr,
-+                        "colo-compare ppkt", ppkt->size);
-+            qemu_hexdump((char *)spkt->data, stderr,
-+                        "colo-compare spkt", spkt->size);
-+        }
-         colo_compare_inconsistency_notify(s);
-     }
---
-.5.0

-[PULL 32/33] net: Drop the legacy "name" parameter from the -net option
+Deleted patch
-From: Thomas Huth <thuth@redhat.com>
-It's been deprecated since QEMU v3.1, so it's time to finally
-remove it. The "id" parameter can simply be used instead.
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Signed-off-by: Thomas Huth <thuth@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- docs/system/deprecated.rst | 15 +++++++++------
- net/net.c                  | 10 +---------
- qapi/net.json              |  3 ---
-files changed, 10 insertions(+), 18 deletions(-)
-diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst
-index XXXXXXX..XXXXXXX 100644
---- a/docs/system/deprecated.rst
-+++ b/docs/system/deprecated.rst
-@@ -XXX,XX +XXX,XX @@ The 'file' driver for drives is no longer appropriate for character or host
- devices and will only accept regular files (S_IFREG). The correct driver
- for these file types is 'host_cdrom' or 'host_device' as appropriate.
--``-net ...,name=``\ *name* (since 3.1)
--''''''''''''''''''''''''''''''''''''''
--
--The ``name`` parameter of the ``-net`` option is a synonym
--for the ``id`` parameter, which should now be used instead.
--
- ``-smp`` (invalid topologies) (since 3.1)
- '''''''''''''''''''''''''''''''''''''''''
-@@ -XXX,XX +XXX,XX @@ What follows is a record of recently removed, formerly deprecated
- features that serves as a record for users who have encountered
- trouble after a recent upgrade.
-+System emulator command line arguments
-+--------------------------------------
-+
-+``-net ...,name=``\ *name* (removed in 5.1)
-+'''''''''''''''''''''''''''''''''''''''''''
-+
-+The ``name`` parameter of the ``-net`` option was a synonym
-+for the ``id`` parameter, which should now be used instead.
-+
- QEMU Machine Protocol (QMP) commands
- ------------------------------------
-diff --git a/net/net.c b/net/net.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
-+++ b/net/net.c
-@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
- {
-     Netdev legacy = {0};
-     const Netdev *netdev;
--    const char *name;
-     NetClientState *peer = NULL;
-     if (is_netdev) {
-         netdev = object;
--        name = netdev->id;
-         if (netdev->type == NET_CLIENT_DRIVER_NIC ||
-             !net_client_init_fun[netdev->type]) {
-@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
-         const NetLegacyOptions *opts = net->opts;
-         legacy.id = net->id;
-         netdev = &legacy;
--        /* missing optional values have been initialized to "all bits zero" */
--        name = net->has_id ? net->id : net->name;
--
--        if (net->has_name) {
--            warn_report("The 'name' parameter is deprecated, use 'id' instead");
--        }
-         /* Map the old options to the new flat type */
-         switch (opts->type) {
-@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
-         }
-     }
--    if (net_client_init_fun[netdev->type](netdev, name, peer, errp) < 0) {
-+    if (net_client_init_fun[netdev->type](netdev, netdev->id, peer, errp) < 0) {
-         /* FIXME drop when all init functions store an Error */
-         if (errp && !*errp) {
-             error_setg(errp, QERR_DEVICE_INIT_FAILED,
-diff --git a/qapi/net.json b/qapi/net.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/net.json
-+++ b/qapi/net.json
-@@ -XXX,XX +XXX,XX @@
- #
- # @id: identifier for monitor commands
- #
--# @name: identifier for monitor commands, ignored if @id is present
--#
- # @opts: device type specific properties (legacy)
- #
- # Since: 1.2
-@@ -XXX,XX +XXX,XX @@
- { 'struct': 'NetLegacy',
-   'data': {
-     '*id':   'str',
--    '*name': 'str',
-     'opts':  'NetLegacyOptions' } }
- ##
---
-.5.0

The following changes since commit 7d3660e79830a069f1848bb4fa1cdf8f666424fb:

Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging (2020-06-12 23:06:22 +0100)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 955aab203f932b8a7c23ff9c58ba036997cb3ed8:

net: Drop the NetLegacy structure, always use Netdev instead (2020-06-16 14:40:40 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Derek Su (1):
      colo-compare: Fix memory leak in packet_enqueue()

Helge Deller (1):
      Fix tulip breakage

Jason Wang (1):
      net: use peer when purging queue in qemu_flush_or_purge_queue_packets()

Lukas Straub (6):
      net/colo-compare.c: Create event_bh with the right AioContext
      chardev/char.c: Use qemu_co_sleep_ns if in coroutine
      net/colo-compare.c: Fix deadlock in compare_chr_send
      net/colo-compare.c: Only hexdump packets if tracing is enabled
      net/colo-compare.c: Check that colo-compare is active
      net/colo-compare.c: Correct ordering in complete and finalize

Philippe Mathieu-Daudé (3):
      hw/net/tulip: Fix 'Descriptor Error' definition
      hw/net/tulip: Log descriptor overflows
      hw/net/e1000e: Do not abort() on invalid PSRCTL register value

Sai Pavan Boddu (11):
      net: cadence_gem: Fix debug statements
      net: cadence_gem: Fix the queue address update during wrap around
      net: cadence_gem: Fix irq update w.r.t queue
      net: cadence_gem: Define access permission for interrupt registers
      net: cadence_gem: Set ISR according to queue in use
      net: cadence_gem: Move tx/rx packet buffert to CadenceGEMState
      net: cadence_gem: Fix up code style
      net: cadence_gem: Add support for jumbo frames
      net: cadnece_gem: Update irq_read_clear field of designcfg_debug1 reg
      net: cadence_gem: Update the reset value for interrupt mask register
      net: cadence_gem: TX_LAST bit should be set by guest

Thomas Huth (2):
      net: Drop the legacy "name" parameter from the -net option
      net: Drop the NetLegacy structure, always use Netdev instead

Tong Ho (1):
      net: cadence_gem: Fix RX address filtering

Yuri Benditovich (7):
      virtio-net: implement RSS configuration command
      virtio-net: implement RX RSS processing
      tap: allow extended virtio header with hash info
      virtio-net: reference implementation of hash report
      vmstate.h: provide VMSTATE_VARRAY_UINT16_ALLOC macro
      virtio-net: add migration support for RSS and hash report
      virtio-net: align RSC fields with updated virtio-net header

From: Yuri Benditovich <yuri.benditovich@daynix.com>

Optionally report RSS feature.
Handle RSS configuration command and keep RSS parameters
in virtio-net device context.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/trace-events            |   3 +
 hw/net/virtio-net.c            | 167 ++++++++++++++++++++++++++++++++++++++---
 include/hw/virtio/virtio-net.h |  13 ++++
 3 files changed, 174 insertions(+), 9 deletions(-)

diff --git a/hw/net/trace-events b/hw/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -XXX,XX +XXX,XX @@ virtio_net_announce_notify(void) ""
 virtio_net_announce_timer(int round) "%d"
 virtio_net_handle_announce(int round) "%d"
 virtio_net_post_load_device(void)
+virtio_net_rss_disable(void)
+virtio_net_rss_error(const char *msg, uint32_t value) "%s, value 0x%08x"
+virtio_net_rss_enable(uint32_t p1, uint16_t p2, uint8_t p3) "hashes 0x%x, table of %d, key of %d"
 
 # tulip.c
 tulip_reg_write(uint64_t addr, const char *name, int size, uint64_t val) "addr 0x%02"PRIx64" (%s) size %d value 0x%08"PRIx64
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@
    tso/gso/gro 'off'. */
 #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
 
+#define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
+                                         VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
+
 /* temporary until standard header include it */
 #if !defined(VIRTIO_NET_HDR_F_RSC_INFO)
 
@@ -XXX,XX +XXX,XX @@ static VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_net_config, mtu)},
     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
      .end = endof(struct virtio_net_config, duplex)},
+    {.flags = 1ULL << VIRTIO_NET_F_RSS,
+     .end = endof(struct virtio_net_config, supported_hash_types)},
     {}
 };
 
@@ -XXX,XX +XXX,XX @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
     memcpy(netcfg.mac, n->mac, ETH_ALEN);
     virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
     netcfg.duplex = n->net_conf.duplex;
+    netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
+    virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
+                 VIRTIO_NET_RSS_MAX_TABLE_LEN);
+    virtio_stl_p(vdev, &netcfg.supported_hash_types,
+                 VIRTIO_NET_RSS_SUPPORTED_HASHES);
     memcpy(config, &netcfg, n->config_size);
 }
 
@@ -XXX,XX +XXX,XX @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
         return features;
     }
 
+    virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
     vdev->backend_features = features;
 
@@ -XXX,XX +XXX,XX @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
     }
 
     virtio_net_set_multiqueue(n,
+                              virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
                               virtio_has_feature(features, VIRTIO_NET_F_MQ));
 
     virtio_net_set_mrg_rx_bufs(n,
@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
     }
 }
 
+static void virtio_net_disable_rss(VirtIONet *n)
+{
+    if (n->rss_data.enabled) {
+        trace_virtio_net_rss_disable();
+    }
+    n->rss_data.enabled = false;
+}
+
+static uint16_t virtio_net_handle_rss(VirtIONet *n,
+                                      struct iovec *iov, unsigned int iov_cnt)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
+    struct virtio_net_rss_config cfg;
+    size_t s, offset = 0, size_get;
+    uint16_t queues, i;
+    struct {
+        uint16_t us;
+        uint8_t b;
+    } QEMU_PACKED temp;
+    const char *err_msg = "";
+    uint32_t err_value = 0;
+
+    if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
+        err_msg = "RSS is not negotiated";
+        goto error;
+    }
+    size_get = offsetof(struct virtio_net_rss_config, indirection_table);
+    s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
+    if (s != size_get) {
+        err_msg = "Short command buffer";
+        err_value = (uint32_t)s;
+        goto error;
+    }
+    n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
+    n->rss_data.indirections_len =
+        virtio_lduw_p(vdev, &cfg.indirection_table_mask);
+    n->rss_data.indirections_len++;
+    if (!is_power_of_2(n->rss_data.indirections_len)) {
+        err_msg = "Invalid size of indirection table";
+        err_value = n->rss_data.indirections_len;
+        goto error;
+    }
+    if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
+        err_msg = "Too large indirection table";
+        err_value = n->rss_data.indirections_len;
+        goto error;
+    }
+    n->rss_data.default_queue =
+        virtio_lduw_p(vdev, &cfg.unclassified_queue);
+    if (n->rss_data.default_queue >= n->max_queues) {
+        err_msg = "Invalid default queue";
+        err_value = n->rss_data.default_queue;
+        goto error;
+    }
+    offset += size_get;
+    size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
+    g_free(n->rss_data.indirections_table);
+    n->rss_data.indirections_table = g_malloc(size_get);
+    if (!n->rss_data.indirections_table) {
+        err_msg = "Can't allocate indirections table";
+        err_value = n->rss_data.indirections_len;
+        goto error;
+    }
+    s = iov_to_buf(iov, iov_cnt, offset,
+                   n->rss_data.indirections_table, size_get);
+    if (s != size_get) {
+        err_msg = "Short indirection table buffer";
+        err_value = (uint32_t)s;
+        goto error;
+    }
+    for (i = 0; i < n->rss_data.indirections_len; ++i) {
+        uint16_t val = n->rss_data.indirections_table[i];
+        n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
+    }
+    offset += size_get;
+    size_get = sizeof(temp);
+    s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
+    if (s != size_get) {
+        err_msg = "Can't get queues";
+        err_value = (uint32_t)s;
+        goto error;
+    }
+    queues = virtio_lduw_p(vdev, &temp.us);
+    if (queues == 0 || queues > n->max_queues) {
+        err_msg = "Invalid number of queues";
+        err_value = queues;
+        goto error;
+    }
+    if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
+        err_msg = "Invalid key size";
+        err_value = temp.b;
+        goto error;
+    }
+    if (!temp.b && n->rss_data.hash_types) {
+        err_msg = "No key provided";
+        err_value = 0;
+        goto error;
+    }
+    if (!temp.b && !n->rss_data.hash_types) {
+        virtio_net_disable_rss(n);
+        return queues;
+    }
+    offset += size_get;
+    size_get = temp.b;
+    s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
+    if (s != size_get) {
+        err_msg = "Can get key buffer";
+        err_value = (uint32_t)s;
+        goto error;
+    }
+    n->rss_data.enabled = true;
+    trace_virtio_net_rss_enable(n->rss_data.hash_types,
+                                n->rss_data.indirections_len,
+                                temp.b);
+    return queues;
+error:
+    trace_virtio_net_rss_error(err_msg, err_value);
+    virtio_net_disable_rss(n);
+    return 0;
+}
+
 static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
                                 struct iovec *iov, unsigned int iov_cnt)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(n);
-    struct virtio_net_ctrl_mq mq;
-    size_t s;
     uint16_t queues;
 
-    s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
-    if (s != sizeof(mq)) {
-        return VIRTIO_NET_ERR;
-    }
+    virtio_net_disable_rss(n);
+    if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
+        queues = virtio_net_handle_rss(n, iov, iov_cnt);
+    } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
+        struct virtio_net_ctrl_mq mq;
+        size_t s;
+        if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
+            return VIRTIO_NET_ERR;
+        }
+        s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
+        if (s != sizeof(mq)) {
+            return VIRTIO_NET_ERR;
+        }
+        queues = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
 
-    if (cmd != VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
+    } else {
         return VIRTIO_NET_ERR;
     }
 
-    queues = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
-
     if (queues < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
         queues > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
         queues > n->max_queues ||
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_unrealize(DeviceState *dev)
     g_free(n->vqs);
     qemu_del_nic(n->nic);
     virtio_net_rsc_cleanup(n);
+    g_free(n->rss_data.indirections_table);
     virtio_cleanup(vdev);
 }
 
@@ -XXX,XX +XXX,XX @@ static Property virtio_net_properties[] = {
     DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
                     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
+    DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
+                    VIRTIO_NET_F_RSS, false),
     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
                     VIRTIO_NET_F_RSC_EXT, false),
     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ typedef struct VirtioNetRscChain {
 /* Maximum packet size we can receive from tap device: header + 64k */
 #define VIRTIO_NET_MAX_BUFSIZE (sizeof(struct virtio_net_hdr) + (64 * KiB))
 
+#define VIRTIO_NET_RSS_MAX_KEY_SIZE     40
+#define VIRTIO_NET_RSS_MAX_TABLE_LEN    128
+
+typedef struct VirtioNetRssData {
+    bool    enabled;
+    uint32_t hash_types;
+    uint8_t key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
+    uint16_t indirections_len;
+    uint16_t *indirections_table;
+    uint16_t default_queue;
+} VirtioNetRssData;
+
 typedef struct VirtIONetQueue {
     VirtQueue *rx_vq;
     VirtQueue *tx_vq;
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
     bool failover;
     DeviceListener primary_listener;
     Notifier migration_state;
+    VirtioNetRssData rss_data;
 };
 
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

If VIRTIO_NET_F_RSS negotiated and RSS is enabled, process
incoming packets, calculate packet's hash and place the
packet into respective RX virtqueue.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 88 +++++++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/virtio-net.h |  1 +
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@
 #include "trace.h"
 #include "monitor/qdev.h"
 #include "hw/pci/pci.h"
+#include "net_rx_pkt.h"
 
 #define VIRTIO_NET_VM_VERSION    11
 
@@ -XXX,XX +XXX,XX @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
     return 0;
 }
 
+static uint8_t virtio_net_get_hash_type(bool isip4,
+                                        bool isip6,
+                                        bool isudp,
+                                        bool istcp,
+                                        uint32_t types)
+{
+    if (isip4) {
+        if (istcp && (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) {
+            return NetPktRssIpV4Tcp;
+        }
+        if (isudp && (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) {
+            return NetPktRssIpV4Udp;
+        }
+        if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
+            return NetPktRssIpV4;
+        }
+    } else if (isip6) {
+        uint32_t mask = VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
+                        VIRTIO_NET_RSS_HASH_TYPE_TCPv6;
+
+        if (istcp && (types & mask)) {
+            return (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) ?
+                NetPktRssIpV6TcpEx : NetPktRssIpV6Tcp;
+        }
+        mask = VIRTIO_NET_RSS_HASH_TYPE_UDP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDPv6;
+        if (isudp && (types & mask)) {
+            return (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) ?
+                NetPktRssIpV6UdpEx : NetPktRssIpV6Udp;
+        }
+        mask = VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_IPv6;
+        if (types & mask) {
+            return (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) ?
+                NetPktRssIpV6Ex : NetPktRssIpV6;
+        }
+    }
+    return 0xff;
+}
+
+static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
+                                  size_t size)
+{
+    VirtIONet *n = qemu_get_nic_opaque(nc);
+    unsigned int index = nc->queue_index, new_index;
+    struct NetRxPkt *pkt = n->rx_pkt;
+    uint8_t net_hash_type;
+    uint32_t hash;
+    bool isip4, isip6, isudp, istcp;
+
+    net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len,
+                             size - n->host_hdr_len);
+    net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
+    if (isip4 && (net_rx_pkt_get_ip4_info(pkt)->fragment)) {
+        istcp = isudp = false;
+    }
+    if (isip6 && (net_rx_pkt_get_ip6_info(pkt)->fragment)) {
+        istcp = isudp = false;
+    }
+    net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp,
+                                             n->rss_data.hash_types);
+    if (net_hash_type > NetPktRssIpV6UdpEx) {
+        return n->rss_data.default_queue;
+    }
+
+    hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
+    new_index = hash & (n->rss_data.indirections_len - 1);
+    new_index = n->rss_data.indirections_table[new_index];
+    if (index == new_index) {
+        return -1;
+    }
+    return new_index;
+}
+
 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
-                                      size_t size)
+                                      size_t size, bool no_rss)
 {
     VirtIONet *n = qemu_get_nic_opaque(nc);
     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
         return -1;
     }
 
+    if (!no_rss && n->rss_data.enabled) {
+        int index = virtio_net_process_rss(nc, buf, size);
+        if (index >= 0) {
+            NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
+            return virtio_net_receive_rcu(nc2, buf, size, true);
+        }
+    }
+
     /* hdr_len refers to the header we supply to the guest */
     if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
         return 0;
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
 {
     RCU_READ_LOCK_GUARD();
 
-    return virtio_net_receive_rcu(nc, buf, size);
+    return virtio_net_receive_rcu(nc, buf, size, false);
 }
 
 static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
 
     QTAILQ_INIT(&n->rsc_chains);
     n->qdev = dev;
+
+    net_rx_pkt_init(&n->rx_pkt, false);
 }
 
 static void virtio_net_device_unrealize(DeviceState *dev)
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_unrealize(DeviceState *dev)
     qemu_del_nic(n->nic);
     virtio_net_rsc_cleanup(n);
     g_free(n->rss_data.indirections_table);
+    net_rx_pkt_uninit(n->rx_pkt);
     virtio_cleanup(vdev);
 }
 
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
     DeviceListener primary_listener;
     Notifier migration_state;
     VirtioNetRssData rss_data;
+    struct NetRxPkt *rx_pkt;
 };
 
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

Suggest VIRTIO_NET_F_HASH_REPORT if specified in device
parameters.
If the VIRTIO_NET_F_HASH_REPORT is set,
the device extends configuration space. If the feature
is negotiated, the packet layout is extended to
accomodate the hash information. In this case deliver
packet's hash value and report type in virtio header
extension.
Use for configuration the same procedure as already
used for RSS. We add two fields in rss_data that
controls what the device does with the calculated hash
if rss_data.enabled is set. If field 'populate' is set
the hash is set in the packet, if field 'redirect' is
set the hash is used to decide the queue to place the
packet to.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 99 +++++++++++++++++++++++++++++++++---------
 include/hw/virtio/virtio-net.h |  2 +
 2 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_net_config, mtu)},
     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
      .end = endof(struct virtio_net_config, duplex)},
-    {.flags = 1ULL << VIRTIO_NET_F_RSS,
+    {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
      .end = endof(struct virtio_net_config, supported_hash_types)},
     {}
 };
@@ -XXX,XX +XXX,XX @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
     netcfg.duplex = n->net_conf.duplex;
     netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
     virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
-                 VIRTIO_NET_RSS_MAX_TABLE_LEN);
+                 virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
+                 VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
     virtio_stl_p(vdev, &netcfg.supported_hash_types,
                  VIRTIO_NET_RSS_SUPPORTED_HASHES);
     memcpy(config, &netcfg, n->config_size);
@@ -XXX,XX +XXX,XX @@ static int peer_has_ufo(VirtIONet *n)
 }
 
 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
-                                       int version_1)
+                                       int version_1, int hash_report)
 {
     int i;
     NetClientState *nc;
@@ -XXX,XX +XXX,XX @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
     n->mergeable_rx_bufs = mergeable_rx_bufs;
 
     if (version_1) {
-        n->guest_hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+        n->guest_hdr_len = hash_report ?
+            sizeof(struct virtio_net_hdr_v1_hash) :
+            sizeof(struct virtio_net_hdr_mrg_rxbuf);
+        n->rss_data.populate_hash = !!hash_report;
     } else {
         n->guest_hdr_len = n->mergeable_rx_bufs ?
             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
@@ -XXX,XX +XXX,XX @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
+
+        virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
     }
 
     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
@@ -XXX,XX +XXX,XX @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
     }
 
     virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
+    virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
     vdev->backend_features = features;
 
@@ -XXX,XX +XXX,XX @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
                                virtio_has_feature(features,
                                                   VIRTIO_NET_F_MRG_RXBUF),
                                virtio_has_feature(features,
-                                                  VIRTIO_F_VERSION_1));
+                                                  VIRTIO_F_VERSION_1),
+                               virtio_has_feature(features,
+                                                  VIRTIO_NET_F_HASH_REPORT));
 
     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
+    n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
 
     if (n->has_vnet_hdr) {
         n->curr_guest_offloads =
@@ -XXX,XX +XXX,XX @@ static void virtio_net_disable_rss(VirtIONet *n)
 }
 
 static uint16_t virtio_net_handle_rss(VirtIONet *n,
-                                      struct iovec *iov, unsigned int iov_cnt)
+                                      struct iovec *iov,
+                                      unsigned int iov_cnt,
+                                      bool do_rss)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(n);
     struct virtio_net_rss_config cfg;
@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
     const char *err_msg = "";
     uint32_t err_value = 0;
 
-    if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
+    if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
         err_msg = "RSS is not negotiated";
         goto error;
     }
+    if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
+        err_msg = "Hash report is not negotiated";
+        goto error;
+    }
     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
     if (s != size_get) {
@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
     n->rss_data.indirections_len =
         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
     n->rss_data.indirections_len++;
+    if (!do_rss) {
+        n->rss_data.indirections_len = 1;
+    }
     if (!is_power_of_2(n->rss_data.indirections_len)) {
         err_msg = "Invalid size of indirection table";
         err_value = n->rss_data.indirections_len;
@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
         err_value = n->rss_data.indirections_len;
         goto error;
     }
-    n->rss_data.default_queue =
-        virtio_lduw_p(vdev, &cfg.unclassified_queue);
+    n->rss_data.default_queue = do_rss ?
+        virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
     if (n->rss_data.default_queue >= n->max_queues) {
         err_msg = "Invalid default queue";
         err_value = n->rss_data.default_queue;
@@ -XXX,XX +XXX,XX @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
         err_value = (uint32_t)s;
         goto error;
     }
-    queues = virtio_lduw_p(vdev, &temp.us);
+    queues = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queues;
     if (queues == 0 || queues > n->max_queues) {
         err_msg = "Invalid number of queues";
         err_value = queues;
@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
     uint16_t queues;
 
     virtio_net_disable_rss(n);
+    if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
+        queues = virtio_net_handle_rss(n, iov, iov_cnt, false);
+        return queues ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
+    }
     if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
-        queues = virtio_net_handle_rss(n, iov, iov_cnt);
+        queues = virtio_net_handle_rss(n, iov, iov_cnt, true);
     } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
         struct virtio_net_ctrl_mq mq;
         size_t s;
@@ -XXX,XX +XXX,XX @@ static uint8_t virtio_net_get_hash_type(bool isip4,
     return 0xff;
 }
 
+static void virtio_set_packet_hash(const uint8_t *buf, uint8_t report,
+                                   uint32_t hash)
+{
+    struct virtio_net_hdr_v1_hash *hdr = (void *)buf;
+    hdr->hash_value = hash;
+    hdr->hash_report = report;
+}
+
 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
                                   size_t size)
 {
     VirtIONet *n = qemu_get_nic_opaque(nc);
-    unsigned int index = nc->queue_index, new_index;
+    unsigned int index = nc->queue_index, new_index = index;
     struct NetRxPkt *pkt = n->rx_pkt;
     uint8_t net_hash_type;
     uint32_t hash;
     bool isip4, isip6, isudp, istcp;
+    static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
+        VIRTIO_NET_HASH_REPORT_IPv4,
+        VIRTIO_NET_HASH_REPORT_TCPv4,
+        VIRTIO_NET_HASH_REPORT_TCPv6,
+        VIRTIO_NET_HASH_REPORT_IPv6,
+        VIRTIO_NET_HASH_REPORT_IPv6_EX,
+        VIRTIO_NET_HASH_REPORT_TCPv6_EX,
+        VIRTIO_NET_HASH_REPORT_UDPv4,
+        VIRTIO_NET_HASH_REPORT_UDPv6,
+        VIRTIO_NET_HASH_REPORT_UDPv6_EX
+    };
 
     net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len,
                              size - n->host_hdr_len);
@@ -XXX,XX +XXX,XX @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
     net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp,
                                              n->rss_data.hash_types);
     if (net_hash_type > NetPktRssIpV6UdpEx) {
-        return n->rss_data.default_queue;
+        if (n->rss_data.populate_hash) {
+            virtio_set_packet_hash(buf, VIRTIO_NET_HASH_REPORT_NONE, 0);
+        }
+        return n->rss_data.redirect ? n->rss_data.default_queue : -1;
     }
 
     hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
-    new_index = hash & (n->rss_data.indirections_len - 1);
-    new_index = n->rss_data.indirections_table[new_index];
-    if (index == new_index) {
-        return -1;
+
+    if (n->rss_data.populate_hash) {
+        virtio_set_packet_hash(buf, reports[net_hash_type], hash);
     }
-    return new_index;
+
+    if (n->rss_data.redirect) {
+        new_index = hash & (n->rss_data.indirections_len - 1);
+        new_index = n->rss_data.indirections_table[new_index];
+    }
+
+    return (index == new_index) ? -1 : new_index;
 }
 
 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
             }
 
             receive_header(n, sg, elem->in_num, buf, size);
+            if (n->rss_data.populate_hash) {
+                offset = sizeof(mhdr);
+                iov_from_buf(sg, elem->in_num, offset,
+                             buf + offset, n->host_hdr_len - sizeof(mhdr));
+            }
             offset = n->host_hdr_len;
             total += n->guest_hdr_len;
             guest_offset = n->guest_hdr_len;
@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
     trace_virtio_net_post_load_device();
     virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
                                virtio_vdev_has_feature(vdev,
-                                                       VIRTIO_F_VERSION_1));
+                                                       VIRTIO_F_VERSION_1),
+                               virtio_vdev_has_feature(vdev,
+                                                       VIRTIO_NET_F_HASH_REPORT));
 
     /* MAC_TABLE_ENTRIES may be different from the saved image */
     if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
@@ -XXX,XX +XXX,XX @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
 
     n->vqs[0].tx_waiting = 0;
     n->tx_burst = n->net_conf.txburst;
-    virtio_net_set_mrg_rx_bufs(n, 0, 0);
+    virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
     n->promisc = 1; /* for compatibility */
 
     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
@@ -XXX,XX +XXX,XX @@ static Property virtio_net_properties[] = {
     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
     DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
                     VIRTIO_NET_F_RSS, false),
+    DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
+                    VIRTIO_NET_F_HASH_REPORT, false),
     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
                     VIRTIO_NET_F_RSC_EXT, false),
     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ typedef struct VirtioNetRscChain {
 
 typedef struct VirtioNetRssData {
     bool    enabled;
+    bool    redirect;
+    bool    populate_hash;
     uint32_t hash_types;
     uint8_t key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
     uint16_t indirections_len;
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

Similar to VMSTATE_VARRAY_UINT32_ALLOC, but the size is
16-bit field.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/migration/vmstate.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index XXXXXXX..XXXXXXX 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -XXX,XX +XXX,XX @@ extern const VMStateInfo vmstate_info_qlist;
     .offset     = vmstate_offset_pointer(_state, _field, _type),     \
 }
 
+#define VMSTATE_VARRAY_UINT16_ALLOC(_field, _state, _field_num, _version, _info, _type) {\
+    .name       = (stringify(_field)),                               \
+    .version_id = (_version),                                        \
+    .num_offset = vmstate_offset_value(_state, _field_num, uint16_t),\
+    .info       = &(_info),                                          \
+    .size       = sizeof(_type),                                     \
+    .flags      = VMS_VARRAY_UINT16 | VMS_POINTER | VMS_ALLOC,       \
+    .offset     = vmstate_offset_pointer(_state, _field, _type),     \
+}
+
 #define VMSTATE_VARRAY_UINT16_UNSAFE(_field, _state, _field_num, _version, _info, _type) {\
     .name       = (stringify(_field)),                               \
     .version_id = (_version),                                        \
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

Save and restore RSS/hash report configuration.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static int virtio_net_post_load_device(void *opaque, int version_id)
         }
     }
 
+    if (n->rss_data.enabled) {
+        trace_virtio_net_rss_enable(n->rss_data.hash_types,
+                                    n->rss_data.indirections_len,
+                                    sizeof(n->rss_data.key));
+    } else {
+        trace_virtio_net_rss_disable();
+    }
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_virtio_net_has_vnet = {
     },
 };
 
+static bool virtio_net_rss_needed(void *opaque)
+{
+    return VIRTIO_NET(opaque)->rss_data.enabled;
+}
+
+static const VMStateDescription vmstate_virtio_net_rss = {
+    .name      = "virtio-net-device/rss",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = virtio_net_rss_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(rss_data.enabled, VirtIONet),
+        VMSTATE_BOOL(rss_data.redirect, VirtIONet),
+        VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
+        VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
+        VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
+        VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
+        VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
+                            VIRTIO_NET_RSS_MAX_KEY_SIZE),
+        VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
+                                    rss_data.indirections_len, 0,
+                                    vmstate_info_uint16, uint16_t),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_virtio_net_device = {
     .name = "virtio-net-device",
     .version_id = VIRTIO_NET_VM_VERSION,
@@ -XXX,XX +XXX,XX @@ static const VMStateDescription vmstate_virtio_net_device = {
                             has_ctrl_guest_offloads),
         VMSTATE_END_OF_LIST()
    },
+    .subsections = (const VMStateDescription * []) {
+        &vmstate_virtio_net_rss,
+        NULL
+    }
 };
 
 static NetClientInfo net_virtio_info = {
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

Removal of duplicated RSC definitions. Changing names of the
fields to ones defined in the Linux header.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@
                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
 
-/* temporary until standard header include it */
-#if !defined(VIRTIO_NET_HDR_F_RSC_INFO)
-
-#define VIRTIO_NET_HDR_F_RSC_INFO  4 /* rsc_ext data in csum_ fields */
-#define VIRTIO_NET_F_RSC_EXT       61
-
-#endif
-
-static inline __virtio16 *virtio_net_rsc_ext_num_packets(
-    struct virtio_net_hdr *hdr)
-{
-    return &hdr->csum_start;
-}
-
-static inline __virtio16 *virtio_net_rsc_ext_num_dupacks(
-    struct virtio_net_hdr *hdr)
-{
-    return &hdr->csum_offset;
-}
-
 static VirtIOFeature feature_sizes[] = {
     {.flags = 1ULL << VIRTIO_NET_F_MAC,
      .end = endof(struct virtio_net_config, mac)},
@@ -XXX,XX +XXX,XX @@ static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
                                        VirtioNetRscSeg *seg)
 {
     int ret;
-    struct virtio_net_hdr *h;
+    struct virtio_net_hdr_v1 *h;
 
-    h = (struct virtio_net_hdr *)seg->buf;
+    h = (struct virtio_net_hdr_v1 *)seg->buf;
     h->flags = 0;
     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
 
     if (seg->is_coalesced) {
-        *virtio_net_rsc_ext_num_packets(h) = seg->packets;
-        *virtio_net_rsc_ext_num_dupacks(h) = seg->dup_ack;
+        h->rsc.segments = seg->packets;
+        h->rsc.dup_acks = seg->dup_ack;
         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
         if (chain->proto == ETH_P_IP) {
             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
-- 
2.5.0

From: Helge Deller <deller@gmx.de>

The tulip network driver in a qemu-system-hppa emulation is broken in
the sense that bigger network packages aren't received any longer and
thus even running e.g. "apt update" inside the VM fails.

The breakage was introduced by commit 8ffb7265af ("check frame size and
r/w data length") which added checks to prevent accesses outside of the
rx/tx buffers.

But the new checks were implemented wrong. The variable rx_frame_len
counts backwards, from rx_frame_size down to zero, and the variable len
is never bigger than rx_frame_len, so accesses just can't happen and the
checks are unnecessary.
On the contrary the checks now prevented bigger packages to be moved
into the rx buffers.

This patch reverts the wrong checks and were sucessfully tested with a
qemu-system-hppa emulation.

Fixes: 8ffb7265af ("check frame size and r/w data length")
Buglink: https://bugs.launchpad.net/bugs/1874539
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/tulip.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hw/net/tulip.c b/hw/net/tulip.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/tulip.c
+++ b/hw/net/tulip.c
@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
             len = s->rx_frame_len;
         }
 
-        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
-            return;
-        }
         pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
             (s->rx_frame_size - s->rx_frame_len), len);
         s->rx_frame_len -= len;
@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
             len = s->rx_frame_len;
         }
 
-        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
-            return;
-        }
         pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
             (s->rx_frame_size - s->rx_frame_len), len);
         s->rx_frame_len -= len;
-- 
2.5.0

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

Log with GUEST_ERROR what the guest is doing wrong.

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/tulip.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hw/net/tulip.c b/hw/net/tulip.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/tulip.c
+++ b/hw/net/tulip.c
@@ -XXX,XX +XXX,XX @@ static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
     int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
 
     if (s->tx_frame_len + len1 > sizeof(s->tx_frame)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "%s: descriptor overflow (ofs: %u, len:%d, size:%zu)\n",
+                      __func__, s->tx_frame_len, len1, sizeof(s->tx_frame));
         return -1;
     }
     if (len1) {
@@ -XXX,XX +XXX,XX @@ static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
     }
 
     if (s->tx_frame_len + len2 > sizeof(s->tx_frame)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "%s: descriptor overflow (ofs: %u, len:%d, size:%zu)\n",
+                      __func__, s->tx_frame_len, len2, sizeof(s->tx_frame));
         return -1;
     }
     if (len2) {
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Enabling debug breaks the build, Fix them and make debug statements
always compilable. Fix few statements to use sized integer casting.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/dma.h"
 #include "net/checksum.h"
 
-#ifdef CADENCE_GEM_ERR_DEBUG
-#define DB_PRINT(...) do { \
-    fprintf(stderr,  ": %s: ", __func__); \
-    fprintf(stderr, ## __VA_ARGS__); \
-    } while (0)
-#else
-    #define DB_PRINT(...)
-#endif
+#define CADENCE_GEM_ERR_DEBUG 0
+#define DB_PRINT(...) do {\
+    if (CADENCE_GEM_ERR_DEBUG) {   \
+        qemu_log(": %s: ", __func__); \
+        qemu_log(__VA_ARGS__); \
+    } \
+} while (0)
 
 #define GEM_NWCTRL        (0x00000000/4) /* Network Control reg */
 #define GEM_NWCFG         (0x00000004/4) /* Network Config reg */
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
         size += 4;
     }
 
-    DB_PRINT("config bufsize: %d packet size: %ld\n", rxbufsize, size);
+    DB_PRINT("config bufsize: %u packet size: %zd\n", rxbufsize, size);
 
     /* Find which queue we are targeting */
     q = get_queue_from_screen(s, rxbuf_ptr, rxbufsize);
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
             return -1;
         }
 
-        DB_PRINT("copy %u bytes to 0x%" PRIx64 "\n",
-                 MIN(bytes_to_copy, rxbufsize),
-                 rx_desc_get_buffer(s, s->rx_desc[q]));
+        DB_PRINT("copy %" PRIu32 " bytes to 0x%" PRIx64 "\n",
+                MIN(bytes_to_copy, rxbufsize),
+                rx_desc_get_buffer(s, s->rx_desc[q]));
 
         /* Copy packet data to emulated DMA buffer */
         address_space_write(&s->dma_as, rx_desc_get_buffer(s, s->rx_desc[q]) +
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
              */
             if ((tx_desc_get_buffer(s, desc) == 0) ||
                 (tx_desc_get_length(desc) == 0)) {
-                DB_PRINT("Invalid TX descriptor @ 0x%x\n",
-                         (unsigned)packet_desc_addr);
+                DB_PRINT("Invalid TX descriptor @ 0x%" HWADDR_PRIx "\n",
+                         packet_desc_addr);
                 break;
             }
 
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

During wrap around and reset, queues are pointing to initial base
address of queue 0, irrespective of what queue we are dealing with.
Fix it by assigning proper base address every time.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static int get_queue_from_screen(CadenceGEMState *s, uint8_t *rxbuf_ptr,
     return 0;
 }
 
+static uint32_t gem_get_queue_base_addr(CadenceGEMState *s, bool tx, int q)
+{
+    uint32_t base_addr = 0;
+
+    switch (q) {
+    case 0:
+        base_addr = s->regs[tx ? GEM_TXQBASE : GEM_RXQBASE];
+        break;
+    case 1 ... (MAX_PRIORITY_QUEUES - 1):
+        base_addr = s->regs[(tx ? GEM_TRANSMIT_Q1_PTR :
+                                 GEM_RECEIVE_Q1_PTR) + q - 1];
+        break;
+    default:
+        g_assert_not_reached();
+    };
+
+    return base_addr;
+}
+
+static inline uint32_t gem_get_tx_queue_base_addr(CadenceGEMState *s, int q)
+{
+    return gem_get_queue_base_addr(s, true, q);
+}
+
+static inline uint32_t gem_get_rx_queue_base_addr(CadenceGEMState *s, int q)
+{
+    return gem_get_queue_base_addr(s, false, q);
+}
+
 static hwaddr gem_get_desc_addr(CadenceGEMState *s, bool tx, int q)
 {
     hwaddr desc_addr = 0;
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
         /* Next descriptor */
         if (rx_desc_get_wrap(s->rx_desc[q])) {
             DB_PRINT("wrapping RX descriptor list\n");
-            s->rx_desc_addr[q] = s->regs[GEM_RXQBASE];
+            s->rx_desc_addr[q] = gem_get_rx_queue_base_addr(s, q);
         } else {
             DB_PRINT("incrementing RX descriptor list\n");
             s->rx_desc_addr[q] += 4 * gem_get_desc_len(s, true);
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                                     sizeof(desc_first));
                 /* Advance the hardware current descriptor past this packet */
                 if (tx_desc_get_wrap(desc)) {
-                    s->tx_desc_addr[q] = s->regs[GEM_TXQBASE];
+                    s->tx_desc_addr[q] = gem_get_tx_queue_base_addr(s, q);
                 } else {
                     s->tx_desc_addr[q] = packet_desc_addr +
                                          4 * gem_get_desc_len(s, false);
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                 } else {
                     packet_desc_addr = 0;
                 }
-                packet_desc_addr |= s->regs[GEM_TXQBASE];
+                packet_desc_addr |= gem_get_tx_queue_base_addr(s, q);
             } else {
                 packet_desc_addr += 4 * gem_get_desc_len(s, false);
             }
@@ -XXX,XX +XXX,XX @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
         if (!(val & GEM_NWCTRL_TXENA)) {
             /* Reset to start of Q when transmit disabled. */
             for (i = 0; i < s->num_priority_queues; i++) {
-                s->tx_desc_addr[i] = s->regs[GEM_TXQBASE];
+                s->tx_desc_addr[i] = gem_get_tx_queue_base_addr(s, i);
             }
         }
         if (gem_can_receive(qemu_get_queue(s->nic))) {
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Set irq's specific to a queue, present implementation is setting q1 irq
based on q0 status.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static void gem_update_int_status(CadenceGEMState *s)
 {
     int i;
 
-    if (!s->regs[GEM_ISR]) {
-        /* ISR isn't set, clear all the interrupts */
-        for (i = 0; i < s->num_priority_queues; ++i) {
-            qemu_set_irq(s->irq[i], 0);
-        }
-        return;
-    }
+    qemu_set_irq(s->irq[0], !!s->regs[GEM_ISR]);
 
-    /* If we get here we know s->regs[GEM_ISR] is set, so we don't need to
-     * check it again.
-     */
-    if (s->num_priority_queues == 1) {
-        /* No priority queues, just trigger the interrupt */
-        DB_PRINT("asserting int.\n");
-        qemu_set_irq(s->irq[0], 1);
-        return;
-    }
-
-    for (i = 0; i < s->num_priority_queues; ++i) {
-        if (s->regs[GEM_INT_Q1_STATUS + i]) {
-            DB_PRINT("asserting int. (q=%d)\n", i);
-            qemu_set_irq(s->irq[i], 1);
-        }
+    for (i = 1; i < s->num_priority_queues; ++i) {
+        qemu_set_irq(s->irq[i], !!s->regs[GEM_INT_Q1_STATUS + i - 1]);
     }
 }
 
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Q1 to Q7 ISR's are clear-on-read, IER/IDR registers
are write-only, mask reg are read-only.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static const uint8_t broadcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
  */
 static void gem_init_register_masks(CadenceGEMState *s)
 {
+    unsigned int i;
     /* Mask of register bits which are read only */
     memset(&s->regs_ro[0], 0, sizeof(s->regs_ro));
     s->regs_ro[GEM_NWCTRL]   = 0xFFF80000;
@@ -XXX,XX +XXX,XX @@ static void gem_init_register_masks(CadenceGEMState *s)
     s->regs_ro[GEM_ISR]      = 0xFFFFFFFF;
     s->regs_ro[GEM_IMR]      = 0xFFFFFFFF;
     s->regs_ro[GEM_MODID]    = 0xFFFFFFFF;
+    for (i = 0; i < s->num_priority_queues; i++) {
+        s->regs_ro[GEM_INT_Q1_STATUS + i] = 0xFFFFFFFF;
+        s->regs_ro[GEM_INT_Q1_ENABLE + i] = 0xFFFFF319;
+        s->regs_ro[GEM_INT_Q1_DISABLE + i] = 0xFFFFF319;
+        s->regs_ro[GEM_INT_Q1_MASK + i] = 0xFFFFFFFF;
+    }
 
     /* Mask of register bits which are clear on read */
     memset(&s->regs_rtc[0], 0, sizeof(s->regs_rtc));
     s->regs_rtc[GEM_ISR]      = 0xFFFFFFFF;
+    for (i = 0; i < s->num_priority_queues; i++) {
+        s->regs_rtc[GEM_INT_Q1_STATUS + i] = 0x00000CE6;
+    }
 
     /* Mask of register bits which are write 1 to clear */
     memset(&s->regs_w1c[0], 0, sizeof(s->regs_w1c));
@@ -XXX,XX +XXX,XX @@ static void gem_init_register_masks(CadenceGEMState *s)
     s->regs_wo[GEM_NWCTRL]   = 0x00073E60;
     s->regs_wo[GEM_IER]      = 0x07FFFFFF;
     s->regs_wo[GEM_IDR]      = 0x07FFFFFF;
+    for (i = 0; i < s->num_priority_queues; i++) {
+        s->regs_wo[GEM_INT_Q1_ENABLE + i] = 0x00000CE6;
+        s->regs_wo[GEM_INT_Q1_DISABLE + i] = 0x00000CE6;
+    }
 }
 
 /*
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Set ISR according to queue in use, added interrupt support for
all queues.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static inline void rx_desc_set_sar(uint32_t *desc, int sar_idx)
 /* The broadcast MAC address: 0xFFFFFFFFFFFF */
 static const uint8_t broadcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 
+static void gem_set_isr(CadenceGEMState *s, int q, uint32_t flag)
+{
+    if (q == 0) {
+        s->regs[GEM_ISR] |= flag & ~(s->regs[GEM_IMR]);
+    } else {
+        s->regs[GEM_INT_Q1_STATUS + q - 1] |= flag &
+                                      ~(s->regs[GEM_INT_Q1_MASK + q - 1]);
+    }
+}
+
 /*
  * gem_init_register_masks:
  * One time initialization.
@@ -XXX,XX +XXX,XX @@ static void gem_get_rx_desc(CadenceGEMState *s, int q)
     if (rx_desc_get_ownership(s->rx_desc[q]) == 1) {
         DB_PRINT("descriptor 0x%" HWADDR_PRIx " owned by sw.\n", desc_addr);
         s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_NOBUF;
-        s->regs[GEM_ISR] |= GEM_INT_RXUSED & ~(s->regs[GEM_IMR]);
+        gem_set_isr(s, q, GEM_INT_RXUSED);
         /* Handle interrupt consequences */
         gem_update_int_status(s);
     }
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
     gem_receive_updatestats(s, buf, size);
 
     s->regs[GEM_RXSTATUS] |= GEM_RXSTATUS_FRMRCVD;
-    s->regs[GEM_ISR] |= GEM_INT_RXCMPL & ~(s->regs[GEM_IMR]);
+    gem_set_isr(s, q, GEM_INT_RXCMPL);
 
     /* Handle interrupt consequences */
     gem_update_int_status(s);
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                 DB_PRINT("TX descriptor next: 0x%08x\n", s->tx_desc_addr[q]);
 
                 s->regs[GEM_TXSTATUS] |= GEM_TXSTATUS_TXCMPL;
-                s->regs[GEM_ISR] |= GEM_INT_TXCMPL & ~(s->regs[GEM_IMR]);
-
-                /* Update queue interrupt status */
-                if (s->num_priority_queues > 1) {
-                    s->regs[GEM_INT_Q1_STATUS + q] |=
-                            GEM_INT_TXCMPL & ~(s->regs[GEM_INT_Q1_MASK + q]);
-                }
+                gem_set_isr(s, q, GEM_INT_TXCMPL);
 
                 /* Handle interrupt consequences */
                 gem_update_int_status(s);
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
 
         if (tx_desc_get_used(desc)) {
             s->regs[GEM_TXSTATUS] |= GEM_TXSTATUS_USED;
-            s->regs[GEM_ISR] |= GEM_INT_TXUSED & ~(s->regs[GEM_IMR]);
+            /* IRQ TXUSED is defined only for queue 0 */
+            if (q == 0) {
+                gem_set_isr(s, 0, GEM_INT_TXUSED);
+            }
             gem_update_int_status(s);
         }
     }
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Moving this buffers to CadenceGEMState, as their size will be increased
more when JUMBO frames support is added.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c         | 38 +++++++++++++++++---------------------
 include/hw/net/cadence_gem.h |  4 ++++
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static void gem_get_rx_desc(CadenceGEMState *s, int q)
  */
 static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
-    CadenceGEMState *s;
+    CadenceGEMState *s = qemu_get_nic_opaque(nc);
     unsigned   rxbufsize, bytes_to_copy;
     unsigned   rxbuf_offset;
-    uint8_t    rxbuf[2048];
     uint8_t   *rxbuf_ptr;
     bool first_desc = true;
     int maf;
     int q = 0;
 
-    s = qemu_get_nic_opaque(nc);
-
     /* Is this destination MAC address "for us" ? */
     maf = gem_mac_address_filter(s, buf);
     if (maf == GEM_RX_REJECT) {
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
     } else {
         unsigned crc_val;
 
-        if (size > sizeof(rxbuf) - sizeof(crc_val)) {
-            size = sizeof(rxbuf) - sizeof(crc_val);
+        if (size > MAX_FRAME_SIZE - sizeof(crc_val)) {
+            size = MAX_FRAME_SIZE - sizeof(crc_val);
         }
         bytes_to_copy = size;
         /* The application wants the FCS field, which QEMU does not provide.
          * We must try and calculate one.
          */
 
-        memcpy(rxbuf, buf, size);
-        memset(rxbuf + size, 0, sizeof(rxbuf) - size);
-        rxbuf_ptr = rxbuf;
-        crc_val = cpu_to_le32(crc32(0, rxbuf, MAX(size, 60)));
-        memcpy(rxbuf + size, &crc_val, sizeof(crc_val));
+        memcpy(s->rx_packet, buf, size);
+        memset(s->rx_packet + size, 0, MAX_FRAME_SIZE - size);
+        rxbuf_ptr = s->rx_packet;
+        crc_val = cpu_to_le32(crc32(0, s->rx_packet, MAX(size, 60)));
+        memcpy(s->rx_packet + size, &crc_val, sizeof(crc_val));
 
         bytes_to_copy += 4;
         size += 4;
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
 {
     uint32_t desc[DESC_MAX_NUM_WORDS];
     hwaddr packet_desc_addr;
-    uint8_t     tx_packet[2048];
     uint8_t     *p;
     unsigned    total_bytes;
     int q = 0;
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
      * Packets scattered across multiple descriptors are gathered to this
      * one contiguous buffer first.
      */
-    p = tx_packet;
+    p = s->tx_packet;
     total_bytes = 0;
 
     for (q = s->num_priority_queues - 1; q >= 0; q--) {
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                 break;
             }
 
-            if (tx_desc_get_length(desc) > sizeof(tx_packet) -
-                                               (p - tx_packet)) {
+            if (tx_desc_get_length(desc) > MAX_FRAME_SIZE -
+                                               (p - s->tx_packet)) {
                 DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \
                          " too large: size 0x%x space 0x%zx\n",
                          packet_desc_addr, tx_desc_get_length(desc),
-                         sizeof(tx_packet) - (p - tx_packet));
+                         MAX_FRAME_SIZE - (p - s->tx_packet));
                 break;
             }
 
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
 
                 /* Is checksum offload enabled? */
                 if (s->regs[GEM_DMACFG] & GEM_DMACFG_TXCSUM_OFFL) {
-                    net_checksum_calculate(tx_packet, total_bytes);
+                    net_checksum_calculate(s->tx_packet, total_bytes);
                 }
 
                 /* Update MAC statistics */
-                gem_transmit_updatestats(s, tx_packet, total_bytes);
+                gem_transmit_updatestats(s, s->tx_packet, total_bytes);
 
                 /* Send the packet somewhere */
                 if (s->phy_loop || (s->regs[GEM_NWCTRL] &
                                     GEM_NWCTRL_LOCALLOOP)) {
-                    gem_receive(qemu_get_queue(s->nic), tx_packet,
+                    gem_receive(qemu_get_queue(s->nic), s->tx_packet,
                                 total_bytes);
                 } else {
-                    qemu_send_packet(qemu_get_queue(s->nic), tx_packet,
+                    qemu_send_packet(qemu_get_queue(s->nic), s->tx_packet,
                                      total_bytes);
                 }
 
                 /* Prepare for next packet */
-                p = tx_packet;
+                p = s->tx_packet;
                 total_bytes = 0;
             }
 
diff --git a/include/hw/net/cadence_gem.h b/include/hw/net/cadence_gem.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/net/cadence_gem.h
+++ b/include/hw/net/cadence_gem.h
@@ -XXX,XX +XXX,XX @@
 #define MAX_TYPE1_SCREENERS             16
 #define MAX_TYPE2_SCREENERS             16
 
+#define MAX_FRAME_SIZE 2048
+
 typedef struct CadenceGEMState {
     /*< private >*/
     SysBusDevice parent_obj;
@@ -XXX,XX +XXX,XX @@ typedef struct CadenceGEMState {
 
     uint8_t can_rx_state; /* Debug only */
 
+    uint8_t tx_packet[MAX_FRAME_SIZE];
+    uint8_t rx_packet[MAX_FRAME_SIZE];
     uint32_t rx_desc[MAX_PRIORITY_QUEUES][DESC_MAX_NUM_WORDS];
 
     bool sar_active[4];
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Fix the code style for register definitions.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 204 ++++++++++++++++++++++++++-------------------------
 1 file changed, 103 insertions(+), 101 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@
     } \
 } while (0)
 
-#define GEM_NWCTRL        (0x00000000/4) /* Network Control reg */
-#define GEM_NWCFG         (0x00000004/4) /* Network Config reg */
-#define GEM_NWSTATUS      (0x00000008/4) /* Network Status reg */
-#define GEM_USERIO        (0x0000000C/4) /* User IO reg */
-#define GEM_DMACFG        (0x00000010/4) /* DMA Control reg */
-#define GEM_TXSTATUS      (0x00000014/4) /* TX Status reg */
-#define GEM_RXQBASE       (0x00000018/4) /* RX Q Base address reg */
-#define GEM_TXQBASE       (0x0000001C/4) /* TX Q Base address reg */
-#define GEM_RXSTATUS      (0x00000020/4) /* RX Status reg */
-#define GEM_ISR           (0x00000024/4) /* Interrupt Status reg */
-#define GEM_IER           (0x00000028/4) /* Interrupt Enable reg */
-#define GEM_IDR           (0x0000002C/4) /* Interrupt Disable reg */
-#define GEM_IMR           (0x00000030/4) /* Interrupt Mask reg */
-#define GEM_PHYMNTNC      (0x00000034/4) /* Phy Maintenance reg */
-#define GEM_RXPAUSE       (0x00000038/4) /* RX Pause Time reg */
-#define GEM_TXPAUSE       (0x0000003C/4) /* TX Pause Time reg */
-#define GEM_TXPARTIALSF   (0x00000040/4) /* TX Partial Store and Forward */
-#define GEM_RXPARTIALSF   (0x00000044/4) /* RX Partial Store and Forward */
-#define GEM_HASHLO        (0x00000080/4) /* Hash Low address reg */
-#define GEM_HASHHI        (0x00000084/4) /* Hash High address reg */
-#define GEM_SPADDR1LO     (0x00000088/4) /* Specific addr 1 low reg */
-#define GEM_SPADDR1HI     (0x0000008C/4) /* Specific addr 1 high reg */
-#define GEM_SPADDR2LO     (0x00000090/4) /* Specific addr 2 low reg */
-#define GEM_SPADDR2HI     (0x00000094/4) /* Specific addr 2 high reg */
-#define GEM_SPADDR3LO     (0x00000098/4) /* Specific addr 3 low reg */
-#define GEM_SPADDR3HI     (0x0000009C/4) /* Specific addr 3 high reg */
-#define GEM_SPADDR4LO     (0x000000A0/4) /* Specific addr 4 low reg */
-#define GEM_SPADDR4HI     (0x000000A4/4) /* Specific addr 4 high reg */
-#define GEM_TIDMATCH1     (0x000000A8/4) /* Type ID1 Match reg */
-#define GEM_TIDMATCH2     (0x000000AC/4) /* Type ID2 Match reg */
-#define GEM_TIDMATCH3     (0x000000B0/4) /* Type ID3 Match reg */
-#define GEM_TIDMATCH4     (0x000000B4/4) /* Type ID4 Match reg */
-#define GEM_WOLAN         (0x000000B8/4) /* Wake on LAN reg */
-#define GEM_IPGSTRETCH    (0x000000BC/4) /* IPG Stretch reg */
-#define GEM_SVLAN         (0x000000C0/4) /* Stacked VLAN reg */
-#define GEM_MODID         (0x000000FC/4) /* Module ID reg */
-#define GEM_OCTTXLO       (0x00000100/4) /* Octects transmitted Low reg */
-#define GEM_OCTTXHI       (0x00000104/4) /* Octects transmitted High reg */
-#define GEM_TXCNT         (0x00000108/4) /* Error-free Frames transmitted */
-#define GEM_TXBCNT        (0x0000010C/4) /* Error-free Broadcast Frames */
-#define GEM_TXMCNT        (0x00000110/4) /* Error-free Multicast Frame */
-#define GEM_TXPAUSECNT    (0x00000114/4) /* Pause Frames Transmitted */
-#define GEM_TX64CNT       (0x00000118/4) /* Error-free 64 TX */
-#define GEM_TX65CNT       (0x0000011C/4) /* Error-free 65-127 TX */
-#define GEM_TX128CNT      (0x00000120/4) /* Error-free 128-255 TX */
-#define GEM_TX256CNT      (0x00000124/4) /* Error-free 256-511 */
-#define GEM_TX512CNT      (0x00000128/4) /* Error-free 512-1023 TX */
-#define GEM_TX1024CNT     (0x0000012C/4) /* Error-free 1024-1518 TX */
-#define GEM_TX1519CNT     (0x00000130/4) /* Error-free larger than 1519 TX */
-#define GEM_TXURUNCNT     (0x00000134/4) /* TX under run error counter */
-#define GEM_SINGLECOLLCNT (0x00000138/4) /* Single Collision Frames */
-#define GEM_MULTCOLLCNT   (0x0000013C/4) /* Multiple Collision Frames */
-#define GEM_EXCESSCOLLCNT (0x00000140/4) /* Excessive Collision Frames */
-#define GEM_LATECOLLCNT   (0x00000144/4) /* Late Collision Frames */
-#define GEM_DEFERTXCNT    (0x00000148/4) /* Deferred Transmission Frames */
-#define GEM_CSENSECNT     (0x0000014C/4) /* Carrier Sense Error Counter */
-#define GEM_OCTRXLO       (0x00000150/4) /* Octects Received register Low */
-#define GEM_OCTRXHI       (0x00000154/4) /* Octects Received register High */
-#define GEM_RXCNT         (0x00000158/4) /* Error-free Frames Received */
-#define GEM_RXBROADCNT    (0x0000015C/4) /* Error-free Broadcast Frames RX */
-#define GEM_RXMULTICNT    (0x00000160/4) /* Error-free Multicast Frames RX */
-#define GEM_RXPAUSECNT    (0x00000164/4) /* Pause Frames Received Counter */
-#define GEM_RX64CNT       (0x00000168/4) /* Error-free 64 byte Frames RX */
-#define GEM_RX65CNT       (0x0000016C/4) /* Error-free 65-127B Frames RX */
-#define GEM_RX128CNT      (0x00000170/4) /* Error-free 128-255B Frames RX */
-#define GEM_RX256CNT      (0x00000174/4) /* Error-free 256-512B Frames RX */
-#define GEM_RX512CNT      (0x00000178/4) /* Error-free 512-1023B Frames RX */
-#define GEM_RX1024CNT     (0x0000017C/4) /* Error-free 1024-1518B Frames RX */
-#define GEM_RX1519CNT     (0x00000180/4) /* Error-free 1519-max Frames RX */
-#define GEM_RXUNDERCNT    (0x00000184/4) /* Undersize Frames Received */
-#define GEM_RXOVERCNT     (0x00000188/4) /* Oversize Frames Received */
-#define GEM_RXJABCNT      (0x0000018C/4) /* Jabbers Received Counter */
-#define GEM_RXFCSCNT      (0x00000190/4) /* Frame Check seq. Error Counter */
-#define GEM_RXLENERRCNT   (0x00000194/4) /* Length Field Error Counter */
-#define GEM_RXSYMERRCNT   (0x00000198/4) /* Symbol Error Counter */
-#define GEM_RXALIGNERRCNT (0x0000019C/4) /* Alignment Error Counter */
-#define GEM_RXRSCERRCNT   (0x000001A0/4) /* Receive Resource Error Counter */
-#define GEM_RXORUNCNT     (0x000001A4/4) /* Receive Overrun Counter */
-#define GEM_RXIPCSERRCNT  (0x000001A8/4) /* IP header Checksum Error Counter */
-#define GEM_RXTCPCCNT     (0x000001AC/4) /* TCP Checksum Error Counter */
-#define GEM_RXUDPCCNT     (0x000001B0/4) /* UDP Checksum Error Counter */
-
-#define GEM_1588S         (0x000001D0/4) /* 1588 Timer Seconds */
-#define GEM_1588NS        (0x000001D4/4) /* 1588 Timer Nanoseconds */
-#define GEM_1588ADJ       (0x000001D8/4) /* 1588 Timer Adjust */
-#define GEM_1588INC       (0x000001DC/4) /* 1588 Timer Increment */
-#define GEM_PTPETXS       (0x000001E0/4) /* PTP Event Frame Transmitted (s) */
-#define GEM_PTPETXNS      (0x000001E4/4) /* PTP Event Frame Transmitted (ns) */
-#define GEM_PTPERXS       (0x000001E8/4) /* PTP Event Frame Received (s) */
-#define GEM_PTPERXNS      (0x000001EC/4) /* PTP Event Frame Received (ns) */
-#define GEM_PTPPTXS       (0x000001E0/4) /* PTP Peer Frame Transmitted (s) */
-#define GEM_PTPPTXNS      (0x000001E4/4) /* PTP Peer Frame Transmitted (ns) */
-#define GEM_PTPPRXS       (0x000001E8/4) /* PTP Peer Frame Received (s) */
-#define GEM_PTPPRXNS      (0x000001EC/4) /* PTP Peer Frame Received (ns) */
+#define GEM_NWCTRL        (0x00000000 / 4) /* Network Control reg */
+#define GEM_NWCFG         (0x00000004 / 4) /* Network Config reg */
+#define GEM_NWSTATUS      (0x00000008 / 4) /* Network Status reg */
+#define GEM_USERIO        (0x0000000C / 4) /* User IO reg */
+#define GEM_DMACFG        (0x00000010 / 4) /* DMA Control reg */
+#define GEM_TXSTATUS      (0x00000014 / 4) /* TX Status reg */
+#define GEM_RXQBASE       (0x00000018 / 4) /* RX Q Base address reg */
+#define GEM_TXQBASE       (0x0000001C / 4) /* TX Q Base address reg */
+#define GEM_RXSTATUS      (0x00000020 / 4) /* RX Status reg */
+#define GEM_ISR           (0x00000024 / 4) /* Interrupt Status reg */
+#define GEM_IER           (0x00000028 / 4) /* Interrupt Enable reg */
+#define GEM_IDR           (0x0000002C / 4) /* Interrupt Disable reg */
+#define GEM_IMR           (0x00000030 / 4) /* Interrupt Mask reg */
+#define GEM_PHYMNTNC      (0x00000034 / 4) /* Phy Maintenance reg */
+#define GEM_RXPAUSE       (0x00000038 / 4) /* RX Pause Time reg */
+#define GEM_TXPAUSE       (0x0000003C / 4) /* TX Pause Time reg */
+#define GEM_TXPARTIALSF   (0x00000040 / 4) /* TX Partial Store and Forward */
+#define GEM_RXPARTIALSF   (0x00000044 / 4) /* RX Partial Store and Forward */
+#define GEM_HASHLO        (0x00000080 / 4) /* Hash Low address reg */
+#define GEM_HASHHI        (0x00000084 / 4) /* Hash High address reg */
+#define GEM_SPADDR1LO     (0x00000088 / 4) /* Specific addr 1 low reg */
+#define GEM_SPADDR1HI     (0x0000008C / 4) /* Specific addr 1 high reg */
+#define GEM_SPADDR2LO     (0x00000090 / 4) /* Specific addr 2 low reg */
+#define GEM_SPADDR2HI     (0x00000094 / 4) /* Specific addr 2 high reg */
+#define GEM_SPADDR3LO     (0x00000098 / 4) /* Specific addr 3 low reg */
+#define GEM_SPADDR3HI     (0x0000009C / 4) /* Specific addr 3 high reg */
+#define GEM_SPADDR4LO     (0x000000A0 / 4) /* Specific addr 4 low reg */
+#define GEM_SPADDR4HI     (0x000000A4 / 4) /* Specific addr 4 high reg */
+#define GEM_TIDMATCH1     (0x000000A8 / 4) /* Type ID1 Match reg */
+#define GEM_TIDMATCH2     (0x000000AC / 4) /* Type ID2 Match reg */
+#define GEM_TIDMATCH3     (0x000000B0 / 4) /* Type ID3 Match reg */
+#define GEM_TIDMATCH4     (0x000000B4 / 4) /* Type ID4 Match reg */
+#define GEM_WOLAN         (0x000000B8 / 4) /* Wake on LAN reg */
+#define GEM_IPGSTRETCH    (0x000000BC / 4) /* IPG Stretch reg */
+#define GEM_SVLAN         (0x000000C0 / 4) /* Stacked VLAN reg */
+#define GEM_MODID         (0x000000FC / 4) /* Module ID reg */
+#define GEM_OCTTXLO       (0x00000100 / 4) /* Octects transmitted Low reg */
+#define GEM_OCTTXHI       (0x00000104 / 4) /* Octects transmitted High reg */
+#define GEM_TXCNT         (0x00000108 / 4) /* Error-free Frames transmitted */
+#define GEM_TXBCNT        (0x0000010C / 4) /* Error-free Broadcast Frames */
+#define GEM_TXMCNT        (0x00000110 / 4) /* Error-free Multicast Frame */
+#define GEM_TXPAUSECNT    (0x00000114 / 4) /* Pause Frames Transmitted */
+#define GEM_TX64CNT       (0x00000118 / 4) /* Error-free 64 TX */
+#define GEM_TX65CNT       (0x0000011C / 4) /* Error-free 65-127 TX */
+#define GEM_TX128CNT      (0x00000120 / 4) /* Error-free 128-255 TX */
+#define GEM_TX256CNT      (0x00000124 / 4) /* Error-free 256-511 */
+#define GEM_TX512CNT      (0x00000128 / 4) /* Error-free 512-1023 TX */
+#define GEM_TX1024CNT     (0x0000012C / 4) /* Error-free 1024-1518 TX */
+#define GEM_TX1519CNT     (0x00000130 / 4) /* Error-free larger than 1519 TX */
+#define GEM_TXURUNCNT     (0x00000134 / 4) /* TX under run error counter */
+#define GEM_SINGLECOLLCNT (0x00000138 / 4) /* Single Collision Frames */
+#define GEM_MULTCOLLCNT   (0x0000013C / 4) /* Multiple Collision Frames */
+#define GEM_EXCESSCOLLCNT (0x00000140 / 4) /* Excessive Collision Frames */
+#define GEM_LATECOLLCNT   (0x00000144 / 4) /* Late Collision Frames */
+#define GEM_DEFERTXCNT    (0x00000148 / 4) /* Deferred Transmission Frames */
+#define GEM_CSENSECNT     (0x0000014C / 4) /* Carrier Sense Error Counter */
+#define GEM_OCTRXLO       (0x00000150 / 4) /* Octects Received register Low */
+#define GEM_OCTRXHI       (0x00000154 / 4) /* Octects Received register High */
+#define GEM_RXCNT         (0x00000158 / 4) /* Error-free Frames Received */
+#define GEM_RXBROADCNT    (0x0000015C / 4) /* Error-free Broadcast Frames RX */
+#define GEM_RXMULTICNT    (0x00000160 / 4) /* Error-free Multicast Frames RX */
+#define GEM_RXPAUSECNT    (0x00000164 / 4) /* Pause Frames Received Counter */
+#define GEM_RX64CNT       (0x00000168 / 4) /* Error-free 64 byte Frames RX */
+#define GEM_RX65CNT       (0x0000016C / 4) /* Error-free 65-127B Frames RX */
+#define GEM_RX128CNT      (0x00000170 / 4) /* Error-free 128-255B Frames RX */
+#define GEM_RX256CNT      (0x00000174 / 4) /* Error-free 256-512B Frames RX */
+#define GEM_RX512CNT      (0x00000178 / 4) /* Error-free 512-1023B Frames RX */
+#define GEM_RX1024CNT     (0x0000017C / 4) /* Error-free 1024-1518B Frames RX */
+#define GEM_RX1519CNT     (0x00000180 / 4) /* Error-free 1519-max Frames RX */
+#define GEM_RXUNDERCNT    (0x00000184 / 4) /* Undersize Frames Received */
+#define GEM_RXOVERCNT     (0x00000188 / 4) /* Oversize Frames Received */
+#define GEM_RXJABCNT      (0x0000018C / 4) /* Jabbers Received Counter */
+#define GEM_RXFCSCNT      (0x00000190 / 4) /* Frame Check seq. Error Counter */
+#define GEM_RXLENERRCNT   (0x00000194 / 4) /* Length Field Error Counter */
+#define GEM_RXSYMERRCNT   (0x00000198 / 4) /* Symbol Error Counter */
+#define GEM_RXALIGNERRCNT (0x0000019C / 4) /* Alignment Error Counter */
+#define GEM_RXRSCERRCNT   (0x000001A0 / 4) /* Receive Resource Error Counter */
+#define GEM_RXORUNCNT     (0x000001A4 / 4) /* Receive Overrun Counter */
+#define GEM_RXIPCSERRCNT  (0x000001A8 / 4) /* IP header Checksum Err Counter */
+#define GEM_RXTCPCCNT     (0x000001AC / 4) /* TCP Checksum Error Counter */
+#define GEM_RXUDPCCNT     (0x000001B0 / 4) /* UDP Checksum Error Counter */
+
+#define GEM_1588S         (0x000001D0 / 4) /* 1588 Timer Seconds */
+#define GEM_1588NS        (0x000001D4 / 4) /* 1588 Timer Nanoseconds */
+#define GEM_1588ADJ       (0x000001D8 / 4) /* 1588 Timer Adjust */
+#define GEM_1588INC       (0x000001DC / 4) /* 1588 Timer Increment */
+#define GEM_PTPETXS       (0x000001E0 / 4) /* PTP Event Frame Transmitted (s) */
+#define GEM_PTPETXNS      (0x000001E4 / 4) /*
+                                            * PTP Event Frame Transmitted (ns)
+                                            */
+#define GEM_PTPERXS       (0x000001E8 / 4) /* PTP Event Frame Received (s) */
+#define GEM_PTPERXNS      (0x000001EC / 4) /* PTP Event Frame Received (ns) */
+#define GEM_PTPPTXS       (0x000001E0 / 4) /* PTP Peer Frame Transmitted (s) */
+#define GEM_PTPPTXNS      (0x000001E4 / 4) /* PTP Peer Frame Transmitted (ns) */
+#define GEM_PTPPRXS       (0x000001E8 / 4) /* PTP Peer Frame Received (s) */
+#define GEM_PTPPRXNS      (0x000001EC / 4) /* PTP Peer Frame Received (ns) */
 
 /* Design Configuration Registers */
-#define GEM_DESCONF       (0x00000280/4)
-#define GEM_DESCONF2      (0x00000284/4)
-#define GEM_DESCONF3      (0x00000288/4)
-#define GEM_DESCONF4      (0x0000028C/4)
-#define GEM_DESCONF5      (0x00000290/4)
-#define GEM_DESCONF6      (0x00000294/4)
+#define GEM_DESCONF       (0x00000280 / 4)
+#define GEM_DESCONF2      (0x00000284 / 4)
+#define GEM_DESCONF3      (0x00000288 / 4)
+#define GEM_DESCONF4      (0x0000028C / 4)
+#define GEM_DESCONF5      (0x00000290 / 4)
+#define GEM_DESCONF6      (0x00000294 / 4)
 #define GEM_DESCONF6_64B_MASK (1U << 23)
-#define GEM_DESCONF7      (0x00000298/4)
+#define GEM_DESCONF7      (0x00000298 / 4)
 
 #define GEM_INT_Q1_STATUS               (0x00000400 / 4)
 #define GEM_INT_Q1_MASK                 (0x00000640 / 4)
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

Add a property "jumbo-max-len", which sets default value of jumbo frames
up to 16,383 bytes. Add Frame length checks for standard and jumbo
frames.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c         | 51 +++++++++++++++++++++++++++++++++++++++-----
 include/hw/net/cadence_gem.h |  4 +++-
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@
 #define GEM_TXPAUSE       (0x0000003C / 4) /* TX Pause Time reg */
 #define GEM_TXPARTIALSF   (0x00000040 / 4) /* TX Partial Store and Forward */
 #define GEM_RXPARTIALSF   (0x00000044 / 4) /* RX Partial Store and Forward */
+#define GEM_JUMBO_MAX_LEN (0x00000048 / 4) /* Max Jumbo Frame Size */
 #define GEM_HASHLO        (0x00000080 / 4) /* Hash Low address reg */
 #define GEM_HASHHI        (0x00000084 / 4) /* Hash High address reg */
 #define GEM_SPADDR1LO     (0x00000088 / 4) /* Specific addr 1 low reg */
@@ -XXX,XX +XXX,XX @@
 #define GEM_NWCFG_LERR_DISC    0x00010000 /* Discard RX frames with len err */
 #define GEM_NWCFG_BUFF_OFST_M  0x0000C000 /* Receive buffer offset mask */
 #define GEM_NWCFG_BUFF_OFST_S  14         /* Receive buffer offset shift */
+#define GEM_NWCFG_RCV_1538     0x00000100 /* Receive 1538 bytes frame */
 #define GEM_NWCFG_UCAST_HASH   0x00000080 /* accept unicast if hash match */
 #define GEM_NWCFG_MCAST_HASH   0x00000040 /* accept multicast if hash match */
 #define GEM_NWCFG_BCAST_REJ    0x00000020 /* Reject broadcast packets */
 #define GEM_NWCFG_PROMISC      0x00000010 /* Accept all packets */
+#define GEM_NWCFG_JUMBO_FRAME  0x00000008 /* Jumbo Frames enable */
 
 #define GEM_DMACFG_ADDR_64B    (1U << 30)
 #define GEM_DMACFG_TX_BD_EXT   (1U << 29)
@@ -XXX,XX +XXX,XX @@
 
 /* GEM_ISR GEM_IER GEM_IDR GEM_IMR */
 #define GEM_INT_TXCMPL        0x00000080 /* Transmit Complete */
+#define GEM_INT_AMBA_ERR      0x00000040
 #define GEM_INT_TXUSED         0x00000008
 #define GEM_INT_RXUSED         0x00000004
 #define GEM_INT_RXCMPL        0x00000002
@@ -XXX,XX +XXX,XX @@ static inline void rx_desc_set_sar(uint32_t *desc, int sar_idx)
 /* The broadcast MAC address: 0xFFFFFFFFFFFF */
 static const uint8_t broadcast_addr[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 
+static uint32_t gem_get_max_buf_len(CadenceGEMState *s, bool tx)
+{
+    uint32_t size;
+    if (s->regs[GEM_NWCFG] & GEM_NWCFG_JUMBO_FRAME) {
+        size = s->regs[GEM_JUMBO_MAX_LEN];
+        if (size > s->jumbo_max_len) {
+            size = s->jumbo_max_len;
+            qemu_log_mask(LOG_GUEST_ERROR, "GEM_JUMBO_MAX_LEN reg cannot be"
+                " greater than 0x%" PRIx32 "\n", s->jumbo_max_len);
+        }
+    } else if (tx) {
+        size = 1518;
+    } else {
+        size = s->regs[GEM_NWCFG] & GEM_NWCFG_RCV_1538 ? 1538 : 1518;
+    }
+    return size;
+}
+
 static void gem_set_isr(CadenceGEMState *s, int q, uint32_t flag)
 {
     if (q == 0) {
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
     /* Find which queue we are targeting */
     q = get_queue_from_screen(s, rxbuf_ptr, rxbufsize);
 
+    if (size > gem_get_max_buf_len(s, false)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "rx frame too long\n");
+        gem_set_isr(s, q, GEM_INT_AMBA_ERR);
+        return -1;
+    }
+
     while (bytes_to_copy) {
         hwaddr desc_addr;
 
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                 break;
             }
 
-            if (tx_desc_get_length(desc) > MAX_FRAME_SIZE -
+            if (tx_desc_get_length(desc) > gem_get_max_buf_len(s, true) -
                                                (p - s->tx_packet)) {
-                DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \
-                         " too large: size 0x%x space 0x%zx\n",
+                qemu_log_mask(LOG_GUEST_ERROR, "TX descriptor @ 0x%" \
+                         HWADDR_PRIx " too large: size 0x%x space 0x%zx\n",
                          packet_desc_addr, tx_desc_get_length(desc),
-                         MAX_FRAME_SIZE - (p - s->tx_packet));
+                         gem_get_max_buf_len(s, true) - (p - s->tx_packet));
+                gem_set_isr(s, q, GEM_INT_AMBA_ERR);
                 break;
             }
 
@@ -XXX,XX +XXX,XX @@ static void gem_reset(DeviceState *d)
     s->regs[GEM_RXPARTIALSF] = 0x000003ff;
     s->regs[GEM_MODID] = s->revision;
     s->regs[GEM_DESCONF] = 0x02500111;
-    s->regs[GEM_DESCONF2] = 0x2ab13fff;
+    s->regs[GEM_DESCONF2] = 0x2ab10000 | s->jumbo_max_len;
     s->regs[GEM_DESCONF5] = 0x002f2045;
     s->regs[GEM_DESCONF6] = GEM_DESCONF6_64B_MASK;
+    s->regs[GEM_JUMBO_MAX_LEN] = s->jumbo_max_len;
 
     if (s->num_priority_queues > 1) {
         queues_mask = MAKE_64BIT_MASK(1, s->num_priority_queues - 1);
@@ -XXX,XX +XXX,XX @@ static void gem_write(void *opaque, hwaddr offset, uint64_t val,
         s->regs[GEM_IMR] &= ~val;
         gem_update_int_status(s);
         break;
+    case GEM_JUMBO_MAX_LEN:
+        s->regs[GEM_JUMBO_MAX_LEN] = val & MAX_JUMBO_FRAME_SIZE_MASK;
+        break;
     case GEM_INT_Q1_ENABLE ... GEM_INT_Q7_ENABLE:
         s->regs[GEM_INT_Q1_MASK + offset - GEM_INT_Q1_ENABLE] &= ~val;
         gem_update_int_status(s);
@@ -XXX,XX +XXX,XX @@ static void gem_realize(DeviceState *dev, Error **errp)
 
     s->nic = qemu_new_nic(&net_gem_info, &s->conf,
                           object_get_typename(OBJECT(dev)), dev->id, s);
+
+    if (s->jumbo_max_len > MAX_FRAME_SIZE) {
+        error_setg(errp, "jumbo-max-len is greater than %d",
+                  MAX_FRAME_SIZE);
+        return;
+    }
 }
 
 static void gem_init(Object *obj)
@@ -XXX,XX +XXX,XX @@ static Property gem_properties[] = {
                       num_type1_screeners, 4),
     DEFINE_PROP_UINT8("num-type2-screeners", CadenceGEMState,
                       num_type2_screeners, 4),
+    DEFINE_PROP_UINT16("jumbo-max-len", CadenceGEMState,
+                       jumbo_max_len, 10240),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/net/cadence_gem.h b/include/hw/net/cadence_gem.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/net/cadence_gem.h
+++ b/include/hw/net/cadence_gem.h
@@ -XXX,XX +XXX,XX @@
 #define MAX_TYPE1_SCREENERS             16
 #define MAX_TYPE2_SCREENERS             16
 
-#define MAX_FRAME_SIZE 2048
+#define MAX_JUMBO_FRAME_SIZE_MASK 0x3FFF
+#define MAX_FRAME_SIZE MAX_JUMBO_FRAME_SIZE_MASK
 
 typedef struct CadenceGEMState {
     /*< private >*/
@@ -XXX,XX +XXX,XX @@ typedef struct CadenceGEMState {
     uint8_t num_type1_screeners;
     uint8_t num_type2_screeners;
     uint32_t revision;
+    uint16_t jumbo_max_len;
 
     /* GEM registers backing store */
     uint32_t regs[CADENCE_GEM_MAXREG];
-- 
2.5.0

From: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>

TX_LAST bit should not be set by hardware, its set by guest to inform
the last bd of the frame.

Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Signed-off-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static inline unsigned tx_desc_get_last(uint32_t *desc)
     return (desc[1] & DESC_1_TX_LAST) ? 1 : 0;
 }
 
-static inline void tx_desc_set_last(uint32_t *desc)
-{
-    desc[1] |= DESC_1_TX_LAST;
-}
-
 static inline unsigned tx_desc_get_length(uint32_t *desc)
 {
     return desc[1] & DESC_1_LENGTH;
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
 
             /* read next descriptor */
             if (tx_desc_get_wrap(desc)) {
-                tx_desc_set_last(desc);
 
                 if (s->regs[GEM_DMACFG] & GEM_DMACFG_ADDR_64B) {
                     packet_desc_addr = s->regs[GEM_TBQPH];
-- 
2.5.0

From: Tong Ho <tong.ho@xilinx.com>

Two defects are fixed:

1/ Detection of multicast frames
2/ Treating drop of mis-addressed frames as non-error

Signed-off-by: Tong Ho <tong.ho@xilinx.com>
Signed-off-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Sai Pavan Boddu <sai.pavan.boddu@xilinx.com>
Reviewed-by: Edgar E. Iglesias <edgar.iglesias@xilinx.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/module.h"
 #include "sysemu/dma.h"
 #include "net/checksum.h"
+#include "net/eth.h"
 
 #define CADENCE_GEM_ERR_DEBUG 0
 #define DB_PRINT(...) do {\
@@ -XXX,XX +XXX,XX @@ static unsigned calc_mac_hash(const uint8_t *mac)
 static int gem_mac_address_filter(CadenceGEMState *s, const uint8_t *packet)
 {
     uint8_t *gem_spaddr;
-    int i;
+    int i, is_mc;
 
     /* Promiscuous mode? */
     if (s->regs[GEM_NWCFG] & GEM_NWCFG_PROMISC) {
@@ -XXX,XX +XXX,XX @@ static int gem_mac_address_filter(CadenceGEMState *s, const uint8_t *packet)
     }
 
     /* Accept packets -w- hash match? */
-    if ((packet[0] == 0x01 && (s->regs[GEM_NWCFG] & GEM_NWCFG_MCAST_HASH)) ||
-        (packet[0] != 0x01 && (s->regs[GEM_NWCFG] & GEM_NWCFG_UCAST_HASH))) {
+    is_mc = is_multicast_ether_addr(packet);
+    if ((is_mc && (s->regs[GEM_NWCFG] & GEM_NWCFG_MCAST_HASH)) ||
+        (!is_mc && (s->regs[GEM_NWCFG] & GEM_NWCFG_UCAST_HASH))) {
+        uint64_t buckets;
         unsigned hash_index;
 
         hash_index = calc_mac_hash(packet);
-        if (hash_index < 32) {
-            if (s->regs[GEM_HASHLO] & (1<<hash_index)) {
-                return packet[0] == 0x01 ? GEM_RX_MULTICAST_HASH_ACCEPT :
-                                           GEM_RX_UNICAST_HASH_ACCEPT;
-            }
-        } else {
-            hash_index -= 32;
-            if (s->regs[GEM_HASHHI] & (1<<hash_index)) {
-                return packet[0] == 0x01 ? GEM_RX_MULTICAST_HASH_ACCEPT :
-                                           GEM_RX_UNICAST_HASH_ACCEPT;
-            }
+        buckets = ((uint64_t)s->regs[GEM_HASHHI] << 32) | s->regs[GEM_HASHLO];
+        if ((buckets >> hash_index) & 1) {
+            return is_mc ? GEM_RX_MULTICAST_HASH_ACCEPT
+                         : GEM_RX_UNICAST_HASH_ACCEPT;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
     /* Is this destination MAC address "for us" ? */
     maf = gem_mac_address_filter(s, buf);
     if (maf == GEM_RX_REJECT) {
-        return -1;
+        return size;  /* no, drop siliently b/c it's not an error */
     }
 
     /* Discard packets with receive length error enabled ? */
-- 
2.5.0

The sender of packet will be checked in the qemu_net_queue_purge() but
we use NetClientState not its peer when trying to purge the incoming
queue in qemu_flush_or_purge_packets(). This will trigger the assert
in virtio_net_reset since we can't pass the sender check:

hw/net/virtio-net.c:533: void virtio_net_reset(VirtIODevice *): Assertion
`!virtio_net_get_subqueue(nc)->async_tx.elem' failed.
#9 0x55a33fa31b78 in virtio_net_reset hw/net/virtio-net.c:533:13
#10 0x55a33fc88412 in virtio_reset hw/virtio/virtio.c:1919:9
#11 0x55a341d82764 in virtio_bus_reset hw/virtio/virtio-bus.c:95:9
#12 0x55a341dba2de in virtio_pci_reset hw/virtio/virtio-pci.c:1824:5
#13 0x55a341db3e02 in virtio_pci_common_write hw/virtio/virtio-pci.c:1252:13
#14 0x55a33f62117b in memory_region_write_accessor memory.c:496:5
#15 0x55a33f6205e4 in access_with_adjusted_size memory.c:557:18
#16 0x55a33f61e177 in memory_region_dispatch_write memory.c:1488:16

Reproducer:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg701914.html

Fix by using the peer.

Reported-by: "Alexander Bulekov" <alxndr@bu.edu>
Acked-by: Alexander Bulekov <alxndr@bu.edu>
Fixes: ca77d85e1dbf9 ("net: complete all queued packets on VM stop")
Cc: qemu-stable@nongnu.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ void qemu_flush_or_purge_queued_packets(NetClientState *nc, bool purge)
         qemu_notify_event();
     } else if (purge) {
         /* Unable to empty the queue, purge remaining packets */
-        qemu_net_queue_purge(nc->incoming_queue, nc);
+        qemu_net_queue_purge(nc->incoming_queue, nc->peer);
     }
 }
 
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

qemu_bh_new will set the bh to be executed in the main
loop. This causes crashes as colo_compare_handle_event assumes
that it has exclusive access the queues, which are also
concurrently accessed in the iothread.

Create the bh with the AioContext of the iothread to fulfill
these assumptions and fix the crashes. This is safe, because
the bh already takes the appropriate locks.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Derek Su <dereksu@qnap.com>
Tested-by: Derek Su <dereksu@qnap.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static void colo_compare_handle_event(void *opaque)
 
 static void colo_compare_iothread(CompareState *s)
 {
+    AioContext *ctx = iothread_get_aio_context(s->iothread);
     object_ref(OBJECT(s->iothread));
     s->worker_context = iothread_get_g_main_context(s->iothread);
 
@@ -XXX,XX +XXX,XX @@ static void colo_compare_iothread(CompareState *s)
     }
 
     colo_compare_timer_init(s);
-    s->event_bh = qemu_bh_new(colo_compare_handle_event, s);
+    s->event_bh = aio_bh_new(ctx, colo_compare_handle_event, s);
 }
 
 static char *compare_get_pri_indev(Object *obj, Error **errp)
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

To be able to convert compare_chr_send to a coroutine in the
next commit, use qemu_co_sleep_ns if in coroutine.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 chardev/char.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/chardev/char.c b/chardev/char.c
index XXXXXXX..XXXXXXX 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/module.h"
 #include "qemu/option.h"
 #include "qemu/id.h"
+#include "qemu/coroutine.h"
 
 #include "chardev/char-mux.h"
 
@@ -XXX,XX +XXX,XX @@ static int qemu_chr_write_buffer(Chardev *s,
     retry:
         res = cc->chr_write(s, buf + *offset, len - *offset);
         if (res < 0 && errno == EAGAIN && write_all) {
-            g_usleep(100);
+            if (qemu_in_coroutine()) {
+                qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+            } else {
+                g_usleep(100);
+            }
             goto retry;
         }
 
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

The chr_out chardev is connected to a filter-redirector
running in the main loop. qemu_chr_fe_write_all might block
here in compare_chr_send if the (socket-)buffer is full.
If another filter-redirector in the main loop want's to
send data to chr_pri_in it might also block if the buffer
is full. This leads to a deadlock because both event loops
get blocked.

Fix this by converting compare_chr_send to a coroutine and
putting the packets in a send queue.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Tested-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 193 ++++++++++++++++++++++++++++++++++++++++-------------
 net/colo.c         |   7 ++
 net/colo.h         |   1 +
 3 files changed, 156 insertions(+), 45 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@
 #include "migration/migration.h"
 #include "util.h"
 
+#include "block/aio-wait.h"
+#include "qemu/coroutine.h"
+
 #define TYPE_COLO_COMPARE "colo-compare"
 #define COLO_COMPARE(obj) \
     OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
@@ -XXX,XX +XXX,XX @@ static int event_unhandled_count;
  *                    |packet  |  |packet  +    |packet  | |packet  +
  *                    +--------+  +--------+    +--------+ +--------+
  */
+
+typedef struct SendCo {
+    Coroutine *co;
+    struct CompareState *s;
+    CharBackend *chr;
+    GQueue send_list;
+    bool notify_remote_frame;
+    bool done;
+    int ret;
+} SendCo;
+
+typedef struct SendEntry {
+    uint32_t size;
+    uint32_t vnet_hdr_len;
+    uint8_t *buf;
+} SendEntry;
+
 typedef struct CompareState {
     Object parent;
 
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
     SocketReadState pri_rs;
     SocketReadState sec_rs;
     SocketReadState notify_rs;
+    SendCo out_sendco;
+    SendCo notify_sendco;
     bool vnet_hdr;
     uint32_t compare_timeout;
     uint32_t expired_scan_cycle;
@@ -XXX,XX +XXX,XX @@ enum {
 
 
 static int compare_chr_send(CompareState *s,
-                            const uint8_t *buf,
+                            uint8_t *buf,
                             uint32_t size,
                             uint32_t vnet_hdr_len,
-                            bool notify_remote_frame);
+                            bool notify_remote_frame,
+                            bool zero_copy);
 
 static bool packet_matches_str(const char *str,
                                const uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static void notify_remote_frame(CompareState *s)
     char msg[] = "DO_CHECKPOINT";
     int ret = 0;
 
-    ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true);
+    ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true, false);
     if (ret < 0) {
         error_report("Notify Xen COLO-frame failed");
     }
@@ -XXX,XX +XXX,XX @@ static void colo_release_primary_pkt(CompareState *s, Packet *pkt)
                            pkt->data,
                            pkt->size,
                            pkt->vnet_hdr_len,
-                           false);
+                           false,
+                           true);
     if (ret < 0) {
         error_report("colo send primary packet failed");
     }
     trace_colo_compare_main("packet same and release packet");
-    packet_destroy(pkt, NULL);
+    packet_destroy_partial(pkt, NULL);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static void colo_compare_connection(void *opaque, void *user_data)
     }
 }
 
-static int compare_chr_send(CompareState *s,
-                            const uint8_t *buf,
-                            uint32_t size,
-                            uint32_t vnet_hdr_len,
-                            bool notify_remote_frame)
+static void coroutine_fn _compare_chr_send(void *opaque)
 {
+    SendCo *sendco = opaque;
+    CompareState *s = sendco->s;
     int ret = 0;
-    uint32_t len = htonl(size);
 
-    if (!size) {
-        return 0;
-    }
+    while (!g_queue_is_empty(&sendco->send_list)) {
+        SendEntry *entry = g_queue_pop_tail(&sendco->send_list);
+        uint32_t len = htonl(entry->size);
 
-    if (notify_remote_frame) {
-        ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
-                                    (uint8_t *)&len,
-                                    sizeof(len));
-    } else {
-        ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len));
-    }
+        ret = qemu_chr_fe_write_all(sendco->chr, (uint8_t *)&len, sizeof(len));
 
-    if (ret != sizeof(len)) {
-        goto err;
-    }
+        if (ret != sizeof(len)) {
+            g_free(entry->buf);
+            g_slice_free(SendEntry, entry);
+            goto err;
+        }
 
-    if (s->vnet_hdr) {
-        /*
-         * We send vnet header len make other module(like filter-redirector)
-         * know how to parse net packet correctly.
-         */
-        len = htonl(vnet_hdr_len);
+        if (!sendco->notify_remote_frame && s->vnet_hdr) {
+            /*
+             * We send vnet header len make other module(like filter-redirector)
+             * know how to parse net packet correctly.
+             */
+            len = htonl(entry->vnet_hdr_len);
 
-        if (!notify_remote_frame) {
-            ret = qemu_chr_fe_write_all(&s->chr_out,
+            ret = qemu_chr_fe_write_all(sendco->chr,
                                         (uint8_t *)&len,
                                         sizeof(len));
+
+            if (ret != sizeof(len)) {
+                g_free(entry->buf);
+                g_slice_free(SendEntry, entry);
+                goto err;
+            }
         }
 
-        if (ret != sizeof(len)) {
+        ret = qemu_chr_fe_write_all(sendco->chr,
+                                    (uint8_t *)entry->buf,
+                                    entry->size);
+
+        if (ret != entry->size) {
+            g_free(entry->buf);
+            g_slice_free(SendEntry, entry);
             goto err;
         }
+
+        g_free(entry->buf);
+        g_slice_free(SendEntry, entry);
     }
 
+    sendco->ret = 0;
+    goto out;
+
+err:
+    while (!g_queue_is_empty(&sendco->send_list)) {
+        SendEntry *entry = g_queue_pop_tail(&sendco->send_list);
+        g_free(entry->buf);
+        g_slice_free(SendEntry, entry);
+    }
+    sendco->ret = ret < 0 ? ret : -EIO;
+out:
+    sendco->co = NULL;
+    sendco->done = true;
+    aio_wait_kick();
+}
+
+static int compare_chr_send(CompareState *s,
+                            uint8_t *buf,
+                            uint32_t size,
+                            uint32_t vnet_hdr_len,
+                            bool notify_remote_frame,
+                            bool zero_copy)
+{
+    SendCo *sendco;
+    SendEntry *entry;
+
     if (notify_remote_frame) {
-        ret = qemu_chr_fe_write_all(&s->chr_notify_dev,
-                                    (uint8_t *)buf,
-                                    size);
+        sendco = &s->notify_sendco;
     } else {
-        ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)buf, size);
+        sendco = &s->out_sendco;
     }
 
-    if (ret != size) {
-        goto err;
+    if (!size) {
+        return 0;
     }
 
-    return 0;
+    entry = g_slice_new(SendEntry);
+    entry->size = size;
+    entry->vnet_hdr_len = vnet_hdr_len;
+    if (zero_copy) {
+        entry->buf = buf;
+    } else {
+        entry->buf = g_malloc(size);
+        memcpy(entry->buf, buf, size);
+    }
+    g_queue_push_head(&sendco->send_list, entry);
+
+    if (sendco->done) {
+        sendco->co = qemu_coroutine_create(_compare_chr_send, sendco);
+        sendco->done = false;
+        qemu_coroutine_enter(sendco->co);
+        if (sendco->done) {
+            /* report early errors */
+            return sendco->ret;
+        }
+    }
 
-err:
-    return ret < 0 ? ret : -EIO;
+    /* assume success */
+    return 0;
 }
 
 static int compare_chr_can_read(void *opaque)
@@ -XXX,XX +XXX,XX @@ static void compare_pri_rs_finalize(SocketReadState *pri_rs)
                          pri_rs->buf,
                          pri_rs->packet_len,
                          pri_rs->vnet_hdr_len,
+                         false,
                          false);
     } else {
         /* compare packet in the specified connection */
@@ -XXX,XX +XXX,XX @@ static void compare_notify_rs_finalize(SocketReadState *notify_rs)
     if (packet_matches_str("COLO_USERSPACE_PROXY_INIT",
                            notify_rs->buf,
                            notify_rs->packet_len)) {
-        ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true);
+        ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true, false);
         if (ret < 0) {
             error_report("Notify Xen COLO-frame INIT failed");
         }
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
 
     QTAILQ_INSERT_TAIL(&net_compares, s, next);
 
+    s->out_sendco.s = s;
+    s->out_sendco.chr = &s->chr_out;
+    s->out_sendco.notify_remote_frame = false;
+    s->out_sendco.done = true;
+    g_queue_init(&s->out_sendco.send_list);
+
+    if (s->notify_dev) {
+        s->notify_sendco.s = s;
+        s->notify_sendco.chr = &s->chr_notify_dev;
+        s->notify_sendco.notify_remote_frame = true;
+        s->notify_sendco.done = true;
+        g_queue_init(&s->notify_sendco.send_list);
+    }
+
     g_queue_init(&s->conn_list);
 
     qemu_mutex_init(&event_mtx);
@@ -XXX,XX +XXX,XX @@ static void colo_flush_packets(void *opaque, void *user_data)
                          pkt->data,
                          pkt->size,
                          pkt->vnet_hdr_len,
-                         false);
-        packet_destroy(pkt, NULL);
+                         false,
+                         true);
+        packet_destroy_partial(pkt, NULL);
     }
     while (!g_queue_is_empty(&conn->secondary_list)) {
         pkt = g_queue_pop_head(&conn->secondary_list);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
         }
     }
 
+    AioContext *ctx = iothread_get_aio_context(s->iothread);
+    aio_context_acquire(ctx);
+    AIO_WAIT_WHILE(ctx, !s->out_sendco.done);
+    if (s->notify_dev) {
+        AIO_WAIT_WHILE(ctx, !s->notify_sendco.done);
+    }
+    aio_context_release(ctx);
+
     /* Release all unhandled packets after compare thead exited */
     g_queue_foreach(&s->conn_list, colo_flush_packets, s);
+    AIO_WAIT_WHILE(NULL, !s->out_sendco.done);
 
     g_queue_clear(&s->conn_list);
+    g_queue_clear(&s->out_sendco.send_list);
+    if (s->notify_dev) {
+        g_queue_clear(&s->notify_sendco.send_list);
+    }
 
     if (s->connection_track_table) {
         g_hash_table_destroy(s->connection_track_table);
diff --git a/net/colo.c b/net/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ void packet_destroy(void *opaque, void *user_data)
     g_slice_free(Packet, pkt);
 }
 
+void packet_destroy_partial(void *opaque, void *user_data)
+{
+    Packet *pkt = opaque;
+
+    g_slice_free(Packet, pkt);
+}
+
 /*
  * Clear hashtable, stop this hash growing really huge
  */
diff --git a/net/colo.h b/net/colo.h
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.h
+++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@ bool connection_has_tracked(GHashTable *connection_track_table,
 void connection_hashtable_reset(GHashTable *connection_track_table);
 Packet *packet_new(const void *data, int size, int vnet_hdr_len);
 void packet_destroy(void *opaque, void *user_data);
+void packet_destroy_partial(void *opaque, void *user_data);
 
 #endif /* NET_COLO_H */
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

Else the log will be flooded if there is a lot of network
traffic.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ sec:
         g_queue_push_head(&conn->primary_list, ppkt);
         g_queue_push_head(&conn->secondary_list, spkt);
 
-        qemu_hexdump((char *)ppkt->data, stderr,
-                     "colo-compare ppkt", ppkt->size);
-        qemu_hexdump((char *)spkt->data, stderr,
-                     "colo-compare spkt", spkt->size);
+        if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
+            qemu_hexdump((char *)ppkt->data, stderr,
+                        "colo-compare ppkt", ppkt->size);
+            qemu_hexdump((char *)spkt->data, stderr,
+                        "colo-compare spkt", spkt->size);
+        }
 
         colo_compare_inconsistency_notify(s);
     }
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

If the colo-compare object is removed before failover and a
checkpoint happens, qemu crashes because it tries to lock
the destroyed event_mtx in colo_notify_compares_event.

Fix this by checking if everything is initialized by
introducing a new variable colo_compare_active which
is protected by a new mutex colo_compare_mutex. The new mutex
also protects against concurrent access of the net_compares
list and makes sure that colo_notify_compares_event isn't
active while we destroy event_mtx and event_complete_cond.

With this it also is again possible to use colo without
colo-compare (periodic mode) and to use multiple colo-compare
for multiple network interfaces.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Tested-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static NotifierList colo_compare_notifiers =
 #define REGULAR_PACKET_CHECK_MS 3000
 #define DEFAULT_TIME_OUT_MS 3000
 
+static QemuMutex colo_compare_mutex;
+static bool colo_compare_active;
 static QemuMutex event_mtx;
 static QemuCond event_complete_cond;
 static int event_unhandled_count;
@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
 void colo_notify_compares_event(void *opaque, int event, Error **errp)
 {
     CompareState *s;
+    qemu_mutex_lock(&colo_compare_mutex);
+
+    if (!colo_compare_active) {
+        qemu_mutex_unlock(&colo_compare_mutex);
+        return;
+    }
 
     qemu_mutex_lock(&event_mtx);
     QTAILQ_FOREACH(s, &net_compares, next) {
@@ -XXX,XX +XXX,XX @@ void colo_notify_compares_event(void *opaque, int event, Error **errp)
     }
 
     qemu_mutex_unlock(&event_mtx);
+    qemu_mutex_unlock(&colo_compare_mutex);
 }
 
 static void colo_compare_timer_init(CompareState *s)
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
                            s->vnet_hdr);
     }
 
+    qemu_mutex_lock(&colo_compare_mutex);
+    if (!colo_compare_active) {
+        qemu_mutex_init(&event_mtx);
+        qemu_cond_init(&event_complete_cond);
+        colo_compare_active = true;
+    }
     QTAILQ_INSERT_TAIL(&net_compares, s, next);
+    qemu_mutex_unlock(&colo_compare_mutex);
 
     s->out_sendco.s = s;
     s->out_sendco.chr = &s->chr_out;
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
 
     g_queue_init(&s->conn_list);
 
-    qemu_mutex_init(&event_mtx);
-    qemu_cond_init(&event_complete_cond);
-
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
 
     qemu_bh_delete(s->event_bh);
 
+    qemu_mutex_lock(&colo_compare_mutex);
     QTAILQ_FOREACH(tmp, &net_compares, next) {
         if (tmp == s) {
             QTAILQ_REMOVE(&net_compares, s, next);
             break;
         }
     }
+    if (QTAILQ_EMPTY(&net_compares)) {
+        colo_compare_active = false;
+        qemu_mutex_destroy(&event_mtx);
+        qemu_cond_destroy(&event_complete_cond);
+    }
+    qemu_mutex_unlock(&colo_compare_mutex);
 
     AioContext *ctx = iothread_get_aio_context(s->iothread);
     aio_context_acquire(ctx);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
         object_unref(OBJECT(s->iothread));
     }
 
-    qemu_mutex_destroy(&event_mtx);
-    qemu_cond_destroy(&event_complete_cond);
-
     g_free(s->pri_indev);
     g_free(s->sec_indev);
     g_free(s->outdev);
     g_free(s->notify_dev);
 }
 
+static void __attribute__((__constructor__)) colo_compare_init_globals(void)
+{
+    colo_compare_active = false;
+    qemu_mutex_init(&colo_compare_mutex);
+}
+
 static const TypeInfo colo_compare_info = {
     .name = TYPE_COLO_COMPARE,
     .parent = TYPE_OBJECT,
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

In colo_compare_complete, insert CompareState into net_compares
only after everything has been initialized.
In colo_compare_finalize, remove CompareState from net_compares
before anything is deinitialized.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
                            s->vnet_hdr);
     }
 
-    qemu_mutex_lock(&colo_compare_mutex);
-    if (!colo_compare_active) {
-        qemu_mutex_init(&event_mtx);
-        qemu_cond_init(&event_complete_cond);
-        colo_compare_active = true;
-    }
-    QTAILQ_INSERT_TAIL(&net_compares, s, next);
-    qemu_mutex_unlock(&colo_compare_mutex);
-
     s->out_sendco.s = s;
     s->out_sendco.chr = &s->chr_out;
     s->out_sendco.notify_remote_frame = false;
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
                                                       connection_destroy);
 
     colo_compare_iothread(s);
+
+    qemu_mutex_lock(&colo_compare_mutex);
+    if (!colo_compare_active) {
+        qemu_mutex_init(&event_mtx);
+        qemu_cond_init(&event_complete_cond);
+        colo_compare_active = true;
+    }
+    QTAILQ_INSERT_TAIL(&net_compares, s, next);
+    qemu_mutex_unlock(&colo_compare_mutex);
+
     return;
 }
 
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
     CompareState *s = COLO_COMPARE(obj);
     CompareState *tmp = NULL;
 
-    qemu_chr_fe_deinit(&s->chr_pri_in, false);
-    qemu_chr_fe_deinit(&s->chr_sec_in, false);
-    qemu_chr_fe_deinit(&s->chr_out, false);
-    if (s->notify_dev) {
-        qemu_chr_fe_deinit(&s->chr_notify_dev, false);
-    }
-
-    if (s->iothread) {
-        colo_compare_timer_del(s);
-    }
-
-    qemu_bh_delete(s->event_bh);
-
     qemu_mutex_lock(&colo_compare_mutex);
     QTAILQ_FOREACH(tmp, &net_compares, next) {
         if (tmp == s) {
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
     }
     qemu_mutex_unlock(&colo_compare_mutex);
 
+    qemu_chr_fe_deinit(&s->chr_pri_in, false);
+    qemu_chr_fe_deinit(&s->chr_sec_in, false);
+    qemu_chr_fe_deinit(&s->chr_out, false);
+    if (s->notify_dev) {
+        qemu_chr_fe_deinit(&s->chr_notify_dev, false);
+    }
+
+    if (s->iothread) {
+        colo_compare_timer_del(s);
+    }
+
+    qemu_bh_delete(s->event_bh);
+
     AioContext *ctx = iothread_get_aio_context(s->iothread);
     aio_context_acquire(ctx);
     AIO_WAIT_WHILE(ctx, !s->out_sendco.done);
-- 
2.5.0

From: Derek Su <dereksu@qnap.com>

The patch is to fix the "pkt" memory leak in packet_enqueue().
The allocated "pkt" needs to be freed if the colo compare
primary or secondary queue is too big.

Replace the error_report of full queue with a trace event.

Signed-off-by: Derek Su <dereksu@qnap.com>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 23 +++++++++++++++--------
 net/trace-events   |  1 +
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ enum {
     SECONDARY_IN,
 };
 
+static const char *colo_mode[] = {
+    [PRIMARY_IN] = "primary",
+    [SECONDARY_IN] = "secondary",
+};
 
 static int compare_chr_send(CompareState *s,
                             uint8_t *buf,
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
     ConnectionKey key;
     Packet *pkt = NULL;
     Connection *conn;
+    int ret;
 
     if (mode == PRIMARY_IN) {
         pkt = packet_new(s->pri_rs.buf,
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
     }
 
     if (mode == PRIMARY_IN) {
-        if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) {
-            error_report("colo compare primary queue size too big,"
-                         "drop packet");
-        }
+        ret = colo_insert_packet(&conn->primary_list, pkt, &conn->pack);
     } else {
-        if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) {
-            error_report("colo compare secondary queue size too big,"
-                         "drop packet");
-        }
+        ret = colo_insert_packet(&conn->secondary_list, pkt, &conn->sack);
     }
+
+    if (!ret) {
+        trace_colo_compare_drop_packet(colo_mode[mode],
+            "queue size too big, drop packet");
+        packet_destroy(pkt, NULL);
+        pkt = NULL;
+    }
+
     *con = conn;
 
     return 0;
diff --git a/net/trace-events b/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -XXX,XX +XXX,XX @@ colo_proxy_main(const char *chr) ": %s"
 
 # colo-compare.c
 colo_compare_main(const char *chr) ": %s"
+colo_compare_drop_packet(const char *queue, const char *chr) ": %s: %s"
 colo_compare_udp_miscompare(const char *sta, int size) ": %s = %d"
 colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
 colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
-- 
2.5.0

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

libFuzzer found using 'qemu-system-i386 -M q35':

qemu: hardware error: e1000e: PSRCTL.BSIZE0 cannot be zero
CPU #0:
EAX=00000000 EBX=00000000 ECX=00000000 EDX=00000663
ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000
EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 00000000 0000ffff 00009300
CS =f000 ffff0000 0000ffff 00009b00
SS =0000 00000000 0000ffff 00009300
DS =0000 00000000 0000ffff 00009300
FS =0000 00000000 0000ffff 00009300
GS =0000 00000000 0000ffff 00009300
LDT=0000 00000000 0000ffff 00008200
TR =0000 00000000 0000ffff 00008b00
GDT=     00000000 0000ffff
IDT=     00000000 0000ffff
CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000
DR6=ffff0ff0 DR7=00000400
EFER=0000000000000000
FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80
FPR0=0000000000000000 0000 FPR1=0000000000000000 0000
FPR2=0000000000000000 0000 FPR3=0000000000000000 0000
FPR4=0000000000000000 0000 FPR5=0000000000000000 0000
FPR6=0000000000000000 0000 FPR7=0000000000000000 0000
XMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000
XMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000
XMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000
XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000
==1988== ERROR: libFuzzer: deadly signal
    #6 0x7fae4d3ea894 in __GI_abort (/lib64/libc.so.6+0x22894)
    #7 0x563f4cc59a1d in hw_error (qemu-fuzz-i386+0xe8ca1d)
    #8 0x563f4d7c93f2 in e1000e_set_psrctl (qemu-fuzz-i386+0x19fc3f2)
    #9 0x563f4d7b798f in e1000e_core_write (qemu-fuzz-i386+0x19ea98f)
    #10 0x563f4d7afc46 in e1000e_mmio_write (qemu-fuzz-i386+0x19e2c46)
    #11 0x563f4cc9a0a7 in memory_region_write_accessor (qemu-fuzz-i386+0xecd0a7)
    #12 0x563f4cc99c13 in access_with_adjusted_size (qemu-fuzz-i386+0xeccc13)
    #13 0x563f4cc987b4 in memory_region_dispatch_write (qemu-fuzz-i386+0xecb7b4)

It simply sent the following 2 I/O command to the e1000e
PCI BAR #2 I/O region:

writew 0x0100 0x0c00 # RCTL =   E1000_RCTL_DTYP_MASK
  writeb 0x2170 0x00   # PSRCTL = 0

2813 static void
2814 e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val)
2815 {
2816     if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) {
2817
2818         if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
2819             hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
2820         }

Instead of calling hw_error() which abort the process (it is
meant for CPU fatal error condition, not for device logging),
log the invalid request with qemu_log_mask(LOG_GUEST_ERROR)
and return, ignoring the request.

Cc: qemu-stable@nongnu.org
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000e_core.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -XXX,XX +XXX,XX @@
 */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "net/net.h"
 #include "net/tap.h"
-#include "hw/hw.h"
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "sysemu/runstate.h"
@@ -XXX,XX +XXX,XX @@ e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val)
     if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) {
 
         if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
-            hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "e1000e: PSRCTL.BSIZE0 cannot be zero");
+            return;
         }
 
         if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) {
-            hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero");
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "e1000e: PSRCTL.BSIZE1 cannot be zero");
+            return;
         }
     }
 
-- 
2.5.0

From: Thomas Huth <thuth@redhat.com>

It's been deprecated since QEMU v3.1, so it's time to finally
remove it. The "id" parameter can simply be used instead.

Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 docs/system/deprecated.rst | 15 +++++++++------
 net/net.c                  | 10 +---------
 qapi/net.json              |  3 ---
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst
index XXXXXXX..XXXXXXX 100644
--- a/docs/system/deprecated.rst
+++ b/docs/system/deprecated.rst
@@ -XXX,XX +XXX,XX @@ The 'file' driver for drives is no longer appropriate for character or host
 devices and will only accept regular files (S_IFREG). The correct driver
 for these file types is 'host_cdrom' or 'host_device' as appropriate.
 
-``-net ...,name=``\ *name* (since 3.1)
-''''''''''''''''''''''''''''''''''''''
-
-The ``name`` parameter of the ``-net`` option is a synonym
-for the ``id`` parameter, which should now be used instead.
-
 ``-smp`` (invalid topologies) (since 3.1)
 '''''''''''''''''''''''''''''''''''''''''
 
@@ -XXX,XX +XXX,XX @@ What follows is a record of recently removed, formerly deprecated
 features that serves as a record for users who have encountered
 trouble after a recent upgrade.
 
+System emulator command line arguments
+--------------------------------------
+
+``-net ...,name=``\ *name* (removed in 5.1)
+'''''''''''''''''''''''''''''''''''''''''''
+
+The ``name`` parameter of the ``-net`` option was a synonym
+for the ``id`` parameter, which should now be used instead.
+
 QEMU Machine Protocol (QMP) commands
 ------------------------------------
 
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
 {
     Netdev legacy = {0};
     const Netdev *netdev;
-    const char *name;
     NetClientState *peer = NULL;
 
     if (is_netdev) {
         netdev = object;
-        name = netdev->id;
 
         if (netdev->type == NET_CLIENT_DRIVER_NIC ||
             !net_client_init_fun[netdev->type]) {
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
         const NetLegacyOptions *opts = net->opts;
         legacy.id = net->id;
         netdev = &legacy;
-        /* missing optional values have been initialized to "all bits zero" */
-        name = net->has_id ? net->id : net->name;
-
-        if (net->has_name) {
-            warn_report("The 'name' parameter is deprecated, use 'id' instead");
-        }
 
         /* Map the old options to the new flat type */
         switch (opts->type) {
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
         }
     }
 
-    if (net_client_init_fun[netdev->type](netdev, name, peer, errp) < 0) {
+    if (net_client_init_fun[netdev->type](netdev, netdev->id, peer, errp) < 0) {
         /* FIXME drop when all init functions store an Error */
         if (errp && !*errp) {
             error_setg(errp, QERR_DEVICE_INIT_FAILED,
diff --git a/qapi/net.json b/qapi/net.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
 #
 # @id: identifier for monitor commands
 #
-# @name: identifier for monitor commands, ignored if @id is present
-#
 # @opts: device type specific properties (legacy)
 #
 # Since: 1.2
@@ -XXX,XX +XXX,XX @@
 { 'struct': 'NetLegacy',
   'data': {
     '*id':   'str',
-    '*name': 'str',
     'opts':  'NetLegacyOptions' } }
 
 ##
-- 
2.5.0

From: Thomas Huth <thuth@redhat.com>

Now that the "name" parameter is gone, there is hardly any difference
between NetLegacy and Netdev anymore, so we can drop NetLegacy and always
use Netdev to simplify the code quite a bit.

The only two differences that were really left between Netdev and NetLegacy:

1) NetLegacy does not allow a "hubport" type. We can continue to block
   this with a simple check in net_client_init1() for this type.

2) The "id" parameter was optional in NetLegacy (and an internal id
   was chosen via assign_name() during initialization), but it is mandatory
   for Netdev. To avoid that the visitor code bails out here, we have to
   add an internal id to the QemuOpts already earlier now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/net.c     | 77 ++++++++++-------------------------------------------------
 qapi/net.json | 46 -----------------------------------
 2 files changed, 13 insertions(+), 110 deletions(-)

diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
 };
 
 
-static int net_client_init1(const void *object, bool is_netdev, Error **errp)
+static int net_client_init1(const Netdev *netdev, bool is_netdev, Error **errp)
 {
-    Netdev legacy = {0};
-    const Netdev *netdev;
     NetClientState *peer = NULL;
 
     if (is_netdev) {
-        netdev = object;
-
         if (netdev->type == NET_CLIENT_DRIVER_NIC ||
             !net_client_init_fun[netdev->type]) {
             error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
             return -1;
         }
     } else {
-        const NetLegacy *net = object;
-        const NetLegacyOptions *opts = net->opts;
-        legacy.id = net->id;
-        netdev = &legacy;
-
-        /* Map the old options to the new flat type */
-        switch (opts->type) {
-        case NET_LEGACY_OPTIONS_TYPE_NONE:
+        if (netdev->type == NET_CLIENT_DRIVER_NONE) {
             return 0; /* nothing to do */
-        case NET_LEGACY_OPTIONS_TYPE_NIC:
-            legacy.type = NET_CLIENT_DRIVER_NIC;
-            legacy.u.nic = opts->u.nic;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_USER:
-            legacy.type = NET_CLIENT_DRIVER_USER;
-            legacy.u.user = opts->u.user;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_TAP:
-            legacy.type = NET_CLIENT_DRIVER_TAP;
-            legacy.u.tap = opts->u.tap;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_L2TPV3:
-            legacy.type = NET_CLIENT_DRIVER_L2TPV3;
-            legacy.u.l2tpv3 = opts->u.l2tpv3;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_SOCKET:
-            legacy.type = NET_CLIENT_DRIVER_SOCKET;
-            legacy.u.socket = opts->u.socket;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_VDE:
-            legacy.type = NET_CLIENT_DRIVER_VDE;
-            legacy.u.vde = opts->u.vde;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_BRIDGE:
-            legacy.type = NET_CLIENT_DRIVER_BRIDGE;
-            legacy.u.bridge = opts->u.bridge;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_NETMAP:
-            legacy.type = NET_CLIENT_DRIVER_NETMAP;
-            legacy.u.netmap = opts->u.netmap;
-            break;
-        case NET_LEGACY_OPTIONS_TYPE_VHOST_USER:
-            legacy.type = NET_CLIENT_DRIVER_VHOST_USER;
-            legacy.u.vhost_user = opts->u.vhost_user;
-            break;
-        default:
-            abort();
         }
-
-        if (!net_client_init_fun[netdev->type]) {
+        if (netdev->type == NET_CLIENT_DRIVER_HUBPORT ||
+            !net_client_init_fun[netdev->type]) {
             error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
                        "a net backend type (maybe it is not compiled "
                        "into this binary)");
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
 
         /* Do not add to a hub if it's a nic with a netdev= parameter. */
         if (netdev->type != NET_CLIENT_DRIVER_NIC ||
-            !opts->u.nic.has_netdev) {
+            !netdev->u.nic.has_netdev) {
             peer = net_hub_add_port(0, NULL, NULL);
         }
     }
@@ -XXX,XX +XXX,XX @@ static void show_netdevs(void)
 static int net_client_init(QemuOpts *opts, bool is_netdev, Error **errp)
 {
     gchar **substrings = NULL;
-    void *object = NULL;
+    Netdev *object = NULL;
     Error *err = NULL;
     int ret = -1;
     Visitor *v = opts_visitor_new(opts);
@@ -XXX,XX +XXX,XX @@ static int net_client_init(QemuOpts *opts, bool is_netdev, Error **errp)
         }
     }
 
-    if (is_netdev) {
-        visit_type_Netdev(v, NULL, (Netdev **)&object, &err);
-    } else {
-        visit_type_NetLegacy(v, NULL, (NetLegacy **)&object, &err);
+    /* Create an ID for -net if the user did not specify one */
+    if (!is_netdev && !qemu_opts_id(opts)) {
+        static int idx;
+        qemu_opts_set_id(opts, g_strdup_printf("__org.qemu.net%i", idx++));
     }
 
+    visit_type_Netdev(v, NULL, &object, &err);
+
     if (!err) {
         ret = net_client_init1(object, is_netdev, &err);
     }
 
-    if (is_netdev) {
-        qapi_free_Netdev(object);
-    } else {
-        qapi_free_NetLegacy(object);
-    }
+    qapi_free_Netdev(object);
 
 out:
     error_propagate(errp, err);
diff --git a/qapi/net.json b/qapi/net.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
     'vhost-user': 'NetdevVhostUserOptions' } }
 
 ##
-# @NetLegacy:
-#
-# Captures the configuration of a network device; legacy.
-#
-# @id: identifier for monitor commands
-#
-# @opts: device type specific properties (legacy)
-#
-# Since: 1.2
-##
-{ 'struct': 'NetLegacy',
-  'data': {
-    '*id':   'str',
-    'opts':  'NetLegacyOptions' } }
-
-##
-# @NetLegacyOptionsType:
-#
-# Since: 1.2
-##
-{ 'enum': 'NetLegacyOptionsType',
-  'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
-           'bridge', 'netmap', 'vhost-user'] }
-
-##
-# @NetLegacyOptions:
-#
-# Like Netdev, but for use only by the legacy command line options
-#
-# Since: 1.2
-##
-{ 'union': 'NetLegacyOptions',
-  'base': { 'type': 'NetLegacyOptionsType' },
-  'discriminator': 'type',
-  'data': {
-    'nic':      'NetLegacyNicOptions',
-    'user':     'NetdevUserOptions',
-    'tap':      'NetdevTapOptions',
-    'l2tpv3':   'NetdevL2TPv3Options',
-    'socket':   'NetdevSocketOptions',
-    'vde':      'NetdevVdeOptions',
-    'bridge':   'NetdevBridgeOptions',
-    'netmap':   'NetdevNetmapOptions',
-    'vhost-user': 'NetdevVhostUserOptions' } }
-
-##
 # @NetFilterDirection:
 #
 # Indicates whether a netfilter is attached to a netdev's transmit queue or
-- 
2.5.0

The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:

Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)

----------------------------------------------------------------

Changes since V2:
- fix 32bit build errros

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 170 ++++++++++
 10 files changed, 1584 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 ++
 4 files changed, 215 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 3 files changed, 522 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec, size_t num,
+                                    bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem, unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
+                            false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq, unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)(uintptr_t)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)(intptr_t)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq, unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next)
+{
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     hwaddr *addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        Int128 needle_last, map_last;
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = map->iova + off;
+
+        needle_last = int128_add(int128_make64(needle.translated_addr),
+                                 int128_make64(iovec[i].iov_len));
+        map_last = int128_make64(map->translated_addr + map->size);
+        if (unlikely(int128_gt(needle_last, map_last))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                     const struct iovec *iovec, size_t num,
                                     bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64(sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-                            false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr)
 {
-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 }
 
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)(uintptr_t)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)(uintptr_t)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)(intptr_t)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4