Series comparison

-[PULL 00/23] Net patches
+[PULL V3 00/15] Net patches
-The following changes since commit e0175b71638cf4398903c0d25f93fe62e0606389:
+The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
-  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20200228' into staging (2020-02-28 16:39:27 +0000)
+  Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to 41aa2e3f9b27fd259a13711545d933a20f1d2f16:
+for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
-  l2tpv3: fix RFC number typo in qemu-options.hx (2020-03-02 15:30:08 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
 ----------------------------------------------------------------
+Changes since V2:
+- fix 32bit build errros
 ----------------------------------------------------------------
-Bin Meng (1):
+Eugenio Pérez (14):
-      hw: net: cadence_gem: Fix build errors in DB_PRINT()
+      vhost: Add VhostShadowVirtqueue
       vhost: Add Shadow VirtQueue kick forwarding capabilities
       vhost: Add Shadow VirtQueue call forwarding capabilities
       vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
-Finn Thain (14):
+Jason Wang (1):
-      dp8393x: Mask EOL bit from descriptor addresses
+      virtio-net: fix map leaking on error during receive
       dp8393x: Always use 32-bit accesses
       dp8393x: Clean up endianness hacks
       dp8393x: Have dp8393x_receive() return the packet size
       dp8393x: Update LLFA and CRDA registers from rx descriptor
       dp8393x: Clear RRRA command register bit only when appropriate
       dp8393x: Implement packet size limit and RBAE interrupt
       dp8393x: Don't clobber packet checksum
       dp8393x: Use long-word-aligned RRA pointers in 32-bit mode
       dp8393x: Pad frames to word or long word boundary
       dp8393x: Clear descriptor in_use field to release packet
       dp8393x: Always update RRA pointers and sequence numbers
       dp8393x: Don't reset Silicon Revision register
       dp8393x: Don't stop reception upon RBE interrupt assertion
-Lukas Straub (4):
+ hw/net/virtio-net.c                |   1 +
-      block/replication.c: Ignore requests after failover
+ hw/virtio/meson.build              |   2 +-
-      tests/test-replication.c: Add test for for secondary node continuing replication
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
-      net/filter.c: Add Options to insert filters anywhere in the filter list
+ hw/virtio/vhost-iova-tree.h        |  27 ++
-      colo: Update Documentation for continuous replication
+ hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
-Stefan Hajnoczi (1):
+ hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
-      l2tpv3: fix RFC number typo in qemu-options.hx
+ include/hw/virtio/vhost-vdpa.h     |   8 +
+ include/qemu/iova-tree.h           |  38 ++-
-Yuri Benditovich (3):
+ util/iova-tree.c                   | 170 ++++++++++
-      e1000e: Avoid hw_error if legacy mode used
+files changed, 1584 insertions(+), 17 deletions(-)
-      NetRxPkt: Introduce support for additional hash types
+ create mode 100644 hw/virtio/vhost-iova-tree.c
-      NetRxPkt: fix hash calculation of IPV6 TCP
+ create mode 100644 hw/virtio/vhost-iova-tree.h
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
- block/replication.c        |  35 ++++++-
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
  docs/COLO-FT.txt           | 224 +++++++++++++++++++++++++++++++++------------
  docs/block-replication.txt |  28 ++++--
  hw/net/cadence_gem.c       |  11 ++-
  hw/net/dp8393x.c           | 200 ++++++++++++++++++++++++++--------------
  hw/net/e1000e_core.c       |  15 +--
  hw/net/net_rx_pkt.c        |  44 ++++++++-
  hw/net/net_rx_pkt.h        |   6 +-
  hw/net/trace-events        |   4 +
  include/net/filter.h       |   2 +
  net/filter.c               |  92 ++++++++++++++++++-
  qemu-options.hx            |  35 +++++--
  tests/test-replication.c   |  52 +++++++++++
 files changed, 591 insertions(+), 157 deletions(-)

-[PULL 01/23] dp8393x: Mask EOL bit from descriptor addresses
+Deleted patch
-From: Finn Thain <fthain@telegraphics.com.au>
-The Least Significant bit of a descriptor address register is used as
-an EOL flag. It has to be masked when the register value is to be used
-as an actual address for copying memory around. But when the registers
-are to be updated the EOL bit should not be masked.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
-Tested-by: Laurent Vivier <laurent@vivier.eu>
----
- hw/net/dp8393x.c | 17 +++++++++++------
-file changed, 11 insertions(+), 6 deletions(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
-+++ b/hw/net/dp8393x.c
-@@ -XXX,XX +XXX,XX @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0)
- #define SONIC_ISR_PINT   0x0800
- #define SONIC_ISR_LCD    0x1000
-+#define SONIC_DESC_EOL   0x0001
-+#define SONIC_DESC_ADDR  0xFFFE
-+
- #define TYPE_DP8393X "dp8393x"
- #define DP8393X(obj) OBJECT_CHECK(dp8393xState, (obj), TYPE_DP8393X)
-@@ -XXX,XX +XXX,XX @@ static uint32_t dp8393x_crba(dp8393xState *s)
- static uint32_t dp8393x_crda(dp8393xState *s)
- {
--    return (s->regs[SONIC_URDA] << 16) | s->regs[SONIC_CRDA];
-+    return (s->regs[SONIC_URDA] << 16) |
-+           (s->regs[SONIC_CRDA] & SONIC_DESC_ADDR);
- }
- static uint32_t dp8393x_rbwc(dp8393xState *s)
-@@ -XXX,XX +XXX,XX @@ static uint32_t dp8393x_tsa(dp8393xState *s)
- static uint32_t dp8393x_ttda(dp8393xState *s)
- {
--    return (s->regs[SONIC_UTDA] << 16) | s->regs[SONIC_TTDA];
-+    return (s->regs[SONIC_UTDA] << 16) |
-+           (s->regs[SONIC_TTDA] & SONIC_DESC_ADDR);
- }
- static uint32_t dp8393x_wt(dp8393xState *s)
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_transmit_packets(dp8393xState *s)
-                                MEMTXATTRS_UNSPECIFIED, s->data,
-                                size);
-             s->regs[SONIC_CTDA] = dp8393x_get(s, width, 0) & ~0x1;
--            if (dp8393x_get(s, width, 0) & 0x1) {
-+            if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) {
-                 /* EOL detected */
-                 break;
-             }
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
-     /* XXX: Check byte ordering */
-     /* Check for EOL */
--    if (s->regs[SONIC_LLFA] & 0x1) {
-+    if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
-         /* Are we still in resource exhaustion? */
-         size = sizeof(uint16_t) * 1 * width;
-         address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width;
-         address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED,
-                            s->data, size);
--        if (dp8393x_get(s, width, 0) & 0x1) {
-+        if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) {
-             /* Still EOL ; stop reception */
-             return -1;
-         } else {
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
-                        dp8393x_crda(s) + sizeof(uint16_t) * 5 * width,
-                        MEMTXATTRS_UNSPECIFIED, s->data, size);
-     s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0);
--    if (s->regs[SONIC_LLFA] & 0x1) {
-+    if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
-         /* EOL detected */
-         s->regs[SONIC_ISR] |= SONIC_ISR_RDE;
-     } else {
---
-.5.0

-[PULL 15/23] e1000e: Avoid hw_error if legacy mode used
+[PULL V3 01/15] virtio-net: fix map leaking on error during receive
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-https://bugzilla.redhat.com/show_bug.cgi?id=1787142
+Fixing this by detaching the cached elements on error. This addresses
-The emulation issues hw_error if PSRCTL register
+CVE-2022-26353.
 is written, for example, with zero value.
 Such configuration does not present any problem when
 DTYP bits of RCTL register define legacy format of
 transfer descriptors. Current commit discards check
 for BSIZE0 and BSIZE1 when legacy mode used.
-Acked-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
+Cc: qemu-stable@nongnu.org
 Fixes: CVE-2022-26353
 Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000e_core.c | 13 ++++++++-----
+ hw/net/virtio-net.c | 1 +
-file changed, 8 insertions(+), 5 deletions(-)
+file changed, 1 insertion(+)
-diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e_core.c
+--- a/hw/net/virtio-net.c
-+++ b/hw/net/e1000e_core.c
++++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@ e1000e_set_eitr(E1000ECore *core, int index, uint32_t val)
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
- static void
- e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val)
+ err:
- {
+     for (j = 0; j < i; j++) {
--    if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
--        hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
+         g_free(elems[j]);
 -    }
 +    if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) {
 +
 +        if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
 +            hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
 +        }
 -    if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) {
 -        hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero");
 +        if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) {
 +            hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero");
 +        }
      }
-     core->mac[PSRCTL] = val;
 --
-.5.0
+.7.4

-[PULL 10/23] dp8393x: Pad frames to word or long word boundary
+[PULL V3 02/15] vhost: Add VhostShadowVirtqueue
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-The existing code has a bug where the Remaining Buffer Word Count (RBWC)
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-is calculated with a truncating division, which gives the wrong result
+notifications and buffers, allowing qemu to track them. While qemu is
-for odd-sized packets.
+forwarding the buffers and virtqueue changes, it is able to commit the
 memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-Section 1.4.1 of the datasheet says,
+This commit only exposes basic SVQ allocation and free. Next patches of
 the series add functionality like notifications and buffers forwarding.
-    Once the end of the packet has been reached, the serializer will
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-    fill out the last word (16-bit mode) or long word (32-bit mode)
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
     if the last byte did not end on a word or long word boundary
     respectively. The fill byte will be 0FFh.
 Implement buffer padding so that buffer limits are correctly enforced.
 Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
 Tested-by: Laurent Vivier <laurent@vivier.eu>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 39 ++++++++++++++++++++++++++++-----------
+ hw/virtio/meson.build              |  2 +-
-file changed, 28 insertions(+), 11 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 files changed, 91 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/meson.build
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-     dp8393xState *s = qemu_get_nic_opaque(nc);
-     int packet_type;
+ virtio_ss = ss.source_set()
-     uint32_t available, address;
+ virtio_ss.add(files('virtio.c'))
--    int width, rx_len = pkt_size;
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
-+    int width, rx_len, padded_len;
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-     uint32_t checksum;
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-     int size;
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
--    width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1;
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
--
+new file mode 100644
-     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
+index XXXXXXX..XXXXXXX
-         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
+--- /dev/null
++++ b/hw/virtio/vhost-shadow-virtqueue.c
--    if (pkt_size + 4 > dp8393x_rbwc(s) * 2) {
+@@ -XXX,XX +XXX,XX @@
-+    rx_len = pkt_size + sizeof(checksum);
++/*
-+    if (s->regs[SONIC_DCR] & SONIC_DCR_DW) {
++ * vhost shadow virtqueue
-+        width = 2;
++ *
-+        padded_len = ((rx_len - 1) | 3) + 1;
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
-+    } else {
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
-+        width = 1;
++ *
-+        padded_len = ((rx_len - 1) | 1) + 1;
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
 +
 +#include "qemu/error-report.h"
 +
 +/**
 + * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
 + * Returns the new virtqueue or NULL.
 + *
 + * In case of error, reason is reported through error_report.
 + */
 +VhostShadowVirtqueue *vhost_svq_new(void)
 +{
 +    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 +    int r;
 +
 +    r = event_notifier_init(&svq->hdev_kick, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create kick event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_kick;
 +    }
 +
-+    if (padded_len > dp8393x_rbwc(s) * 2) {
++    r = event_notifier_init(&svq->hdev_call, 0);
-         DPRINTF("oversize packet, pkt_size is %d\n", pkt_size);
++    if (r != 0) {
-         s->regs[SONIC_ISR] |= SONIC_ISR_RBAE;
++        error_report("Couldn't create call event notifier: %s (%d)",
-         dp8393x_update_irq(s);
++                     g_strerror(errno), errno);
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
++        goto err_init_hdev_call;
      s->regs[SONIC_TRBA0] = s->regs[SONIC_CRBA0];
      /* Calculate the ethernet checksum */
 -    checksum = cpu_to_le32(crc32(0, buf, rx_len));
 +    checksum = cpu_to_le32(crc32(0, buf, pkt_size));
      /* Put packet into RBA */
      DPRINTF("Receive packet at %08x\n", dp8393x_crba(s));
      address = dp8393x_crba(s);
      address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
 -                        buf, rx_len);
 -    address += rx_len;
 +                        buf, pkt_size);
 +    address += pkt_size;
 +
 +    /* Put frame checksum into RBA */
      address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
 -                        &checksum, 4);
 -    address += 4;
 -    rx_len += 4;
 +                        &checksum, sizeof(checksum));
 +    address += sizeof(checksum);
 +
 +    /* Pad short packets to keep pointers aligned */
 +    if (rx_len < padded_len) {
 +        size = padded_len - rx_len;
 +        address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED,
 +            (uint8_t *)"\xFF\xFF\xFF", size, 1);
 +        address += size;
 +    }
 +
-     s->regs[SONIC_CRBA1] = address >> 16;
++    return g_steal_pointer(&svq);
-     s->regs[SONIC_CRBA0] = address & 0xffff;
++
-     available = dp8393x_rbwc(s);
++err_init_hdev_call:
--    available -= rx_len / 2;
++    event_notifier_cleanup(&svq->hdev_kick);
-+    available -= padded_len >> 1;
++
-     s->regs[SONIC_RBWC1] = available >> 16;
++err_init_hdev_kick:
-     s->regs[SONIC_RBWC0] = available & 0xffff;
++    return NULL;
++}
 +
 +/**
 + * Free the resources of the shadow virtqueue.
 + *
 + * @pvq: gpointer to SVQ so it can be used by autofree functions.
 + */
 +void vhost_svq_free(gpointer pvq)
 +{
 +    VhostShadowVirtqueue *vq = pvq;
 +    event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_cleanup(&vq->hdev_call);
 +    g_free(vq);
 +}
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost shadow virtqueue
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef VHOST_SHADOW_VIRTQUEUE_H
 +#define VHOST_SHADOW_VIRTQUEUE_H
 +
 +#include "qemu/event_notifier.h"
 +
 +/* Shadow virtqueue to relay notifications */
 +typedef struct VhostShadowVirtqueue {
 +    /* Shadow kick notifier, sent to vhost */
 +    EventNotifier hdev_kick;
 +    /* Shadow call notifier, sent to vhost */
 +    EventNotifier hdev_call;
 +} VhostShadowVirtqueue;
 +
 +VhostShadowVirtqueue *vhost_svq_new(void);
 +
 +void vhost_svq_free(gpointer vq);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 +
 +#endif
 --
-.5.0
+.7.4

-[PULL 19/23] block/replication.c: Ignore requests after failover
+[PULL V3 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-After failover the Secondary side of replication shouldn't change state, because
+At this mode no buffer forwarding will be performed in SVQ mode: Qemu
-it now functions as our primary disk.
+will just forward the guest's kicks to the device.
-In replication_start, replication_do_checkpoint, replication_stop, ignore
+Host memory notifiers regions are left out for simplicity, and they will
-the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or
+not be addressed in this series.
-BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active
-and hidden images into the base image).
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Lukas Straub <lukasstraub2@web.de>
 Reviewed-by: Zhang Chen <chen.zhang@intel.com>
 Acked-by: Max Reitz <mreitz@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- block/replication.c | 35 ++++++++++++++++++++++++++++++++++-
+ hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
-file changed, 34 insertions(+), 1 deletion(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
+ hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
-diff --git a/block/replication.c b/block/replication.c
+ include/hw/virtio/vhost-vdpa.h     |   4 ++
 files changed, 215 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/block/replication.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/block/replication.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
+@@ -XXX,XX +XXX,XX @@
-     aio_context_acquire(aio_context);
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
-     s = bs->opaque;
+ #include "qemu/error-report.h"
-+    if (s->stage == BLOCK_REPLICATION_DONE ||
++#include "qemu/main-loop.h"
-+        s->stage == BLOCK_REPLICATION_FAILOVER) {
++#include "linux-headers/linux/vhost.h"
-+        /*
++
-+         * This case happens when a secondary is promoted to primary.
++/**
-+         * Ignore the request because the secondary side of replication
++ * Forward guest notifications.
-+         * doesn't have to do anything anymore.
++ *
-+         */
++ * @n: guest kick event notifier, the one that guest set to notify svq.
-+        aio_context_release(aio_context);
++ */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
 +    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
  void vhost_svq_free(gpointer pvq)
  {
      VhostShadowVirtqueue *vq = pvq;
 +    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
      EventNotifier hdev_call;
 +
 +    /*
 +     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
 +     * notifier allows to recover the VhostShadowVirtqueue from the event loop
 +     * easily. If we use the VirtQueue's one, we don't have an easy way to
 +     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
-     if (s->stage != BLOCK_REPLICATION_NONE) {
+     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
-         error_setg(errp, "Block replication is running or done");
+         if (vhost_vdpa_host_notifier_init(dev, i)) {
-         aio_context_release(aio_context);
+             goto err;
-@@ -XXX,XX +XXX,XX @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
+@@ -XXX,XX +XXX,XX @@ err:
-     aio_context_acquire(aio_context);
+     return;
-     s = bs->opaque;
+ }
-+    if (s->stage == BLOCK_REPLICATION_DONE ||
++static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
-+        s->stage == BLOCK_REPLICATION_FAILOVER) {
++{
-+        /*
++    struct vhost_vdpa *v = dev->opaque;
-+         * This case happens when a secondary was promoted to primary.
++    size_t idx;
-+         * Ignore the request because the secondary side of replication
++
-+         * doesn't have to do anything anymore.
++    if (!v->shadow_vqs) {
 +         */
 +        aio_context_release(aio_context);
 +        return;
 +    }
 +
-     if (s->mode == REPLICATION_MODE_SECONDARY) {
++    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
-         secondary_do_checkpoint(s, errp);
++        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
-     }
++    }
-@@ -XXX,XX +XXX,XX @@ static void replication_get_error(ReplicationState *rs, Error **errp)
++    g_ptr_array_free(v->shadow_vqs, true);
-     aio_context_acquire(aio_context);
++}
-     s = bs->opaque;
++
+ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
--    if (s->stage != BLOCK_REPLICATION_RUNNING) {
+ {
-+    if (s->stage == BLOCK_REPLICATION_NONE) {
+     struct vhost_vdpa *v;
-         error_setg(errp, "Block replication is not running");
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
-         aio_context_release(aio_context);
+     trace_vhost_vdpa_cleanup(dev, v);
-         return;
+     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
-@@ -XXX,XX +XXX,XX @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
+     memory_listener_unregister(&v->listener);
-     aio_context_acquire(aio_context);
++    vhost_vdpa_svq_cleanup(dev);
-     s = bs->opaque;
+     dev->opaque = NULL;
-+    if (s->stage == BLOCK_REPLICATION_DONE ||
+     ram_block_discard_disable(false);
-+        s->stage == BLOCK_REPLICATION_FAILOVER) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
-+        /*
+     return ret;
-+         * This case happens when a secondary was promoted to primary.
+ }
-+         * Ignore the request because the secondary side of replication
-+         * doesn't have to do anything anymore.
++static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
-+         */
++{
-+        aio_context_release(aio_context);
++    if (!v->shadow_vqs_enabled) {
 +        return;
 +    }
 +
-     if (s->stage != BLOCK_REPLICATION_RUNNING) {
++    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
-         error_setg(errp, "Block replication is not running");
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-         aio_context_release(aio_context);
++        vhost_svq_stop(svq);
 +    }
 +}
 +
  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      uint8_t status = 0;
 +    vhost_vdpa_reset_svq(v);
 +
      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
      trace_vhost_vdpa_reset_device(dev, status);
      return ret;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +}
 +
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HW_VIRTIO_VHOST_VDPA_H
  #define HW_VIRTIO_VHOST_VDPA_H
 +#include <gmodule.h>
 +
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
-.5.0
+.7.4

-[PULL 14/23] dp8393x: Don't stop reception upon RBE interrupt assertion
+[PULL V3 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-Section 3.4.7 of the datasheet explains that,
+This will make qemu aware of the device used buffers, allowing it to
 write the guest memory with its contents if needed.
-    The RBE bit in the Interrupt Status register is set when the
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-    SONIC finishes using the second to last receive buffer and reads
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
     the last RRA descriptor. Actually, the SONIC is not truly out of
     resources, but gives the system an early warning of an impending
     out of resources condition.
 RBE does not mean actual receive buffer exhaustion, and reception should
 not be stopped. This is important because Linux will not check and clear
 the RBE interrupt until it receives another packet. But that won't
 happen if can_receive returns false. This bug causes the SONIC to become
 deaf (until reset).
 Fix this with a new flag to indicate actual receive buffer exhaustion.
 Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
 Tested-by: Laurent Vivier <laurent@vivier.eu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 35 ++++++++++++++++++++++-------------
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
-file changed, 22 insertions(+), 13 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
  hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 files changed, 71 insertions(+), 2 deletions(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ typedef struct dp8393xState {
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
-     /* Hardware */
+ }
-     uint8_t it_shift;
-     bool big_endian;
+ /**
-+    bool last_rba_is_full;
++ * Forward vhost notifications
-     qemu_irq irq;
++ *
- #ifdef DEBUG_SONIC
++ * @n: hdev call event notifier, the one that device set to notify svq.
-     int irq_level;
++ */
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_read_rra(dp8393xState *s)
++static void vhost_svq_handle_call(EventNotifier *n)
-         s->regs[SONIC_RRP] = s->regs[SONIC_RSA];
++{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
 +                                             hdev_call);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->svq_call);
 +}
 +
 +/**
 + * Set the call notifier for the SVQ to call the guest
 + *
 + * @svq: Shadow virtqueue
 + * @call_fd: call notifier
 + *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
 +    if (call_fd == VHOST_FILE_UNBIND) {
 +        /*
 +         * Fail event_notifier_set if called handling device call.
 +         *
 +         * SVQ still needs device notifications, since it needs to keep
 +         * forwarding used buffers even with the unbind.
 +         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
 +        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      }
--    /* Check resource exhaustion */
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
-+    /* Warn the host if CRBA now has the last available resource */
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
-     if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP])
+     return g_steal_pointer(&svq);
-     {
-         s->regs[SONIC_ISR] |= SONIC_ISR_RBE;
+ err_init_hdev_call:
-         dp8393x_update_irq(s);
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
-     }
+     VhostShadowVirtqueue *vq = pvq;
      vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_set_handler(&vq->hdev_call, NULL);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
  }
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
       * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
       */
      EventNotifier svq_kick;
 +
-+    /* Allow packet reception */
++    /* Guest's call notifier, where the SVQ calls guest. */
-+    s->last_rba_is_full = false;
++    EventNotifier svq_call;
  } VhostShadowVirtqueue;
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
- static void dp8393x_do_software_reset(dp8393xState *s)
++static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
++                                         struct vhost_vring_file *file)
-                 dp8393x_do_read_rra(s);
++{
-             }
++    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-             dp8393x_update_irq(s);
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
--            if (dp8393x_can_receive(s->nic->ncs)) {
++}
--                qemu_flush_queued_packets(qemu_get_queue(s->nic));
++
--            }
+ /**
-             break;
+  * Set the shadow virtqueue descriptors to the device
-         /* The guest is required to store aligned pointers here */
+  *
-         case SONIC_RSA:
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-@@ -XXX,XX +XXX,XX @@ static int dp8393x_can_receive(NetClientState *nc)
+  * @svq: The shadow virtqueue
+  * @idx: The index of the virtqueue in the vhost device
-     if (!(s->regs[SONIC_CR] & SONIC_CR_RXEN))
+  * @errp: Error
-         return 0;
++ *
--    if (s->regs[SONIC_ISR] & SONIC_ISR_RBE)
++ * Note that this function does not rewind kick file descriptor if cannot set
--        return 0;
++ * call one.
-     return 1;
+  */
- }
+ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
+     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
-         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
+     if (unlikely(r != 0)) {
+         error_setg_errno(errp, -r, "Can't set device kick fd");
-+    if (s->last_rba_is_full) {
++        return false;
 +        return pkt_size;
 +    }
 +
-     rx_len = pkt_size + sizeof(checksum);
++    event_notifier = &svq->hdev_call;
-     if (s->regs[SONIC_DCR] & SONIC_DCR_DW) {
++    file.fd = event_notifier_get_fd(event_notifier);
-         width = 2;
++    r = vhost_vdpa_set_vring_dev_call(dev, &file);
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
++    if (unlikely(r != 0)) {
-         DPRINTF("oversize packet, pkt_size is %d\n", pkt_size);
++        error_setg_errno(errp, -r, "Can't set device call fd");
          s->regs[SONIC_ISR] |= SONIC_ISR_RBAE;
          dp8393x_update_irq(s);
 -        dp8393x_do_read_rra(s);
 -        return pkt_size;
 +        s->regs[SONIC_RCR] |= SONIC_RCR_LPKT;
 +        goto done;
      }
-     packet_type = dp8393x_receive_filter(s, buf, pkt_size);
+     return r == 0;
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
-         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
+ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
-     }
+                                        struct vhost_vring_file *file)
+ {
-+    dp8393x_update_irq(s);
+-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +
-     s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) |
++    if (v->shadow_vqs_enabled) {
-                          ((s->regs[SONIC_RSC] + 1) & 0x00ff);
++        int vdpa_idx = file->index - dev->vq_index;
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +done:
 +
-     if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) {
++        vhost_svq_set_svq_call_fd(svq, file->fd);
--        /* Read next RRA */
++        return 0;
--        dp8393x_do_read_rra(s);
++    } else {
-+        if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP]) {
++        return vhost_vdpa_set_vring_dev_call(dev, file);
-+            /* Stop packet reception */
++    }
 +            s->last_rba_is_full = true;
 +        } else {
 +            /* Read next resource */
 +            dp8393x_do_read_rra(s);
 +        }
      }
 -    /* Done */
 -    dp8393x_update_irq(s);
 -
      return pkt_size;
  }
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
 --
-.5.0
+.7.4

-[PULL 11/23] dp8393x: Clear descriptor in_use field to release packet
+[PULL V3 05/15] vhost: Add vhost_svq_valid_features to shadow vq
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-When the SONIC receives a packet into the last available descriptor, it
+This allows SVQ to negotiate features with the guest and the device. For
-retains ownership of that descriptor for as long as necessary.
+the device, SVQ is a driver. While this function bypasses all
 non-transport features, it needs to disable the features that SVQ does
 not support when forwarding buffers. This includes packed vq layout,
 indirect descriptors or event idx.
-Section 3.4.7 of the datasheet says,
+Future changes can add support to offer more features to the guest,
 since the use of VirtQueue gives this for free. This is left out at the
 moment for simplicity.
-    When the system appends more descriptors, the SONIC releases ownership
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-    of the descriptor after writing 0000h to the RXpkt.in_use field.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 The packet can now be processed by the host, so raise a PKTRX interrupt,
 just like the normal case.
 Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
 Tested-by: Laurent Vivier <laurent@vivier.eu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 10 ++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
-file changed, 10 insertions(+)
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
  hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 files changed, 61 insertions(+)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@
-             return -1;
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
-         }
-         /* Link has been updated by host */
+ #include "qemu/error-report.h"
 +#include "qapi/error.h"
  #include "qemu/main-loop.h"
  #include "linux-headers/linux/vhost.h"
  /**
 + * Validate the transport device features that both guests can use with the SVQ
 + * and SVQs can use with the device.
 + *
 + * @dev_features: The features
 + * @errp: Error pointer
 + */
 +bool vhost_svq_valid_features(uint64_t features, Error **errp)
 +{
 +    bool ok = true;
 +    uint64_t svq_features = features;
 +
-+        /* Clear in_use */
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
-+        size = sizeof(uint16_t) * width;
++         ++b) {
-+        address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width;
++        switch (b) {
-+        dp8393x_put(s, width, 0, 0);
++        case VIRTIO_F_ANY_LAYOUT:
-+        address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED,
++            continue;
 +                         (uint8_t *)s->data, size, 1);
 +
-+        /* Move to next descriptor */
++        case VIRTIO_F_ACCESS_PLATFORM:
-         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
++            /* SVQ trust in the host's IOMMU to translate addresses */
-+        s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
++        case VIRTIO_F_VERSION_1:
 +            /* SVQ trust that the guest vring is little endian */
 +            if (!(svq_features & BIT_ULL(b))) {
 +                svq_features |= BIT_ULL(b);
 +                ok = false;
 +            }
 +            continue;
 +
 +        default:
 +            if (svq_features & BIT_ULL(b)) {
 +                svq_features &= ~BIT_ULL(b);
 +                ok = false;
 +            }
 +        }
 +    }
 +
 +    if (!ok) {
 +        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
 +                         ", ok: 0x%"PRIx64, features, svq_features);
 +    }
 +    return ok;
 +}
 +
 +/**
   * Forward guest notifications.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier svq_call;
  } VhostShadowVirtqueue;
 +bool vhost_svq_valid_features(uint64_t features, Error **errp);
 +
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
      g_autoptr(GPtrArray) shadow_vqs = NULL;
 +    uint64_t dev_features, svq_features;
 +    int r;
 +    bool ok;
      if (!v->shadow_vqs_enabled) {
          return 0;
      }
-     /* Save current position */
++    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    if (r != 0) {
 +        error_setg_errno(errp, -r, "Can't get vdpa device features");
 +        return r;
 +    }
 +
 +    svq_features = dev_features;
 +    ok = vhost_svq_valid_features(svq_features, errp);
 +    if (unlikely(!ok)) {
 +        return -1;
 +    }
 +
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
          g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 --
-.5.0
+.7.4

-[PULL 16/23] NetRxPkt: Introduce support for additional hash types
+[PULL V3 06/15] virtio: Add vhost_svq_get_vring_addr
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Add support for following hash types:
+It reports the shadow virtqueue address from qemu virtual address space.
 IPV6 TCP with extension headers
 IPV4 UDP
 IPV6 UDP
 IPV6 UDP with extension headers
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
+Since this will be different from the guest's vaddr, but the device can
-Acked-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
+access it, SVQ takes special care about its alignment & lack of garbage
 data. It assumes that IOMMU will work in host_page_size ranges for that.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/net_rx_pkt.c | 42 ++++++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
- hw/net/net_rx_pkt.h |  6 +++++-
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
- hw/net/trace-events |  4 ++++
+files changed, 38 insertions(+)
 files changed, 51 insertions(+), 1 deletion(-)
-diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/net_rx_pkt.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/net_rx_pkt.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ _net_rx_rss_prepare_tcp(uint8_t *rss_input,
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
                            &tcphdr->th_dport, sizeof(uint16_t));
  }
-+static inline void
+ /**
-+_net_rx_rss_prepare_udp(uint8_t *rss_input,
++ * Get the shadow vq vring address.
-+                        struct NetRxPkt *pkt,
++ * @svq: Shadow virtqueue
-+                        size_t *bytes_written)
++ * @addr: Destination to store address
 + */
 +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr)
 +{
-+    struct udp_header *udphdr = &pkt->l4hdr_info.hdr.udp;
++    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-+
++    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-+    _net_rx_rss_add_chunk(rss_input, bytes_written,
++    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +                          &udphdr->uh_sport, sizeof(uint16_t));
 +
 +    _net_rx_rss_add_chunk(rss_input, bytes_written,
 +                          &udphdr->uh_dport, sizeof(uint16_t));
 +}
 +
- uint32_t
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
- net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
++{
-                          NetRxPktRssType type,
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
-@@ -XXX,XX +XXX,XX @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
++    size_t avail_size = offsetof(vring_avail_t, ring) +
-         trace_net_rx_pkt_rss_ip6_ex();
++                                             sizeof(uint16_t) * svq->vring.num;
-         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
++
-         break;
++    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
-+    case NetPktRssIpV6TcpEx:
++}
-+        assert(pkt->isip6);
++
-+        assert(pkt->istcp);
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
-+        trace_net_rx_pkt_rss_ip6_ex_tcp();
++{
-+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
++    size_t used_size = offsetof(vring_used_t, ring) +
-+        _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
++                                    sizeof(vring_used_elem_t) * svq->vring.num;
-+        break;
++    return ROUND_UP(used_size, qemu_real_host_page_size);
-+    case NetPktRssIpV4Udp:
++}
-+        assert(pkt->isip4);
++
-+        assert(pkt->isudp);
++/**
-+        trace_net_rx_pkt_rss_ip4_udp();
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
-+        _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
+  *
-+        _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
+  * @svq: The svq
-+        break;
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 +    case NetPktRssIpV6Udp:
 +        assert(pkt->isip6);
 +        assert(pkt->isudp);
 +        trace_net_rx_pkt_rss_ip6_udp();
 +        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
 +        _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
 +        break;
 +    case NetPktRssIpV6UdpEx:
 +        assert(pkt->isip6);
 +        assert(pkt->isudp);
 +        trace_net_rx_pkt_rss_ip6_ex_udp();
 +        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
 +        _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
 +        break;
      default:
          assert(false);
          break;
 diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/net_rx_pkt.h
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/hw/net/net_rx_pkt.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@
-     NetPktRssIpV4Tcp,
+ #define VHOST_SHADOW_VIRTQUEUE_H
-     NetPktRssIpV6Tcp,
-     NetPktRssIpV6,
+ #include "qemu/event_notifier.h"
--    NetPktRssIpV6Ex
++#include "hw/virtio/virtio.h"
-+    NetPktRssIpV6Ex,
++#include "standard-headers/linux/vhost_types.h"
-+    NetPktRssIpV6TcpEx,
-+    NetPktRssIpV4Udp,
+ /* Shadow virtqueue to relay notifications */
-+    NetPktRssIpV6Udp,
+ typedef struct VhostShadowVirtqueue {
-+    NetPktRssIpV6UdpEx,
++    /* Shadow vring */
- } NetRxPktRssType;
++    struct vring vring;
++
- /**
+     /* Shadow kick notifier, sent to vhost */
-diff --git a/hw/net/trace-events b/hw/net/trace-events
+     EventNotifier hdev_kick;
-index XXXXXXX..XXXXXXX 100644
+     /* Shadow call notifier, sent to vhost */
---- a/hw/net/trace-events
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
-+++ b/hw/net/trace-events
-@@ -XXX,XX +XXX,XX @@ net_rx_pkt_l3_csum_validate_csum(size_t l3hdr_off, uint32_t csl, uint32_t cntr,
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
- net_rx_pkt_rss_ip4(void) "Calculating IPv4 RSS  hash"
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
- net_rx_pkt_rss_ip4_tcp(void) "Calculating IPv4/TCP RSS  hash"
++                              struct vhost_vring_addr *addr);
-+net_rx_pkt_rss_ip4_udp(void) "Calculating IPv4/UDP RSS  hash"
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
- net_rx_pkt_rss_ip6_tcp(void) "Calculating IPv6/TCP RSS  hash"
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
-+net_rx_pkt_rss_ip6_udp(void) "Calculating IPv6/UDP RSS  hash"
- net_rx_pkt_rss_ip6(void) "Calculating IPv6 RSS  hash"
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
  net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS  hash"
 +net_rx_pkt_rss_ip6_ex_tcp(void) "Calculating IPv6/EX/TCP RSS  hash"
 +net_rx_pkt_rss_ip6_ex_udp(void) "Calculating IPv6/EX/UDP RSS  hash"
  net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X"
  net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes"
 --
-.5.0
+.7.4

-[PULL 04/23] dp8393x: Have dp8393x_receive() return the packet size
+[PULL V3 07/15] vdpa: adapt vhost_ops callbacks to svq
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-This function re-uses its 'size' argument as a scratch variable.
+First half of the buffers forwarding part, preparing vhost-vdpa
-Instead, declare a local 'size' variable for that purpose so that the
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
-function result doesn't get messed up.
+this is effectively dead code at the moment, but it helps to reduce
 patch size.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Tested-by: Laurent Vivier <laurent@vivier.eu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 9 +++++----
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
-file changed, 5 insertions(+), 4 deletions(-)
+file changed, 41 insertions(+), 7 deletions(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static int dp8393x_receive_filter(dp8393xState *s, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
 +                                         struct vhost_vring_state *ring)
 +{
 +    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +}
 +
  static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                           struct vhost_vring_file *file)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
  }
- static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
++static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
--                               size_t size)
++                                         struct vhost_vring_addr *addr)
-+                               size_t pkt_size)
++{
 +    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
 +                                addr->desc_user_addr, addr->used_user_addr,
 +                                addr->avail_user_addr,
 +                                addr->log_guest_addr);
 +
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 +
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
  static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                         struct vhost_vring_addr *addr)
  {
-     dp8393xState *s = qemu_get_nic_opaque(nc);
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-     int packet_type;
+-                                    addr->desc_user_addr, addr->used_user_addr,
-     uint32_t available, address;
+-                                    addr->avail_user_addr,
--    int width, rx_len = size;
+-                                    addr->log_guest_addr);
-+    int width, rx_len = pkt_size;
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-     uint32_t checksum;
++    struct vhost_vdpa *v = dev->opaque;
-+    int size;
++
++    if (v->shadow_vqs_enabled) {
-     width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1;
++        /*
++         * Device vring addr was set at device start. SVQ base is handled by
-     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
++         * VirtQueue code.
-         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
++         */
++        return 0;
--    packet_type = dp8393x_receive_filter(s, buf, size);
++    }
-+    packet_type = dp8393x_receive_filter(s, buf, pkt_size);
++
-     if (packet_type < 0) {
++    return vhost_vdpa_set_vring_dev_addr(dev, addr);
          DPRINTF("packet not for netcard\n");
          return -1;
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
      /* Done */
      dp8393x_update_irq(s);
 -    return size;
 +    return pkt_size;
  }
- static void dp8393x_reset(DeviceState *dev)
+ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                         struct vhost_vring_state *ring)
  {
 -    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        /*
 +         * Device vring base was set at device start. SVQ base is handled by
 +         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
 +    return vhost_vdpa_set_dev_vring_base(dev, ring);
  }
  static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
 --
-.5.0
+.7.4

-[PULL 02/23] dp8393x: Always use 32-bit accesses
+[PULL V3 08/15] vhost: Shadow virtqueue buffers forwarding
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-The DP83932 and DP83934 have 32 data lines. The datasheet says,
+Initial version of shadow virtqueue that actually forward buffers. There
+is no iommu support at the moment, and that will be addressed in future
-    Data Bus: These bidirectional lines are used to transfer data on the
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
-    system bus. When the SONIC is a bus master, 16-bit data is transferred
+this means that SVQ is not usable at this point of the series on any
-    on D15-D0 and 32-bit data is transferred on D31-D0. When the SONIC is
+device.
-    accessed as a slave, register data is driven onto lines D15-D0.
-    D31-D16 are held TRI-STATE if SONIC is in 16-bit mode. If SONIC is in
+For simplicity it only supports modern devices, that expects vring
--bit mode, they are driven, but invalid.
+in little endian, with split ring and no event idx or indirect
+descriptors. Support for them will not be added in this series.
-Always use 32-bit accesses both as bus master and bus slave.
+It reuses the VirtQueue code for the device part. The driver part is
-Force the MSW to zero in bus master mode.
+based on Linux's virtio_ring driver, but with stripped functionality
+and optimizations so it's easier to review.
-This gets the Linux 'jazzsonic' driver working, and avoids the need for
-prior hacks to make the NetBSD 'sn' driver work.
+However, forwarding buffers have some particular pieces: One of the most
+unexpected ones is that a guest's buffer can expand through more than
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
+one descriptor in SVQ. While this is handled gracefully by qemu's
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+emulated virtio devices, it may cause unexpected SVQ queue full. This
 patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 47 +++++++++++++++++++++++++++++------------------
+ hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
-file changed, 29 insertions(+), 18 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
+ hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+files changed, 522 insertions(+), 11 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_put(dp8393xState *s, int width, int offset,
+@@ -XXX,XX +XXX,XX @@
-                         uint16_t val)
+ #include "qemu/error-report.h"
  #include "qapi/error.h"
  #include "qemu/main-loop.h"
 +#include "qemu/log.h"
 +#include "qemu/memalign.h"
  #include "linux-headers/linux/vhost.h"
  /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
  }
  /**
 - * Forward guest notifications.
 + * Number of descriptors that the SVQ can make available from the guest.
 + *
 + * @svq: The svq
 + */
 +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
 +{
 +    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    const struct iovec *iovec, size_t num,
 +                                    bool more_descs, bool write)
 +{
 +    uint16_t i = svq->free_head, last = svq->free_head;
 +    unsigned n;
 +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 +    vring_desc_t *descs = svq->vring.desc;
 +
 +    if (num == 0) {
 +        return;
 +    }
 +
 +    for (n = 0; n < num; n++) {
 +        if (more_descs || (n + 1 < num)) {
 +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 +        } else {
 +            descs[i].flags = flags;
 +        }
 +        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
 +
 +        last = i;
 +        i = cpu_to_le16(descs[i].next);
 +    }
 +
 +    svq->free_head = le16_to_cpu(descs[last].next);
 +}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem, unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 +                            false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
-     if (s->big_endian) {
+     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
--        s->data[offset * width + width - 1] = cpu_to_be16(val);
+     event_notifier_test_and_clear(n);
-+        if (width == 2) {
+-    event_notifier_set(&svq->hdev_kick);
-+            s->data[offset * 2] = 0;
++    vhost_handle_guest_kick(svq);
-+            s->data[offset * 2 + 1] = cpu_to_be16(val);
++}
-+        } else {
++
-+            s->data[offset] = cpu_to_be16(val);
++static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
-+        }
++{
-     } else {
++    if (svq->last_used_idx != svq->shadow_used_idx) {
--        s->data[offset * width] = cpu_to_le16(val);
++        return true;
-+        if (width == 2) {
++    }
-+            s->data[offset * 2] = cpu_to_le16(val);
++
-+            s->data[offset * 2 + 1] = 0;
++    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
-+        } else {
++
-+            s->data[offset] = cpu_to_le16(val);
++    return svq->last_used_idx != svq->shadow_used_idx;
-+        }
+ }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
-@@ -XXX,XX +XXX,XX @@ static uint64_t dp8393x_read(void *opaque, hwaddr addr, unsigned int size)
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
+ void vhost_svq_stop(VhostShadowVirtqueue *svq)
-     DPRINTF("read 0x%04x from reg %s\n", val, reg_names[reg]);
+ {
+     event_notifier_set_handler(&svq->svq_kick, NULL);
--    return val;
++    g_autofree VirtQueueElement *next_avail_elem = NULL;
-+    return s->big_endian ? val << 16 : val;
++
 +    if (!svq->vq) {
 +        return;
 +    }
 +
 +    /* Send all pending used descriptors to guest */
 +    vhost_svq_flush(svq, false);
 +
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
- static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
+ /**
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq, unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq, unsigned idx,
 +                                  Error **errp)
  {
-     dp8393xState *s = opaque;
+     struct vhost_vring_file file = {
-     int reg = addr >> s->it_shift;
+         .index = dev->vq_index + idx,
-+    uint32_t val = s->big_endian ? data >> 16 : data;
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
--    DPRINTF("write 0x%04x to reg %s\n", (uint16_t)data, reg_names[reg]);
+     if (unlikely(r != 0)) {
-+    DPRINTF("write 0x%04x to reg %s\n", (uint16_t)val, reg_names[reg]);
+         error_setg_errno(errp, -r, "Can't set device kick fd");
+-        return false;
-     switch (reg) {
++        return r;
          /* Command register */
          case SONIC_CR:
 -            dp8393x_do_command(s, data);
 +            dp8393x_do_command(s, val);
              break;
          /* Prevent write to read-only registers */
          case SONIC_CAP2:
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
          /* Accept write to some registers only when in reset mode */
          case SONIC_DCR:
              if (s->regs[SONIC_CR] & SONIC_CR_RST) {
 -                s->regs[reg] = data & 0xbfff;
 +                s->regs[reg] = val & 0xbfff;
              } else {
                  DPRINTF("writing to DCR invalid\n");
              }
              break;
          case SONIC_DCR2:
              if (s->regs[SONIC_CR] & SONIC_CR_RST) {
 -                s->regs[reg] = data & 0xf017;
 +                s->regs[reg] = val & 0xf017;
              } else {
                  DPRINTF("writing to DCR2 invalid\n");
              }
              break;
          /* 12 lower bytes are Read Only */
          case SONIC_TCR:
 -            s->regs[reg] = data & 0xf000;
 +            s->regs[reg] = val & 0xf000;
              break;
          /* 9 lower bytes are Read Only */
          case SONIC_RCR:
 -            s->regs[reg] = data & 0xffe0;
 +            s->regs[reg] = val & 0xffe0;
              break;
          /* Ignore most significant bit */
          case SONIC_IMR:
 -            s->regs[reg] = data & 0x7fff;
 +            s->regs[reg] = val & 0x7fff;
              dp8393x_update_irq(s);
              break;
          /* Clear bits by writing 1 to them */
          case SONIC_ISR:
 -            data &= s->regs[reg];
 -            s->regs[reg] &= ~data;
 -            if (data & SONIC_ISR_RBE) {
 +            val &= s->regs[reg];
 +            s->regs[reg] &= ~val;
 +            if (val & SONIC_ISR_RBE) {
                  dp8393x_do_read_rra(s);
              }
              dp8393x_update_irq(s);
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
          case SONIC_REA:
          case SONIC_RRP:
          case SONIC_RWP:
 -            s->regs[reg] = data & 0xfffe;
 +            s->regs[reg] = val & 0xfffe;
              break;
          /* Invert written value for some registers */
          case SONIC_CRCT:
          case SONIC_FAET:
          case SONIC_MPT:
 -            s->regs[reg] = data ^ 0xffff;
 +            s->regs[reg] = val ^ 0xffff;
              break;
          /* All other registers have no special contrainst */
          default:
 -            s->regs[reg] = data;
 +            s->regs[reg] = val;
      }
-     if (reg == SONIC_WT0 || reg == SONIC_WT1) {
+     event_notifier = &svq->hdev_call;
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
- static const MemoryRegionOps dp8393x_ops = {
+         error_setg_errno(errp, -r, "Can't set device call fd");
-     .read = dp8393x_read,
+     }
-     .write = dp8393x_write,
--    .impl.min_access_size = 2,
++    return r;
--    .impl.max_access_size = 2,
++}
-+    .impl.min_access_size = 4,
++
-+    .impl.max_access_size = 4,
++/**
-     .endianness = DEVICE_NATIVE_ENDIAN,
++ * Unmap a SVQ area in the device
- };
++ */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)(uintptr_t)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)(intptr_t)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
      }
      for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
          VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        struct vhost_vring_addr addr = {
 +            .index = i,
 +        };
 +        int r;
          bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
          if (unlikely(!ok)) {
 -            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            goto err;
 +        }
 +
 +        vhost_svq_start(svq, dev->vdev, vq);
 +        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
 +        if (unlikely(!ok)) {
 +            goto err_map;
 +        }
 +
 +        /* Override vring GPA set by vhost subsystem */
 +        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
 +        if (unlikely(r != 0)) {
 +            error_setg_errno(&err, -r, "Cannot set device address");
 +            goto err_set_addr;
 +        }
 +    }
 +
 +    return true;
 +
 +err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
      }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
          }
          vhost_vdpa_set_vring_ready(dev);
      } else {
 +        ok = vhost_vdpa_svqs_stop(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      }
 --
-.5.0
+.7.4

-[PULL 06/23] dp8393x: Clear RRRA command register bit only when appropriate
+[PULL V3 09/15] util: Add iova_tree_alloc_map
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-It doesn't make sense to clear the command register bit unless the
+This iova tree function allows it to look for a hole in allocated
-command was actually issued.
+regions and return a totally new translation for a given translated
+address.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+It's usage is mainly to allow devices to access qemu address space,
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+remapping guest's one into a new iova space where qemu can add chunks of
 addresses.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 7 +++----
+ include/qemu/iova-tree.h |  18 +++++++
-file changed, 3 insertions(+), 4 deletions(-)
+ util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
+files changed, 154 insertions(+)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
 diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/dp8393x.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_read_rra(dp8393xState *s)
+@@ -XXX,XX +XXX,XX @@
-         s->regs[SONIC_ISR] |= SONIC_ISR_RBE;
+ #define  IOVA_OK           (0)
-         dp8393x_update_irq(s);
+ #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
-     }
+ #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
--
++#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
--    /* Done */
--    s->regs[SONIC_CR] &= ~SONIC_CR_RRRA;
+ typedef struct IOVATree IOVATree;
  typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
  void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next)
 +{
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
- static void dp8393x_do_software_reset(dp8393xState *s)
++/**
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_command(dp8393xState *s, uint16_t command)
++ * Try to find an unallocated IOVA range between prev and this elements.
-         dp8393x_do_start_timer(s);
++ *
-     if (command & SONIC_CR_RST)
++ * @args: Arguments to allocation
-         dp8393x_do_software_reset(s);
++ *
--    if (command & SONIC_CR_RRRA)
++ * Cases:
-+    if (command & SONIC_CR_RRRA) {
++ *
-         dp8393x_do_read_rra(s);
++ * (1) !prev, !this: No entries allocated, always succeed
-+        s->regs[SONIC_CR] &= ~SONIC_CR_RRRA;
++ *
-+    }
++ * (2) !prev, this: We're iterating at the 1st element.
-     if (command & SONIC_CR_LCAM)
++ *
-         dp8393x_do_load_cam(s);
++ * (3) prev, !this: We're iterating at the last element.
- }
++ *
 + * (4) prev, this: this is the most common case, we'll try to find a hole
 + * between "prev" and "this" mapping.
 + *
 + * Note that this function assumes the last valid iova is HWADDR_MAX, but it
 + * searches linearly so it's easy to discard the result if it's not the case.
 + */
 +static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
 +{
 +    const DMAMap *prev = args->prev, *this = args->this;
 +    uint64_t hole_start, hole_last;
 +
 +    if (this && this->iova + this->size < args->iova_begin) {
 +        return;
 +    }
 +
 +    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
 +    hole_last = this ? this->iova : HWADDR_MAX;
 +
 +    if (hole_last - hole_start > args->new_size) {
 +        args->iova_result = hole_start;
 +        args->iova_found = true;
 +    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
      g_tree_destroy(tree->tree);
 --
-.5.0
+.7.4

-[PULL 20/23] tests/test-replication.c: Add test for for secondary node continuing replication
+[PULL V3 10/15] util: add iova_tree_find_iova
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-This simulates the case that happens when we resume COLO after failover.
+This function does the reverse operation of iova_tree_find: To look for
 a mapping that match a translated address so we can do the reverse.
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
+This have linear complexity instead of logarithmic, but it supports
 overlapping HVA. Future developments could reduce it.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- tests/test-replication.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
-file changed, 52 insertions(+)
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 files changed, 53 insertions(+), 1 deletion(-)
-diff --git a/tests/test-replication.c b/tests/test-replication.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/tests/test-replication.c
+--- a/include/qemu/iova-tree.h
-+++ b/tests/test-replication.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static void test_secondary_stop(void)
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-     teardown_secondary();
+  * @tree: the iova tree to search from
   * @map: the mapping to search
   *
 - * Search for a mapping in the iova tree that overlaps with the
 + * Search for a mapping in the iova tree that iova overlaps with the
   * mapping range specified.  Only the first found mapping will be
   * returned.
   *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
      bool iova_found;
  };
 +typedef struct IOVATreeFindIOVAArgs {
 +    const DMAMap *needle;
 +    const DMAMap *result;
 +} IOVATreeFindIOVAArgs;
 +
  /**
   * Iterate args to the next hole
   *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
      return g_tree_lookup(tree->tree, map);
  }
-+static void test_secondary_continuous_replication(void)
++static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
 +                                                gpointer data)
 +{
-+    BlockBackend *top_blk, *local_blk;
++    const DMAMap *map = key;
-+    Error *local_err = NULL;
++    IOVATreeFindIOVAArgs *args = data;
 +    const DMAMap *needle;
 +
-+    top_blk = start_secondary();
++    g_assert(key == value);
 +    replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
 +    g_assert(!local_err);
 +
-+    /* write 0x22 to s_local_disk (IMG_SIZE / 2, IMG_SIZE) */
++    needle = args->needle;
-+    local_blk = blk_by_name(S_LOCAL_DISK_ID);
++    if (map->translated_addr + map->size < needle->translated_addr ||
-+    test_blk_write(local_blk, 0x22, IMG_SIZE / 2, IMG_SIZE / 2, false);
++        needle->translated_addr + needle->size < map->translated_addr) {
 +        return false;
 +    }
 +
-+    /* replication will backup s_local_disk to s_hidden_disk */
++    args->result = map;
-+    test_blk_read(top_blk, 0x11, IMG_SIZE / 2,
++    return true;
 +                  IMG_SIZE / 2, 0, IMG_SIZE, false);
 +
 +    /* write 0x33 to s_active_disk (0, IMG_SIZE / 2) */
 +    test_blk_write(top_blk, 0x33, 0, IMG_SIZE / 2, false);
 +
 +    /* do failover (active commit) */
 +    replication_stop_all(true, &local_err);
 +    g_assert(!local_err);
 +
 +    /* it should ignore all requests from now on */
 +
 +    /* start after failover */
 +    replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
 +    g_assert(!local_err);
 +
 +    /* checkpoint */
 +    replication_do_checkpoint_all(&local_err);
 +    g_assert(!local_err);
 +
 +    /* stop */
 +    replication_stop_all(true, &local_err);
 +    g_assert(!local_err);
 +
 +    /* read from s_local_disk (0, IMG_SIZE / 2) */
 +    test_blk_read(top_blk, 0x33, 0, IMG_SIZE / 2,
 +                  0, IMG_SIZE / 2, false);
 +
 +
 +    /* read from s_local_disk (IMG_SIZE / 2, IMG_SIZE) */
 +    test_blk_read(top_blk, 0x22, IMG_SIZE / 2,
 +                  IMG_SIZE / 2, 0, IMG_SIZE, false);
 +
 +    teardown_secondary();
 +}
 +
- static void test_secondary_do_checkpoint(void)
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
 +{
 +    IOVATreeFindIOVAArgs args = {
 +        .needle = map,
 +    };
 +
 +    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
 +    return args.result;
 +}
 +
  const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
  {
-     BlockBackend *top_blk, *local_blk;
+     const DMAMap map = { .iova = iova, .size = 0 };
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
      g_test_add_func("/replication/secondary/write", test_secondary_write);
      g_test_add_func("/replication/secondary/start", test_secondary_start);
      g_test_add_func("/replication/secondary/stop",  test_secondary_stop);
 +    g_test_add_func("/replication/secondary/continuous_replication",
 +                    test_secondary_continuous_replication);
      g_test_add_func("/replication/secondary/do_checkpoint",
                      test_secondary_do_checkpoint);
      g_test_add_func("/replication/secondary/get_error_all",
 --
-.5.0
+.7.4

-[PULL 21/23] net/filter.c: Add Options to insert filters anywhere in the filter list
+[PULL V3 11/15] vhost: Add VhostIOVATree
-From: Lukas Straub <lukasstraub2@web.de>
+From: Eugenio Pérez <eperezma@redhat.com>
-To switch the Secondary to Primary, we need to insert new filters
+This tree is able to look for a translated address from an IOVA address.
 before the filter-rewriter.
-Add the options insert= and position= to be able to insert filters
+At first glance it is similar to util/iova-tree. However, SVQ working on
-anywhere in the filter list.
+devices with limited IOVA space need more capabilities, like allocating
 IOVA chunks or performing reverse translations (qemu addresses to iova).
-position should be "head" or "tail" to insert at the head or
+The allocation capability, as "assign a free IOVA address to this chunk
-tail of the filter list or it should be "id=<id>" to specify
+of memory in qemu's address space" allows shadow virtqueue to create a
-the id of another filter.
+new address space that is not restricted by guest's addressable one, so
-insert should be either "before" or "behind" to specify where to
+we can allocate shadow vqs vrings outside of it.
 insert the new filter relative to the one specified with position.
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
+It duplicates the tree so it can search efficiently in both directions,
-Reviewed-by: Zhang Chen <chen.zhang@intel.com>
+and it will signal overlap if iova or the translated address is present
 in any tree.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/net/filter.h |  2 ++
+ hw/virtio/meson.build       |   2 +-
- net/filter.c         | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++-
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
- qemu-options.hx      | 31 +++++++++++++++---
+ hw/virtio/vhost-iova-tree.h |  27 +++++++++++
-files changed, 119 insertions(+), 6 deletions(-)
+files changed, 138 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/include/net/filter.h b/include/net/filter.h
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/include/net/filter.h
+--- a/hw/virtio/meson.build
-+++ b/include/net/filter.h
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ struct NetFilterState {
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-     NetClientState *netdev;
-     NetFilterDirection direction;
+ virtio_ss = ss.source_set()
-     bool on;
+ virtio_ss.add(files('virtio.c'))
-+    char *position;
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+    bool insert_before_flag;
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
-     QTAILQ_ENTRY(NetFilterState) next;
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
- };
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-diff --git a/net/filter.c b/net/filter.c
+diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
-index XXXXXXX..XXXXXXX 100644
+new file mode 100644
---- a/net/filter.c
+index XXXXXXX..XXXXXXX
-+++ b/net/filter.c
+--- /dev/null
-@@ -XXX,XX +XXX,XX @@ static void netfilter_set_status(Object *obj, const char *str, Error **errp)
++++ b/hw/virtio/vhost-iova-tree.c
-     }
+@@ -XXX,XX +XXX,XX @@
- }
++/*
++ * vhost software live migration iova tree
-+static char *netfilter_get_position(Object *obj, Error **errp)
++ *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/iova-tree.h"
 +#include "vhost-iova-tree.h"
 +
 +#define iova_min_addr qemu_real_host_page_size
 +
 +/**
 + * VhostIOVATree, able to:
 + * - Translate iova address
 + * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
 +struct VhostIOVATree {
 +    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
 +    /* Last addressable iova address in the device */
 +    uint64_t iova_last;
 +
 +    /* IOVA address to qemu memory maps. */
 +    IOVATree *iova_taddr_map;
 +};
 +
 +/**
 + * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
-+    NetFilterState *nf = NETFILTER(obj);
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
 +
-+    return g_strdup(nf->position);
++    /* Some devices do not like 0 addresses */
 +    tree->iova_first = MAX(iova_first, iova_min_addr);
 +    tree->iova_last = iova_last;
 +
 +    tree->iova_taddr_map = iova_tree_new();
 +    return tree;
 +}
 +
-+static void netfilter_set_position(Object *obj, const char *str, Error **errp)
++/**
 + * Delete an iova tree
 + */
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
 +{
-+    NetFilterState *nf = NETFILTER(obj);
++    iova_tree_destroy(iova_tree->iova_taddr_map);
-+
++    g_free(iova_tree);
 +    nf->position = g_strdup(str);
 +}
 +
-+static char *netfilter_get_insert(Object *obj, Error **errp)
++/**
 + * Find the IOVA address stored from a memory address
 + *
 + * @tree: The iova tree
 + * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
-+    NetFilterState *nf = NETFILTER(obj);
++    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +
 +    return nf->insert_before_flag ? g_strdup("before") : g_strdup("behind");
 +}
 +
-+static void netfilter_set_insert(Object *obj, const char *str, Error **errp)
++/**
 + * Allocate a new mapping
 + *
 + * @tree: The iova tree
 + * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
-+    NetFilterState *nf = NETFILTER(obj);
++    /* Some vhost devices do not like addr 0. Skip first page */
 +    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
-+    if (strcmp(str, "before") && strcmp(str, "behind")) {
++    if (map->translated_addr + map->size < map->translated_addr ||
-+        error_setg(errp, "Invalid value for netfilter insert, "
++        map->perm == IOMMU_NONE) {
-+                         "should be 'before' or 'behind'");
++        return IOVA_ERR_INVALID;
 +        return;
 +    }
 +
-+    nf->insert_before_flag = !strcmp(str, "before");
++    /* Allocate a node in IOVA address */
 +    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
 +                               tree->iova_last);
 +}
 +
- static void netfilter_init(Object *obj)
++/**
- {
++ * Remove existing mappings from iova tree
-     NetFilterState *nf = NETFILTER(obj);
++ *
++ * @iova_tree: The vhost iova tree
-     nf->on = true;
++ * @map: The map to remove
-+    nf->insert_before_flag = false;
++ */
-+    nf->position = g_strdup("tail");
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
++{
-     object_property_add_str(obj, "netdev",
++    iova_tree_remove(iova_tree->iova_taddr_map, map);
-                             netfilter_get_netdev_id, netfilter_set_netdev_id,
++}
-@@ -XXX,XX +XXX,XX @@ static void netfilter_init(Object *obj)
+diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
-     object_property_add_str(obj, "status",
+new file mode 100644
-                             netfilter_get_status, netfilter_set_status,
+index XXXXXXX..XXXXXXX
-                             NULL);
+--- /dev/null
-+    object_property_add_str(obj, "position",
++++ b/hw/virtio/vhost-iova-tree.h
-+                            netfilter_get_position, netfilter_set_position,
+@@ -XXX,XX +XXX,XX @@
-+                            NULL);
++/*
-+    object_property_add_str(obj, "insert",
++ * vhost software live migration iova tree
-+                            netfilter_get_insert, netfilter_set_insert,
++ *
-+                            NULL);
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
- }
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
++ *
- static void netfilter_complete(UserCreatable *uc, Error **errp)
++ * SPDX-License-Identifier: GPL-2.0-or-later
- {
++ */
      NetFilterState *nf = NETFILTER(uc);
 +    NetFilterState *position = NULL;
      NetClientState *ncs[MAX_QUEUE_NUM];
      NetFilterClass *nfc = NETFILTER_GET_CLASS(uc);
      int queues;
@@ -XXX,XX +XXX,XX @@ static void netfilter_complete(UserCreatable *uc, Error **errp)
          return;
      }
 +    if (strcmp(nf->position, "head") && strcmp(nf->position, "tail")) {
 +        Object *container;
 +        Object *obj;
 +        char *position_id;
 +
-+        if (!g_str_has_prefix(nf->position, "id=")) {
++#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
-+            error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "position",
++#define HW_VIRTIO_VHOST_IOVA_TREE_H
 +                       "'head', 'tail' or 'id=<id>'");
 +            return;
 +        }
 +
-+        /* get the id from the string */
++#include "qemu/iova-tree.h"
-+        position_id = g_strndup(nf->position + 3, strlen(nf->position) - 3);
++#include "exec/memory.h"
 +
-+        /* Search for the position to insert before/behind */
++typedef struct VhostIOVATree VhostIOVATree;
 +        container = object_get_objects_root();
 +        obj = object_resolve_path_component(container, position_id);
 +        if (!obj) {
 +            error_setg(errp, "filter '%s' not found", position_id);
 +            g_free(position_id);
 +            return;
 +        }
 +
-+        position = NETFILTER(obj);
++VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
 +
-+        if (position->netdev != ncs[0]) {
++const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
-+            error_setg(errp, "filter '%s' belongs to a different netdev",
++                                        const DMAMap *map);
-+                        position_id);
++int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
-+            g_free(position_id);
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
 +            return;
 +        }
 +
-+        g_free(position_id);
++#endif
 +    }
 +
      nf->netdev = ncs[0];
      if (nfc->setup) {
@@ -XXX,XX +XXX,XX @@ static void netfilter_complete(UserCreatable *uc, Error **errp)
              return;
          }
      }
 -    QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next);
 +
 +    if (position) {
 +        if (nf->insert_before_flag) {
 +            QTAILQ_INSERT_BEFORE(position, nf, next);
 +        } else {
 +            QTAILQ_INSERT_AFTER(&nf->netdev->filters, position, nf, next);
 +        }
 +    } else if (!strcmp(nf->position, "head")) {
 +        QTAILQ_INSERT_HEAD(&nf->netdev->filters, nf, next);
 +    } else if (!strcmp(nf->position, "tail")) {
 +        QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next);
 +    }
  }
  static void netfilter_finalize(Object *obj)
@@ -XXX,XX +XXX,XX @@ static void netfilter_finalize(Object *obj)
          QTAILQ_REMOVE(&nf->netdev->filters, nf, next);
      }
      g_free(nf->netdev_id);
 +    g_free(nf->position);
  }
  static void default_handle_event(NetFilterState *nf, int event, Error **errp)
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ applications, they can do this through this parameter. Its format is
  a gnutls priority string as described at
  @url{https://gnutls.org/manual/html_node/Priority-Strings.html}.
 -@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}]
 +@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
  Interval @var{t} can't be 0, this filter batches the packet delivery: all
  packets arriving in a given interval on netdev @var{netdevid} are delayed
@@ -XXX,XX +XXX,XX @@ queue @var{all|rx|tx} is an option that can be applied to any netfilter.
  @option{tx}: the filter is attached to the transmit queue of the netdev,
               where it will receive packets sent by the netdev.
 -@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support]
 +position @var{head|tail|id=<id>} is an option to specify where the
 +filter should be inserted in the filter list. It can be applied to any
 +netfilter.
 +
 +@option{head}: the filter is inserted at the head of the filter
 +               list, before any existing filters.
 +
 +@option{tail}: the filter is inserted at the tail of the filter
 +               list, behind any existing filters (default).
 +
 +@option{id=<id>}: the filter is inserted before or behind the filter
 +                  specified by <id>, see the insert option below.
 +
 +insert @var{behind|before} is an option to specify where to insert the
 +new filter relative to the one specified with position=id=<id>. It can
 +be applied to any netfilter.
 +
 +@option{before}: insert before the specified filter.
 +
 +@option{behind}: insert behind the specified filter (default).
 +
 +@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
  filter-mirror on netdev @var{netdevid},mirror net packet to chardev@var{chardevid}, if it has the vnet_hdr_support flag, filter-mirror will mirror packet with vnet_hdr_len.
 -@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support]
 +@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
  filter-redirector on netdev @var{netdevid},redirect filter's net packet to chardev
  @var{chardevid},and redirect indev's packet to filter.if it has the vnet_hdr_support flag,
@@ -XXX,XX +XXX,XX @@ Create a filter-redirector we need to differ outdev id from indev id, id can not
  be the same. we can just use indev or outdev, but at least one of indev or outdev
  need to be specified.
 -@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support]
 +@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
  Filter-rewriter is a part of COLO project.It will rewrite tcp packet to
  secondary from primary to keep secondary tcp connection,and rewrite
@@ -XXX,XX +XXX,XX @@ colo secondary:
  -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1
  -object filter-rewriter,id=rew0,netdev=hn0,queue=all
 -@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}]
 +@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
  Dump the network traffic on netdev @var{dev} to the file specified by
  @var{filename}. At most @var{len} bytes (64k by default) per packet are stored.
 --
-.5.0
+.7.4

-[PULL 03/23] dp8393x: Clean up endianness hacks
+[PULL V3 12/15] vdpa: Add custom IOTLB translations to SVQ
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-According to the datasheet, section 3.4.4, "in 32-bit mode ... the SONIC
+Use translations added in VhostIOVATree in SVQ.
-always writes long words".
+Only introduce usage here, not allocation and deallocation. As with
-Therefore, use the same technique for the 'in_use' field that is used
+previous patches, we use the dead code paths of shadow_vqs_enabled to
-everywhere else, and write the full long word.
+avoid commiting too many changes at once. These are impossible to take
+at the moment.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 17 ++++++-----------
+ hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
-file changed, 6 insertions(+), 11 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+ include/hw/virtio/vhost-vdpa.h     |   3 +
 files changed, 187 insertions(+), 30 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-         return -1;
+     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
  }
 -static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +/**
 + * Translate addresses between the qemu's virtual address and the SVQ IOVA
 + *
 + * @svq: Shadow VirtQueue
 + * @vaddr: Translated IOVA addresses
 + * @iovec: Source qemu's VA addresses
 + * @num: Length of iovec and minimum length of vaddr
 + */
 +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
 +                                     hwaddr *addrs, const struct iovec *iovec,
 +                                     size_t num)
 +{
 +    if (num == 0) {
 +        return true;
 +    }
 +
 +    for (size_t i = 0; i < num; ++i) {
 +        DMAMap needle = {
 +            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
 +            .size = iovec[i].iov_len,
 +        };
 +        Int128 needle_last, map_last;
 +        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = map->iova + off;
 +
 +        needle_last = int128_add(int128_make64(needle.translated_addr),
 +                                 int128_make64(iovec[i].iov_len));
 +        map_last = int128_make64(map->translated_addr + map->size);
 +        if (unlikely(int128_gt(needle_last, map_last))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                      const struct iovec *iovec, size_t num,
                                      bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
 -        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].addr = cpu_to_le64(sg[n]);
          descs[i].len = cpu_to_le32(iovec[n].iov_len);
          last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
--    /* XXX: Check byte ordering */
+-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
--
+-                            false);
-     /* Check for EOL */
+-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
-     if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
++    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
-         /* Are we still in resource exhaustion? */
++    if (unlikely(!ok)) {
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
++        return false;
-         /* EOL detected */
++    }
-         s->regs[SONIC_ISR] |= SONIC_ISR_RDE;
++    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-     } else {
++                            elem->in_num > 0, false);
--        /* Clear in_use, but it is always 16bit wide */
++
--        int offset = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width;
++
--        if (s->big_endian && width == 2) {
++    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
--            /* we need to adjust the offset of the 16bit field */
++    if (unlikely(!ok)) {
--            offset += sizeof(uint16_t);
++        return false;
--        }
++    }
--        s->data[0] = 0;
++
--        address_space_write(&s->as, offset, MEMTXATTRS_UNSPECIFIED,
++    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
--                            s->data, sizeof(uint16_t));
-+        /* Clear in_use */
+     /*
-+        size = sizeof(uint16_t) * width;
+      * Put the entry in the available array (but don't update avail->idx until
-+        address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width;
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
-+        dp8393x_put(s, width, 0, 0);
+ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-+        address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
+                               struct vhost_vring_addr *addr)
-+                            s->data, size);
+ {
-         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
+-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
+-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-         s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff);
+-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
  }
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/event_notifier.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 +#include "hw/virtio/vhost-iova-tree.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
 +
 +        iova = mem_region.iova;
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
 -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +    needle.translated_addr = svq_addr.used_user_addr;
 +    return vhost_vdpa_svq_unmap_ring(v, &needle);
 +}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)(uintptr_t)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)(uintptr_t)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
 +    addr->desc_user_addr = driver_region.iova;
 +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
 +    addr->avail_user_addr = driver_region.iova + avail_offset;
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)(intptr_t)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
 +    addr->used_user_addr = device_region.iova;
 -    return r == 0;
 +    return ok;
  }
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #include <gmodule.h>
 +#include "hw/virtio/vhost-iova-tree.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
      bool shadow_vqs_enabled;
 +    /* IOVA mapping used by the Shadow Virtqueue */
 +    VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 --
-.5.0
+.7.4

-[PULL 07/23] dp8393x: Implement packet size limit and RBAE interrupt
+[PULL V3 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-Add a bounds check to prevent a large packet from causing a buffer
+This is needed to achieve migration, so the destination can restore its
-overflow. This is defensive programming -- I haven't actually tried
+index.
 sending an oversized packet or a jumbo ethernet frame.
-The SONIC handles packets that are too big for the buffer by raising
+Setting base as last used idx, so destination will see as available all
-the RBAE interrupt and dropping them. Linux uses that interrupt to
+the entries that the device did not use, including the in-flight
-count dropped packets.
+processing ones.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
+This is ok for networking, but other kinds of devices might have
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+problems with these retransmissions.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 9 +++++++++
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
-file changed, 9 insertions(+)
+file changed, 17 insertions(+)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
- #define SONIC_TCR_CRCI   0x2000
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
- #define SONIC_TCR_PINT   0x8000
+                                        struct vhost_vring_state *ring)
+ {
-+#define SONIC_ISR_RBAE   0x0010
++    struct vhost_vdpa *v = dev->opaque;
- #define SONIC_ISR_RBE    0x0020
+     int ret;
- #define SONIC_ISR_RDE    0x0040
- #define SONIC_ISR_TC     0x0080
++    if (v->shadow_vqs_enabled) {
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
++                                                      ring->index);
-         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
++
++        /*
-+    if (pkt_size + 4 > dp8393x_rbwc(s) * 2) {
++         * Setting base as last used idx, so destination will see as available
-+        DPRINTF("oversize packet, pkt_size is %d\n", pkt_size);
++         * all the entries that the device did not use, including the in-flight
-+        s->regs[SONIC_ISR] |= SONIC_ISR_RBAE;
++         * processing ones.
-+        dp8393x_update_irq(s);
++         *
-+        dp8393x_do_read_rra(s);
++         * TODO: This is ok for networking, but other kinds of devices might
-+        return pkt_size;
++         * have problems with these retransmissions.
 +         */
 +        ring->num = svq->last_used_idx;
 +        return 0;
 +    }
 +
-     packet_type = dp8393x_receive_filter(s, buf, pkt_size);
+     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
-     if (packet_type < 0) {
+     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
-         DPRINTF("packet not for netcard\n");
+     return ret;
 --
-.5.0
+.7.4

-[PULL 12/23] dp8393x: Always update RRA pointers and sequence numbers
+[PULL V3 14/15] vdpa: Never set log_base addr if SVQ is enabled
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-These operations need to take place regardless of whether or not
+Setting the log address would make the device start reporting invalid
-rx descriptors have been used up (that is, EOL flag was observed).
+dirty memory because the SVQ vrings are located in qemu's memory.
-The algorithm is now the same for a packet that was withheld as for
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-a packet that was not.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
 Tested-by: Laurent Vivier <laurent@vivier.eu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 12 +++++++-----
+ hw/virtio/vhost-vdpa.c | 3 ++-
-file changed, 7 insertions(+), 5 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
-         /* Move to next descriptor */
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
-         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
+                                      struct vhost_log *log)
-         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
+ {
--        s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff);
+-    if (vhost_vdpa_one_time_request(dev)) {
-+    }
++    struct vhost_vdpa *v = dev->opaque;
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
--        if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) {
+         return 0;
 -            /* Read next RRA */
 -            dp8393x_do_read_rra(s);
 -        }
 +    s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) |
 +                         ((s->regs[SONIC_RSC] + 1) & 0x00ff);
 +
 +    if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) {
 +        /* Read next RRA */
 +        dp8393x_do_read_rra(s);
      }
-     /* Done */
 --
-.5.0
+.7.4

-[PULL 05/23] dp8393x: Update LLFA and CRDA registers from rx descriptor
+[PULL V3 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: Finn Thain <fthain@telegraphics.com.au>
+From: Eugenio Pérez <eperezma@redhat.com>
-Follow the algorithm given in the National Semiconductor DP83932C
+SVQ is able to log the dirty bits by itself, so let's use it to not
-datasheet in section 3.4.7:
+block migration.
-    At the next reception, the SONIC re-reads the last RXpkt.link field,
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-    and updates its CRDA register to point to the next descriptor.
+enabled. Even if the device supports it, the reports would be nonsense
 because SVQ memory is in the qemu region.
-The chip is designed to allow the host to provide a new list of
+The log region is still allocated. Future changes might skip that, but
-descriptors in this way.
+this series is already long enough.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Tested-by: Laurent Vivier <laurent@vivier.eu>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/dp8393x.c | 11 +++++++----
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
-file changed, 7 insertions(+), 4 deletions(-)
+ include/hw/virtio/vhost-vdpa.h |  1 +
 files changed, 36 insertions(+), 4 deletions(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/dp8393x.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
-         address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width;
+     return v->index != 0;
-         address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED,
+ }
-                            s->data, size);
--        if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) {
++static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
-+        s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0);
++                                       uint64_t *features)
-+        if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
++{
-             /* Still EOL ; stop reception */
++    int ret;
-             return -1;
++
--        } else {
++    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
--            s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
++    trace_vhost_vdpa_get_features(dev, *features);
-         }
++    return ret;
-+        /* Link has been updated by host */
++}
-+        s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
++
  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
          return 0;
      }
-     /* Save current position */
+-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
++    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
-                         MEMTXATTRS_UNSPECIFIED,
+     if (r != 0) {
-                         s->data, size);
+         error_setg_errno(errp, -r, "Can't get vdpa device features");
+         return r;
--    /* Move to next descriptor */
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
-+    /* Check link field */
+ static int vhost_vdpa_set_features(struct vhost_dev *dev,
-     size = sizeof(uint16_t) * width;
+                                    uint64_t features)
-     address_space_read(&s->as,
+ {
-                        dp8393x_crda(s) + sizeof(uint16_t) * 5 * width,
++    struct vhost_vdpa *v = dev->opaque;
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
+     int ret;
-         dp8393x_put(s, width, 0, 0);
-         address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
+     if (vhost_vdpa_one_time_request(dev)) {
-                             s->data, size);
+         return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
 +
-+        /* Move to next descriptor */
++        v->acked_features = features;
-         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
++
-         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
++        /* We must not ack _F_LOG if SVQ is enabled */
-         s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff);
++        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 +
      trace_vhost_vdpa_set_features(dev, features);
      ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                       uint64_t *features)
  {
 -    int ret;
 +    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 -    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    uint64_t acked_features;
      bool shadow_vqs_enabled;
      /* IOVA mapping used by the Shadow Virtqueue */
      VhostIOVATree *iova_tree;
 --
-.5.0
+.7.4

-[PULL 08/23] dp8393x: Don't clobber packet checksum
+Deleted patch
-From: Finn Thain <fthain@telegraphics.com.au>
-A received packet consumes pkt_size bytes in the buffer and the frame
-checksum that's appended to it consumes another 4 bytes. The Receive
-Buffer Address register takes the former quantity into account but
-not the latter. So the next packet written to the buffer overwrites
-the frame checksum. Fix this.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Tested-by: Laurent Vivier <laurent@vivier.eu>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/dp8393x.c | 1 +
-file changed, 1 insertion(+)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
-+++ b/hw/net/dp8393x.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
-     address += rx_len;
-     address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
-                         &checksum, 4);
-+    address += 4;
-     rx_len += 4;
-     s->regs[SONIC_CRBA1] = address >> 16;
-     s->regs[SONIC_CRBA0] = address & 0xffff;
---
-.5.0

-[PULL 09/23] dp8393x: Use long-word-aligned RRA pointers in 32-bit mode
+Deleted patch
-From: Finn Thain <fthain@telegraphics.com.au>
-Section 3.4.1 of the datasheet says,
-    The alignment of the RRA is confined to either word or long word
-    boundaries, depending upon the data width mode. In 16-bit mode,
-    the RRA must be aligned to a word boundary (A0 is always zero)
-    and in 32-bit mode, the RRA is aligned to a long word boundary
-    (A0 and A1 are always zero).
-This constraint has been implemented for 16-bit mode; implement it
-for 32-bit mode too.
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
-Tested-by: Laurent Vivier <laurent@vivier.eu>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/dp8393x.c | 8 ++++++--
-file changed, 6 insertions(+), 2 deletions(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
-+++ b/hw/net/dp8393x.c
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
-                 qemu_flush_queued_packets(qemu_get_queue(s->nic));
-             }
-             break;
--        /* Ignore least significant bit */
-+        /* The guest is required to store aligned pointers here */
-         case SONIC_RSA:
-         case SONIC_REA:
-         case SONIC_RRP:
-         case SONIC_RWP:
--            s->regs[reg] = val & 0xfffe;
-+            if (s->regs[SONIC_DCR] & SONIC_DCR_DW) {
-+                s->regs[reg] = val & 0xfffc;
-+            } else {
-+                s->regs[reg] = val & 0xfffe;
-+            }
-             break;
-         /* Invert written value for some registers */
-         case SONIC_CRCT:
---
-.5.0

-[PULL 13/23] dp8393x: Don't reset Silicon Revision register
+Deleted patch
-From: Finn Thain <fthain@telegraphics.com.au>
-The jazzsonic driver in Linux uses the Silicon Revision register value
-to probe the chip. The driver fails unless the SR register contains 4.
-Unfortunately, reading this register in QEMU usually returns 0 because
-the s->regs[] array gets wiped after a software reset.
-Fixes: bd8f1ebce4 ("net/dp8393x: fix hardware reset")
-Suggested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/dp8393x.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
-+++ b/hw/net/dp8393x.c
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_reset(DeviceState *dev)
-     timer_del(s->watchdog);
-     memset(s->regs, 0, sizeof(s->regs));
-+    s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux/mips */
-     s->regs[SONIC_CR] = SONIC_CR_RST | SONIC_CR_STP | SONIC_CR_RXDIS;
-     s->regs[SONIC_DCR] &= ~(SONIC_DCR_EXBUS | SONIC_DCR_LBR);
-     s->regs[SONIC_RCR] &= ~(SONIC_RCR_LB0 | SONIC_RCR_LB1 | SONIC_RCR_BRD | SONIC_RCR_RNT);
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_realize(DeviceState *dev, Error **errp)
-     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
-     s->watchdog = timer_new_ns(QEMU_CLOCK_VIRTUAL, dp8393x_watchdog, s);
--    s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux */
-     memory_region_init_ram(&s->prom, OBJECT(dev),
-                            "dp8393x-prom", SONIC_PROM_SIZE, &local_err);
---
-.5.0

-[PULL 17/23] NetRxPkt: fix hash calculation of IPV6 TCP
+Deleted patch
-From: Yuri Benditovich <yuri.benditovich@daynix.com>
-When requested to calculate the hash for TCPV6 packet,
-ignore overrides of source and destination addresses
-in in extension headers.
-Use these overrides when new hash type NetPktRssIpV6TcpEx
-requested.
-Use this type in e1000e hash calculation for IPv6 TCP, which
-should take in account overrides of the addresses.
-Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
-Acked-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/e1000e_core.c | 2 +-
- hw/net/net_rx_pkt.c  | 2 +-
-files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e_core.c
-+++ b/hw/net/e1000e_core.c
-@@ -XXX,XX +XXX,XX @@ e1000e_rss_calc_hash(E1000ECore *core,
-         type = NetPktRssIpV4Tcp;
-         break;
-     case E1000_MRQ_RSS_TYPE_IPV6TCP:
--        type = NetPktRssIpV6Tcp;
-+        type = NetPktRssIpV6TcpEx;
-         break;
-     case E1000_MRQ_RSS_TYPE_IPV6:
-         type = NetPktRssIpV6;
-diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/net_rx_pkt.c
-+++ b/hw/net/net_rx_pkt.c
-@@ -XXX,XX +XXX,XX @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
-         assert(pkt->isip6);
-         assert(pkt->istcp);
-         trace_net_rx_pkt_rss_ip6_tcp();
--        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
-+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
-         _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
-         break;
-     case NetPktRssIpV6:
---
-.5.0

-[PULL 18/23] hw: net: cadence_gem: Fix build errors in DB_PRINT()
+Deleted patch
-From: Bin Meng <bmeng.cn@gmail.com>
-When CADENCE_GEM_ERR_DEBUG is turned on, there are several
-compilation errors in DB_PRINT(). Fix them.
-While we are here, update to use appropriate modifiers in
-the same DB_PRINT() call.
-Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/cadence_gem.c | 11 ++++++-----
-file changed, 6 insertions(+), 5 deletions(-)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
-             return -1;
-         }
--        DB_PRINT("copy %d bytes to 0x%x\n", MIN(bytes_to_copy, rxbufsize),
--                rx_desc_get_buffer(s->rx_desc[q]));
-+        DB_PRINT("copy %u bytes to 0x%" PRIx64 "\n",
-+                 MIN(bytes_to_copy, rxbufsize),
-+                 rx_desc_get_buffer(s, s->rx_desc[q]));
-         /* Copy packet data to emulated DMA buffer */
-         address_space_write(&s->dma_as, rx_desc_get_buffer(s, s->rx_desc[q]) +
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
-             if (tx_desc_get_length(desc) > sizeof(tx_packet) -
-                                                (p - tx_packet)) {
--                DB_PRINT("TX descriptor @ 0x%x too large: size 0x%x space " \
--                         "0x%x\n", (unsigned)packet_desc_addr,
--                         (unsigned)tx_desc_get_length(desc),
-+                DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \
-+                         " too large: size 0x%x space 0x%zx\n",
-+                         packet_desc_addr, tx_desc_get_length(desc),
-                          sizeof(tx_packet) - (p - tx_packet));
-                 break;
-             }
---
-.5.0

-[PULL 22/23] colo: Update Documentation for continuous replication
+Deleted patch
-From: Lukas Straub <lukasstraub2@web.de>
-Document the qemu command-line and qmp commands for continuous replication
-Signed-off-by: Lukas Straub <lukasstraub2@web.de>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- docs/COLO-FT.txt           | 224 +++++++++++++++++++++++++++++++++------------
- docs/block-replication.txt |  28 ++++--
-files changed, 184 insertions(+), 68 deletions(-)
-diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
-index XXXXXXX..XXXXXXX 100644
---- a/docs/COLO-FT.txt
-+++ b/docs/COLO-FT.txt
-@@ -XXX,XX +XXX,XX @@ The diagram just shows the main qmp command, you can get the detail
- in test procedure.
- == Test procedure ==
--1. Startup qemu
--Primary:
--# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name primary \
--  -device piix3-usb-uhci -vnc :7 \
--  -device usb-tablet -netdev tap,id=hn0,vhost=off \
--  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
--  -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
--         children.0.file.filename=1.raw,\
--         children.0.driver=raw -S
--Secondary:
--# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name secondary \
--  -device piix3-usb-uhci -vnc :7 \
--  -device usb-tablet -netdev tap,id=hn0,vhost=off \
--  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
--  -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \
--  -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\
--         file.driver=qcow2,top-id=active-disk0,\
--         file.file.filename=/mnt/ramfs/active_disk.img,\
--         file.backing.driver=qcow2,\
--         file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\
--         file.backing.backing=secondary-disk0 \
--  -incoming tcp:0:8888
--
--2. On Secondary VM's QEMU monitor, issue command
-+Note: Here we are running both instances on the same host for testing,
-+change the IP Addresses if you want to run it on two hosts. Initally
-+127.0.0.1 is the Primary Host and 127.0.0.2 is the Secondary Host.
-+
-+== Startup qemu ==
-+1. Primary:
-+Note: Initally, $imagefolder/primary.qcow2 needs to be copied to all hosts.
-+You don't need to change any IP's here, because 0.0.0.0 listens on any
-+interface. The chardev's with 127.0.0.1 IP's loopback to the local qemu
-+instance.
-+
-+# imagefolder="/mnt/vms/colo-test-primary"
-+
-+# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \
-+   -device piix3-usb-uhci -device usb-tablet -name primary \
-+   -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \
-+   -device rtl8139,id=e0,netdev=hn0 \
-+   -chardev socket,id=mirror0,host=0.0.0.0,port=9003,server,nowait \
-+   -chardev socket,id=compare1,host=0.0.0.0,port=9004,server,wait \
-+   -chardev socket,id=compare0,host=127.0.0.1,port=9001,server,nowait \
-+   -chardev socket,id=compare0-0,host=127.0.0.1,port=9001 \
-+   -chardev socket,id=compare_out,host=127.0.0.1,port=9005,server,nowait \
-+   -chardev socket,id=compare_out0,host=127.0.0.1,port=9005 \
-+   -object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0 \
-+   -object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out \
-+   -object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0 \
-+   -object iothread,id=iothread1 \
-+   -object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,\
-+outdev=compare_out0,iothread=iothread1 \
-+   -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
-+children.0.file.filename=$imagefolder/primary.qcow2,children.0.driver=qcow2 -S
-+
-+2. Secondary:
-+Note: Active and hidden images need to be created only once and the
-+size should be the same as primary.qcow2. Again, you don't need to change
-+any IP's here, except for the $primary_ip variable.
-+
-+# imagefolder="/mnt/vms/colo-test-secondary"
-+# primary_ip=127.0.0.1
-+
-+# qemu-img create -f qcow2 $imagefolder/secondary-active.qcow2 10G
-+
-+# qemu-img create -f qcow2 $imagefolder/secondary-hidden.qcow2 10G
-+
-+# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \
-+   -device piix3-usb-uhci -device usb-tablet -name secondary \
-+   -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \
-+   -device rtl8139,id=e0,netdev=hn0 \
-+   -chardev socket,id=red0,host=$primary_ip,port=9003,reconnect=1 \
-+   -chardev socket,id=red1,host=$primary_ip,port=9004,reconnect=1 \
-+   -object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0 \
-+   -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 \
-+   -object filter-rewriter,id=rew0,netdev=hn0,queue=all \
-+   -drive if=none,id=parent0,file.filename=$imagefolder/primary.qcow2,driver=qcow2 \
-+   -drive if=none,id=childs0,driver=replication,mode=secondary,file.driver=qcow2,\
-+top-id=colo-disk0,file.file.filename=$imagefolder/secondary-active.qcow2,\
-+file.backing.driver=qcow2,file.backing.file.filename=$imagefolder/secondary-hidden.qcow2,\
-+file.backing.backing=parent0 \
-+   -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
-+children.0=childs0 \
-+   -incoming tcp:0.0.0.0:9998
-+
-+
-+3. On Secondary VM's QEMU monitor, issue command
- {'execute':'qmp_capabilities'}
--{ 'execute': 'nbd-server-start',
--  'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } }
--}
--{'execute': 'nbd-server-add', 'arguments': {'device': 'secondary-disk0', 'writable': true } }
-+{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': {'host': '0.0.0.0', 'port': '9999'} } } }
-+{'execute': 'nbd-server-add', 'arguments': {'device': 'parent0', 'writable': true } }
- Note:
-   a. The qmp command nbd-server-start and nbd-server-add must be run
-      before running the qmp command migrate on primary QEMU
-   b. Active disk, hidden disk and nbd target's length should be the
-      same.
--  c. It is better to put active disk and hidden disk in ramdisk.
-+  c. It is better to put active disk and hidden disk in ramdisk. They
-+     will be merged into the parent disk on failover.
--3. On Primary VM's QEMU monitor, issue command:
-+4. On Primary VM's QEMU monitor, issue command:
- {'execute':'qmp_capabilities'}
--{ 'execute': 'human-monitor-command',
--  'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}}
--{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } }
--{ 'execute': 'migrate-set-capabilities',
--      'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
--{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } }
-+{'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}}
-+{'execute': 'x-blockdev-change', 'arguments':{'parent': 'colo-disk0', 'node': 'replication0' } }
-+{'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
-+{'execute': 'migrate', 'arguments': {'uri': 'tcp:127.0.0.2:9998' } }
-   Note:
-   a. There should be only one NBD Client for each primary disk.
--  b. xx.xx.xx.xx is the secondary physical machine's hostname or IP
--  c. The qmp command line must be run after running qmp command line in
-+  b. The qmp command line must be run after running qmp command line in
-      secondary qemu.
--4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
-+5. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
- You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }'
--to change the checkpoint period time
-+to change the idle checkpoint period time
-+
-+6. Failover test
-+You can kill one of the VMs and Failover on the surviving VM:
-+
-+If you killed the Secondary, then follow "Primary Failover". After that,
-+if you want to resume the replication, follow "Primary resume replication"
-+
-+If you killed the Primary, then follow "Secondary Failover". After that,
-+if you want to resume the replication, follow "Secondary resume replication"
-+
-+== Primary Failover ==
-+The Secondary died, resume on the Primary
-+
-+{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'child': 'children.1'} }
-+{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_del replication0' } }
-+{'execute': 'object-del', 'arguments':{ 'id': 'comp0' } }
-+{'execute': 'object-del', 'arguments':{ 'id': 'iothread1' } }
-+{'execute': 'object-del', 'arguments':{ 'id': 'm0' } }
-+{'execute': 'object-del', 'arguments':{ 'id': 'redire0' } }
-+{'execute': 'object-del', 'arguments':{ 'id': 'redire1' } }
-+{'execute': 'x-colo-lost-heartbeat' }
-+
-+== Secondary Failover ==
-+The Primary died, resume on the Secondary and prepare to become the new Primary
-+
-+{'execute': 'nbd-server-stop'}
-+{'execute': 'x-colo-lost-heartbeat'}
-+
-+{'execute': 'object-del', 'arguments':{ 'id': 'f2' } }
-+{'execute': 'object-del', 'arguments':{ 'id': 'f1' } }
-+{'execute': 'chardev-remove', 'arguments':{ 'id': 'red1' } }
-+{'execute': 'chardev-remove', 'arguments':{ 'id': 'red0' } }
-+
-+{'execute': 'chardev-add', 'arguments':{ 'id': 'mirror0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9003' } }, 'server': true } } } }
-+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare1', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9004' } }, 'server': true } } } }
-+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': true } } } }
-+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0-0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': false } } } }
-+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': true } } } }
-+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': false } } } }
-+
-+== Primary resume replication ==
-+Resume replication after new Secondary is up.
-+
-+Start the new Secondary (Steps 2 and 3 above), then on the Primary:
-+{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.2:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} }
-+
-+Wait until disk is synced, then:
-+{'execute': 'stop'}
-+{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync'} }
-+
-+{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}}
-+{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } }
-+
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } }
-+
-+{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
-+{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.2:9998' } }
-+
-+Note:
-+If this Primary previously was a Secondary, then we need to insert the
-+filters before the filter-rewriter by using the
-+"'insert': 'before', 'position': 'id=rew0'" Options. See below.
-+
-+== Secondary resume replication ==
-+Become Primary and resume replication after new Secondary is up. Note
-+that now 127.0.0.1 is the Secondary and 127.0.0.2 is the Primary.
-+
-+Start the new Secondary (Steps 2 and 3 above, but with primary_ip=127.0.0.2),
-+then on the old Secondary:
-+{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.1:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} }
-+
-+Wait until disk is synced, then:
-+{'execute': 'stop'}
-+{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync' } }
--5. Failover test
--You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's
--monitor at the same time, then SVM will failover and client will not detect this
--change.
-+{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.1,file.port=9999,file.export=parent0,node-name=replication0'}}
-+{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } }
--Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to
--issue block related command to stop block replication.
--Primary:
--  Remove the nbd child from the quorum:
--  { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}}
--  { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}}
--  Note: there is no qmp command to remove the blockdev now
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } }
-+{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } }
--Secondary:
--  The primary host is down, so we should do the following thing:
--  { 'execute': 'nbd-server-stop' }
-+{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
-+{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.1:9998' } }
- == TODO ==
--1. Support continuous VM replication.
--2. Support shared storage.
--3. Develop the heartbeat part.
--4. Reduce checkpoint VM’s downtime while doing checkpoint.
-+1. Support shared storage.
-+2. Develop the heartbeat part.
-+3. Reduce checkpoint VM’s downtime while doing checkpoint.
-diff --git a/docs/block-replication.txt b/docs/block-replication.txt
-index XXXXXXX..XXXXXXX 100644
---- a/docs/block-replication.txt
-+++ b/docs/block-replication.txt
-@@ -XXX,XX +XXX,XX @@ blocks that are already in QEMU.
-              ^            ||                            .----------
-              |            ||                            | Secondary
-Quorum          ||                            '----------
--         /      \         ||
--        /        \        ||
--   Primary    2 filter
--     disk         ^                                                             virtio-blk
--                  |                                                                  ^
--                3 NBD  ------->  3 NBD                                               |
-+         /      \         ||                                                           virtio-blk
-+        /        \        ||                                                               ^
-+   Primary    2 filter                                                                     |
-+     disk         ^                                                                   7 Quorum
-+                  |                                                                    /
-+                3 NBD  ------->  3 NBD                                                /
-                 client    ||     server                                          2 filter
-                           ||        ^                                                ^
- --------.                 ||        |                                                |
-@@ -XXX,XX +XXX,XX @@ any state that would otherwise be lost by the speculative write-through
- of the NBD server into the secondary disk. So before block replication,
- the primary disk and secondary disk should contain the same data.
-+7) The secondary also has a quorum node, so after secondary failover it
-+can become the new primary and continue replication.
-+
-+
- == Failure Handling ==
- There are 7 internal errors when block replication is running:
-. I/O error on primary disk
-@@ -XXX,XX +XXX,XX @@ Primary:
-      leading whitespace.
-. The qmp command line must be run after running qmp command line in
-      secondary qemu.
--  6. After failover we need remove children.1 (replication driver).
-+  6. After primary failover we need remove children.1 (replication driver).
- Secondary:
-   -drive if=none,driver=raw,file.filename=1.raw,id=colo1 \
--  -drive if=xxx,id=topxxx,driver=replication,mode=secondary,top-id=topxxx\
-+  -drive if=none,id=childs1,driver=replication,mode=secondary,top-id=childs1
-          file.file.filename=active_disk.qcow2,\
-          file.driver=qcow2,\
-          file.backing.file.filename=hidden_disk.qcow2,\
-          file.backing.driver=qcow2,\
-          file.backing.backing=colo1
-+  -drive if=xxx,driver=quorum,read-pattern=fifo,id=top-disk1,\
-+         vote-threshold=1,children.0=childs1
-   Then run qmp command in secondary qemu:
-     { 'execute': 'nbd-server-start',
-@@ -XXX,XX +XXX,XX @@ Secondary:
-   The primary host is down, so we should do the following thing:
-   { 'execute': 'nbd-server-stop' }
-+Promote Secondary to Primary:
-+  see COLO-FT.txt
-+
- TODO:
--1. Continuous block replication
--2. Shared disk
-+1. Shared disk
---
-.5.0

-[PULL 23/23] l2tpv3: fix RFC number typo in qemu-options.hx
+Deleted patch
-From: Stefan Hajnoczi <stefanha@redhat.com>
-The L2TPv3 RFC number is 3931:
-https://tools.ietf.org/html/rfc3931
-Reported-by: Henrik Johansson <henrikjohansson@rocketmail.com>
-Reviewed-by: Stefan Weil <sw@weilnetz.de>
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- qemu-options.hx | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/qemu-options.hx b/qemu-options.hx
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
-+++ b/qemu-options.hx
-@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
-     "                Linux kernel 3.3+ as well as most routers can talk\n"
-     "                L2TPv3. This transport allows connecting a VM to a VM,\n"
-     "                VM to a router and even VM to Host. It is a nearly-universal\n"
--    "                standard (RFC3391). Note - this implementation uses static\n"
-+    "                standard (RFC3931). Note - this implementation uses static\n"
-     "                pre-configured tunnels (same as the Linux kernel).\n"
-     "                use 'src=' to specify source address\n"
-     "                use 'dst=' to specify destination address\n"
-@@ -XXX,XX +XXX,XX @@ Example (send packets from host's 1.2.3.4):
- @end example
- @item -netdev l2tpv3,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,srcport=@var{srcport}][,dstport=@var{dstport}],txsession=@var{txsession}[,rxsession=@var{rxsession}][,ipv6][,udp][,cookie64][,counter][,pincounter][,txcookie=@var{txcookie}][,rxcookie=@var{rxcookie}][,offset=@var{offset}]
--Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3391) is a
-+Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3931) is a
- popular protocol to transport Ethernet (and other Layer 2) data frames between
- two systems. It is present in routers, firewalls and the Linux kernel
- (from version 3.3 onwards).
---
-.5.0

The following changes since commit e0175b71638cf4398903c0d25f93fe62e0606389:

Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20200228' into staging (2020-02-28 16:39:27 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 41aa2e3f9b27fd259a13711545d933a20f1d2f16:

l2tpv3: fix RFC number typo in qemu-options.hx (2020-03-02 15:30:08 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Bin Meng (1):
      hw: net: cadence_gem: Fix build errors in DB_PRINT()

Finn Thain (14):
      dp8393x: Mask EOL bit from descriptor addresses
      dp8393x: Always use 32-bit accesses
      dp8393x: Clean up endianness hacks
      dp8393x: Have dp8393x_receive() return the packet size
      dp8393x: Update LLFA and CRDA registers from rx descriptor
      dp8393x: Clear RRRA command register bit only when appropriate
      dp8393x: Implement packet size limit and RBAE interrupt
      dp8393x: Don't clobber packet checksum
      dp8393x: Use long-word-aligned RRA pointers in 32-bit mode
      dp8393x: Pad frames to word or long word boundary
      dp8393x: Clear descriptor in_use field to release packet
      dp8393x: Always update RRA pointers and sequence numbers
      dp8393x: Don't reset Silicon Revision register
      dp8393x: Don't stop reception upon RBE interrupt assertion

Lukas Straub (4):
      block/replication.c: Ignore requests after failover
      tests/test-replication.c: Add test for for secondary node continuing replication
      net/filter.c: Add Options to insert filters anywhere in the filter list
      colo: Update Documentation for continuous replication

Stefan Hajnoczi (1):
      l2tpv3: fix RFC number typo in qemu-options.hx

Yuri Benditovich (3):
      e1000e: Avoid hw_error if legacy mode used
      NetRxPkt: Introduce support for additional hash types
      NetRxPkt: fix hash calculation of IPV6 TCP

From: Finn Thain <fthain@telegraphics.com.au>

The Least Significant bit of a descriptor address register is used as
an EOL flag. It has to be masked when the register value is to be used
as an actual address for copying memory around. But when the registers
are to be updated the EOL bit should not be masked.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
---
 hw/net/dp8393x.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0)
 #define SONIC_ISR_PINT   0x0800
 #define SONIC_ISR_LCD    0x1000
 
+#define SONIC_DESC_EOL   0x0001
+#define SONIC_DESC_ADDR  0xFFFE
+
 #define TYPE_DP8393X "dp8393x"
 #define DP8393X(obj) OBJECT_CHECK(dp8393xState, (obj), TYPE_DP8393X)
 
@@ -XXX,XX +XXX,XX @@ static uint32_t dp8393x_crba(dp8393xState *s)
 
 static uint32_t dp8393x_crda(dp8393xState *s)
 {
-    return (s->regs[SONIC_URDA] << 16) | s->regs[SONIC_CRDA];
+    return (s->regs[SONIC_URDA] << 16) |
+           (s->regs[SONIC_CRDA] & SONIC_DESC_ADDR);
 }
 
 static uint32_t dp8393x_rbwc(dp8393xState *s)
@@ -XXX,XX +XXX,XX @@ static uint32_t dp8393x_tsa(dp8393xState *s)
 
 static uint32_t dp8393x_ttda(dp8393xState *s)
 {
-    return (s->regs[SONIC_UTDA] << 16) | s->regs[SONIC_TTDA];
+    return (s->regs[SONIC_UTDA] << 16) |
+           (s->regs[SONIC_TTDA] & SONIC_DESC_ADDR);
 }
 
 static uint32_t dp8393x_wt(dp8393xState *s)
@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_transmit_packets(dp8393xState *s)
                                MEMTXATTRS_UNSPECIFIED, s->data,
                                size);
             s->regs[SONIC_CTDA] = dp8393x_get(s, width, 0) & ~0x1;
-            if (dp8393x_get(s, width, 0) & 0x1) {
+            if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) {
                 /* EOL detected */
                 break;
             }
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
     /* XXX: Check byte ordering */
 
     /* Check for EOL */
-    if (s->regs[SONIC_LLFA] & 0x1) {
+    if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
         /* Are we still in resource exhaustion? */
         size = sizeof(uint16_t) * 1 * width;
         address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width;
         address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED,
                            s->data, size);
-        if (dp8393x_get(s, width, 0) & 0x1) {
+        if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) {
             /* Still EOL ; stop reception */
             return -1;
         } else {
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
                        dp8393x_crda(s) + sizeof(uint16_t) * 5 * width,
                        MEMTXATTRS_UNSPECIFIED, s->data, size);
     s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0);
-    if (s->regs[SONIC_LLFA] & 0x1) {
+    if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
         /* EOL detected */
         s->regs[SONIC_ISR] |= SONIC_ISR_RDE;
     } else {
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

The DP83932 and DP83934 have 32 data lines. The datasheet says,

Data Bus: These bidirectional lines are used to transfer data on the
    system bus. When the SONIC is a bus master, 16-bit data is transferred
    on D15-D0 and 32-bit data is transferred on D31-D0. When the SONIC is
    accessed as a slave, register data is driven onto lines D15-D0.
    D31-D16 are held TRI-STATE if SONIC is in 16-bit mode. If SONIC is in
    32-bit mode, they are driven, but invalid.

Always use 32-bit accesses both as bus master and bus slave.

Force the MSW to zero in bus master mode.

This gets the Linux 'jazzsonic' driver working, and avoids the need for
prior hacks to make the NetBSD 'sn' driver work.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 47 +++++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static void dp8393x_put(dp8393xState *s, int width, int offset,
                         uint16_t val)
 {
     if (s->big_endian) {
-        s->data[offset * width + width - 1] = cpu_to_be16(val);
+        if (width == 2) {
+            s->data[offset * 2] = 0;
+            s->data[offset * 2 + 1] = cpu_to_be16(val);
+        } else {
+            s->data[offset] = cpu_to_be16(val);
+        }
     } else {
-        s->data[offset * width] = cpu_to_le16(val);
+        if (width == 2) {
+            s->data[offset * 2] = cpu_to_le16(val);
+            s->data[offset * 2 + 1] = 0;
+        } else {
+            s->data[offset] = cpu_to_le16(val);
+        }
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static uint64_t dp8393x_read(void *opaque, hwaddr addr, unsigned int size)
 
     DPRINTF("read 0x%04x from reg %s\n", val, reg_names[reg]);
 
-    return val;
+    return s->big_endian ? val << 16 : val;
 }
 
 static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
 {
     dp8393xState *s = opaque;
     int reg = addr >> s->it_shift;
+    uint32_t val = s->big_endian ? data >> 16 : data;
 
-    DPRINTF("write 0x%04x to reg %s\n", (uint16_t)data, reg_names[reg]);
+    DPRINTF("write 0x%04x to reg %s\n", (uint16_t)val, reg_names[reg]);
 
     switch (reg) {
         /* Command register */
         case SONIC_CR:
-            dp8393x_do_command(s, data);
+            dp8393x_do_command(s, val);
             break;
         /* Prevent write to read-only registers */
         case SONIC_CAP2:
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
         /* Accept write to some registers only when in reset mode */
         case SONIC_DCR:
             if (s->regs[SONIC_CR] & SONIC_CR_RST) {
-                s->regs[reg] = data & 0xbfff;
+                s->regs[reg] = val & 0xbfff;
             } else {
                 DPRINTF("writing to DCR invalid\n");
             }
             break;
         case SONIC_DCR2:
             if (s->regs[SONIC_CR] & SONIC_CR_RST) {
-                s->regs[reg] = data & 0xf017;
+                s->regs[reg] = val & 0xf017;
             } else {
                 DPRINTF("writing to DCR2 invalid\n");
             }
             break;
         /* 12 lower bytes are Read Only */
         case SONIC_TCR:
-            s->regs[reg] = data & 0xf000;
+            s->regs[reg] = val & 0xf000;
             break;
         /* 9 lower bytes are Read Only */
         case SONIC_RCR:
-            s->regs[reg] = data & 0xffe0;
+            s->regs[reg] = val & 0xffe0;
             break;
         /* Ignore most significant bit */
         case SONIC_IMR:
-            s->regs[reg] = data & 0x7fff;
+            s->regs[reg] = val & 0x7fff;
             dp8393x_update_irq(s);
             break;
         /* Clear bits by writing 1 to them */
         case SONIC_ISR:
-            data &= s->regs[reg];
-            s->regs[reg] &= ~data;
-            if (data & SONIC_ISR_RBE) {
+            val &= s->regs[reg];
+            s->regs[reg] &= ~val;
+            if (val & SONIC_ISR_RBE) {
                 dp8393x_do_read_rra(s);
             }
             dp8393x_update_irq(s);
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
         case SONIC_REA:
         case SONIC_RRP:
         case SONIC_RWP:
-            s->regs[reg] = data & 0xfffe;
+            s->regs[reg] = val & 0xfffe;
             break;
         /* Invert written value for some registers */
         case SONIC_CRCT:
         case SONIC_FAET:
         case SONIC_MPT:
-            s->regs[reg] = data ^ 0xffff;
+            s->regs[reg] = val ^ 0xffff;
             break;
         /* All other registers have no special contrainst */
         default:
-            s->regs[reg] = data;
+            s->regs[reg] = val;
     }
 
     if (reg == SONIC_WT0 || reg == SONIC_WT1) {
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
 static const MemoryRegionOps dp8393x_ops = {
     .read = dp8393x_read,
     .write = dp8393x_write,
-    .impl.min_access_size = 2,
-    .impl.max_access_size = 2,
+    .impl.min_access_size = 4,
+    .impl.max_access_size = 4,
     .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

According to the datasheet, section 3.4.4, "in 32-bit mode ... the SONIC
always writes long words".

Therefore, use the same technique for the 'in_use' field that is used
everywhere else, and write the full long word.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         return -1;
     }
 
-    /* XXX: Check byte ordering */
-
     /* Check for EOL */
     if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
         /* Are we still in resource exhaustion? */
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         /* EOL detected */
         s->regs[SONIC_ISR] |= SONIC_ISR_RDE;
     } else {
-        /* Clear in_use, but it is always 16bit wide */
-        int offset = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width;
-        if (s->big_endian && width == 2) {
-            /* we need to adjust the offset of the 16bit field */
-            offset += sizeof(uint16_t);
-        }
-        s->data[0] = 0;
-        address_space_write(&s->as, offset, MEMTXATTRS_UNSPECIFIED,
-                            s->data, sizeof(uint16_t));
+        /* Clear in_use */
+        size = sizeof(uint16_t) * width;
+        address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width;
+        dp8393x_put(s, width, 0, 0);
+        address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
+                            s->data, size);
         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
         s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff);
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

This function re-uses its 'size' argument as a scratch variable.
Instead, declare a local 'size' variable for that purpose so that the
function result doesn't get messed up.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static int dp8393x_receive_filter(dp8393xState *s, const uint8_t * buf,
 }
 
 static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
-                               size_t size)
+                               size_t pkt_size)
 {
     dp8393xState *s = qemu_get_nic_opaque(nc);
     int packet_type;
     uint32_t available, address;
-    int width, rx_len = size;
+    int width, rx_len = pkt_size;
     uint32_t checksum;
+    int size;
 
     width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1;
 
     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
 
-    packet_type = dp8393x_receive_filter(s, buf, size);
+    packet_type = dp8393x_receive_filter(s, buf, pkt_size);
     if (packet_type < 0) {
         DPRINTF("packet not for netcard\n");
         return -1;
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
     /* Done */
     dp8393x_update_irq(s);
 
-    return size;
+    return pkt_size;
 }
 
 static void dp8393x_reset(DeviceState *dev)
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

Follow the algorithm given in the National Semiconductor DP83932C
datasheet in section 3.4.7:

At the next reception, the SONIC re-reads the last RXpkt.link field,
    and updates its CRDA register to point to the next descriptor.

The chip is designed to allow the host to provide a new list of
descriptors in this way.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width;
         address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED,
                            s->data, size);
-        if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) {
+        s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0);
+        if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) {
             /* Still EOL ; stop reception */
             return -1;
-        } else {
-            s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
         }
+        /* Link has been updated by host */
+        s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
     }
 
     /* Save current position */
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
                         MEMTXATTRS_UNSPECIFIED,
                         s->data, size);
 
-    /* Move to next descriptor */
+    /* Check link field */
     size = sizeof(uint16_t) * width;
     address_space_read(&s->as,
                        dp8393x_crda(s) + sizeof(uint16_t) * 5 * width,
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         dp8393x_put(s, width, 0, 0);
         address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
                             s->data, size);
+
+        /* Move to next descriptor */
         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
         s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff);
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

It doesn't make sense to clear the command register bit unless the
command was actually issued.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_read_rra(dp8393xState *s)
         s->regs[SONIC_ISR] |= SONIC_ISR_RBE;
         dp8393x_update_irq(s);
     }
-
-    /* Done */
-    s->regs[SONIC_CR] &= ~SONIC_CR_RRRA;
 }
 
 static void dp8393x_do_software_reset(dp8393xState *s)
@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_command(dp8393xState *s, uint16_t command)
         dp8393x_do_start_timer(s);
     if (command & SONIC_CR_RST)
         dp8393x_do_software_reset(s);
-    if (command & SONIC_CR_RRRA)
+    if (command & SONIC_CR_RRRA) {
         dp8393x_do_read_rra(s);
+        s->regs[SONIC_CR] &= ~SONIC_CR_RRRA;
+    }
     if (command & SONIC_CR_LCAM)
         dp8393x_do_load_cam(s);
 }
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

Add a bounds check to prevent a large packet from causing a buffer
overflow. This is defensive programming -- I haven't actually tried
sending an oversized packet or a jumbo ethernet frame.

The SONIC handles packets that are too big for the buffer by raising
the RBAE interrupt and dropping them. Linux uses that interrupt to
count dropped packets.

From: Finn Thain <fthain@telegraphics.com.au>

A received packet consumes pkt_size bytes in the buffer and the frame
checksum that's appended to it consumes another 4 bytes. The Receive
Buffer Address register takes the former quantity into account but
not the latter. So the next packet written to the buffer overwrites
the frame checksum. Fix this.

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
     address += rx_len;
     address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
                         &checksum, 4);
+    address += 4;
     rx_len += 4;
     s->regs[SONIC_CRBA1] = address >> 16;
     s->regs[SONIC_CRBA0] = address & 0xffff;
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

Section 3.4.1 of the datasheet says,

The alignment of the RRA is confined to either word or long word
    boundaries, depending upon the data width mode. In 16-bit mode,
    the RRA must be aligned to a word boundary (A0 is always zero)
    and in 32-bit mode, the RRA is aligned to a long word boundary
    (A0 and A1 are always zero).

This constraint has been implemented for 16-bit mode; implement it
for 32-bit mode too.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
                 qemu_flush_queued_packets(qemu_get_queue(s->nic));
             }
             break;
-        /* Ignore least significant bit */
+        /* The guest is required to store aligned pointers here */
         case SONIC_RSA:
         case SONIC_REA:
         case SONIC_RRP:
         case SONIC_RWP:
-            s->regs[reg] = val & 0xfffe;
+            if (s->regs[SONIC_DCR] & SONIC_DCR_DW) {
+                s->regs[reg] = val & 0xfffc;
+            } else {
+                s->regs[reg] = val & 0xfffe;
+            }
             break;
         /* Invert written value for some registers */
         case SONIC_CRCT:
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

The existing code has a bug where the Remaining Buffer Word Count (RBWC)
is calculated with a truncating division, which gives the wrong result
for odd-sized packets.

Section 1.4.1 of the datasheet says,

Once the end of the packet has been reached, the serializer will
    fill out the last word (16-bit mode) or long word (32-bit mode)
    if the last byte did not end on a word or long word boundary
    respectively. The fill byte will be 0FFh.

Implement buffer padding so that buffer limits are correctly enforced.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
     dp8393xState *s = qemu_get_nic_opaque(nc);
     int packet_type;
     uint32_t available, address;
-    int width, rx_len = pkt_size;
+    int width, rx_len, padded_len;
     uint32_t checksum;
     int size;
 
-    width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1;
-
     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
 
-    if (pkt_size + 4 > dp8393x_rbwc(s) * 2) {
+    rx_len = pkt_size + sizeof(checksum);
+    if (s->regs[SONIC_DCR] & SONIC_DCR_DW) {
+        width = 2;
+        padded_len = ((rx_len - 1) | 3) + 1;
+    } else {
+        width = 1;
+        padded_len = ((rx_len - 1) | 1) + 1;
+    }
+
+    if (padded_len > dp8393x_rbwc(s) * 2) {
         DPRINTF("oversize packet, pkt_size is %d\n", pkt_size);
         s->regs[SONIC_ISR] |= SONIC_ISR_RBAE;
         dp8393x_update_irq(s);
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
     s->regs[SONIC_TRBA0] = s->regs[SONIC_CRBA0];
 
     /* Calculate the ethernet checksum */
-    checksum = cpu_to_le32(crc32(0, buf, rx_len));
+    checksum = cpu_to_le32(crc32(0, buf, pkt_size));
 
     /* Put packet into RBA */
     DPRINTF("Receive packet at %08x\n", dp8393x_crba(s));
     address = dp8393x_crba(s);
     address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
-                        buf, rx_len);
-    address += rx_len;
+                        buf, pkt_size);
+    address += pkt_size;
+
+    /* Put frame checksum into RBA */
     address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED,
-                        &checksum, 4);
-    address += 4;
-    rx_len += 4;
+                        &checksum, sizeof(checksum));
+    address += sizeof(checksum);
+
+    /* Pad short packets to keep pointers aligned */
+    if (rx_len < padded_len) {
+        size = padded_len - rx_len;
+        address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED,
+            (uint8_t *)"\xFF\xFF\xFF", size, 1);
+        address += size;
+    }
+
     s->regs[SONIC_CRBA1] = address >> 16;
     s->regs[SONIC_CRBA0] = address & 0xffff;
     available = dp8393x_rbwc(s);
-    available -= rx_len / 2;
+    available -= padded_len >> 1;
     s->regs[SONIC_RBWC1] = available >> 16;
     s->regs[SONIC_RBWC0] = available & 0xffff;
 
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

When the SONIC receives a packet into the last available descriptor, it
retains ownership of that descriptor for as long as necessary.

Section 3.4.7 of the datasheet says,

When the system appends more descriptors, the SONIC releases ownership
    of the descriptor after writing 0000h to the RXpkt.in_use field.

The packet can now be processed by the host, so raise a PKTRX interrupt,
just like the normal case.

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
             return -1;
         }
         /* Link has been updated by host */
+
+        /* Clear in_use */
+        size = sizeof(uint16_t) * width;
+        address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width;
+        dp8393x_put(s, width, 0, 0);
+        address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED,
+                         (uint8_t *)s->data, size, 1);
+
+        /* Move to next descriptor */
         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
+        s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
     }
 
     /* Save current position */
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

These operations need to take place regardless of whether or not
rx descriptors have been used up (that is, EOL flag was observed).

The algorithm is now the same for a packet that was withheld as for
a packet that was not.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         /* Move to next descriptor */
         s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA];
         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
-        s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff);
+    }
 
-        if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) {
-            /* Read next RRA */
-            dp8393x_do_read_rra(s);
-        }
+    s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) |
+                         ((s->regs[SONIC_RSC] + 1) & 0x00ff);
+
+    if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) {
+        /* Read next RRA */
+        dp8393x_do_read_rra(s);
     }
 
     /* Done */
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

The jazzsonic driver in Linux uses the Silicon Revision register value
to probe the chip. The driver fails unless the SR register contains 4.
Unfortunately, reading this register in QEMU usually returns 0 because
the s->regs[] array gets wiped after a software reset.

Fixes: bd8f1ebce4 ("net/dp8393x: fix hardware reset")
Suggested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static void dp8393x_reset(DeviceState *dev)
     timer_del(s->watchdog);
 
     memset(s->regs, 0, sizeof(s->regs));
+    s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux/mips */
     s->regs[SONIC_CR] = SONIC_CR_RST | SONIC_CR_STP | SONIC_CR_RXDIS;
     s->regs[SONIC_DCR] &= ~(SONIC_DCR_EXBUS | SONIC_DCR_LBR);
     s->regs[SONIC_RCR] &= ~(SONIC_RCR_LB0 | SONIC_RCR_LB1 | SONIC_RCR_BRD | SONIC_RCR_RNT);
@@ -XXX,XX +XXX,XX @@ static void dp8393x_realize(DeviceState *dev, Error **errp)
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 
     s->watchdog = timer_new_ns(QEMU_CLOCK_VIRTUAL, dp8393x_watchdog, s);
-    s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux */
 
     memory_region_init_ram(&s->prom, OBJECT(dev),
                            "dp8393x-prom", SONIC_PROM_SIZE, &local_err);
-- 
2.5.0

From: Finn Thain <fthain@telegraphics.com.au>

Section 3.4.7 of the datasheet explains that,

The RBE bit in the Interrupt Status register is set when the
    SONIC finishes using the second to last receive buffer and reads
    the last RRA descriptor. Actually, the SONIC is not truly out of
    resources, but gives the system an early warning of an impending
    out of resources condition.

RBE does not mean actual receive buffer exhaustion, and reception should
not be stopped. This is important because Linux will not check and clear
the RBE interrupt until it receives another packet. But that won't
happen if can_receive returns false. This bug causes the SONIC to become
deaf (until reset).

Fix this with a new flag to indicate actual receive buffer exhaustion.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/dp8393x.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ typedef struct dp8393xState {
     /* Hardware */
     uint8_t it_shift;
     bool big_endian;
+    bool last_rba_is_full;
     qemu_irq irq;
 #ifdef DEBUG_SONIC
     int irq_level;
@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_read_rra(dp8393xState *s)
         s->regs[SONIC_RRP] = s->regs[SONIC_RSA];
     }
 
-    /* Check resource exhaustion */
+    /* Warn the host if CRBA now has the last available resource */
     if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP])
     {
         s->regs[SONIC_ISR] |= SONIC_ISR_RBE;
         dp8393x_update_irq(s);
     }
+
+    /* Allow packet reception */
+    s->last_rba_is_full = false;
 }
 
 static void dp8393x_do_software_reset(dp8393xState *s)
@@ -XXX,XX +XXX,XX @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data,
                 dp8393x_do_read_rra(s);
             }
             dp8393x_update_irq(s);
-            if (dp8393x_can_receive(s->nic->ncs)) {
-                qemu_flush_queued_packets(qemu_get_queue(s->nic));
-            }
             break;
         /* The guest is required to store aligned pointers here */
         case SONIC_RSA:
@@ -XXX,XX +XXX,XX @@ static int dp8393x_can_receive(NetClientState *nc)
 
     if (!(s->regs[SONIC_CR] & SONIC_CR_RXEN))
         return 0;
-    if (s->regs[SONIC_ISR] & SONIC_ISR_RBE)
-        return 0;
     return 1;
 }
 
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
     s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER |
         SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC);
 
+    if (s->last_rba_is_full) {
+        return pkt_size;
+    }
+
     rx_len = pkt_size + sizeof(checksum);
     if (s->regs[SONIC_DCR] & SONIC_DCR_DW) {
         width = 2;
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         DPRINTF("oversize packet, pkt_size is %d\n", pkt_size);
         s->regs[SONIC_ISR] |= SONIC_ISR_RBAE;
         dp8393x_update_irq(s);
-        dp8393x_do_read_rra(s);
-        return pkt_size;
+        s->regs[SONIC_RCR] |= SONIC_RCR_LPKT;
+        goto done;
     }
 
     packet_type = dp8393x_receive_filter(s, buf, pkt_size);
@@ -XXX,XX +XXX,XX @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf,
         s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX;
     }
 
+    dp8393x_update_irq(s);
+
     s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) |
                          ((s->regs[SONIC_RSC] + 1) & 0x00ff);
 
+done:
+
     if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) {
-        /* Read next RRA */
-        dp8393x_do_read_rra(s);
+        if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP]) {
+            /* Stop packet reception */
+            s->last_rba_is_full = true;
+        } else {
+            /* Read next resource */
+            dp8393x_do_read_rra(s);
+        }
     }
 
-    /* Done */
-    dp8393x_update_irq(s);
-
     return pkt_size;
 }
 
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

https://bugzilla.redhat.com/show_bug.cgi?id=1787142
The emulation issues hw_error if PSRCTL register
is written, for example, with zero value.
Such configuration does not present any problem when
DTYP bits of RCTL register define legacy format of
transfer descriptors. Current commit discards check
for BSIZE0 and BSIZE1 when legacy mode used.

Acked-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000e_core.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -XXX,XX +XXX,XX @@ e1000e_set_eitr(E1000ECore *core, int index, uint32_t val)
 static void
 e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val)
 {
-    if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
-        hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
-    }
+    if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) {
+
+        if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) {
+            hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero");
+        }
 
-    if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) {
-        hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero");
+        if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) {
+            hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero");
+        }
     }
 
     core->mac[PSRCTL] = val;
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

Add support for following hash types:
IPV6 TCP with extension headers
IPV4 UDP
IPV6 UDP
IPV6 UDP with extension headers

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Acked-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/net_rx_pkt.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 hw/net/net_rx_pkt.h |  6 +++++-
 hw/net/trace-events |  4 ++++
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/net_rx_pkt.c
+++ b/hw/net/net_rx_pkt.c
@@ -XXX,XX +XXX,XX @@ _net_rx_rss_prepare_tcp(uint8_t *rss_input,
                           &tcphdr->th_dport, sizeof(uint16_t));
 }
 
+static inline void
+_net_rx_rss_prepare_udp(uint8_t *rss_input,
+                        struct NetRxPkt *pkt,
+                        size_t *bytes_written)
+{
+    struct udp_header *udphdr = &pkt->l4hdr_info.hdr.udp;
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+                          &udphdr->uh_sport, sizeof(uint16_t));
+
+    _net_rx_rss_add_chunk(rss_input, bytes_written,
+                          &udphdr->uh_dport, sizeof(uint16_t));
+}
+
 uint32_t
 net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
                          NetRxPktRssType type,
@@ -XXX,XX +XXX,XX @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
         trace_net_rx_pkt_rss_ip6_ex();
         _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
         break;
+    case NetPktRssIpV6TcpEx:
+        assert(pkt->isip6);
+        assert(pkt->istcp);
+        trace_net_rx_pkt_rss_ip6_ex_tcp();
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
+        _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
+        break;
+    case NetPktRssIpV4Udp:
+        assert(pkt->isip4);
+        assert(pkt->isudp);
+        trace_net_rx_pkt_rss_ip4_udp();
+        _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length);
+        _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
+        break;
+    case NetPktRssIpV6Udp:
+        assert(pkt->isip6);
+        assert(pkt->isudp);
+        trace_net_rx_pkt_rss_ip6_udp();
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
+        _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
+        break;
+    case NetPktRssIpV6UdpEx:
+        assert(pkt->isip6);
+        assert(pkt->isudp);
+        trace_net_rx_pkt_rss_ip6_ex_udp();
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
+        _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length);
+        break;
     default:
         assert(false);
         break;
diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/net_rx_pkt.h
+++ b/hw/net/net_rx_pkt.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
     NetPktRssIpV4Tcp,
     NetPktRssIpV6Tcp,
     NetPktRssIpV6,
-    NetPktRssIpV6Ex
+    NetPktRssIpV6Ex,
+    NetPktRssIpV6TcpEx,
+    NetPktRssIpV4Udp,
+    NetPktRssIpV6Udp,
+    NetPktRssIpV6UdpEx,
 } NetRxPktRssType;
 
 /**
diff --git a/hw/net/trace-events b/hw/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -XXX,XX +XXX,XX @@ net_rx_pkt_l3_csum_validate_csum(size_t l3hdr_off, uint32_t csl, uint32_t cntr,
 
 net_rx_pkt_rss_ip4(void) "Calculating IPv4 RSS  hash"
 net_rx_pkt_rss_ip4_tcp(void) "Calculating IPv4/TCP RSS  hash"
+net_rx_pkt_rss_ip4_udp(void) "Calculating IPv4/UDP RSS  hash"
 net_rx_pkt_rss_ip6_tcp(void) "Calculating IPv6/TCP RSS  hash"
+net_rx_pkt_rss_ip6_udp(void) "Calculating IPv6/UDP RSS  hash"
 net_rx_pkt_rss_ip6(void) "Calculating IPv6 RSS  hash"
 net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS  hash"
+net_rx_pkt_rss_ip6_ex_tcp(void) "Calculating IPv6/EX/TCP RSS  hash"
+net_rx_pkt_rss_ip6_ex_udp(void) "Calculating IPv6/EX/UDP RSS  hash"
 net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X"
 net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes"
 
-- 
2.5.0

From: Yuri Benditovich <yuri.benditovich@daynix.com>

When requested to calculate the hash for TCPV6 packet,
ignore overrides of source and destination addresses
in in extension headers.
Use these overrides when new hash type NetPktRssIpV6TcpEx
requested.
Use this type in e1000e hash calculation for IPv6 TCP, which
should take in account overrides of the addresses.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Acked-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000e_core.c | 2 +-
 hw/net/net_rx_pkt.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -XXX,XX +XXX,XX @@ e1000e_rss_calc_hash(E1000ECore *core,
         type = NetPktRssIpV4Tcp;
         break;
     case E1000_MRQ_RSS_TYPE_IPV6TCP:
-        type = NetPktRssIpV6Tcp;
+        type = NetPktRssIpV6TcpEx;
         break;
     case E1000_MRQ_RSS_TYPE_IPV6:
         type = NetPktRssIpV6;
diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/net_rx_pkt.c
+++ b/hw/net/net_rx_pkt.c
@@ -XXX,XX +XXX,XX @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt,
         assert(pkt->isip6);
         assert(pkt->istcp);
         trace_net_rx_pkt_rss_ip6_tcp();
-        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length);
+        _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length);
         _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length);
         break;
     case NetPktRssIpV6:
-- 
2.5.0

From: Bin Meng <bmeng.cn@gmail.com>

When CADENCE_GEM_ERR_DEBUG is turned on, there are several
compilation errors in DB_PRINT(). Fix them.

While we are here, update to use appropriate modifiers in
the same DB_PRINT() call.

Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size)
             return -1;
         }
 
-        DB_PRINT("copy %d bytes to 0x%x\n", MIN(bytes_to_copy, rxbufsize),
-                rx_desc_get_buffer(s->rx_desc[q]));
+        DB_PRINT("copy %u bytes to 0x%" PRIx64 "\n",
+                 MIN(bytes_to_copy, rxbufsize),
+                 rx_desc_get_buffer(s, s->rx_desc[q]));
 
         /* Copy packet data to emulated DMA buffer */
         address_space_write(&s->dma_as, rx_desc_get_buffer(s, s->rx_desc[q]) +
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
 
             if (tx_desc_get_length(desc) > sizeof(tx_packet) -
                                                (p - tx_packet)) {
-                DB_PRINT("TX descriptor @ 0x%x too large: size 0x%x space " \
-                         "0x%x\n", (unsigned)packet_desc_addr,
-                         (unsigned)tx_desc_get_length(desc),
+                DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \
+                         " too large: size 0x%x space 0x%zx\n",
+                         packet_desc_addr, tx_desc_get_length(desc),
                          sizeof(tx_packet) - (p - tx_packet));
                 break;
             }
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

After failover the Secondary side of replication shouldn't change state, because
it now functions as our primary disk.

In replication_start, replication_do_checkpoint, replication_stop, ignore
the request if current state is BLOCK_REPLICATION_DONE (sucessful failover) or
BLOCK_REPLICATION_FAILOVER (failover in progres i.e. currently merging active
and hidden images into the base image).

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Acked-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 block/replication.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/block/replication.c b/block/replication.c
index XXXXXXX..XXXXXXX 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -XXX,XX +XXX,XX @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
     aio_context_acquire(aio_context);
     s = bs->opaque;
 
+    if (s->stage == BLOCK_REPLICATION_DONE ||
+        s->stage == BLOCK_REPLICATION_FAILOVER) {
+        /*
+         * This case happens when a secondary is promoted to primary.
+         * Ignore the request because the secondary side of replication
+         * doesn't have to do anything anymore.
+         */
+        aio_context_release(aio_context);
+        return;
+    }
+
     if (s->stage != BLOCK_REPLICATION_NONE) {
         error_setg(errp, "Block replication is running or done");
         aio_context_release(aio_context);
@@ -XXX,XX +XXX,XX @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
     aio_context_acquire(aio_context);
     s = bs->opaque;
 
+    if (s->stage == BLOCK_REPLICATION_DONE ||
+        s->stage == BLOCK_REPLICATION_FAILOVER) {
+        /*
+         * This case happens when a secondary was promoted to primary.
+         * Ignore the request because the secondary side of replication
+         * doesn't have to do anything anymore.
+         */
+        aio_context_release(aio_context);
+        return;
+    }
+
     if (s->mode == REPLICATION_MODE_SECONDARY) {
         secondary_do_checkpoint(s, errp);
     }
@@ -XXX,XX +XXX,XX @@ static void replication_get_error(ReplicationState *rs, Error **errp)
     aio_context_acquire(aio_context);
     s = bs->opaque;
 
-    if (s->stage != BLOCK_REPLICATION_RUNNING) {
+    if (s->stage == BLOCK_REPLICATION_NONE) {
         error_setg(errp, "Block replication is not running");
         aio_context_release(aio_context);
         return;
@@ -XXX,XX +XXX,XX @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
     aio_context_acquire(aio_context);
     s = bs->opaque;
 
+    if (s->stage == BLOCK_REPLICATION_DONE ||
+        s->stage == BLOCK_REPLICATION_FAILOVER) {
+        /*
+         * This case happens when a secondary was promoted to primary.
+         * Ignore the request because the secondary side of replication
+         * doesn't have to do anything anymore.
+         */
+        aio_context_release(aio_context);
+        return;
+    }
+
     if (s->stage != BLOCK_REPLICATION_RUNNING) {
         error_setg(errp, "Block replication is not running");
         aio_context_release(aio_context);
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

This simulates the case that happens when we resume COLO after failover.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 tests/test-replication.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/tests/test-replication.c b/tests/test-replication.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/test-replication.c
+++ b/tests/test-replication.c
@@ -XXX,XX +XXX,XX @@ static void test_secondary_stop(void)
     teardown_secondary();
 }
 
+static void test_secondary_continuous_replication(void)
+{
+    BlockBackend *top_blk, *local_blk;
+    Error *local_err = NULL;
+
+    top_blk = start_secondary();
+    replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
+    g_assert(!local_err);
+
+    /* write 0x22 to s_local_disk (IMG_SIZE / 2, IMG_SIZE) */
+    local_blk = blk_by_name(S_LOCAL_DISK_ID);
+    test_blk_write(local_blk, 0x22, IMG_SIZE / 2, IMG_SIZE / 2, false);
+
+    /* replication will backup s_local_disk to s_hidden_disk */
+    test_blk_read(top_blk, 0x11, IMG_SIZE / 2,
+                  IMG_SIZE / 2, 0, IMG_SIZE, false);
+
+    /* write 0x33 to s_active_disk (0, IMG_SIZE / 2) */
+    test_blk_write(top_blk, 0x33, 0, IMG_SIZE / 2, false);
+
+    /* do failover (active commit) */
+    replication_stop_all(true, &local_err);
+    g_assert(!local_err);
+
+    /* it should ignore all requests from now on */
+
+    /* start after failover */
+    replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
+    g_assert(!local_err);
+
+    /* checkpoint */
+    replication_do_checkpoint_all(&local_err);
+    g_assert(!local_err);
+
+    /* stop */
+    replication_stop_all(true, &local_err);
+    g_assert(!local_err);
+
+    /* read from s_local_disk (0, IMG_SIZE / 2) */
+    test_blk_read(top_blk, 0x33, 0, IMG_SIZE / 2,
+                  0, IMG_SIZE / 2, false);
+
+
+    /* read from s_local_disk (IMG_SIZE / 2, IMG_SIZE) */
+    test_blk_read(top_blk, 0x22, IMG_SIZE / 2,
+                  IMG_SIZE / 2, 0, IMG_SIZE, false);
+
+    teardown_secondary();
+}
+
 static void test_secondary_do_checkpoint(void)
 {
     BlockBackend *top_blk, *local_blk;
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     g_test_add_func("/replication/secondary/write", test_secondary_write);
     g_test_add_func("/replication/secondary/start", test_secondary_start);
     g_test_add_func("/replication/secondary/stop",  test_secondary_stop);
+    g_test_add_func("/replication/secondary/continuous_replication",
+                    test_secondary_continuous_replication);
     g_test_add_func("/replication/secondary/do_checkpoint",
                     test_secondary_do_checkpoint);
     g_test_add_func("/replication/secondary/get_error_all",
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

To switch the Secondary to Primary, we need to insert new filters
before the filter-rewriter.

Add the options insert= and position= to be able to insert filters
anywhere in the filter list.

position should be "head" or "tail" to insert at the head or
tail of the filter list or it should be "id=<id>" to specify
the id of another filter.
insert should be either "before" or "behind" to specify where to
insert the new filter relative to the one specified with position.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Reviewed-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/net/filter.h |  2 ++
 net/filter.c         | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 qemu-options.hx      | 31 +++++++++++++++---
 3 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/include/net/filter.h b/include/net/filter.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/filter.h
+++ b/include/net/filter.h
@@ -XXX,XX +XXX,XX @@ struct NetFilterState {
     NetClientState *netdev;
     NetFilterDirection direction;
     bool on;
+    char *position;
+    bool insert_before_flag;
     QTAILQ_ENTRY(NetFilterState) next;
 };
 
diff --git a/net/filter.c b/net/filter.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter.c
+++ b/net/filter.c
@@ -XXX,XX +XXX,XX @@ static void netfilter_set_status(Object *obj, const char *str, Error **errp)
     }
 }
 
+static char *netfilter_get_position(Object *obj, Error **errp)
+{
+    NetFilterState *nf = NETFILTER(obj);
+
+    return g_strdup(nf->position);
+}
+
+static void netfilter_set_position(Object *obj, const char *str, Error **errp)
+{
+    NetFilterState *nf = NETFILTER(obj);
+
+    nf->position = g_strdup(str);
+}
+
+static char *netfilter_get_insert(Object *obj, Error **errp)
+{
+    NetFilterState *nf = NETFILTER(obj);
+
+    return nf->insert_before_flag ? g_strdup("before") : g_strdup("behind");
+}
+
+static void netfilter_set_insert(Object *obj, const char *str, Error **errp)
+{
+    NetFilterState *nf = NETFILTER(obj);
+
+    if (strcmp(str, "before") && strcmp(str, "behind")) {
+        error_setg(errp, "Invalid value for netfilter insert, "
+                         "should be 'before' or 'behind'");
+        return;
+    }
+
+    nf->insert_before_flag = !strcmp(str, "before");
+}
+
 static void netfilter_init(Object *obj)
 {
     NetFilterState *nf = NETFILTER(obj);
 
     nf->on = true;
+    nf->insert_before_flag = false;
+    nf->position = g_strdup("tail");
 
     object_property_add_str(obj, "netdev",
                             netfilter_get_netdev_id, netfilter_set_netdev_id,
@@ -XXX,XX +XXX,XX @@ static void netfilter_init(Object *obj)
     object_property_add_str(obj, "status",
                             netfilter_get_status, netfilter_set_status,
                             NULL);
+    object_property_add_str(obj, "position",
+                            netfilter_get_position, netfilter_set_position,
+                            NULL);
+    object_property_add_str(obj, "insert",
+                            netfilter_get_insert, netfilter_set_insert,
+                            NULL);
 }
 
 static void netfilter_complete(UserCreatable *uc, Error **errp)
 {
     NetFilterState *nf = NETFILTER(uc);
+    NetFilterState *position = NULL;
     NetClientState *ncs[MAX_QUEUE_NUM];
     NetFilterClass *nfc = NETFILTER_GET_CLASS(uc);
     int queues;
@@ -XXX,XX +XXX,XX @@ static void netfilter_complete(UserCreatable *uc, Error **errp)
         return;
     }
 
+    if (strcmp(nf->position, "head") && strcmp(nf->position, "tail")) {
+        Object *container;
+        Object *obj;
+        char *position_id;
+
+        if (!g_str_has_prefix(nf->position, "id=")) {
+            error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "position",
+                       "'head', 'tail' or 'id=<id>'");
+            return;
+        }
+
+        /* get the id from the string */
+        position_id = g_strndup(nf->position + 3, strlen(nf->position) - 3);
+
+        /* Search for the position to insert before/behind */
+        container = object_get_objects_root();
+        obj = object_resolve_path_component(container, position_id);
+        if (!obj) {
+            error_setg(errp, "filter '%s' not found", position_id);
+            g_free(position_id);
+            return;
+        }
+
+        position = NETFILTER(obj);
+
+        if (position->netdev != ncs[0]) {
+            error_setg(errp, "filter '%s' belongs to a different netdev",
+                        position_id);
+            g_free(position_id);
+            return;
+        }
+
+        g_free(position_id);
+    }
+
     nf->netdev = ncs[0];
 
     if (nfc->setup) {
@@ -XXX,XX +XXX,XX @@ static void netfilter_complete(UserCreatable *uc, Error **errp)
             return;
         }
     }
-    QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next);
+
+    if (position) {
+        if (nf->insert_before_flag) {
+            QTAILQ_INSERT_BEFORE(position, nf, next);
+        } else {
+            QTAILQ_INSERT_AFTER(&nf->netdev->filters, position, nf, next);
+        }
+    } else if (!strcmp(nf->position, "head")) {
+        QTAILQ_INSERT_HEAD(&nf->netdev->filters, nf, next);
+    } else if (!strcmp(nf->position, "tail")) {
+        QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next);
+    }
 }
 
 static void netfilter_finalize(Object *obj)
@@ -XXX,XX +XXX,XX @@ static void netfilter_finalize(Object *obj)
         QTAILQ_REMOVE(&nf->netdev->filters, nf, next);
     }
     g_free(nf->netdev_id);
+    g_free(nf->position);
 }
 
 static void default_handle_event(NetFilterState *nf, int event, Error **errp)
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ applications, they can do this through this parameter. Its format is
 a gnutls priority string as described at
 @url{https://gnutls.org/manual/html_node/Priority-Strings.html}.
 
-@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}]
+@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
 
 Interval @var{t} can't be 0, this filter batches the packet delivery: all
 packets arriving in a given interval on netdev @var{netdevid} are delayed
@@ -XXX,XX +XXX,XX @@ queue @var{all|rx|tx} is an option that can be applied to any netfilter.
 @option{tx}: the filter is attached to the transmit queue of the netdev,
              where it will receive packets sent by the netdev.
 
-@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support]
+position @var{head|tail|id=<id>} is an option to specify where the
+filter should be inserted in the filter list. It can be applied to any
+netfilter.
+
+@option{head}: the filter is inserted at the head of the filter
+               list, before any existing filters.
+
+@option{tail}: the filter is inserted at the tail of the filter
+               list, behind any existing filters (default).
+
+@option{id=<id>}: the filter is inserted before or behind the filter
+                  specified by <id>, see the insert option below.
+
+insert @var{behind|before} is an option to specify where to insert the
+new filter relative to the one specified with position=id=<id>. It can
+be applied to any netfilter.
+
+@option{before}: insert before the specified filter.
+
+@option{behind}: insert behind the specified filter (default).
+
+@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
 
 filter-mirror on netdev @var{netdevid},mirror net packet to chardev@var{chardevid}, if it has the vnet_hdr_support flag, filter-mirror will mirror packet with vnet_hdr_len.
 
-@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support]
+@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
 
 filter-redirector on netdev @var{netdevid},redirect filter's net packet to chardev
 @var{chardevid},and redirect indev's packet to filter.if it has the vnet_hdr_support flag,
@@ -XXX,XX +XXX,XX @@ Create a filter-redirector we need to differ outdev id from indev id, id can not
 be the same. we can just use indev or outdev, but at least one of indev or outdev
 need to be specified.
 
-@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support]
+@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
 
 Filter-rewriter is a part of COLO project.It will rewrite tcp packet to
 secondary from primary to keep secondary tcp connection,and rewrite
@@ -XXX,XX +XXX,XX @@ colo secondary:
 -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1
 -object filter-rewriter,id=rew0,netdev=hn0,queue=all
 
-@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}]
+@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}]
 
 Dump the network traffic on netdev @var{dev} to the file specified by
 @var{filename}. At most @var{len} bytes (64k by default) per packet are stored.
-- 
2.5.0

From: Lukas Straub <lukasstraub2@web.de>

Document the qemu command-line and qmp commands for continuous replication

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 docs/COLO-FT.txt           | 224 +++++++++++++++++++++++++++++++++------------
 docs/block-replication.txt |  28 ++++--
 2 files changed, 184 insertions(+), 68 deletions(-)

diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/COLO-FT.txt
+++ b/docs/COLO-FT.txt
@@ -XXX,XX +XXX,XX @@ The diagram just shows the main qmp command, you can get the detail
 in test procedure.
 
 == Test procedure ==
-1. Startup qemu
-Primary:
-# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name primary \
-  -device piix3-usb-uhci -vnc :7 \
-  -device usb-tablet -netdev tap,id=hn0,vhost=off \
-  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
-  -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
-         children.0.file.filename=1.raw,\
-         children.0.driver=raw -S
-Secondary:
-# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name secondary \
-  -device piix3-usb-uhci -vnc :7 \
-  -device usb-tablet -netdev tap,id=hn0,vhost=off \
-  -device virtio-net-pci,id=net-pci0,netdev=hn0 \
-  -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \
-  -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\
-         file.driver=qcow2,top-id=active-disk0,\
-         file.file.filename=/mnt/ramfs/active_disk.img,\
-         file.backing.driver=qcow2,\
-         file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\
-         file.backing.backing=secondary-disk0 \
-  -incoming tcp:0:8888
-
-2. On Secondary VM's QEMU monitor, issue command
+Note: Here we are running both instances on the same host for testing,
+change the IP Addresses if you want to run it on two hosts. Initally
+127.0.0.1 is the Primary Host and 127.0.0.2 is the Secondary Host.
+
+== Startup qemu ==
+1. Primary:
+Note: Initally, $imagefolder/primary.qcow2 needs to be copied to all hosts.
+You don't need to change any IP's here, because 0.0.0.0 listens on any
+interface. The chardev's with 127.0.0.1 IP's loopback to the local qemu
+instance.
+
+# imagefolder="/mnt/vms/colo-test-primary"
+
+# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \
+   -device piix3-usb-uhci -device usb-tablet -name primary \
+   -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \
+   -device rtl8139,id=e0,netdev=hn0 \
+   -chardev socket,id=mirror0,host=0.0.0.0,port=9003,server,nowait \
+   -chardev socket,id=compare1,host=0.0.0.0,port=9004,server,wait \
+   -chardev socket,id=compare0,host=127.0.0.1,port=9001,server,nowait \
+   -chardev socket,id=compare0-0,host=127.0.0.1,port=9001 \
+   -chardev socket,id=compare_out,host=127.0.0.1,port=9005,server,nowait \
+   -chardev socket,id=compare_out0,host=127.0.0.1,port=9005 \
+   -object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0 \
+   -object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out \
+   -object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0 \
+   -object iothread,id=iothread1 \
+   -object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,\
+outdev=compare_out0,iothread=iothread1 \
+   -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
+children.0.file.filename=$imagefolder/primary.qcow2,children.0.driver=qcow2 -S
+
+2. Secondary:
+Note: Active and hidden images need to be created only once and the
+size should be the same as primary.qcow2. Again, you don't need to change
+any IP's here, except for the $primary_ip variable.
+
+# imagefolder="/mnt/vms/colo-test-secondary"
+# primary_ip=127.0.0.1
+
+# qemu-img create -f qcow2 $imagefolder/secondary-active.qcow2 10G
+
+# qemu-img create -f qcow2 $imagefolder/secondary-hidden.qcow2 10G
+
+# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \
+   -device piix3-usb-uhci -device usb-tablet -name secondary \
+   -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \
+   -device rtl8139,id=e0,netdev=hn0 \
+   -chardev socket,id=red0,host=$primary_ip,port=9003,reconnect=1 \
+   -chardev socket,id=red1,host=$primary_ip,port=9004,reconnect=1 \
+   -object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0 \
+   -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 \
+   -object filter-rewriter,id=rew0,netdev=hn0,queue=all \
+   -drive if=none,id=parent0,file.filename=$imagefolder/primary.qcow2,driver=qcow2 \
+   -drive if=none,id=childs0,driver=replication,mode=secondary,file.driver=qcow2,\
+top-id=colo-disk0,file.file.filename=$imagefolder/secondary-active.qcow2,\
+file.backing.driver=qcow2,file.backing.file.filename=$imagefolder/secondary-hidden.qcow2,\
+file.backing.backing=parent0 \
+   -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\
+children.0=childs0 \
+   -incoming tcp:0.0.0.0:9998
+
+
+3. On Secondary VM's QEMU monitor, issue command
 {'execute':'qmp_capabilities'}
-{ 'execute': 'nbd-server-start',
-  'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } }
-}
-{'execute': 'nbd-server-add', 'arguments': {'device': 'secondary-disk0', 'writable': true } }
+{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': {'host': '0.0.0.0', 'port': '9999'} } } }
+{'execute': 'nbd-server-add', 'arguments': {'device': 'parent0', 'writable': true } }
 
 Note:
   a. The qmp command nbd-server-start and nbd-server-add must be run
      before running the qmp command migrate on primary QEMU
   b. Active disk, hidden disk and nbd target's length should be the
      same.
-  c. It is better to put active disk and hidden disk in ramdisk.
+  c. It is better to put active disk and hidden disk in ramdisk. They
+     will be merged into the parent disk on failover.
 
-3. On Primary VM's QEMU monitor, issue command:
+4. On Primary VM's QEMU monitor, issue command:
 {'execute':'qmp_capabilities'}
-{ 'execute': 'human-monitor-command',
-  'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}}
-{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } }
-{ 'execute': 'migrate-set-capabilities',
-      'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
-{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } }
+{'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}}
+{'execute': 'x-blockdev-change', 'arguments':{'parent': 'colo-disk0', 'node': 'replication0' } }
+{'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
+{'execute': 'migrate', 'arguments': {'uri': 'tcp:127.0.0.2:9998' } }
 
   Note:
   a. There should be only one NBD Client for each primary disk.
-  b. xx.xx.xx.xx is the secondary physical machine's hostname or IP
-  c. The qmp command line must be run after running qmp command line in
+  b. The qmp command line must be run after running qmp command line in
      secondary qemu.
 
-4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
+5. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced.
 You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }'
-to change the checkpoint period time
+to change the idle checkpoint period time
+
+6. Failover test
+You can kill one of the VMs and Failover on the surviving VM:
+
+If you killed the Secondary, then follow "Primary Failover". After that,
+if you want to resume the replication, follow "Primary resume replication"
+
+If you killed the Primary, then follow "Secondary Failover". After that,
+if you want to resume the replication, follow "Secondary resume replication"
+
+== Primary Failover ==
+The Secondary died, resume on the Primary
+
+{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'child': 'children.1'} }
+{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_del replication0' } }
+{'execute': 'object-del', 'arguments':{ 'id': 'comp0' } }
+{'execute': 'object-del', 'arguments':{ 'id': 'iothread1' } }
+{'execute': 'object-del', 'arguments':{ 'id': 'm0' } }
+{'execute': 'object-del', 'arguments':{ 'id': 'redire0' } }
+{'execute': 'object-del', 'arguments':{ 'id': 'redire1' } }
+{'execute': 'x-colo-lost-heartbeat' }
+
+== Secondary Failover ==
+The Primary died, resume on the Secondary and prepare to become the new Primary
+
+{'execute': 'nbd-server-stop'}
+{'execute': 'x-colo-lost-heartbeat'}
+
+{'execute': 'object-del', 'arguments':{ 'id': 'f2' } }
+{'execute': 'object-del', 'arguments':{ 'id': 'f1' } }
+{'execute': 'chardev-remove', 'arguments':{ 'id': 'red1' } }
+{'execute': 'chardev-remove', 'arguments':{ 'id': 'red0' } }
+
+{'execute': 'chardev-add', 'arguments':{ 'id': 'mirror0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9003' } }, 'server': true } } } }
+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare1', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9004' } }, 'server': true } } } }
+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': true } } } }
+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0-0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': false } } } }
+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': true } } } }
+{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': false } } } }
+
+== Primary resume replication ==
+Resume replication after new Secondary is up.
+
+Start the new Secondary (Steps 2 and 3 above), then on the Primary:
+{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.2:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} }
+
+Wait until disk is synced, then:
+{'execute': 'stop'}
+{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync'} }
+
+{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}}
+{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } }
+
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } }
+
+{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
+{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.2:9998' } }
+
+Note:
+If this Primary previously was a Secondary, then we need to insert the
+filters before the filter-rewriter by using the
+"'insert': 'before', 'position': 'id=rew0'" Options. See below.
+
+== Secondary resume replication ==
+Become Primary and resume replication after new Secondary is up. Note
+that now 127.0.0.1 is the Secondary and 127.0.0.2 is the Primary.
+
+Start the new Secondary (Steps 2 and 3 above, but with primary_ip=127.0.0.2),
+then on the old Secondary:
+{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.1:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} }
+
+Wait until disk is synced, then:
+{'execute': 'stop'}
+{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync' } }
 
-5. Failover test
-You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's
-monitor at the same time, then SVM will failover and client will not detect this
-change.
+{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.1,file.port=9999,file.export=parent0,node-name=replication0'}}
+{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } }
 
-Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to
-issue block related command to stop block replication.
-Primary:
-  Remove the nbd child from the quorum:
-  { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}}
-  { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}}
-  Note: there is no qmp command to remove the blockdev now
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } }
+{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } }
 
-Secondary:
-  The primary host is down, so we should do the following thing:
-  { 'execute': 'nbd-server-stop' }
+{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } }
+{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.1:9998' } }
 
 == TODO ==
-1. Support continuous VM replication.
-2. Support shared storage.
-3. Develop the heartbeat part.
-4. Reduce checkpoint VM’s downtime while doing checkpoint.
+1. Support shared storage.
+2. Develop the heartbeat part.
+3. Reduce checkpoint VM’s downtime while doing checkpoint.
diff --git a/docs/block-replication.txt b/docs/block-replication.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/block-replication.txt
+++ b/docs/block-replication.txt
@@ -XXX,XX +XXX,XX @@ blocks that are already in QEMU.
              ^            ||                            .----------
              |            ||                            | Secondary
         1 Quorum          ||                            '----------
-         /      \         ||
-        /        \        ||
-   Primary    2 filter
-     disk         ^                                                             virtio-blk
-                  |                                                                  ^
-                3 NBD  ------->  3 NBD                                               |
+         /      \         ||                                                           virtio-blk
+        /        \        ||                                                               ^
+   Primary    2 filter                                                                     |
+     disk         ^                                                                   7 Quorum
+                  |                                                                    /
+                3 NBD  ------->  3 NBD                                                /
                 client    ||     server                                          2 filter
                           ||        ^                                                ^
 --------.                 ||        |                                                |
@@ -XXX,XX +XXX,XX @@ any state that would otherwise be lost by the speculative write-through
 of the NBD server into the secondary disk. So before block replication,
 the primary disk and secondary disk should contain the same data.
 
+7) The secondary also has a quorum node, so after secondary failover it
+can become the new primary and continue replication.
+
+
 == Failure Handling ==
 There are 7 internal errors when block replication is running:
 1. I/O error on primary disk
@@ -XXX,XX +XXX,XX @@ Primary:
      leading whitespace.
   5. The qmp command line must be run after running qmp command line in
      secondary qemu.
-  6. After failover we need remove children.1 (replication driver).
+  6. After primary failover we need remove children.1 (replication driver).
 
 Secondary:
   -drive if=none,driver=raw,file.filename=1.raw,id=colo1 \
-  -drive if=xxx,id=topxxx,driver=replication,mode=secondary,top-id=topxxx\
+  -drive if=none,id=childs1,driver=replication,mode=secondary,top-id=childs1
          file.file.filename=active_disk.qcow2,\
          file.driver=qcow2,\
          file.backing.file.filename=hidden_disk.qcow2,\
          file.backing.driver=qcow2,\
          file.backing.backing=colo1
+  -drive if=xxx,driver=quorum,read-pattern=fifo,id=top-disk1,\
+         vote-threshold=1,children.0=childs1
 
   Then run qmp command in secondary qemu:
     { 'execute': 'nbd-server-start',
@@ -XXX,XX +XXX,XX @@ Secondary:
   The primary host is down, so we should do the following thing:
   { 'execute': 'nbd-server-stop' }
 
+Promote Secondary to Primary:
+  see COLO-FT.txt
+
 TODO:
-1. Continuous block replication
-2. Shared disk
+1. Shared disk
-- 
2.5.0

From: Stefan Hajnoczi <stefanha@redhat.com>

The L2TPv3 RFC number is 3931:
https://tools.ietf.org/html/rfc3931

Reported-by: Henrik Johansson <henrikjohansson@rocketmail.com>
Reviewed-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 qemu-options.hx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
     "                Linux kernel 3.3+ as well as most routers can talk\n"
     "                L2TPv3. This transport allows connecting a VM to a VM,\n"
     "                VM to a router and even VM to Host. It is a nearly-universal\n"
-    "                standard (RFC3391). Note - this implementation uses static\n"
+    "                standard (RFC3931). Note - this implementation uses static\n"
     "                pre-configured tunnels (same as the Linux kernel).\n"
     "                use 'src=' to specify source address\n"
     "                use 'dst=' to specify destination address\n"
@@ -XXX,XX +XXX,XX @@ Example (send packets from host's 1.2.3.4):
 @end example
 
 @item -netdev l2tpv3,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,srcport=@var{srcport}][,dstport=@var{dstport}],txsession=@var{txsession}[,rxsession=@var{rxsession}][,ipv6][,udp][,cookie64][,counter][,pincounter][,txcookie=@var{txcookie}][,rxcookie=@var{rxcookie}][,offset=@var{offset}]
-Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3391) is a
+Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3931) is a
 popular protocol to transport Ethernet (and other Layer 2) data frames between
 two systems. It is present in routers, firewalls and the Linux kernel
 (from version 3.3 onwards).
-- 
2.5.0

The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:

Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)

----------------------------------------------------------------

Changes since V2:
- fix 32bit build errros

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 170 ++++++++++
 10 files changed, 1584 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 ++
 4 files changed, 215 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 3 files changed, 522 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec, size_t num,
+                                    bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem, unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
+                            false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq, unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)(uintptr_t)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)(intptr_t)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq, unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next)
+{
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     hwaddr *addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        Int128 needle_last, map_last;
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = map->iova + off;
+
+        needle_last = int128_add(int128_make64(needle.translated_addr),
+                                 int128_make64(iovec[i].iov_len));
+        map_last = int128_make64(map->translated_addr + map->size);
+        if (unlikely(int128_gt(needle_last, map_last))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                     const struct iovec *iovec, size_t num,
                                     bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64(sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-                            false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr)
 {
-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 }
 
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)(uintptr_t)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)(uintptr_t)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)(intptr_t)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4