Series comparison

-[Qemu-devel] [PULL 0/6] Net patches
+[PULL V2 00/25] Net patches
-The following changes since commit e607bbee553cfe73072870cef458cfa4e78133e2:
+The following changes since commit d48125de38f48a61d6423ef6a01156d6dff9ee2c:
-  Merge remote-tracking branch 'remotes/edgar/tags/edgar/xilinx-next-2018-01-26.for-upstream' into staging (2018-01-26 14:24:25 +0000)
+  Merge tag 'kraxel-20220719-pull-request' of https://gitlab.com/kraxel/qemu into staging (2022-07-19 17:40:36 +0100)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to bf4835a4d5338bb7424827715df22570a8adc67c:
+for you to fetch changes up to 8bdab83b34efb0b598be4e5b98e4f466ca5f2f80:
-  MAINTAINERS: update Dmitry Fleytman email (2018-01-29 16:05:38 +0800)
+  net/colo.c: fix segmentation fault when packet is not parsed correctly (2022-07-20 16:58:08 +0800)
 ----------------------------------------------------------------
+Changes since V1:
+- Fix build erros of vhost-vdpa when virtio-net is not set
 ----------------------------------------------------------------
-Mao Zhongyi (2):
+Eugenio Pérez (21):
-      colo: modified the payload compare function
+      vhost: move descriptor translation to vhost_svq_vring_write_descs
-      colo: compare the packet based on the tcp sequence number
+      virtio-net: Expose MAC_TABLE_ENTRIES
       virtio-net: Expose ctrl virtqueue logic
       vdpa: Avoid compiler to squash reads to used idx
       vhost: Reorder vhost_svq_kick
       vhost: Move vhost_svq_kick call to vhost_svq_add
       vhost: Check for queue full at vhost_svq_add
       vhost: Decouple vhost_svq_add from VirtQueueElement
       vhost: Add SVQDescState
       vhost: Track number of descs in SVQDescState
       vhost: add vhost_svq_push_elem
       vhost: Expose vhost_svq_add
       vhost: add vhost_svq_poll
       vhost: Add svq avail_handler callback
       vdpa: Export vhost_vdpa_dma_map and unmap calls
       vhost-net-vdpa: add stubs for when no virtio-net device is present
       vdpa: manual forward CVQ buffers
       vdpa: Buffer CVQ support on shadow virtqueue
       vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
       vdpa: Add device migration blocker
       vdpa: Add x-svq to NetdevVhostVDPAOptions
-Philippe Mathieu-Daudé (1):
+Zhang Chen (4):
-      MAINTAINERS: update Dmitry Fleytman email
+      softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH
       net/colo: Fix a "double free" crash to clear the conn_list
       net/colo.c: No need to track conn_list for filter-rewriter
       net/colo.c: fix segmentation fault when packet is not parsed correctly
-Thomas Huth (3):
+ hw/net/virtio-net.c                |  85 +++++----
-      net: Allow hubports to connect to other netdevs
+ hw/virtio/vhost-shadow-virtqueue.c | 210 +++++++++++++++-------
-      net: Allow netdevs to be used with 'hostfwd_add' and 'hostfwd_remove'
+ hw/virtio/vhost-shadow-virtqueue.h |  52 +++++-
-      qemu-doc: Get rid of "vlan=X" example in the documentation
+ hw/virtio/vhost-vdpa.c             |  26 ++-
+ include/hw/virtio/vhost-vdpa.h     |   8 +
- MAINTAINERS        |   8 +-
+ include/hw/virtio/virtio-net.h     |   7 +
- hmp-commands.hx    |   4 +-
+ net/colo-compare.c                 |   2 +-
- net/colo-compare.c | 411 +++++++++++++++++++++++++++++++++--------------------
+ net/colo.c                         |  11 +-
- net/colo.c         |   9 ++
+ net/filter-rewriter.c              |   2 +-
- net/colo.h         |  15 ++
+ net/meson.build                    |   3 +-
- net/hub.c          |  27 +++-
+ net/trace-events                   |   1 +
- net/hub.h          |   3 +-
+ net/vhost-vdpa-stub.c              |  21 +++
- net/net.c          |   2 +-
+ net/vhost-vdpa.c                   | 357 +++++++++++++++++++++++++++++++++++--
- net/slirp.c        |  33 +++--
+ qapi/net.json                      |   9 +-
- net/trace-events   |   2 +-
+ softmmu/runstate.c                 |   1 +
- qapi/net.json      |   4 +-
+files changed, 671 insertions(+), 124 deletions(-)
- qemu-options.hx    |  12 +-
+ create mode 100644 net/vhost-vdpa-stub.c
 files changed, 347 insertions(+), 183 deletions(-)

-New patch
+[PULL V2 01/25] vhost: move descriptor translation to vhost_svq_vring_write_descs
+From: Eugenio Pérez <eperezma@redhat.com>
+It's done for both in and out descriptors so it's better placed here.
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 38 +++++++++++++++++++++++++++-----------
+file changed, 27 insertions(+), 11 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+     return true;
+ }
+-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+-                                    const struct iovec *iovec, size_t num,
+-                                    bool more_descs, bool write)
++/**
++ * Write descriptors to SVQ vring
++ *
++ * @svq: The shadow virtqueue
++ * @sg: Cache for hwaddr
++ * @iovec: The iovec from the guest
++ * @num: iovec length
++ * @more_descs: True if more descriptors come in the chain
++ * @write: True if they are writeable descriptors
++ *
++ * Return true if success, false otherwise and print error.
++ */
++static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
++                                        const struct iovec *iovec, size_t num,
++                                        bool more_descs, bool write)
+ {
+     uint16_t i = svq->free_head, last = svq->free_head;
+     unsigned n;
+     uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+     vring_desc_t *descs = svq->vring.desc;
++    bool ok;
+     if (num == 0) {
+-        return;
++        return true;
++    }
++
++    ok = vhost_svq_translate_addr(svq, sg, iovec, num);
++    if (unlikely(!ok)) {
++        return false;
+     }
+     for (n = 0; n < num; n++) {
+@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+     }
+     svq->free_head = le16_to_cpu(svq->desc_next[last]);
++    return true;
+ }
+ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+         return false;
+     }
+-    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
++    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
++                                     elem->in_num > 0, false);
+     if (unlikely(!ok)) {
+         return false;
+     }
+-    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+-                            elem->in_num > 0, false);
+-
+-    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
++    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
++                                     true);
+     if (unlikely(!ok)) {
+         return false;
+     }
+-    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
+-
+     /*
+      * Put the entry in the available array (but don't update avail->idx until
+      * they do sync).
+--
+.7.4

-New patch
+[PULL V2 02/25] virtio-net: Expose MAC_TABLE_ENTRIES
+From: Eugenio Pérez <eperezma@redhat.com>
+vhost-vdpa control virtqueue needs to know the maximum entries supported
+by the virtio-net device, so we know if it is possible to apply the
+filter.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/net/virtio-net.c            | 1 -
+ include/hw/virtio/virtio-net.h | 3 +++
+files changed, 3 insertions(+), 1 deletion(-)
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/net/virtio-net.c
++++ b/hw/net/virtio-net.c
+@@ -XXX,XX +XXX,XX @@
+ #define VIRTIO_NET_VM_VERSION    11
+-#define MAC_TABLE_ENTRIES    64
+ #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
+ /* previously fixed value */
+diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/hw/virtio/virtio-net.h
++++ b/include/hw/virtio/virtio-net.h
+@@ -XXX,XX +XXX,XX @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET)
+  * and latency. */
+ #define TX_BURST 256
++/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */
++#define MAC_TABLE_ENTRIES    64
++
+ typedef struct virtio_net_conf
+ {
+     uint32_t txtimer;
+--
+.7.4

-[Qemu-devel] [PULL 6/6] MAINTAINERS: update Dmitry Fleytman email
+[PULL V2 03/25] virtio-net: Expose ctrl virtqueue logic
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-gently asked by his automatic reply :)
+This allows external vhost-net devices to modify the state of the
 VirtIO device model once the vhost-vdpa device has acknowledged the
 control commands.
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- MAINTAINERS | 8 ++++----
+ hw/net/virtio-net.c            | 84 ++++++++++++++++++++++++------------------
-file changed, 4 insertions(+), 4 deletions(-)
+ include/hw/virtio/virtio-net.h |  4 ++
 files changed, 53 insertions(+), 35 deletions(-)
-diff --git a/MAINTAINERS b/MAINTAINERS
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/hw/net/virtio-net.c
-+++ b/MAINTAINERS
++++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@ F: hw/scsi/mfi.h
+@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
- F: tests/megasas-test.c
+     return VIRTIO_NET_OK;
+ }
- Network packet abstractions
--M: Dmitry Fleytman <dmitry@daynix.com>
+-static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
-+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
++size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
- S: Maintained
++                                  const struct iovec *in_sg, unsigned in_num,
- F: include/net/eth.h
++                                  const struct iovec *out_sg,
- F: net/eth.c
++                                  unsigned out_num)
-@@ -XXX,XX +XXX,XX @@ F: hw/net/net_rx_pkt*
+ {
- F: hw/net/net_tx_pkt*
+     VirtIONet *n = VIRTIO_NET(vdev);
+     struct virtio_net_ctrl_hdr ctrl;
- Vmware
+     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
--M: Dmitry Fleytman <dmitry@daynix.com>
+-    VirtQueueElement *elem;
-+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
+     size_t s;
- S: Maintained
+     struct iovec *iov, *iov2;
- F: hw/net/vmxnet*
+-    unsigned int iov_cnt;
- F: hw/scsi/vmw_pvscsi*
++
-@@ -XXX,XX +XXX,XX @@ F: hw/mem/nvdimm.c
++    if (iov_size(in_sg, in_num) < sizeof(status) ||
- F: include/hw/mem/nvdimm.h
++        iov_size(out_sg, out_num) < sizeof(ctrl)) {
++        virtio_error(vdev, "virtio-net ctrl missing headers");
- e1000x
++        return 0;
--M: Dmitry Fleytman <dmitry@daynix.com>
++    }
-+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
++
- S: Maintained
++    iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
- F: hw/net/e1000x*
++    s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
++    iov_discard_front(&iov, &out_num, sizeof(ctrl));
- e1000e
++    if (s != sizeof(ctrl)) {
--M: Dmitry Fleytman <dmitry@daynix.com>
++        status = VIRTIO_NET_ERR;
-+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
++    } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
- S: Maintained
++        status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
- F: hw/net/e1000e*
++    } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
 +        status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
 +    } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
 +        status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
 +    } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
 +        status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
 +    } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
 +        status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
 +    } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
 +        status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
 +    }
 +
 +    s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
 +    assert(s == sizeof(status));
 +
 +    g_free(iov2);
 +    return sizeof(status);
 +}
 +
 +static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
 +{
 +    VirtQueueElement *elem;
      for (;;) {
 +        size_t written;
          elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
          if (!elem) {
              break;
          }
 -        if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) ||
 -            iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) {
 -            virtio_error(vdev, "virtio-net ctrl missing headers");
 +
 +        written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
 +                                             elem->out_sg, elem->out_num);
 +        if (written > 0) {
 +            virtqueue_push(vq, elem, written);
 +            virtio_notify(vdev, vq);
 +            g_free(elem);
 +        } else {
              virtqueue_detach_element(vq, elem, 0);
              g_free(elem);
              break;
          }
 -
 -        iov_cnt = elem->out_num;
 -        iov2 = iov = g_memdup2(elem->out_sg,
 -                               sizeof(struct iovec) * elem->out_num);
 -        s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl));
 -        iov_discard_front(&iov, &iov_cnt, sizeof(ctrl));
 -        if (s != sizeof(ctrl)) {
 -            status = VIRTIO_NET_ERR;
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
 -            status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
 -            status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
 -            status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
 -            status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
 -            status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt);
 -        } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
 -            status = virtio_net_handle_offloads(n, ctrl.cmd, iov, iov_cnt);
 -        }
 -
 -        s = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, sizeof(status));
 -        assert(s == sizeof(status));
 -
 -        virtqueue_push(vq, elem, sizeof(status));
 -        virtio_notify(vdev, vq);
 -        g_free(iov2);
 -        g_free(elem);
      }
  }
 diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/virtio-net.h
 +++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
      struct EBPFRSSContext ebpf_rss;
  };
 +size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
 +                                  const struct iovec *in_sg, unsigned in_num,
 +                                  const struct iovec *out_sg,
 +                                  unsigned out_num);
  void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
                                     const char *type);
 --
 .7.4

-New patch
+[PULL V2 04/25] vdpa: Avoid compiler to squash reads to used idx
+From: Eugenio Pérez <eperezma@redhat.com>
+In the next patch we will allow busypolling of this value. The compiler
+have a running path where shadow_used_idx, last_used_idx, and vring used
+idx are not modified within the same thread busypolling.
+This was not an issue before since we always cleared device event
+notifier before checking it, and that could act as memory barrier.
+However, the busypoll needs something similar to kernel READ_ONCE.
+Let's add it here, sepparated from the polling.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 3 ++-
+file changed, 2 insertions(+), 1 deletion(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick_notifier(EventNotifier *n)
+ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+ {
++    uint16_t *used_idx = &svq->vring.used->idx;
+     if (svq->last_used_idx != svq->shadow_used_idx) {
+         return true;
+     }
+-    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
++    svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx);
+     return svq->last_used_idx != svq->shadow_used_idx;
+ }
+--
+.7.4

-New patch
+[PULL V2 05/25] vhost: Reorder vhost_svq_kick
+From: Eugenio Pérez <eperezma@redhat.com>
+Future code needs to call it from vhost_svq_add.
+No functional change intended.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 28 ++++++++++++++--------------
+file changed, 14 insertions(+), 14 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+     return true;
+ }
++static void vhost_svq_kick(VhostShadowVirtqueue *svq)
++{
++    /*
++     * We need to expose the available array entries before checking the used
++     * flags
++     */
++    smp_mb();
++    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
++        return;
++    }
++
++    event_notifier_set(&svq->hdev_kick);
++}
++
+ /**
+  * Add an element to a SVQ.
+  *
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+     return true;
+ }
+-static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+-{
+-    /*
+-     * We need to expose the available array entries before checking the used
+-     * flags
+-     */
+-    smp_mb();
+-    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+-        return;
+-    }
+-
+-    event_notifier_set(&svq->hdev_kick);
+-}
+-
+ /**
+  * Forward available buffers.
+  *
+--
+.7.4

-New patch
+[PULL V2 06/25] vhost: Move vhost_svq_kick call to vhost_svq_add
+From: Eugenio Pérez <eperezma@redhat.com>
+The series needs to expose vhost_svq_add with full functionality,
+including kick
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+     }
+     svq->ring_id_maps[qemu_head] = elem;
++    vhost_svq_kick(svq);
+     return true;
+ }
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+                 /* VQ is broken, just return and ignore any other kicks */
+                 return;
+             }
+-            vhost_svq_kick(svq);
+         }
+         virtio_queue_set_notification(svq->vq, true);
+--
+.7.4

-New patch
+[PULL V2 07/25] vhost: Check for queue full at vhost_svq_add
+From: Eugenio Pérez <eperezma@redhat.com>
+The series need to expose vhost_svq_add with full functionality,
+including checking for full queue.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 59 +++++++++++++++++++++-----------------
+file changed, 33 insertions(+), 26 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+  * Add an element to a SVQ.
+  *
+  * The caller must check that there is enough slots for the new element. It
+- * takes ownership of the element: In case of failure, it is free and the SVQ
+- * is considered broken.
++ * takes ownership of the element: In case of failure not ENOSPC, it is free.
++ *
++ * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
+  */
+-static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
++static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+ {
+     unsigned qemu_head;
+-    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
++    unsigned ndescs = elem->in_num + elem->out_num;
++    bool ok;
++
++    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
++        return -ENOSPC;
++    }
++
++    ok = vhost_svq_add_split(svq, elem, &qemu_head);
+     if (unlikely(!ok)) {
+         g_free(elem);
+-        return false;
++        return -EINVAL;
+     }
+     svq->ring_id_maps[qemu_head] = elem;
+     vhost_svq_kick(svq);
+-    return true;
++    return 0;
+ }
+ /**
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+         while (true) {
+             VirtQueueElement *elem;
+-            bool ok;
++            int r;
+             if (svq->next_guest_avail_elem) {
+                 elem = g_steal_pointer(&svq->next_guest_avail_elem);
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+                 break;
+             }
+-            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+-                /*
+-                 * This condition is possible since a contiguous buffer in GPA
+-                 * does not imply a contiguous buffer in qemu's VA
+-                 * scatter-gather segments. If that happens, the buffer exposed
+-                 * to the device needs to be a chain of descriptors at this
+-                 * moment.
+-                 *
+-                 * SVQ cannot hold more available buffers if we are here:
+-                 * queue the current guest descriptor and ignore further kicks
+-                 * until some elements are used.
+-                 */
+-                svq->next_guest_avail_elem = elem;
+-                return;
+-            }
+-
+-            ok = vhost_svq_add(svq, elem);
+-            if (unlikely(!ok)) {
+-                /* VQ is broken, just return and ignore any other kicks */
++            r = vhost_svq_add(svq, elem);
++            if (unlikely(r != 0)) {
++                if (r == -ENOSPC) {
++                    /*
++                     * This condition is possible since a contiguous buffer in
++                     * GPA does not imply a contiguous buffer in qemu's VA
++                     * scatter-gather segments. If that happens, the buffer
++                     * exposed to the device needs to be a chain of descriptors
++                     * at this moment.
++                     *
++                     * SVQ cannot hold more available buffers if we are here:
++                     * queue the current guest descriptor and ignore kicks
++                     * until some elements are used.
++                     */
++                    svq->next_guest_avail_elem = elem;
++                }
++
++                /* VQ is full or broken, just return and ignore kicks */
+                 return;
+             }
+         }
+--
+.7.4

-New patch
+[PULL V2 08/25] vhost: Decouple vhost_svq_add from VirtQueueElement
+From: Eugenio Pérez <eperezma@redhat.com>
+VirtQueueElement comes from the guest, but we're heading SVQ to be able
+to modify the element presented to the device without the guest's
+knowledge.
+To do so, make SVQ accept sg buffers directly, instead of using
+VirtQueueElement.
+Add vhost_svq_add_element to maintain element convenience.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 33 ++++++++++++++++++++++-----------
+file changed, 22 insertions(+), 11 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+ }
+ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+-                                VirtQueueElement *elem, unsigned *head)
++                                const struct iovec *out_sg, size_t out_num,
++                                const struct iovec *in_sg, size_t in_num,
++                                unsigned *head)
+ {
+     unsigned avail_idx;
+     vring_avail_t *avail = svq->vring.avail;
+     bool ok;
+-    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
++    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
+     *head = svq->free_head;
+     /* We need some descriptors here */
+-    if (unlikely(!elem->out_num && !elem->in_num)) {
++    if (unlikely(!out_num && !in_num)) {
+         qemu_log_mask(LOG_GUEST_ERROR,
+                       "Guest provided element with no descriptors");
+         return false;
+     }
+-    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+-                                     elem->in_num > 0, false);
++    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
++                                     false);
+     if (unlikely(!ok)) {
+         return false;
+     }
+-    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
+-                                     true);
++    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
+     if (unlikely(!ok)) {
+         return false;
+     }
+@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+  *
+  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
+  */
+-static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
++static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
++                          size_t out_num, const struct iovec *in_sg,
++                          size_t in_num, VirtQueueElement *elem)
+ {
+     unsigned qemu_head;
+-    unsigned ndescs = elem->in_num + elem->out_num;
++    unsigned ndescs = in_num + out_num;
+     bool ok;
+     if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
+         return -ENOSPC;
+     }
+-    ok = vhost_svq_add_split(svq, elem, &qemu_head);
++    ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
+     if (unlikely(!ok)) {
+         g_free(elem);
+         return -EINVAL;
+@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+     return 0;
+ }
++/* Convenience wrapper to add a guest's element to SVQ */
++static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
++                                 VirtQueueElement *elem)
++{
++    return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
++                         elem->in_num, elem);
++}
++
+ /**
+  * Forward available buffers.
+  *
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+                 break;
+             }
+-            r = vhost_svq_add(svq, elem);
++            r = vhost_svq_add_element(svq, elem);
+             if (unlikely(r != 0)) {
+                 if (r == -ENOSPC) {
+                     /*
+--
+.7.4

-New patch
+[PULL V2 09/25] vhost: Add SVQDescState
+From: Eugenio Pérez <eperezma@redhat.com>
+This will allow SVQ to add context to the different queue elements.
+This patch only store the actual element, no functional change intended.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++--------
+ hw/virtio/vhost-shadow-virtqueue.h |  8 ++++++--
+files changed, 14 insertions(+), 10 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+         return -EINVAL;
+     }
+-    svq->ring_id_maps[qemu_head] = elem;
++    svq->desc_state[qemu_head].elem = elem;
+     vhost_svq_kick(svq);
+     return 0;
+ }
+@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+         return NULL;
+     }
+-    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
++    if (unlikely(!svq->desc_state[used_elem.id].elem)) {
+         qemu_log_mask(LOG_GUEST_ERROR,
+             "Device %s says index %u is used, but it was not available",
+             svq->vdev->name, used_elem.id);
+         return NULL;
+     }
+-    num = svq->ring_id_maps[used_elem.id]->in_num +
+-          svq->ring_id_maps[used_elem.id]->out_num;
++    num = svq->desc_state[used_elem.id].elem->in_num +
++          svq->desc_state[used_elem.id].elem->out_num;
+     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
+     svq->desc_next[last_used_chain] = svq->free_head;
+     svq->free_head = used_elem.id;
+     *len = used_elem.len;
+-    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
++    return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
+ }
+ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+     memset(svq->vring.desc, 0, driver_size);
+     svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
+     memset(svq->vring.used, 0, device_size);
+-    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
++    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
+     svq->desc_next = g_new0(uint16_t, svq->vring.num);
+     for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+         svq->desc_next[i] = cpu_to_le16(i + 1);
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
+     for (unsigned i = 0; i < svq->vring.num; ++i) {
+         g_autofree VirtQueueElement *elem = NULL;
+-        elem = g_steal_pointer(&svq->ring_id_maps[i]);
++        elem = g_steal_pointer(&svq->desc_state[i].elem);
+         if (elem) {
+             virtqueue_detach_element(svq->vq, elem, 0);
+         }
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
+     }
+     svq->vq = NULL;
+     g_free(svq->desc_next);
+-    g_free(svq->ring_id_maps);
++    g_free(svq->desc_state);
+     qemu_vfree(svq->vring.desc);
+     qemu_vfree(svq->vring.used);
+ }
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@
+ #include "standard-headers/linux/vhost_types.h"
+ #include "hw/virtio/vhost-iova-tree.h"
++typedef struct SVQDescState {
++    VirtQueueElement *elem;
++} SVQDescState;
++
+ /* Shadow virtqueue to relay notifications */
+ typedef struct VhostShadowVirtqueue {
+     /* Shadow vring */
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+     /* IOVA mapping */
+     VhostIOVATree *iova_tree;
+-    /* Map for use the guest's descriptors */
+-    VirtQueueElement **ring_id_maps;
++    /* SVQ vring descriptors state */
++    SVQDescState *desc_state;
+     /* Next VirtQueue element that guest made available */
+     VirtQueueElement *next_guest_avail_elem;
+--
+.7.4

-New patch
+[PULL V2 10/25] vhost: Track number of descs in SVQDescState
+From: Eugenio Pérez <eperezma@redhat.com>
+A guest's buffer continuos on GPA may need multiple descriptors on
+qemu's VA, so SVQ should track its length sepparatedly.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 4 ++--
+ hw/virtio/vhost-shadow-virtqueue.h | 6 ++++++
+files changed, 8 insertions(+), 2 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+     }
+     svq->desc_state[qemu_head].elem = elem;
++    svq->desc_state[qemu_head].ndescs = ndescs;
+     vhost_svq_kick(svq);
+     return 0;
+ }
+@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+         return NULL;
+     }
+-    num = svq->desc_state[used_elem.id].elem->in_num +
+-          svq->desc_state[used_elem.id].elem->out_num;
++    num = svq->desc_state[used_elem.id].ndescs;
+     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
+     svq->desc_next[last_used_chain] = svq->free_head;
+     svq->free_head = used_elem.id;
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@
+ typedef struct SVQDescState {
+     VirtQueueElement *elem;
++
++    /*
++     * Number of descriptors exposed to the device. May or may not match
++     * guest's
++     */
++    unsigned int ndescs;
+ } SVQDescState;
+ /* Shadow virtqueue to relay notifications */
+--
+.7.4

-New patch
+[PULL V2 11/25] vhost: add vhost_svq_push_elem
+From: Eugenio Pérez <eperezma@redhat.com>
+This function allows external SVQ users to return guest's available
+buffers.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  3 +++
+files changed, 19 insertions(+)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+     return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
+ }
++/**
++ * Push an element to SVQ, returning it to the guest.
++ */
++void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
++                         const VirtQueueElement *elem, uint32_t len)
++{
++    virtqueue_push(svq->vq, elem, len);
++    if (svq->next_guest_avail_elem) {
++        /*
++         * Avail ring was full when vhost_svq_flush was called, so it's a
++         * good moment to make more descriptors available if possible.
++         */
++        vhost_handle_guest_kick(svq);
++    }
++}
++
+ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                             bool check_for_avail_queue)
+ {
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+ bool vhost_svq_valid_features(uint64_t features, Error **errp);
++void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
++                         const VirtQueueElement *elem, uint32_t len);
++
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+--
+.7.4

-New patch
+[PULL V2 12/25] vhost: Expose vhost_svq_add
+From: Eugenio Pérez <eperezma@redhat.com>
+This allows external parts of SVQ to forward custom buffers to the
+device.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 6 +++---
+ hw/virtio/vhost-shadow-virtqueue.h | 3 +++
+files changed, 6 insertions(+), 3 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+  *
+  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
+  */
+-static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+-                          size_t out_num, const struct iovec *in_sg,
+-                          size_t in_num, VirtQueueElement *elem)
++int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
++                  size_t out_num, const struct iovec *in_sg, size_t in_num,
++                  VirtQueueElement *elem)
+ {
+     unsigned qemu_head;
+     unsigned ndescs = in_num + out_num;
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
+ void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+                          const VirtQueueElement *elem, uint32_t len);
++int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
++                  size_t out_num, const struct iovec *in_sg, size_t in_num,
++                  VirtQueueElement *elem);
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+--
+.7.4

-New patch
+[PULL V2 13/25] vhost: add vhost_svq_poll
+From: Eugenio Pérez <eperezma@redhat.com>
+It allows the Shadow Control VirtQueue to wait for the device to use the
+available buffers.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 27 +++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  1 +
+files changed, 28 insertions(+)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+ }
+ /**
++ * Poll the SVQ for one device used buffer.
++ *
++ * This function race with main event loop SVQ polling, so extra
++ * synchronization is needed.
++ *
++ * Return the length written by the device.
++ */
++size_t vhost_svq_poll(VhostShadowVirtqueue *svq)
++{
++    int64_t start_us = g_get_monotonic_time();
++    do {
++        uint32_t len;
++        VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
++        if (elem) {
++            return len;
++        }
++
++        if (unlikely(g_get_monotonic_time() - start_us > 10e6)) {
++            return 0;
++        }
++
++        /* Make sure we read new used_idx */
++        smp_rmb();
++    } while (true);
++}
++
++/**
+  * Forward used buffers.
+  *
+  * @n: hdev call event notifier, the one that device set to notify svq.
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                   size_t out_num, const struct iovec *in_sg, size_t in_num,
+                   VirtQueueElement *elem);
++size_t vhost_svq_poll(VhostShadowVirtqueue *svq);
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+--
+.7.4

-New patch
+[PULL V2 14/25] vhost: Add svq avail_handler callback
+From: Eugenio Pérez <eperezma@redhat.com>
+This allows external handlers to be aware of new buffers that the guest
+places in the virtqueue.
+When this callback is defined the ownership of the guest's virtqueue
+element is transferred to the callback. This means that if the user
+wants to forward the descriptor it needs to manually inject it. The
+callback is also free to process the command by itself and use the
+element with svq_push.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-shadow-virtqueue.c | 14 ++++++++++++--
+ hw/virtio/vhost-shadow-virtqueue.h | 31 ++++++++++++++++++++++++++++++-
+ hw/virtio/vhost-vdpa.c             |  3 ++-
+files changed, 44 insertions(+), 4 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+                 break;
+             }
+-            r = vhost_svq_add_element(svq, elem);
++            if (svq->ops) {
++                r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
++            } else {
++                r = vhost_svq_add_element(svq, elem);
++            }
+             if (unlikely(r != 0)) {
+                 if (r == -ENOSPC) {
+                     /*
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
+  * shadow methods and file descriptors.
+  *
+  * @iova_tree: Tree to perform descriptors translations
++ * @ops: SVQ owner callbacks
++ * @ops_opaque: ops opaque pointer
+  *
+  * Returns the new virtqueue or NULL.
+  *
+  * In case of error, reason is reported through error_report.
+  */
+-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
++VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
++                                    const VhostShadowVirtqueueOps *ops,
++                                    void *ops_opaque)
+ {
+     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+     int r;
+@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+     svq->iova_tree = iova_tree;
++    svq->ops = ops;
++    svq->ops_opaque = ops_opaque;
+     return g_steal_pointer(&svq);
+ err_init_hdev_call:
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
++++ b/hw/virtio/vhost-shadow-virtqueue.h
+@@ -XXX,XX +XXX,XX @@ typedef struct SVQDescState {
+     unsigned int ndescs;
+ } SVQDescState;
++typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
++
++/**
++ * Callback to handle an avail buffer.
++ *
++ * @svq:  Shadow virtqueue
++ * @elem:  Element placed in the queue by the guest
++ * @vq_callback_opaque:  Opaque
++ *
++ * Returns 0 if the vq is running as expected.
++ *
++ * Note that ownership of elem is transferred to the callback.
++ */
++typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq,
++                                      VirtQueueElement *elem,
++                                      void *vq_callback_opaque);
++
++typedef struct VhostShadowVirtqueueOps {
++    VirtQueueAvailCallback avail_handler;
++} VhostShadowVirtqueueOps;
++
+ /* Shadow virtqueue to relay notifications */
+ typedef struct VhostShadowVirtqueue {
+     /* Shadow vring */
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
+      */
+     uint16_t *desc_next;
++    /* Caller callbacks */
++    const VhostShadowVirtqueueOps *ops;
++
++    /* Caller callbacks opaque */
++    void *ops_opaque;
++
+     /* Next head to expose to the device */
+     uint16_t shadow_avail_idx;
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                      VirtQueue *vq);
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
+-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
++VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
++                                    const VhostShadowVirtqueueOps *ops,
++                                    void *ops_opaque);
+ void vhost_svq_free(gpointer vq);
+ G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
+     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
+     for (unsigned n = 0; n < hdev->nvqs; ++n) {
+-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
++        g_autoptr(VhostShadowVirtqueue) svq;
++        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
+         if (unlikely(!svq)) {
+             error_setg(errp, "Cannot create svq %u", n);
+             return -1;
+--
+.7.4

-New patch
+[PULL V2 15/25] vdpa: Export vhost_vdpa_dma_map and unmap calls
+From: Eugenio Pérez <eperezma@redhat.com>
+Shadow CVQ will copy buffers on qemu VA, so we avoid TOCTOU attacks from
+the guest that could set a different state in qemu device model and vdpa
+device.
+To do so, it needs to be able to map these new buffers to the device.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ hw/virtio/vhost-vdpa.c         | 7 +++----
+ include/hw/virtio/vhost-vdpa.h | 4 ++++
+files changed, 7 insertions(+), 4 deletions(-)
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-vdpa.c
++++ b/hw/virtio/vhost-vdpa.c
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
+     return false;
+ }
+-static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+-                              void *vaddr, bool readonly)
++int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
++                       void *vaddr, bool readonly)
+ {
+     struct vhost_msg_v2 msg = {};
+     int fd = v->device_fd;
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+     return ret;
+ }
+-static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
+-                                hwaddr size)
++int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
+ {
+     struct vhost_msg_v2 msg = {};
+     int fd = v->device_fd;
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/hw/virtio/vhost-vdpa.h
++++ b/include/hw/virtio/vhost-vdpa.h
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
+     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
+ } VhostVDPA;
++int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
++                       void *vaddr, bool readonly);
++int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size);
++
+ #endif
+--
+.7.4

-New patch
+[PULL V2 16/25] vhost-net-vdpa: add stubs for when no virtio-net device is present
+From: Eugenio Pérez <eperezma@redhat.com>
+net/vhost-vdpa.c will need functions that are declared in
+vhost-shadow-virtqueue.c, that needs functions of virtio-net.c.
+Copy the vhost-vdpa-stub.c code so
+only the constructor net_init_vhost_vdpa needs to be defined.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ net/meson.build       |  3 ++-
+ net/vhost-vdpa-stub.c | 21 +++++++++++++++++++++
+files changed, 23 insertions(+), 1 deletion(-)
+ create mode 100644 net/vhost-vdpa-stub.c
+diff --git a/net/meson.build b/net/meson.build
+index XXXXXXX..XXXXXXX 100644
+--- a/net/meson.build
++++ b/net/meson.build
+@@ -XXX,XX +XXX,XX @@ endif
+ softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix))
+ softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
+ if have_vhost_net_vdpa
+-  softmmu_ss.add(files('vhost-vdpa.c'))
++  softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-vdpa.c'), if_false: files('vhost-vdpa-stub.c'))
++  softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-vdpa-stub.c'))
+ endif
+ vmnet_files = files(
+diff --git a/net/vhost-vdpa-stub.c b/net/vhost-vdpa-stub.c
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/net/vhost-vdpa-stub.c
+@@ -XXX,XX +XXX,XX @@
++/*
++ * vhost-vdpa-stub.c
++ *
++ * Copyright (c) 2022 Red Hat, Inc.
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ *
++ */
++
++#include "qemu/osdep.h"
++#include "clients.h"
++#include "net/vhost-vdpa.h"
++#include "qapi/error.h"
++
++int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
++                        NetClientState *peer, Error **errp)
++{
++    error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
++    return -1;
++}
+--
+.7.4

-[Qemu-devel] [PULL 5/6] qemu-doc: Get rid of "vlan=X" example in the documentation
+[PULL V2 17/25] vdpa: manual forward CVQ buffers
-From: Thomas Huth <thuth@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The vlan concept is marked as deprecated, so we should not use
+Do a simple forwarding of CVQ buffers, the same work SVQ could do but
-this for examples in the documentation anymore.
+through callbacks. No functional change intended.
-Signed-off-by: Thomas Huth <thuth@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- qemu-options.hx | 4 ++--
+ hw/virtio/vhost-vdpa.c         |  3 ++-
-file changed, 2 insertions(+), 2 deletions(-)
+ include/hw/virtio/vhost-vdpa.h |  3 +++
  net/vhost-vdpa.c               | 58 ++++++++++++++++++++++++++++++++++++++++++
 files changed, 63 insertions(+), 1 deletion(-)
-diff --git a/qemu-options.hx b/qemu-options.hx
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/qemu-options.hx
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ qemu-system-i386 linux.img -net nic -net tap
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
- #launch a QEMU instance with two NICs, each one connected
+     for (unsigned n = 0; n < hdev->nvqs; ++n) {
- #to a TAP device
+         g_autoptr(VhostShadowVirtqueue) svq;
- qemu-system-i386 linux.img \
--                 -net nic,vlan=0 -net tap,vlan=0,ifname=tap0 \
+-        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
--                 -net nic,vlan=1 -net tap,vlan=1,ifname=tap1
++        svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
-+        -netdev tap,id=nd0,ifname=tap0 -device e1000,netdev=nd0 \
++                            v->shadow_vq_ops_opaque);
-+        -netdev tap,id=nd1,ifname=tap1 -device rtl8139,netdev=nd1
+         if (unlikely(!svq)) {
- @end example
+             error_setg(errp, "Cannot create svq %u", n);
+             return -1;
- @example
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #include <gmodule.h>
  #include "hw/virtio/vhost-iova-tree.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      /* IOVA mapping used by the Shadow Virtqueue */
      VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
 +    const VhostShadowVirtqueueOps *shadow_vq_ops;
 +    void *shadow_vq_ops_opaque;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-vdpa.c
 +++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "clients.h"
 +#include "hw/virtio/virtio-net.h"
  #include "net/vhost_net.h"
  #include "net/vhost-vdpa.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "qemu/config-file.h"
  #include "qemu/error-report.h"
 +#include "qemu/log.h"
 +#include "qemu/memalign.h"
  #include "qemu/option.h"
  #include "qapi/error.h"
  #include <linux/vhost.h>
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
          .check_peer_type = vhost_vdpa_check_peer_type,
  };
 +/**
 + * Forward buffer for the moment.
 + */
 +static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
 +                                            VirtQueueElement *elem,
 +                                            void *opaque)
 +{
 +    unsigned int n = elem->out_num + elem->in_num;
 +    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
 +    size_t in_len, dev_written;
 +    virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
 +    int r;
 +
 +    memcpy(dev_buffers, elem->out_sg, elem->out_num);
 +    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
 +
 +    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
 +                      elem->in_num, elem);
 +    if (unlikely(r != 0)) {
 +        if (unlikely(r == -ENOSPC)) {
 +            qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
 +                          __func__);
 +        }
 +        goto out;
 +    }
 +
 +    /*
 +     * We can poll here since we've had BQL from the time we sent the
 +     * descriptor. Also, we need to take the answer before SVQ pulls by itself,
 +     * when BQL is released
 +     */
 +    dev_written = vhost_svq_poll(svq);
 +    if (unlikely(dev_written < sizeof(status))) {
 +        error_report("Insufficient written data (%zu)", dev_written);
 +    }
 +
 +out:
 +    in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
 +                          sizeof(status));
 +    if (unlikely(in_len < sizeof(status))) {
 +        error_report("Bad device CVQ written length");
 +    }
 +    vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
 +    g_free(elem);
 +    return r;
 +}
 +
 +static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
 +    .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
 +};
 +
  static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                             const char *device,
                                             const char *name,
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
      s->vhost_vdpa.device_fd = vdpa_device_fd;
      s->vhost_vdpa.index = queue_pair_index;
 +    if (!is_datapath) {
 +        s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
 +        s->vhost_vdpa.shadow_vq_ops_opaque = s;
 +    }
      ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
      if (ret) {
          qemu_del_net_client(nc);
 --
 .7.4

-[Qemu-devel] [PULL 4/6] net: Allow netdevs to be used with 'hostfwd_add' and 'hostfwd_remove'
+[PULL V2 18/25] vdpa: Buffer CVQ support on shadow virtqueue
-From: Thomas Huth <thuth@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-It does not make much sense to limit these commands to the legacy 'vlan'
+Introduce the control virtqueue support for vDPA shadow virtqueue. This
-concept only, they should work with the modern netdevs, too. So now
+is needed for advanced networking features like rx filtering.
-it is possible to use this command with one, two or three parameters.
+Virtio-net control VQ copies the descriptors to qemu's VA, so we avoid
-With one parameter, the command installs a hostfwd rule on the default
+TOCTOU with the guest's or device's memory every time there is a device
-"user" network:
+model change.  Otherwise, the guest could change the memory content in
-    hostfwd_add tcp:...
+the time between qemu and the device read it.
-With two parameters, the command installs a hostfwd rule on a netdev
+To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR is
-(that's the new way of using this command):
+implemented.  If the virtio-net driver changes MAC the virtio-net device
-    hostfwd_add netdev_id tcp:...
+model will be updated with the new one, and a rx filtering change event
+will be raised.
-With three parameters, the command installs a rule on a 'vlan' (aka hub):
-    hostfwd_add hub_id name tcp:...
+More cvq commands could be added here straightforwardly but they have
+not been tested.
-Same applies to the hostfwd_remove command now.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Signed-off-by: Thomas Huth <thuth@redhat.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hmp-commands.hx |  4 ++--
+ net/vhost-vdpa.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
- net/slirp.c     | 33 +++++++++++++++++++++++----------
+file changed, 205 insertions(+), 8 deletions(-)
-files changed, 25 insertions(+), 12 deletions(-)
+diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 diff --git a/hmp-commands.hx b/hmp-commands.hx
 index XXXXXXX..XXXXXXX 100644
---- a/hmp-commands.hx
+--- a/net/vhost-vdpa.c
-+++ b/hmp-commands.hx
++++ b/net/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ ETEXI
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostVDPAState {
-     {
+     NetClientState nc;
-         .name       = "hostfwd_add",
+     struct vhost_vdpa vhost_vdpa;
-         .args_type  = "arg1:s,arg2:s?,arg3:s?",
+     VHostNetState *vhost_net;
--        .params     = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport",
++
-+        .params     = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport",
++    /* Control commands shadow buffers */
-         .help       = "redirect TCP or UDP connections from host to guest (requires -net user)",
++    void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
-         .cmd        = hmp_hostfwd_add,
+     bool started;
-     },
+ } VhostVDPAState;
-@@ -XXX,XX +XXX,XX @@ ETEXI
-     {
+@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_cleanup(NetClientState *nc)
-         .name       = "hostfwd_remove",
+ {
-         .args_type  = "arg1:s,arg2:s?,arg3:s?",
+     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
--        .params     = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport",
-+        .params     = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport",
++    qemu_vfree(s->cvq_cmd_out_buffer);
-         .help       = "remove host-to-guest TCP or UDP redirection",
++    qemu_vfree(s->cvq_cmd_in_buffer);
-         .cmd        = hmp_hostfwd_remove,
+     if (s->vhost_net) {
-     },
+         vhost_net_cleanup(s->vhost_net);
-diff --git a/net/slirp.c b/net/slirp.c
+         g_free(s->vhost_net);
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
---- a/net/slirp.c
+         .check_peer_type = vhost_vdpa_check_peer_type,
-+++ b/net/slirp.c
+ };
-@@ -XXX,XX +XXX,XX @@ error:
-     return -1;
++static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
 +{
 +    VhostIOVATree *tree = v->iova_tree;
 +    DMAMap needle = {
 +        /*
 +         * No need to specify size or to look for more translations since
 +         * this contiguous chunk was allocated by us.
 +         */
 +        .translated_addr = (hwaddr)(uintptr_t)addr,
 +    };
 +    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
 +    int r;
 +
 +    if (unlikely(!map)) {
 +        error_report("Cannot locate expected map");
 +        return;
 +    }
 +
 +    r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1);
 +    if (unlikely(r != 0)) {
 +        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
 +    }
 +
 +    vhost_iova_tree_remove(tree, map);
 +}
 +
 +static size_t vhost_vdpa_net_cvq_cmd_len(void)
 +{
 +    /*
 +     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
 +     * In buffer is always 1 byte, so it should fit here
 +     */
 +    return sizeof(struct virtio_net_ctrl_hdr) +
 +           2 * sizeof(struct virtio_net_ctrl_mac) +
 +           MAC_TABLE_ENTRIES * ETH_ALEN;
 +}
 +
 +static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
 +{
 +    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
 +}
 +
 +/** Copy and map a guest buffer. */
 +static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v,
 +                                   const struct iovec *out_data,
 +                                   size_t out_num, size_t data_len, void *buf,
 +                                   size_t *written, bool write)
 +{
 +    DMAMap map = {};
 +    int r;
 +
 +    if (unlikely(!data_len)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n",
 +                      __func__, write ? "in" : "out");
 +        return false;
 +    }
 +
 +    *written = iov_to_buf(out_data, out_num, 0, buf, data_len);
 +    map.translated_addr = (hwaddr)(uintptr_t)buf;
 +    map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1;
 +    map.perm = write ? IOMMU_RW : IOMMU_RO,
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_report("Cannot map injected element");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf,
 +                           !write);
 +    if (unlikely(r < 0)) {
 +        goto dma_map_err;
 +    }
 +
 +    return true;
 +
 +dma_map_err:
 +    vhost_iova_tree_remove(v->iova_tree, &map);
 +    return false;
 +}
 +
  /**
 - * Forward buffer for the moment.
 + * Copy the guest element into a dedicated buffer suitable to be sent to NIC
 + *
 + * @iov: [0] is the out buffer, [1] is the in one
 + */
 +static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
 +                                        VirtQueueElement *elem,
 +                                        struct iovec *iov)
 +{
 +    size_t in_copied;
 +    bool ok;
 +
 +    iov[0].iov_base = s->cvq_cmd_out_buffer;
 +    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num,
 +                                vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base,
 +                                &iov[0].iov_len, false);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    iov[1].iov_base = s->cvq_cmd_in_buffer;
 +    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0,
 +                                sizeof(virtio_net_ctrl_ack), iov[1].iov_base,
 +                                &in_copied, true);
 +    if (unlikely(!ok)) {
 +        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
 +        return false;
 +    }
 +
 +    iov[1].iov_len = sizeof(virtio_net_ctrl_ack);
 +    return true;
 +}
 +
 +/**
 + * Do not forward commands not supported by SVQ. Otherwise, the device could
 + * accept it and qemu would not know how to update the device model.
 + */
 +static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out,
 +                                            size_t out_num)
 +{
 +    struct virtio_net_ctrl_hdr ctrl;
 +    size_t n;
 +
 +    n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl));
 +    if (unlikely(n < sizeof(ctrl))) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "%s: invalid legnth of out buffer %zu\n", __func__, n);
 +        return false;
 +    }
 +
 +    switch (ctrl.class) {
 +    case VIRTIO_NET_CTRL_MAC:
 +        switch (ctrl.cmd) {
 +        case VIRTIO_NET_CTRL_MAC_ADDR_SET:
 +            return true;
 +        default:
 +            qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n",
 +                          __func__, ctrl.cmd);
 +        };
 +        break;
 +    default:
 +        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
 +                      __func__, ctrl.class);
 +    };
 +
 +    return false;
 +}
 +
 +/**
 + * Validate and copy control virtqueue commands.
 + *
 + * Following QEMU guidelines, we offer a copy of the buffers to the device to
 + * prevent TOCTOU bugs.
   */
  static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
                                              VirtQueueElement *elem,
                                              void *opaque)
  {
 -    unsigned int n = elem->out_num + elem->in_num;
 -    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
 +    VhostVDPAState *s = opaque;
      size_t in_len, dev_written;
      virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
 -    int r;
 +    /* out and in buffers sent to the device */
 +    struct iovec dev_buffers[2] = {
 +        { .iov_base = s->cvq_cmd_out_buffer },
 +        { .iov_base = s->cvq_cmd_in_buffer },
 +    };
 +    /* in buffer used for device model */
 +    const struct iovec in = {
 +        .iov_base = &status,
 +        .iov_len = sizeof(status),
 +    };
 +    int r = -EINVAL;
 +    bool ok;
 +
 +    ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);
 +    if (unlikely(!ok)) {
 +        goto out;
 +    }
 -    memcpy(dev_buffers, elem->out_sg, elem->out_num);
 -    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
 +    ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1);
 +    if (unlikely(!ok)) {
 +        goto out;
 +    }
 -    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
 -                      elem->in_num, elem);
 +    r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem);
      if (unlikely(r != 0)) {
          if (unlikely(r == -ENOSPC)) {
              qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
      dev_written = vhost_svq_poll(svq);
      if (unlikely(dev_written < sizeof(status))) {
          error_report("Insufficient written data (%zu)", dev_written);
 +        goto out;
 +    }
 +
 +    memcpy(&status, dev_buffers[1].iov_base, sizeof(status));
 +    if (status != VIRTIO_NET_OK) {
 +        goto out;
 +    }
 +
 +    status = VIRTIO_NET_ERR;
 +    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1);
 +    if (status != VIRTIO_NET_OK) {
 +        error_report("Bad CVQ processing in model");
      }
  out:
@@ -XXX,XX +XXX,XX @@ out:
      }
      vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
      g_free(elem);
 +    if (dev_buffers[0].iov_base) {
 +        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base);
 +    }
 +    if (dev_buffers[1].iov_base) {
 +        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base);
 +    }
      return r;
  }
--static SlirpState *slirp_lookup(Monitor *mon, const char *vlan,
+@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
--                                const char *stack)
+     s->vhost_vdpa.device_fd = vdpa_device_fd;
-+static SlirpState *slirp_lookup(Monitor *mon, const char *hub_id,
+     s->vhost_vdpa.index = queue_pair_index;
-+                                const char *name)
+     if (!is_datapath) {
- {
++        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
--
++                                            vhost_vdpa_net_cvq_cmd_page_len());
--    if (vlan) {
++        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
-+    if (name) {
++        s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size(),
-         NetClientState *nc;
++                                            vhost_vdpa_net_cvq_cmd_page_len());
--        nc = net_hub_find_client_by_name(strtol(vlan, NULL, 0), stack);
++        memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
--        if (!nc) {
++
--            monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n");
+         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
--            return NULL;
+         s->vhost_vdpa.shadow_vq_ops_opaque = s;
-+        if (hub_id) {
+     }
 +            nc = net_hub_find_client_by_name(strtol(hub_id, NULL, 0), name);
 +            if (!nc) {
 +                monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n");
 +                return NULL;
 +            }
 +        } else {
 +            nc = qemu_find_netdev(name);
 +            if (!nc) {
 +                monitor_printf(mon, "unrecognized netdev id '%s'\n", name);
 +                return NULL;
 +            }
          }
          if (strcmp(nc->model, "user")) {
              monitor_printf(mon, "invalid device specified\n");
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict)
      const char *arg2 = qdict_get_try_str(qdict, "arg2");
      const char *arg3 = qdict_get_try_str(qdict, "arg3");
 -    if (arg2) {
 +    if (arg3) {
          s = slirp_lookup(mon, arg1, arg2);
          src_str = arg3;
 +    } else if (arg2) {
 +        s = slirp_lookup(mon, NULL, arg1);
 +        src_str = arg2;
      } else {
          s = slirp_lookup(mon, NULL, NULL);
          src_str = arg1;
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_add(Monitor *mon, const QDict *qdict)
      const char *arg2 = qdict_get_try_str(qdict, "arg2");
      const char *arg3 = qdict_get_try_str(qdict, "arg3");
 -    if (arg2) {
 +    if (arg3) {
          s = slirp_lookup(mon, arg1, arg2);
          redir_str = arg3;
 +    } else if (arg2) {
 +        s = slirp_lookup(mon, NULL, arg1);
 +        redir_str = arg2;
      } else {
          s = slirp_lookup(mon, NULL, NULL);
          redir_str = arg1;
 --
 .7.4

-[Qemu-devel] [PULL 2/6] colo: compare the packet based on the tcp sequence number
+[PULL V2 19/25] vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
-From: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Packet size some time different or when network is busy.
+To know the device features is needed for CVQ SVQ, so SVQ knows if it
-Based on same payload size, but TCP protocol can not
+can handle all commands or not. Extract from
-guarantee send the same one packet in the same way,
+vhost_vdpa_get_max_queue_pairs so we can reuse it.
-like that:
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-We send this payload:
+Acked-by: Jason Wang <jasowang@redhat.com>
-------------------------------
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 | header |1|2|3|4|5|6|7|8|9|0|
 ------------------------------
 primary:
 ppkt1:
 ----------------
 | header |1|2|3|
 ----------------
 ppkt2:
 ------------------------
 | header |4|5|6|7|8|9|0|
 ------------------------
 secondary:
 spkt1:
 ------------------------------
 | header |1|2|3|4|5|6|7|8|9|0|
 ------------------------------
 In the original method, ppkt1 and ppkt2 are different in size and
 spkt1, so they can't compare and trigger the checkpoint.
 I have tested FTP get 200M and 1G file many times, I found that
 the performance was less than 1% of the native.
 Now I reconstructed the comparison of TCP packets based on the
 TCP sequence number. first of all, ppkt1 and spkt1 have the same
 starting sequence number, so they can compare, even though their
 length is different. And then ppkt1 with a smaller payload length
 is used as the comparison length, if the payload is same, send
 out the ppkt1 and record the offset(the length of ppkt1 payload)
 in spkt1. The next comparison, ppkt2 and spkt1 can be compared
 from the recorded position of spkt1.
 like that:
 ----------------
 | header |1|2|3| ppkt1
 ---------|-----|
          |     |
 ---------v-----v--------------
 | header |1|2|3|4|5|6|7|8|9|0| spkt1
 ---------------|\------------|
                | \offset     |
       ---------v-------------v
       | header |4|5|6|7|8|9|0| ppkt2
       ------------------------
 In this way, the performance can reach native 20% in my multiple
 tests.
 Cc: Zhang Chen <zhangckid@gmail.com>
 Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
 Cc: Jason Wang <jasowang@redhat.com>
 Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
 Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Reviewed-by: Zhang Chen <zhangckid@gmail.com>
 Tested-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 343 +++++++++++++++++++++++++++++++++++------------------
+ net/vhost-vdpa.c | 30 ++++++++++++++++++++----------
- net/colo.c         |   9 ++
+file changed, 20 insertions(+), 10 deletions(-)
  net/colo.h         |  15 +++
  net/trace-events   |   2 +-
 files changed, 250 insertions(+), 119 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/net/vhost-vdpa.c
-+++ b/net/colo-compare.c
++++ b/net/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
- #define COMPARE_READ_LEN_MAX NET_BUFSIZE
+     return nc;
  #define MAX_QUEUE_SIZE 1024
 +#define COLO_COMPARE_FREE_PRIMARY     0x01
 +#define COLO_COMPARE_FREE_SECONDARY   0x02
 +
  /* TODO: Should be configurable */
  #define REGULAR_PACKET_CHECK_MS 3000
@@ -XXX,XX +XXX,XX @@ static gint seq_sorter(Packet *a, Packet *b, gpointer data)
      return ntohl(atcp->th_seq) - ntohl(btcp->th_seq);
  }
-+static void fill_pkt_tcp_info(void *data, uint32_t *max_ack)
+-static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
 +static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
 +{
-+    Packet *pkt = data;
++    int ret = ioctl(fd, VHOST_GET_FEATURES, features);
-+    struct tcphdr *tcphd;
++    if (unlikely(ret < 0)) {
-+
++        error_setg_errno(errp, errno,
-+    tcphd = (struct tcphdr *)pkt->transport_header;
++                         "Fail to query features from vhost-vDPA device");
-+
++    }
-+    pkt->tcp_seq = ntohl(tcphd->th_seq);
++    return ret;
 +    pkt->tcp_ack = ntohl(tcphd->th_ack);
 +    *max_ack = *max_ack > pkt->tcp_ack ? *max_ack : pkt->tcp_ack;
 +    pkt->header_size = pkt->transport_header - (uint8_t *)pkt->data
 +                       + (tcphd->th_off << 2) - pkt->vnet_hdr_len;
 +    pkt->payload_size = pkt->size - pkt->header_size;
 +    pkt->seq_end = pkt->tcp_seq + pkt->payload_size;
 +    pkt->flags = tcphd->th_flags;
 +}
 +
- /*
++static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
-  * Return 1 on success, if return 0 means the
++                                          int *has_cvq, Error **errp)
   * packet will be dropped
   */
 -static int colo_insert_packet(GQueue *queue, Packet *pkt)
 +static int colo_insert_packet(GQueue *queue, Packet *pkt, uint32_t *max_ack)
  {
-     if (g_queue_get_length(queue) <= MAX_QUEUE_SIZE) {
+     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
-         if (pkt->ip->ip_p == IPPROTO_TCP) {
+     g_autofree struct vhost_vdpa_config *config = NULL;
-+            fill_pkt_tcp_info(pkt, max_ack);
+     __virtio16 *max_queue_pairs;
-             g_queue_insert_sorted(queue,
+-    uint64_t features;
-                                   pkt,
+     int ret;
-                                   (GCompareDataFunc)seq_sorter,
-@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
+-    ret = ioctl(fd, VHOST_GET_FEATURES, &features);
 -    if (ret) {
 -        error_setg(errp, "Fail to query features from vhost-vDPA device");
 -        return ret;
 -    }
 -
      if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
          *has_cvq = 1;
      } else {
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
                          NetClientState *peer, Error **errp)
  {
      const NetdevVhostVDPAOptions *opts;
 +    uint64_t features;
      int vdpa_device_fd;
      g_autofree NetClientState **ncs = NULL;
      NetClientState *nc;
 -    int queue_pairs, i, has_cvq = 0;
 +    int queue_pairs, r, i, has_cvq = 0;
      assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
      opts = &netdev->u.vhost_vdpa;
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
          return -errno;
      }
-     if (mode == PRIMARY_IN) {
+-    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
--        if (!colo_insert_packet(&conn->primary_list, pkt)) {
++    r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
-+        if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) {
++    if (unlikely(r < 0)) {
-             error_report("colo compare primary queue size too big,"
++        return r;
                           "drop packet");
          }
      } else {
 -        if (!colo_insert_packet(&conn->secondary_list, pkt)) {
 +        if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) {
              error_report("colo compare secondary queue size too big,"
                           "drop packet");
          }
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
      return 0;
  }
 +static inline bool after(uint32_t seq1, uint32_t seq2)
 +{
 +        return (int32_t)(seq1 - seq2) > 0;
 +}
 +
 +static void colo_release_primary_pkt(CompareState *s, Packet *pkt)
 +{
 +    int ret;
 +    ret = compare_chr_send(s,
 +                           pkt->data,
 +                           pkt->size,
 +                           pkt->vnet_hdr_len);
 +    if (ret < 0) {
 +        error_report("colo send primary packet failed");
 +    }
 +    trace_colo_compare_main("packet same and release packet");
 +    packet_destroy(pkt, NULL);
 +}
 +
  /*
   * The IP packets sent by primary and secondary
   * will be compared in here
@@ -XXX,XX +XXX,XX @@ static int colo_compare_packet_payload(Packet *ppkt,
  }
  /*
 - * Called from the compare thread on the primary
 - * for compare tcp packet
 - * compare_tcp copied from Dr. David Alan Gilbert's branch
 - */
 -static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
 + * return true means that the payload is consist and
 + * need to make the next comparison, false means do
 + * the checkpoint
 +*/
 +static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
 +                              int8_t *mark, uint32_t max_ack)
  {
 -    struct tcphdr *ptcp, *stcp;
 -    int res;
 +    *mark = 0;
 +
 +    if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
 +        if (colo_compare_packet_payload(ppkt, spkt,
 +                                        ppkt->header_size, spkt->header_size,
 +                                        ppkt->payload_size)) {
 +            *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
 +            return true;
 +        }
 +    }
 +    if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
 +        if (colo_compare_packet_payload(ppkt, spkt,
 +                                        ppkt->header_size, spkt->header_size,
 +                                        ppkt->payload_size)) {
 +            *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
 +            return true;
 +        }
 +    }
 +
-+    /* one part of secondary packet payload still need to be compared */
++    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
-+    if (!after(ppkt->seq_end, spkt->seq_end)) {
+                                                  &has_cvq, errp);
-+        if (colo_compare_packet_payload(ppkt, spkt,
+     if (queue_pairs < 0) {
-+                                        ppkt->header_size + ppkt->offset,
+         qemu_close(vdpa_device_fd);
 +                                        spkt->header_size + spkt->offset,
 +                                        ppkt->payload_size - ppkt->offset)) {
 +            if (!after(ppkt->tcp_ack, max_ack)) {
 +                *mark = COLO_COMPARE_FREE_PRIMARY;
 +                spkt->offset += ppkt->payload_size - ppkt->offset;
 +                return true;
 +            } else {
 +                /* secondary guest hasn't ack the data, don't send
 +                 * out this packet
 +                 */
 +                return false;
 +            }
 +        }
 +    } else {
 +        /* primary packet is longer than secondary packet, compare
 +         * the same part and mark the primary packet offset
 +         */
 +        if (colo_compare_packet_payload(ppkt, spkt,
 +                                        ppkt->header_size + ppkt->offset,
 +                                        spkt->header_size + spkt->offset,
 +                                        spkt->payload_size - spkt->offset)) {
 +            *mark = COLO_COMPARE_FREE_SECONDARY;
 +            ppkt->offset += spkt->payload_size - spkt->offset;
 +            return true;
 +        }
 +    }
 -    trace_colo_compare_main("compare tcp");
 +    return false;
 +}
 -    ptcp = (struct tcphdr *)ppkt->transport_header;
 -    stcp = (struct tcphdr *)spkt->transport_header;
 +static void colo_compare_tcp(CompareState *s, Connection *conn)
 +{
 +    Packet *ppkt = NULL, *spkt = NULL;
 +    int8_t mark;
      /*
 -     * The 'identification' field in the IP header is *very* random
 -     * it almost never matches.  Fudge this by ignoring differences in
 -     * unfragmented packets; they'll normally sort themselves out if different
 -     * anyway, and it should recover at the TCP level.
 -     * An alternative would be to get both the primary and secondary to rewrite
 -     * somehow; but that would need some sync traffic to sync the state
 -     */
 -    if (ntohs(ppkt->ip->ip_off) & IP_DF) {
 -        spkt->ip->ip_id = ppkt->ip->ip_id;
 -        /* and the sum will be different if the IDs were different */
 -        spkt->ip->ip_sum = ppkt->ip->ip_sum;
 +     * If ppkt and spkt have the same payload, but ppkt's ACK
 +     * is greater than spkt's ACK, in this case we can not
 +     * send the ppkt because it will cause the secondary guest
 +     * to miss sending some data in the next. Therefore, we
 +     * record the maximum ACK in the current queue at both
 +     * primary side and secondary side. Only when the ack is
 +     * less than the smaller of the two maximum ack, then we
 +     * can ensure that the packet's payload is acknowledged by
 +     * primary and secondary.
 +    */
 +    uint32_t min_ack = conn->pack > conn->sack ? conn->sack : conn->pack;
 +
 +pri:
 +    if (g_queue_is_empty(&conn->primary_list)) {
 +        return;
      }
 +    ppkt = g_queue_pop_head(&conn->primary_list);
 +sec:
 +    if (g_queue_is_empty(&conn->secondary_list)) {
 +        g_queue_push_head(&conn->primary_list, ppkt);
 +        return;
 +    }
 +    spkt = g_queue_pop_head(&conn->secondary_list);
 -    /*
 -     * Check tcp header length for tcp option field.
 -     * th_off > 5 means this tcp packet have options field.
 -     * The tcp options maybe always different.
 -     * for example:
 -     * From RFC 7323.
 -     * TCP Timestamps option (TSopt):
 -     * Kind: 8
 -     *
 -     * Length: 10 bytes
 -     *
 -     *    +-------+-------+---------------------+---------------------+
 -     *    |Kind=8 |  10   |   TS Value (TSval)  |TS Echo Reply (TSecr)|
 -     *    +-------+-------+---------------------+---------------------+
 -     *       1       1              4                     4
 -     *
 -     * In this case the primary guest's timestamp always different with
 -     * the secondary guest's timestamp. COLO just focus on payload,
 -     * so we just need skip this field.
 -     */
 +    if (ppkt->tcp_seq == ppkt->seq_end) {
 +        colo_release_primary_pkt(s, ppkt);
 +        ppkt = NULL;
 +    }
 -    ptrdiff_t ptcp_offset, stcp_offset;
 +    if (ppkt && conn->compare_seq && !after(ppkt->seq_end, conn->compare_seq)) {
 +        trace_colo_compare_main("pri: this packet has compared");
 +        colo_release_primary_pkt(s, ppkt);
 +        ppkt = NULL;
 +    }
 -    ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
 -                  + (ptcp->th_off << 2) - ppkt->vnet_hdr_len;
 -    stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
 -                  + (stcp->th_off << 2) - spkt->vnet_hdr_len;
 -    if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) {
 -        res = colo_compare_packet_payload(ppkt, spkt,
 -                                          ptcp_offset, stcp_offset,
 -                                          ppkt->size - ptcp_offset);
 +    if (spkt->tcp_seq == spkt->seq_end) {
 +        packet_destroy(spkt, NULL);
 +        if (!ppkt) {
 +            goto pri;
 +        } else {
 +            goto sec;
 +        }
      } else {
 -        trace_colo_compare_main("TCP: payload size of packets are different");
 -        res = -1;
 +        if (conn->compare_seq && !after(spkt->seq_end, conn->compare_seq)) {
 +            trace_colo_compare_main("sec: this packet has compared");
 +            packet_destroy(spkt, NULL);
 +            if (!ppkt) {
 +                goto pri;
 +            } else {
 +                goto sec;
 +            }
 +        }
 +        if (!ppkt) {
 +            g_queue_push_head(&conn->secondary_list, spkt);
 +            goto pri;
 +        }
      }
 -    if (res != 0 &&
 -        trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
 -        char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
 -
 -        strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src));
 -        strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst));
 -        strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src));
 -        strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst));
 -
 -        trace_colo_compare_ip_info(ppkt->size, pri_ip_src,
 -                                   pri_ip_dst, spkt->size,
 -                                   sec_ip_src, sec_ip_dst);
 -
 -        trace_colo_compare_tcp_info("pri tcp packet",
 -                                    ntohl(ptcp->th_seq),
 -                                    ntohl(ptcp->th_ack),
 -                                    res, ptcp->th_flags,
 -                                    ppkt->size);
 -
 -        trace_colo_compare_tcp_info("sec tcp packet",
 -                                    ntohl(stcp->th_seq),
 -                                    ntohl(stcp->th_ack),
 -                                    res, stcp->th_flags,
 -                                    spkt->size);
 +    if (colo_mark_tcp_pkt(ppkt, spkt, &mark, min_ack)) {
 +        trace_colo_compare_tcp_info("pri",
 +                                    ppkt->tcp_seq, ppkt->tcp_ack,
 +                                    ppkt->header_size, ppkt->payload_size,
 +                                    ppkt->offset, ppkt->flags);
 +
 +        trace_colo_compare_tcp_info("sec",
 +                                    spkt->tcp_seq, spkt->tcp_ack,
 +                                    spkt->header_size, spkt->payload_size,
 +                                    spkt->offset, spkt->flags);
 +
 +        if (mark == COLO_COMPARE_FREE_PRIMARY) {
 +            conn->compare_seq = ppkt->seq_end;
 +            colo_release_primary_pkt(s, ppkt);
 +            g_queue_push_head(&conn->secondary_list, spkt);
 +            goto pri;
 +        }
 +        if (mark == COLO_COMPARE_FREE_SECONDARY) {
 +            conn->compare_seq = spkt->seq_end;
 +            packet_destroy(spkt, NULL);
 +            goto sec;
 +        }
 +        if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) {
 +            conn->compare_seq = ppkt->seq_end;
 +            colo_release_primary_pkt(s, ppkt);
 +            packet_destroy(spkt, NULL);
 +            goto pri;
 +        }
 +    } else {
 +        g_queue_push_head(&conn->primary_list, ppkt);
 +        g_queue_push_head(&conn->secondary_list, spkt);
          qemu_hexdump((char *)ppkt->data, stderr,
                       "colo-compare ppkt", ppkt->size);
          qemu_hexdump((char *)spkt->data, stderr,
                       "colo-compare spkt", spkt->size);
 -    }
 -    return res;
 +        /*
 +         * colo_compare_inconsistent_notify();
 +         * TODO: notice to checkpoint();
 +         */
 +    }
  }
 +
  /*
   * Called from the compare thread on the primary
   * for compare udp packet
@@ -XXX,XX +XXX,XX @@ static void colo_old_packet_check(void *opaque)
                          (GCompareFunc)colo_old_packet_check_one_conn);
  }
 -/*
 - * Called from the compare thread on the primary
 - * for compare packet with secondary list of the
 - * specified connection when a new packet was
 - * queued to it.
 - */
 -static void colo_compare_connection(void *opaque, void *user_data)
 +static void colo_compare_packet(CompareState *s, Connection *conn,
 +                                int (*HandlePacket)(Packet *spkt,
 +                                Packet *ppkt))
  {
 -    CompareState *s = user_data;
 -    Connection *conn = opaque;
      Packet *pkt = NULL;
      GList *result = NULL;
 -    int ret;
      while (!g_queue_is_empty(&conn->primary_list) &&
             !g_queue_is_empty(&conn->secondary_list)) {
          pkt = g_queue_pop_head(&conn->primary_list);
 -        switch (conn->ip_proto) {
 -        case IPPROTO_TCP:
 -            result = g_queue_find_custom(&conn->secondary_list,
 -                     pkt, (GCompareFunc)colo_packet_compare_tcp);
 -            break;
 -        case IPPROTO_UDP:
 -            result = g_queue_find_custom(&conn->secondary_list,
 -                     pkt, (GCompareFunc)colo_packet_compare_udp);
 -            break;
 -        case IPPROTO_ICMP:
 -            result = g_queue_find_custom(&conn->secondary_list,
 -                     pkt, (GCompareFunc)colo_packet_compare_icmp);
 -            break;
 -        default:
 -            result = g_queue_find_custom(&conn->secondary_list,
 -                     pkt, (GCompareFunc)colo_packet_compare_other);
 -            break;
 -        }
 +        result = g_queue_find_custom(&conn->secondary_list,
 +                 pkt, (GCompareFunc)HandlePacket);
          if (result) {
 -            ret = compare_chr_send(s,
 -                                   pkt->data,
 -                                   pkt->size,
 -                                   pkt->vnet_hdr_len);
 -            if (ret < 0) {
 -                error_report("colo_send_primary_packet failed");
 -            }
 -            trace_colo_compare_main("packet same and release packet");
 +            colo_release_primary_pkt(s, pkt);
              g_queue_remove(&conn->secondary_list, result->data);
 -            packet_destroy(pkt, NULL);
          } else {
              /*
               * If one packet arrive late, the secondary_list or
@@ -XXX,XX +XXX,XX @@ static void colo_compare_connection(void *opaque, void *user_data)
      }
  }
 +/*
 + * Called from the compare thread on the primary
 + * for compare packet with secondary list of the
 + * specified connection when a new packet was
 + * queued to it.
 + */
 +static void colo_compare_connection(void *opaque, void *user_data)
 +{
 +    CompareState *s = user_data;
 +    Connection *conn = opaque;
 +
 +    switch (conn->ip_proto) {
 +    case IPPROTO_TCP:
 +        colo_compare_tcp(s, conn);
 +        break;
 +    case IPPROTO_UDP:
 +        colo_compare_packet(s, conn, colo_packet_compare_udp);
 +        break;
 +    case IPPROTO_ICMP:
 +        colo_compare_packet(s, conn, colo_packet_compare_icmp);
 +        break;
 +    default:
 +        colo_compare_packet(s, conn, colo_packet_compare_other);
 +        break;
 +    }
 +}
 +
  static int compare_chr_send(CompareState *s,
                              const uint8_t *buf,
                              uint32_t size,
 diff --git a/net/colo.c b/net/colo.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo.c
 +++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ Connection *connection_new(ConnectionKey *key)
      conn->processing = false;
      conn->offset = 0;
      conn->syn_flag = 0;
 +    conn->pack = 0;
 +    conn->sack = 0;
      g_queue_init(&conn->primary_list);
      g_queue_init(&conn->secondary_list);
@@ -XXX,XX +XXX,XX @@ Packet *packet_new(const void *data, int size, int vnet_hdr_len)
      pkt->size = size;
      pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
      pkt->vnet_hdr_len = vnet_hdr_len;
 +    pkt->tcp_seq = 0;
 +    pkt->tcp_ack = 0;
 +    pkt->seq_end = 0;
 +    pkt->header_size = 0;
 +    pkt->payload_size = 0;
 +    pkt->offset = 0;
 +    pkt->flags = 0;
      return pkt;
  }
 diff --git a/net/colo.h b/net/colo.h
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo.h
 +++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@ typedef struct Packet {
      int64_t creation_ms;
      /* Get vnet_hdr_len from filter */
      uint32_t vnet_hdr_len;
 +    uint32_t tcp_seq; /* sequence number */
 +    uint32_t tcp_ack; /* acknowledgement number */
 +    /* the sequence number of the last byte of the packet */
 +    uint32_t seq_end;
 +    uint8_t header_size;  /* the header length */
 +    uint16_t payload_size; /* the payload length */
 +    /* record the payload offset(the length that has been compared) */
 +    uint16_t offset;
 +    uint8_t flags; /* Flags(aka Control bits) */
  } Packet;
  typedef struct ConnectionKey {
@@ -XXX,XX +XXX,XX @@ typedef struct Connection {
      /* flag to enqueue unprocessed_connections */
      bool processing;
      uint8_t ip_proto;
 +    /* record the sequence number that has been compared */
 +    uint32_t compare_seq;
 +    /* the maximum of acknowledgement number in primary_list queue */
 +    uint32_t pack;
 +    /* the maximum of acknowledgement number in secondary_list queue */
 +    uint32_t sack;
      /* offset = secondary_seq - primary_seq */
      tcp_seq  offset;
      /*
 diff --git a/net/trace-events b/net/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/net/trace-events
 +++ b/net/trace-events
@@ -XXX,XX +XXX,XX @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
  colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
  colo_old_packet_check_found(int64_t old_time) "%" PRId64
  colo_compare_miscompare(void) ""
 -colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int res, uint32_t flag, int size) "side: %s seq/ack= %u/%u res= %d flags= 0x%x pkt_size: %d\n"
 +colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, int pdlen, int offset, int flags) "%s: seq/ack= %u/%u hdlen= %d pdlen= %d offset= %d flags=%d\n"
  # net/filter-rewriter.c
  colo_filter_rewriter_debug(void) ""
 --
 .7.4

-[Qemu-devel] [PULL 1/6] colo: modified the payload compare function
+[PULL V2 20/25] vdpa: Add device migration blocker
-From: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Modified the function colo_packet_compare_common to prepare for the
+Since the vhost-vdpa device is exposing _F_LOG, adding a migration blocker if
-tcp packet comparison in the next patch.
+it uses CVQ.
-Cc: Zhang Chen <zhangckid@gmail.com>
+However, qemu is able to migrate simple devices with no CVQ as long as
-Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
+they use SVQ. To allow it, add a placeholder error to vhost_vdpa, and
-Cc: Jason Wang <jasowang@redhat.com>
+only add to vhost_dev when used. vhost_dev machinery place the migration
 blocker if needed.
-Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Reviewed-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 88 +++++++++++++++++++++++++++---------------------------
+ hw/virtio/vhost-vdpa.c         | 15 +++++++++++++++
-file changed, 44 insertions(+), 44 deletions(-)
+ include/hw/virtio/vhost-vdpa.h |  1 +
 files changed, 16 insertions(+)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
+@@ -XXX,XX +XXX,XX @@
-  * return:    0  means packet same
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
-  *            > 0 || < 0 means packet different
+ #include "hw/virtio/vhost-vdpa.h"
-  */
+ #include "exec/address-spaces.h"
--static int colo_packet_compare_common(Packet *ppkt,
++#include "migration/blocker.h"
--                                      Packet *spkt,
+ #include "qemu/cutils.h"
--                                      int poffset,
+ #include "qemu/main-loop.h"
--                                      int soffset)
+ #include "cpu.h"
-+static int colo_compare_packet_payload(Packet *ppkt,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
-+                                       Packet *spkt,
+         return true;
-+                                       uint16_t poffset,
+     }
-+                                       uint16_t soffset,
-+                                       uint16_t len)
++    if (v->migration_blocker) {
 +        int r = migrate_add_blocker(v->migration_blocker, &err);
 +        if (unlikely(r < 0)) {
 +            return false;
 +        }
 +    }
 +
- {
+     for (i = 0; i < v->shadow_vqs->len; ++i) {
-     if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
+         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
-         char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
+         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_common(Packet *ppkt,
+@@ -XXX,XX +XXX,XX @@ err:
-                                    sec_ip_src, sec_ip_dst);
+         vhost_svq_stop(svq);
      }
--    poffset = ppkt->vnet_hdr_len + poffset;
++    if (v->migration_blocker) {
--    soffset = ppkt->vnet_hdr_len + soffset;
++        migrate_del_blocker(v->migration_blocker);
--
++    }
--    if (ppkt->size - poffset == spkt->size - soffset) {
++
--        return memcmp(ppkt->data + poffset,
+     return false;
 -                      spkt->data + soffset,
 -                      spkt->size - soffset);
 -    } else {
 -        trace_colo_compare_main("Net packet size are not the same");
 -        return -1;
 -    }
 +    return memcmp(ppkt->data + poffset, spkt->data + soffset, len);
  }
- /*
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
-@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
+         }
       * the secondary guest's timestamp. COLO just focus on payload,
       * so we just need skip this field.
       */
 -    if (ptcp->th_off > 5) {
 -        ptrdiff_t ptcp_offset, stcp_offset;
 -        ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
 -                      + (ptcp->th_off * 4) - ppkt->vnet_hdr_len;
 -        stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
 -                      + (stcp->th_off * 4) - spkt->vnet_hdr_len;
 +    ptrdiff_t ptcp_offset, stcp_offset;
 -        /*
 -         * When network is busy, some tcp options(like sack) will unpredictable
 -         * occur in primary side or secondary side. it will make packet size
 -         * not same, but the two packet's payload is identical. colo just
 -         * care about packet payload, so we skip the option field.
 -         */
 -        res = colo_packet_compare_common(ppkt, spkt, ptcp_offset, stcp_offset);
 -    } else if (ptcp->th_sum == stcp->th_sum) {
 -        res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN, ETH_HLEN);
 +    ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
 +                  + (ptcp->th_off << 2) - ppkt->vnet_hdr_len;
 +    stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
 +                  + (stcp->th_off << 2) - spkt->vnet_hdr_len;
 +    if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) {
 +        res = colo_compare_packet_payload(ppkt, spkt,
 +                                          ptcp_offset, stcp_offset,
 +                                          ppkt->size - ptcp_offset);
      } else {
 +        trace_colo_compare_main("TCP: payload size of packets are different");
          res = -1;
      }
-@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
++    if (v->migration_blocker) {
-  */
++        migrate_del_blocker(v->migration_blocker);
  static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
  {
 -    int ret;
 -    int network_header_length = ppkt->ip->ip_hl * 4;
 +    uint16_t network_header_length = ppkt->ip->ip_hl << 2;
 +    uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
      trace_colo_compare_main("compare udp");
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
       * other field like TOS,TTL,IP Checksum. we only need to compare
       * the ip payload here.
       */
 -    ret = colo_packet_compare_common(ppkt, spkt,
 -                                     network_header_length + ETH_HLEN,
 -                                     network_header_length + ETH_HLEN);
 -
 -    if (ret) {
 +    if (ppkt->size != spkt->size) {
 +        trace_colo_compare_main("UDP: payload size of packets are different");
 +        return -1;
 +    }
-+    if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
+     return true;
 +                                    ppkt->size - offset)) {
          trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
          trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
          if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
              qemu_hexdump((char *)spkt->data, stderr, "colo-compare sec pkt",
                           spkt->size);
          }
 +        return -1;
 +    } else {
 +        return 0;
      }
 -
 -    return ret;
  }
- /*
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
+index XXXXXXX..XXXXXXX 100644
-  */
+--- a/include/hw/virtio/vhost-vdpa.h
- static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
++++ b/include/hw/virtio/vhost-vdpa.h
- {
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
--    int network_header_length = ppkt->ip->ip_hl * 4;
+     bool shadow_vqs_enabled;
-+    uint16_t network_header_length = ppkt->ip->ip_hl << 2;
+     /* IOVA mapping used by the Shadow Virtqueue */
-+    uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
+     VhostIOVATree *iova_tree;
++    Error *migration_blocker;
-     trace_colo_compare_main("compare icmp");
+     GPtrArray *shadow_vqs;
+     const VhostShadowVirtqueueOps *shadow_vq_ops;
-@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
+     void *shadow_vq_ops_opaque;
       * other field like TOS,TTL,IP Checksum. we only need to compare
       * the ip payload here.
       */
 -    if (colo_packet_compare_common(ppkt, spkt,
 -                                   network_header_length + ETH_HLEN,
 -                                   network_header_length + ETH_HLEN)) {
 +    if (ppkt->size != spkt->size) {
 +        trace_colo_compare_main("ICMP: payload size of packets are different");
 +        return -1;
 +    }
 +    if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
 +                                    ppkt->size - offset)) {
          trace_colo_compare_icmp_miscompare("primary pkt size",
                                             ppkt->size);
          trace_colo_compare_icmp_miscompare("Secondary pkt size",
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
   */
  static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
  {
 +    uint16_t offset = ppkt->vnet_hdr_len;
 +
      trace_colo_compare_main("compare other");
      if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
          char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
                                     sec_ip_src, sec_ip_dst);
      }
 -    return colo_packet_compare_common(ppkt, spkt, 0, 0);
 +    if (ppkt->size != spkt->size) {
 +        trace_colo_compare_main("Other: payload size of packets are different");
 +        return -1;
 +    }
 +    return colo_compare_packet_payload(ppkt, spkt, offset, offset,
 +                                       ppkt->size - offset);
  }
  static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
 --
 .7.4

-[Qemu-devel] [PULL 3/6] net: Allow hubports to connect to other netdevs
+[PULL V2 21/25] vdpa: Add x-svq to NetdevVhostVDPAOptions
-From: Thomas Huth <thuth@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-QEMU can emulate hubs to connect NICs and netdevs. This is currently
+Finally offering the possibility to enable SVQ from the command line.
 primarily used for the mis-named 'vlan' feature of the networking
 subsystem. Now the 'vlan' feature has been marked as deprecated, since
 its name is rather confusing and the users often rather mis-configure
 their network when trying to use it. But while the 'vlan' parameter
 should be removed at one point in time, the basic idea of emulating
 a hub in QEMU is still good: It's useful for bundling up the output of
 multiple NICs into one single l2tp netdev for example.
-Now to be able to use the hubport feature without 'vlan's, there is one
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-missing piece: The possibility to connect a hubport to a netdev, too.
+Acked-by: Markus Armbruster <armbru@redhat.com>
-This patch adds this possibility by introducing a new "netdev=..."
+Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 parameter to the hubports.
 To bundle up the output of multiple NICs into one socket netdev, you can
 now run QEMU with these parameters for example:
 qemu-system-ppc64 ... -netdev socket,id=s1,connect=:11122 \
     -netdev hubport,hubid=1,id=h1,netdev=s1 \
     -netdev hubport,hubid=1,id=h2 -device e1000,netdev=h2 \
     -netdev hubport,hubid=1,id=h3 -device virtio-net-pci,netdev=h3
 For using the socket netdev, you have got to start another QEMU as the
 receiving side first, for example with network dumping enabled:
 qemu-system-x86_64 -M isapc -netdev socket,id=s0,listen=:11122 \
     -device ne2k_isa,netdev=s0 \
     -object filter-dump,id=f1,netdev=s0,file=/tmp/dump.dat
 After the ppc64 guest tried to boot from both NICs, you can see in the
 dump file (using Wireshark, for example), that the output of both NICs
 (the e1000 and the virtio-net-pci) has been successfully transfered
 via the socket netdev in this case.
 Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/hub.c       | 27 +++++++++++++++++++++------
+ net/vhost-vdpa.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
- net/hub.h       |  3 ++-
+ qapi/net.json    |  9 ++++++-
- net/net.c       |  2 +-
+files changed, 77 insertions(+), 4 deletions(-)
  qapi/net.json   |  4 +++-
  qemu-options.hx |  8 +++++---
 files changed, 32 insertions(+), 12 deletions(-)
-diff --git a/net/hub.c b/net/hub.c
+diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/hub.c
+--- a/net/vhost-vdpa.c
-+++ b/net/hub.c
++++ b/net/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ const int vdpa_feature_bits[] = {
-  */
+     VHOST_INVALID_FEATURE_BIT
  #include "qemu/osdep.h"
 +#include "qapi/error.h"
  #include "monitor/monitor.h"
  #include "net/net.h"
  #include "clients.h"
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_hub_port_info = {
      .cleanup = net_hub_port_cleanup,
  };
--static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
++/** Supported device specific feature bits with SVQ */
-+static NetHubPort *net_hub_port_new(NetHub *hub, const char *name,
++static const uint64_t vdpa_svq_device_features =
-+                                    NetClientState *hubpeer)
++    BIT_ULL(VIRTIO_NET_F_CSUM) |
 +    BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
 +    BIT_ULL(VIRTIO_NET_F_MTU) |
 +    BIT_ULL(VIRTIO_NET_F_MAC) |
 +    BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
 +    BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
 +    BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
 +    BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
 +    BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
 +    BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
 +    BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
 +    BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
 +    BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
 +    BIT_ULL(VIRTIO_NET_F_STATUS) |
 +    BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
 +    BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
 +    BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
 +    BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
 +    BIT_ULL(VIRTIO_NET_F_STANDBY);
 +
  VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
  {
+     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+@@ -XXX,XX +XXX,XX @@ err_init:
+ static void vhost_vdpa_cleanup(NetClientState *nc)
+ {
+     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
++    struct vhost_dev *dev = &s->vhost_net->dev;
+     qemu_vfree(s->cvq_cmd_out_buffer);
+     qemu_vfree(s->cvq_cmd_in_buffer);
++    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
++        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
++    }
+     if (s->vhost_net) {
+         vhost_net_cleanup(s->vhost_net);
+         g_free(s->vhost_net);
+@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
+                                            int vdpa_device_fd,
+                                            int queue_pair_index,
+                                            int nvqs,
+-                                           bool is_datapath)
++                                           bool is_datapath,
++                                           bool svq,
++                                           VhostIOVATree *iova_tree)
+ {
+     NetClientState *nc = NULL;
+     VhostVDPAState *s;
+@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
+     s->vhost_vdpa.device_fd = vdpa_device_fd;
+     s->vhost_vdpa.index = queue_pair_index;
++    s->vhost_vdpa.shadow_vqs_enabled = svq;
++    s->vhost_vdpa.iova_tree = iova_tree;
+     if (!is_datapath) {
+         s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                             vhost_vdpa_net_cvq_cmd_page_len());
+@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
+         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
+         s->vhost_vdpa.shadow_vq_ops_opaque = s;
++        error_setg(&s->vhost_vdpa.migration_blocker,
++                   "Migration disabled: vhost-vdpa uses CVQ.");
+     }
+     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
+     if (ret) {
+@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
+     return nc;
+ }
++static int vhost_vdpa_get_iova_range(int fd,
++                                     struct vhost_vdpa_iova_range *iova_range)
++{
++    int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
++
++    return ret < 0 ? -errno : 0;
++}
++
+ static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
+ {
+     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
+@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
+     uint64_t features;
+     int vdpa_device_fd;
+     g_autofree NetClientState **ncs = NULL;
++    g_autoptr(VhostIOVATree) iova_tree = NULL;
      NetClientState *nc;
-     NetHubPort *port;
+     int queue_pairs, r, i, has_cvq = 0;
-@@ -XXX,XX +XXX,XX @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
-         name = default_name;
+@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
          return queue_pairs;
      }
--    nc = qemu_new_net_client(&net_hub_port_info, NULL, "hub", name);
++    if (opts->x_svq) {
-+    nc = qemu_new_net_client(&net_hub_port_info, hubpeer, "hub", name);
++        struct vhost_vdpa_iova_range iova_range;
-     port = DO_UPCAST(NetHubPort, nc, nc);
++
-     port->id = id;
++        uint64_t invalid_dev_features =
-     port->hub = hub;
++            features & ~vdpa_svq_device_features &
-@@ -XXX,XX +XXX,XX @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
++            /* Transport are all accepted at this point */
++            ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
- /**
++                             VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
-  * Create a port on a given hub
++
-+ * @hub_id: Number of the hub
++        if (invalid_dev_features) {
-  * @name: Net client name or NULL for default name.
++            error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
-+ * @hubpeer: Peer to use (if "netdev=id" has been specified)
++                       invalid_dev_features);
-  *
++            goto err_svq;
-  * If there is no existing hub with the given id then a new hub is created.
++        }
-  */
++
--NetClientState *net_hub_add_port(int hub_id, const char *name)
++        vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
-+NetClientState *net_hub_add_port(int hub_id, const char *name,
++        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
-+                                 NetClientState *hubpeer)
++    }
- {
++
-     NetHub *hub;
+     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
-     NetHubPort *port;
-@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_add_port(int hub_id, const char *name)
+     for (i = 0; i < queue_pairs; i++) {
-         hub = net_hub_new(hub_id);
+         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
 -                                     vdpa_device_fd, i, 2, true);
 +                                     vdpa_device_fd, i, 2, true, opts->x_svq,
 +                                     iova_tree);
          if (!ncs[i])
              goto err;
      }
--    port = net_hub_port_new(hub, name);
+     if (has_cvq) {
-+    port = net_hub_port_new(hub, name, hubpeer);
+         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-     return &port->nc;
+-                                 vdpa_device_fd, i, 1, false);
- }
++                                 vdpa_device_fd, i, 1, false,
++                                 opts->x_svq, iova_tree);
-@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id)
+         if (!nc)
              goto err;
      }
 +    /* iova_tree ownership belongs to last NetClientState */
 +    g_steal_pointer(&iova_tree);
      return 0;
  err:
@@ -XXX,XX +XXX,XX @@ err:
              qemu_del_net_client(ncs[i]);
          }
      }
--    nc = net_hub_add_port(hub_id, NULL);
-+    nc = net_hub_add_port(hub_id, NULL, NULL);
-     return nc;
- }
-@@ -XXX,XX +XXX,XX @@ int net_init_hubport(const Netdev *netdev, const char *name,
-                      NetClientState *peer, Error **errp)
- {
-     const NetdevHubPortOptions *hubport;
-+    NetClientState *hubpeer = NULL;
-     assert(netdev->type == NET_CLIENT_DRIVER_HUBPORT);
-     assert(!peer);
-     hubport = &netdev->u.hubport;
--    net_hub_add_port(hubport->hubid, name);
-+    if (hubport->has_netdev) {
-+        hubpeer = qemu_find_netdev(hubport->netdev);
-+        if (!hubpeer) {
-+            error_setg(errp, "netdev '%s' not found", hubport->netdev);
-+            return -1;
-+        }
-+    }
 +
-+    net_hub_add_port(hubport->hubid, name, hubpeer);
++err_svq:
-+
+     qemu_close(vdpa_device_fd);
-     return 0;
- }
+     return -1;
 diff --git a/net/hub.h b/net/hub.h
 index XXXXXXX..XXXXXXX 100644
 --- a/net/hub.h
 +++ b/net/hub.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu-common.h"
 -NetClientState *net_hub_add_port(int hub_id, const char *name);
 +NetClientState *net_hub_add_port(int hub_id, const char *name,
 +                                 NetClientState *hubpeer);
  NetClientState *net_hub_find_client_by_name(int hub_id, const char *name);
  void net_hub_info(Monitor *mon);
  void net_hub_check_clients(void);
 diff --git a/net/net.c b/net/net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
          /* Do not add to a vlan if it's a nic with a netdev= parameter. */
          if (netdev->type != NET_CLIENT_DRIVER_NIC ||
              !opts->u.nic.has_netdev) {
 -            peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL);
 +            peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL, NULL);
          }
          if (net->has_vlan && !vlan_warned) {
 diff --git a/qapi/net.json b/qapi/net.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/net.json
 +++ b/qapi/net.json
 @@ -XXX,XX +XXX,XX @@
- # Connect two or more net clients through a software hub.
+ # @queues: number of queues to be created for multiqueue vhost-vdpa
  #          (default: 1)
  #
- # @hubid: hub identifier number
++# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
-+# @netdev: used to connect hub to a netdev instead of a device (since 2.12)
++#         (default: false)
- #
++#
- # Since: 1.2
++# Features:
 +# @unstable: Member @x-svq is experimental.
 +#
  # Since: 5.1
  ##
- { 'struct': 'NetdevHubPortOptions',
+ { 'struct': 'NetdevVhostVDPAOptions',
    'data': {
--    'hubid':     'int32' } }
+     '*vhostdev':     'str',
-+    'hubid':     'int32',
+-    '*queues':       'int' } }
-+    '*netdev':    'str' } }
++    '*queues':       'int',
 +    '*x-svq':        {'type': 'bool', 'features' : [ 'unstable'] } } }
  ##
- # @NetdevNetmapOptions:
+ # @NetdevVmnetHostOptions:
 diff --git a/qemu-options.hx b/qemu-options.hx
 index XXXXXXX..XXXXXXX 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
  #endif
      "-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
      "                configure a vhost-user network, backed by a chardev 'dev'\n"
 -    "-netdev hubport,id=str,hubid=n\n"
 +    "-netdev hubport,id=str,hubid=n[,netdev=nd]\n"
      "                configure a hub port on QEMU VLAN 'n'\n", QEMU_ARCH_ALL)
  DEF("net", HAS_ARG, QEMU_OPTION_net,
      "-net nic[,vlan=n][,netdev=nd][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
@@ -XXX,XX +XXX,XX @@ vde_switch -F -sock /tmp/myswitch
  qemu-system-i386 linux.img -net nic -net vde,sock=/tmp/myswitch
  @end example
 -@item -netdev hubport,id=@var{id},hubid=@var{hubid}
 +@item -netdev hubport,id=@var{id},hubid=@var{hubid}[,netdev=@var{nd}]
  Create a hub port on QEMU "vlan" @var{hubid}.
  The hubport netdev lets you connect a NIC to a QEMU "vlan" instead of a single
  netdev.  @code{-net} and @code{-device} with parameter @option{vlan} create the
 -required hub automatically.
 +required hub automatically. Alternatively, you can also connect the hubport
 +to another netdev with ID @var{nd} by using the @option{netdev=@var{nd}}
 +option.
  @item -netdev vhost-user,chardev=@var{id}[,vhostforce=on|off][,queues=n]
 --
 .7.4

-New patch
+[PULL V2 22/25] softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH
+From: Zhang Chen <chen.zhang@intel.com>
+If the checkpoint occurs when the guest finishes restarting
+but has not started running, the runstate_set() may reject
+the transition from COLO to PRELAUNCH with the crash log:
+{"timestamp": {"seconds": 1593484591, "microseconds": 26605},\
+"event": "RESET", "data": {"guest": true, "reason": "guest-reset"}}
+qemu-system-x86_64: invalid runstate transition: 'colo' -> 'prelaunch'
+Long-term testing says that it's pretty safe.
+Signed-off-by: Like Xu <like.xu@linux.intel.com>
+Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ softmmu/runstate.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/softmmu/runstate.c b/softmmu/runstate.c
+index XXXXXXX..XXXXXXX 100644
+--- a/softmmu/runstate.c
++++ b/softmmu/runstate.c
+@@ -XXX,XX +XXX,XX @@ static const RunStateTransition runstate_transitions_def[] = {
+     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
+     { RUN_STATE_COLO, RUN_STATE_RUNNING },
++    { RUN_STATE_COLO, RUN_STATE_PRELAUNCH },
+     { RUN_STATE_COLO, RUN_STATE_SHUTDOWN},
+     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
+--
+.7.4

-New patch
+[PULL V2 23/25] net/colo: Fix a "double free" crash to clear the conn_list
+From: Zhang Chen <chen.zhang@intel.com>
+We notice the QEMU may crash when the guest has too many
+incoming network connections with the following log:
+@1593578622.668573:colo_proxy_main : colo proxy connection hashtable full, clear it
+free(): invalid pointer
+[1]    15195 abort (core dumped)  qemu-system-x86_64 ....
+This is because we create the s->connection_track_table with
+g_hash_table_new_full() which is defined as:
+GHashTable * g_hash_table_new_full (GHashFunc hash_func,
+                       GEqualFunc key_equal_func,
+                       GDestroyNotify key_destroy_func,
+                       GDestroyNotify value_destroy_func);
+The fourth parameter connection_destroy() will be called to free the
+memory allocated for all 'Connection' values in the hashtable when
+we call g_hash_table_remove_all() in the connection_hashtable_reset().
+But both connection_track_table and conn_list reference to the same
+conn instance. It will trigger double free in conn_list clear. So this
+patch remove free action on hash table side to avoid double free the
+conn.
+Signed-off-by: Like Xu <like.xu@linux.intel.com>
+Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ net/colo-compare.c    | 2 +-
+ net/filter-rewriter.c | 2 +-
+files changed, 2 insertions(+), 2 deletions(-)
+diff --git a/net/colo-compare.c b/net/colo-compare.c
+index XXXXXXX..XXXXXXX 100644
+--- a/net/colo-compare.c
++++ b/net/colo-compare.c
+@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
+     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+                                                       connection_key_equal,
+                                                       g_free,
+-                                                      connection_destroy);
++                                                      NULL);
+     colo_compare_iothread(s);
+diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
+index XXXXXXX..XXXXXXX 100644
+--- a/net/filter-rewriter.c
++++ b/net/filter-rewriter.c
+@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
+     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+                                                       connection_key_equal,
+                                                       g_free,
+-                                                      connection_destroy);
++                                                      NULL);
+     s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
+ }
+--
+.7.4

-New patch
+[PULL V2 24/25] net/colo.c: No need to track conn_list for filter-rewriter
+From: Zhang Chen <chen.zhang@intel.com>
+Filter-rewriter no need to track connection in conn_list.
+This patch fix the glib g_queue_is_empty assertion when COLO guest
+keep a lot of network connection.
+Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ net/colo.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/net/colo.c b/net/colo.c
+index XXXXXXX..XXXXXXX 100644
+--- a/net/colo.c
++++ b/net/colo.c
+@@ -XXX,XX +XXX,XX @@ Connection *connection_get(GHashTable *connection_track_table,
+             /*
+              * clear the conn_list
+              */
+-            while (!g_queue_is_empty(conn_list)) {
++            while (conn_list && !g_queue_is_empty(conn_list)) {
+                 connection_destroy(g_queue_pop_head(conn_list));
+             }
+         }
+--
+.7.4

-New patch
+[PULL V2 25/25] net/colo.c: fix segmentation fault when packet is not parsed correctly
+From: Zhang Chen <chen.zhang@intel.com>
+When COLO use only one vnet_hdr_support parameter between
+filter-redirector and filter-mirror(or colo-compare), COLO will crash
+with segmentation fault. Back track as follow:
+Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
+x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
+    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
+uint16_t proto = be16_to_cpu(PKT_GET_ETH_HDR(p)->h_proto);
+(gdb) bt
+0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
+    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
+0x0000555555cb22b4 in parse_packet_early (pkt=0x555556a44840) at
+net/colo.c:49
+0x0000555555cb2b91 in is_tcp_packet (pkt=0x555556a44840) at
+net/filter-rewriter.c:63
+So wrong vnet_hdr_len will cause pkt->data become NULL. Add check to
+raise error and add trace-events to track vnet_hdr_len.
+Signed-off-by: Tao Xu <tao3.xu@intel.com>
+Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+---
+ net/colo.c       | 9 ++++++++-
+ net/trace-events | 1 +
+files changed, 9 insertions(+), 1 deletion(-)
+diff --git a/net/colo.c b/net/colo.c
+index XXXXXXX..XXXXXXX 100644
+--- a/net/colo.c
++++ b/net/colo.c
+@@ -XXX,XX +XXX,XX @@ int parse_packet_early(Packet *pkt)
+     static const uint8_t vlan[] = {0x81, 0x00};
+     uint8_t *data = pkt->data + pkt->vnet_hdr_len;
+     uint16_t l3_proto;
+-    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
++    ssize_t l2hdr_len;
++
++    if (data == NULL) {
++        trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, "
++                                        "pkt->vnet_hdr_len", pkt->vnet_hdr_len);
++        return 1;
++    }
++    l2hdr_len = eth_get_l2_hdr_length(data);
+     if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) {
+         trace_colo_proxy_main("pkt->size < ETH_HLEN");
+diff --git a/net/trace-events b/net/trace-events
+index XXXXXXX..XXXXXXX 100644
+--- a/net/trace-events
++++ b/net/trace-events
+@@ -XXX,XX +XXX,XX @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
+ # colo.c
+ colo_proxy_main(const char *chr) ": %s"
++colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d"
+ # colo-compare.c
+ colo_compare_main(const char *chr) ": %s"
+--
+.7.4

The following changes since commit e607bbee553cfe73072870cef458cfa4e78133e2:

Merge remote-tracking branch 'remotes/edgar/tags/edgar/xilinx-next-2018-01-26.for-upstream' into staging (2018-01-26 14:24:25 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to bf4835a4d5338bb7424827715df22570a8adc67c:

MAINTAINERS: update Dmitry Fleytman email (2018-01-29 16:05:38 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Mao Zhongyi (2):
      colo: modified the payload compare function
      colo: compare the packet based on the tcp sequence number

Philippe Mathieu-Daudé (1):
      MAINTAINERS: update Dmitry Fleytman email

Thomas Huth (3):
      net: Allow hubports to connect to other netdevs
      net: Allow netdevs to be used with 'hostfwd_add' and 'hostfwd_remove'
      qemu-doc: Get rid of "vlan=X" example in the documentation

From: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>

Modified the function colo_packet_compare_common to prepare for the
tcp packet comparison in the next patch.

Cc: Zhang Chen <zhangckid@gmail.com>
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Jason Wang <jasowang@redhat.com>

Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Reviewed-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 88 +++++++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
  * return:    0  means packet same
  *            > 0 || < 0 means packet different
  */
-static int colo_packet_compare_common(Packet *ppkt,
-                                      Packet *spkt,
-                                      int poffset,
-                                      int soffset)
+static int colo_compare_packet_payload(Packet *ppkt,
+                                       Packet *spkt,
+                                       uint16_t poffset,
+                                       uint16_t soffset,
+                                       uint16_t len)
+
 {
     if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
         char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_common(Packet *ppkt,
                                    sec_ip_src, sec_ip_dst);
     }
 
-    poffset = ppkt->vnet_hdr_len + poffset;
-    soffset = ppkt->vnet_hdr_len + soffset;
-
-    if (ppkt->size - poffset == spkt->size - soffset) {
-        return memcmp(ppkt->data + poffset,
-                      spkt->data + soffset,
-                      spkt->size - soffset);
-    } else {
-        trace_colo_compare_main("Net packet size are not the same");
-        return -1;
-    }
+    return memcmp(ppkt->data + poffset, spkt->data + soffset, len);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
      * the secondary guest's timestamp. COLO just focus on payload,
      * so we just need skip this field.
      */
-    if (ptcp->th_off > 5) {
-        ptrdiff_t ptcp_offset, stcp_offset;
 
-        ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
-                      + (ptcp->th_off * 4) - ppkt->vnet_hdr_len;
-        stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
-                      + (stcp->th_off * 4) - spkt->vnet_hdr_len;
+    ptrdiff_t ptcp_offset, stcp_offset;
 
-        /*
-         * When network is busy, some tcp options(like sack) will unpredictable
-         * occur in primary side or secondary side. it will make packet size
-         * not same, but the two packet's payload is identical. colo just
-         * care about packet payload, so we skip the option field.
-         */
-        res = colo_packet_compare_common(ppkt, spkt, ptcp_offset, stcp_offset);
-    } else if (ptcp->th_sum == stcp->th_sum) {
-        res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN, ETH_HLEN);
+    ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
+                  + (ptcp->th_off << 2) - ppkt->vnet_hdr_len;
+    stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
+                  + (stcp->th_off << 2) - spkt->vnet_hdr_len;
+    if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) {
+        res = colo_compare_packet_payload(ppkt, spkt,
+                                          ptcp_offset, stcp_offset,
+                                          ppkt->size - ptcp_offset);
     } else {
+        trace_colo_compare_main("TCP: payload size of packets are different");
         res = -1;
     }
 
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
  */
 static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
 {
-    int ret;
-    int network_header_length = ppkt->ip->ip_hl * 4;
+    uint16_t network_header_length = ppkt->ip->ip_hl << 2;
+    uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
 
     trace_colo_compare_main("compare udp");
 
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
      * other field like TOS,TTL,IP Checksum. we only need to compare
      * the ip payload here.
      */
-    ret = colo_packet_compare_common(ppkt, spkt,
-                                     network_header_length + ETH_HLEN,
-                                     network_header_length + ETH_HLEN);
-
-    if (ret) {
+    if (ppkt->size != spkt->size) {
+        trace_colo_compare_main("UDP: payload size of packets are different");
+        return -1;
+    }
+    if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
+                                    ppkt->size - offset)) {
         trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
         trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
         if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
             qemu_hexdump((char *)spkt->data, stderr, "colo-compare sec pkt",
                          spkt->size);
         }
+        return -1;
+    } else {
+        return 0;
     }
-
-    return ret;
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
  */
 static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
 {
-    int network_header_length = ppkt->ip->ip_hl * 4;
+    uint16_t network_header_length = ppkt->ip->ip_hl << 2;
+    uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
 
     trace_colo_compare_main("compare icmp");
 
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
      * other field like TOS,TTL,IP Checksum. we only need to compare
      * the ip payload here.
      */
-    if (colo_packet_compare_common(ppkt, spkt,
-                                   network_header_length + ETH_HLEN,
-                                   network_header_length + ETH_HLEN)) {
+    if (ppkt->size != spkt->size) {
+        trace_colo_compare_main("ICMP: payload size of packets are different");
+        return -1;
+    }
+    if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
+                                    ppkt->size - offset)) {
         trace_colo_compare_icmp_miscompare("primary pkt size",
                                            ppkt->size);
         trace_colo_compare_icmp_miscompare("Secondary pkt size",
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
  */
 static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
 {
+    uint16_t offset = ppkt->vnet_hdr_len;
+
     trace_colo_compare_main("compare other");
     if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
         char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
                                    sec_ip_src, sec_ip_dst);
     }
 
-    return colo_packet_compare_common(ppkt, spkt, 0, 0);
+    if (ppkt->size != spkt->size) {
+        trace_colo_compare_main("Other: payload size of packets are different");
+        return -1;
+    }
+    return colo_compare_packet_payload(ppkt, spkt, offset, offset,
+                                       ppkt->size - offset);
 }
 
 static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
-- 
2.7.4

From: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>

Packet size some time different or when network is busy.
Based on same payload size, but TCP protocol can not
guarantee send the same one packet in the same way,

like that:
We send this payload:
------------------------------
| header |1|2|3|4|5|6|7|8|9|0|
------------------------------

primary:
ppkt1:
----------------
| header |1|2|3|
----------------
ppkt2:
------------------------
| header |4|5|6|7|8|9|0|
------------------------

secondary:
spkt1:
------------------------------
| header |1|2|3|4|5|6|7|8|9|0|
------------------------------

In the original method, ppkt1 and ppkt2 are different in size and
spkt1, so they can't compare and trigger the checkpoint.

I have tested FTP get 200M and 1G file many times, I found that
the performance was less than 1% of the native.

Now I reconstructed the comparison of TCP packets based on the
TCP sequence number. first of all, ppkt1 and spkt1 have the same
starting sequence number, so they can compare, even though their
length is different. And then ppkt1 with a smaller payload length
is used as the comparison length, if the payload is same, send
out the ppkt1 and record the offset(the length of ppkt1 payload)
in spkt1. The next comparison, ppkt2 and spkt1 can be compared
from the recorded position of spkt1.

like that:
----------------
| header |1|2|3| ppkt1
---------|-----|
         |     |
---------v-----v--------------
| header |1|2|3|4|5|6|7|8|9|0| spkt1
---------------|\------------|
               | \offset     |
      ---------v-------------v
      | header |4|5|6|7|8|9|0| ppkt2
      ------------------------

In this way, the performance can reach native 20% in my multiple
tests.

Cc: Zhang Chen <zhangckid@gmail.com>
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Jason Wang <jasowang@redhat.com>

Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Reviewed-by: Zhang Chen <zhangckid@gmail.com>
Tested-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 343 +++++++++++++++++++++++++++++++++++------------------
 net/colo.c         |   9 ++
 net/colo.h         |  15 +++
 net/trace-events   |   2 +-
 4 files changed, 250 insertions(+), 119 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@
 #define COMPARE_READ_LEN_MAX NET_BUFSIZE
 #define MAX_QUEUE_SIZE 1024
 
+#define COLO_COMPARE_FREE_PRIMARY     0x01
+#define COLO_COMPARE_FREE_SECONDARY   0x02
+
 /* TODO: Should be configurable */
 #define REGULAR_PACKET_CHECK_MS 3000
 
@@ -XXX,XX +XXX,XX @@ static gint seq_sorter(Packet *a, Packet *b, gpointer data)
     return ntohl(atcp->th_seq) - ntohl(btcp->th_seq);
 }
 
+static void fill_pkt_tcp_info(void *data, uint32_t *max_ack)
+{
+    Packet *pkt = data;
+    struct tcphdr *tcphd;
+
+    tcphd = (struct tcphdr *)pkt->transport_header;
+
+    pkt->tcp_seq = ntohl(tcphd->th_seq);
+    pkt->tcp_ack = ntohl(tcphd->th_ack);
+    *max_ack = *max_ack > pkt->tcp_ack ? *max_ack : pkt->tcp_ack;
+    pkt->header_size = pkt->transport_header - (uint8_t *)pkt->data
+                       + (tcphd->th_off << 2) - pkt->vnet_hdr_len;
+    pkt->payload_size = pkt->size - pkt->header_size;
+    pkt->seq_end = pkt->tcp_seq + pkt->payload_size;
+    pkt->flags = tcphd->th_flags;
+}
+
 /*
  * Return 1 on success, if return 0 means the
  * packet will be dropped
  */
-static int colo_insert_packet(GQueue *queue, Packet *pkt)
+static int colo_insert_packet(GQueue *queue, Packet *pkt, uint32_t *max_ack)
 {
     if (g_queue_get_length(queue) <= MAX_QUEUE_SIZE) {
         if (pkt->ip->ip_p == IPPROTO_TCP) {
+            fill_pkt_tcp_info(pkt, max_ack);
             g_queue_insert_sorted(queue,
                                   pkt,
                                   (GCompareDataFunc)seq_sorter,
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
     }
 
     if (mode == PRIMARY_IN) {
-        if (!colo_insert_packet(&conn->primary_list, pkt)) {
+        if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) {
             error_report("colo compare primary queue size too big,"
                          "drop packet");
         }
     } else {
-        if (!colo_insert_packet(&conn->secondary_list, pkt)) {
+        if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) {
             error_report("colo compare secondary queue size too big,"
                          "drop packet");
         }
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
     return 0;
 }
 
+static inline bool after(uint32_t seq1, uint32_t seq2)
+{
+        return (int32_t)(seq1 - seq2) > 0;
+}
+
+static void colo_release_primary_pkt(CompareState *s, Packet *pkt)
+{
+    int ret;
+    ret = compare_chr_send(s,
+                           pkt->data,
+                           pkt->size,
+                           pkt->vnet_hdr_len);
+    if (ret < 0) {
+        error_report("colo send primary packet failed");
+    }
+    trace_colo_compare_main("packet same and release packet");
+    packet_destroy(pkt, NULL);
+}
+
 /*
  * The IP packets sent by primary and secondary
  * will be compared in here
@@ -XXX,XX +XXX,XX @@ static int colo_compare_packet_payload(Packet *ppkt,
 }
 
 /*
- * Called from the compare thread on the primary
- * for compare tcp packet
- * compare_tcp copied from Dr. David Alan Gilbert's branch
- */
-static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
+ * return true means that the payload is consist and
+ * need to make the next comparison, false means do
+ * the checkpoint
+*/
+static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
+                              int8_t *mark, uint32_t max_ack)
 {
-    struct tcphdr *ptcp, *stcp;
-    int res;
+    *mark = 0;
+
+    if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
+        if (colo_compare_packet_payload(ppkt, spkt,
+                                        ppkt->header_size, spkt->header_size,
+                                        ppkt->payload_size)) {
+            *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
+            return true;
+        }
+    }
+    if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
+        if (colo_compare_packet_payload(ppkt, spkt,
+                                        ppkt->header_size, spkt->header_size,
+                                        ppkt->payload_size)) {
+            *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
+            return true;
+        }
+    }
+
+    /* one part of secondary packet payload still need to be compared */
+    if (!after(ppkt->seq_end, spkt->seq_end)) {
+        if (colo_compare_packet_payload(ppkt, spkt,
+                                        ppkt->header_size + ppkt->offset,
+                                        spkt->header_size + spkt->offset,
+                                        ppkt->payload_size - ppkt->offset)) {
+            if (!after(ppkt->tcp_ack, max_ack)) {
+                *mark = COLO_COMPARE_FREE_PRIMARY;
+                spkt->offset += ppkt->payload_size - ppkt->offset;
+                return true;
+            } else {
+                /* secondary guest hasn't ack the data, don't send
+                 * out this packet
+                 */
+                return false;
+            }
+        }
+    } else {
+        /* primary packet is longer than secondary packet, compare
+         * the same part and mark the primary packet offset
+         */
+        if (colo_compare_packet_payload(ppkt, spkt,
+                                        ppkt->header_size + ppkt->offset,
+                                        spkt->header_size + spkt->offset,
+                                        spkt->payload_size - spkt->offset)) {
+            *mark = COLO_COMPARE_FREE_SECONDARY;
+            ppkt->offset += spkt->payload_size - spkt->offset;
+            return true;
+        }
+    }
 
-    trace_colo_compare_main("compare tcp");
+    return false;
+}
 
-    ptcp = (struct tcphdr *)ppkt->transport_header;
-    stcp = (struct tcphdr *)spkt->transport_header;
+static void colo_compare_tcp(CompareState *s, Connection *conn)
+{
+    Packet *ppkt = NULL, *spkt = NULL;
+    int8_t mark;
 
     /*
-     * The 'identification' field in the IP header is *very* random
-     * it almost never matches.  Fudge this by ignoring differences in
-     * unfragmented packets; they'll normally sort themselves out if different
-     * anyway, and it should recover at the TCP level.
-     * An alternative would be to get both the primary and secondary to rewrite
-     * somehow; but that would need some sync traffic to sync the state
-     */
-    if (ntohs(ppkt->ip->ip_off) & IP_DF) {
-        spkt->ip->ip_id = ppkt->ip->ip_id;
-        /* and the sum will be different if the IDs were different */
-        spkt->ip->ip_sum = ppkt->ip->ip_sum;
+     * If ppkt and spkt have the same payload, but ppkt's ACK
+     * is greater than spkt's ACK, in this case we can not
+     * send the ppkt because it will cause the secondary guest
+     * to miss sending some data in the next. Therefore, we
+     * record the maximum ACK in the current queue at both
+     * primary side and secondary side. Only when the ack is
+     * less than the smaller of the two maximum ack, then we
+     * can ensure that the packet's payload is acknowledged by
+     * primary and secondary.
+    */
+    uint32_t min_ack = conn->pack > conn->sack ? conn->sack : conn->pack;
+
+pri:
+    if (g_queue_is_empty(&conn->primary_list)) {
+        return;
     }
+    ppkt = g_queue_pop_head(&conn->primary_list);
+sec:
+    if (g_queue_is_empty(&conn->secondary_list)) {
+        g_queue_push_head(&conn->primary_list, ppkt);
+        return;
+    }
+    spkt = g_queue_pop_head(&conn->secondary_list);
 
-    /*
-     * Check tcp header length for tcp option field.
-     * th_off > 5 means this tcp packet have options field.
-     * The tcp options maybe always different.
-     * for example:
-     * From RFC 7323.
-     * TCP Timestamps option (TSopt):
-     * Kind: 8
-     *
-     * Length: 10 bytes
-     *
-     *    +-------+-------+---------------------+---------------------+
-     *    |Kind=8 |  10   |   TS Value (TSval)  |TS Echo Reply (TSecr)|
-     *    +-------+-------+---------------------+---------------------+
-     *       1       1              4                     4
-     *
-     * In this case the primary guest's timestamp always different with
-     * the secondary guest's timestamp. COLO just focus on payload,
-     * so we just need skip this field.
-     */
+    if (ppkt->tcp_seq == ppkt->seq_end) {
+        colo_release_primary_pkt(s, ppkt);
+        ppkt = NULL;
+    }
 
-    ptrdiff_t ptcp_offset, stcp_offset;
+    if (ppkt && conn->compare_seq && !after(ppkt->seq_end, conn->compare_seq)) {
+        trace_colo_compare_main("pri: this packet has compared");
+        colo_release_primary_pkt(s, ppkt);
+        ppkt = NULL;
+    }
 
-    ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
-                  + (ptcp->th_off << 2) - ppkt->vnet_hdr_len;
-    stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
-                  + (stcp->th_off << 2) - spkt->vnet_hdr_len;
-    if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) {
-        res = colo_compare_packet_payload(ppkt, spkt,
-                                          ptcp_offset, stcp_offset,
-                                          ppkt->size - ptcp_offset);
+    if (spkt->tcp_seq == spkt->seq_end) {
+        packet_destroy(spkt, NULL);
+        if (!ppkt) {
+            goto pri;
+        } else {
+            goto sec;
+        }
     } else {
-        trace_colo_compare_main("TCP: payload size of packets are different");
-        res = -1;
+        if (conn->compare_seq && !after(spkt->seq_end, conn->compare_seq)) {
+            trace_colo_compare_main("sec: this packet has compared");
+            packet_destroy(spkt, NULL);
+            if (!ppkt) {
+                goto pri;
+            } else {
+                goto sec;
+            }
+        }
+        if (!ppkt) {
+            g_queue_push_head(&conn->secondary_list, spkt);
+            goto pri;
+        }
     }
 
-    if (res != 0 &&
-        trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
-        char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
-
-        strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src));
-        strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst));
-        strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src));
-        strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst));
-
-        trace_colo_compare_ip_info(ppkt->size, pri_ip_src,
-                                   pri_ip_dst, spkt->size,
-                                   sec_ip_src, sec_ip_dst);
-
-        trace_colo_compare_tcp_info("pri tcp packet",
-                                    ntohl(ptcp->th_seq),
-                                    ntohl(ptcp->th_ack),
-                                    res, ptcp->th_flags,
-                                    ppkt->size);
-
-        trace_colo_compare_tcp_info("sec tcp packet",
-                                    ntohl(stcp->th_seq),
-                                    ntohl(stcp->th_ack),
-                                    res, stcp->th_flags,
-                                    spkt->size);
+    if (colo_mark_tcp_pkt(ppkt, spkt, &mark, min_ack)) {
+        trace_colo_compare_tcp_info("pri",
+                                    ppkt->tcp_seq, ppkt->tcp_ack,
+                                    ppkt->header_size, ppkt->payload_size,
+                                    ppkt->offset, ppkt->flags);
+
+        trace_colo_compare_tcp_info("sec",
+                                    spkt->tcp_seq, spkt->tcp_ack,
+                                    spkt->header_size, spkt->payload_size,
+                                    spkt->offset, spkt->flags);
+
+        if (mark == COLO_COMPARE_FREE_PRIMARY) {
+            conn->compare_seq = ppkt->seq_end;
+            colo_release_primary_pkt(s, ppkt);
+            g_queue_push_head(&conn->secondary_list, spkt);
+            goto pri;
+        }
+        if (mark == COLO_COMPARE_FREE_SECONDARY) {
+            conn->compare_seq = spkt->seq_end;
+            packet_destroy(spkt, NULL);
+            goto sec;
+        }
+        if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) {
+            conn->compare_seq = ppkt->seq_end;
+            colo_release_primary_pkt(s, ppkt);
+            packet_destroy(spkt, NULL);
+            goto pri;
+        }
+    } else {
+        g_queue_push_head(&conn->primary_list, ppkt);
+        g_queue_push_head(&conn->secondary_list, spkt);
 
         qemu_hexdump((char *)ppkt->data, stderr,
                      "colo-compare ppkt", ppkt->size);
         qemu_hexdump((char *)spkt->data, stderr,
                      "colo-compare spkt", spkt->size);
-    }
 
-    return res;
+        /*
+         * colo_compare_inconsistent_notify();
+         * TODO: notice to checkpoint();
+         */
+    }
 }
 
+
 /*
  * Called from the compare thread on the primary
  * for compare udp packet
@@ -XXX,XX +XXX,XX @@ static void colo_old_packet_check(void *opaque)
                         (GCompareFunc)colo_old_packet_check_one_conn);
 }
 
-/*
- * Called from the compare thread on the primary
- * for compare packet with secondary list of the
- * specified connection when a new packet was
- * queued to it.
- */
-static void colo_compare_connection(void *opaque, void *user_data)
+static void colo_compare_packet(CompareState *s, Connection *conn,
+                                int (*HandlePacket)(Packet *spkt,
+                                Packet *ppkt))
 {
-    CompareState *s = user_data;
-    Connection *conn = opaque;
     Packet *pkt = NULL;
     GList *result = NULL;
-    int ret;
 
     while (!g_queue_is_empty(&conn->primary_list) &&
            !g_queue_is_empty(&conn->secondary_list)) {
         pkt = g_queue_pop_head(&conn->primary_list);
-        switch (conn->ip_proto) {
-        case IPPROTO_TCP:
-            result = g_queue_find_custom(&conn->secondary_list,
-                     pkt, (GCompareFunc)colo_packet_compare_tcp);
-            break;
-        case IPPROTO_UDP:
-            result = g_queue_find_custom(&conn->secondary_list,
-                     pkt, (GCompareFunc)colo_packet_compare_udp);
-            break;
-        case IPPROTO_ICMP:
-            result = g_queue_find_custom(&conn->secondary_list,
-                     pkt, (GCompareFunc)colo_packet_compare_icmp);
-            break;
-        default:
-            result = g_queue_find_custom(&conn->secondary_list,
-                     pkt, (GCompareFunc)colo_packet_compare_other);
-            break;
-        }
+        result = g_queue_find_custom(&conn->secondary_list,
+                 pkt, (GCompareFunc)HandlePacket);
 
         if (result) {
-            ret = compare_chr_send(s,
-                                   pkt->data,
-                                   pkt->size,
-                                   pkt->vnet_hdr_len);
-            if (ret < 0) {
-                error_report("colo_send_primary_packet failed");
-            }
-            trace_colo_compare_main("packet same and release packet");
+            colo_release_primary_pkt(s, pkt);
             g_queue_remove(&conn->secondary_list, result->data);
-            packet_destroy(pkt, NULL);
         } else {
             /*
              * If one packet arrive late, the secondary_list or
@@ -XXX,XX +XXX,XX @@ static void colo_compare_connection(void *opaque, void *user_data)
     }
 }
 
+/*
+ * Called from the compare thread on the primary
+ * for compare packet with secondary list of the
+ * specified connection when a new packet was
+ * queued to it.
+ */
+static void colo_compare_connection(void *opaque, void *user_data)
+{
+    CompareState *s = user_data;
+    Connection *conn = opaque;
+
+    switch (conn->ip_proto) {
+    case IPPROTO_TCP:
+        colo_compare_tcp(s, conn);
+        break;
+    case IPPROTO_UDP:
+        colo_compare_packet(s, conn, colo_packet_compare_udp);
+        break;
+    case IPPROTO_ICMP:
+        colo_compare_packet(s, conn, colo_packet_compare_icmp);
+        break;
+    default:
+        colo_compare_packet(s, conn, colo_packet_compare_other);
+        break;
+    }
+}
+
 static int compare_chr_send(CompareState *s,
                             const uint8_t *buf,
                             uint32_t size,
diff --git a/net/colo.c b/net/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ Connection *connection_new(ConnectionKey *key)
     conn->processing = false;
     conn->offset = 0;
     conn->syn_flag = 0;
+    conn->pack = 0;
+    conn->sack = 0;
     g_queue_init(&conn->primary_list);
     g_queue_init(&conn->secondary_list);
 
@@ -XXX,XX +XXX,XX @@ Packet *packet_new(const void *data, int size, int vnet_hdr_len)
     pkt->size = size;
     pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     pkt->vnet_hdr_len = vnet_hdr_len;
+    pkt->tcp_seq = 0;
+    pkt->tcp_ack = 0;
+    pkt->seq_end = 0;
+    pkt->header_size = 0;
+    pkt->payload_size = 0;
+    pkt->offset = 0;
+    pkt->flags = 0;
 
     return pkt;
 }
diff --git a/net/colo.h b/net/colo.h
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.h
+++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@ typedef struct Packet {
     int64_t creation_ms;
     /* Get vnet_hdr_len from filter */
     uint32_t vnet_hdr_len;
+    uint32_t tcp_seq; /* sequence number */
+    uint32_t tcp_ack; /* acknowledgement number */
+    /* the sequence number of the last byte of the packet */
+    uint32_t seq_end;
+    uint8_t header_size;  /* the header length */
+    uint16_t payload_size; /* the payload length */
+    /* record the payload offset(the length that has been compared) */
+    uint16_t offset;
+    uint8_t flags; /* Flags(aka Control bits) */
 } Packet;
 
 typedef struct ConnectionKey {
@@ -XXX,XX +XXX,XX @@ typedef struct Connection {
     /* flag to enqueue unprocessed_connections */
     bool processing;
     uint8_t ip_proto;
+    /* record the sequence number that has been compared */
+    uint32_t compare_seq;
+    /* the maximum of acknowledgement number in primary_list queue */
+    uint32_t pack;
+    /* the maximum of acknowledgement number in secondary_list queue */
+    uint32_t sack;
     /* offset = secondary_seq - primary_seq */
     tcp_seq  offset;
     /*
diff --git a/net/trace-events b/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -XXX,XX +XXX,XX @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
 colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
 colo_old_packet_check_found(int64_t old_time) "%" PRId64
 colo_compare_miscompare(void) ""
-colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int res, uint32_t flag, int size) "side: %s seq/ack= %u/%u res= %d flags= 0x%x pkt_size: %d\n"
+colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, int pdlen, int offset, int flags) "%s: seq/ack= %u/%u hdlen= %d pdlen= %d offset= %d flags=%d\n"
 
 # net/filter-rewriter.c
 colo_filter_rewriter_debug(void) ""
-- 
2.7.4

From: Thomas Huth <thuth@redhat.com>

QEMU can emulate hubs to connect NICs and netdevs. This is currently
primarily used for the mis-named 'vlan' feature of the networking
subsystem. Now the 'vlan' feature has been marked as deprecated, since
its name is rather confusing and the users often rather mis-configure
their network when trying to use it. But while the 'vlan' parameter
should be removed at one point in time, the basic idea of emulating
a hub in QEMU is still good: It's useful for bundling up the output of
multiple NICs into one single l2tp netdev for example.

Now to be able to use the hubport feature without 'vlan's, there is one
missing piece: The possibility to connect a hubport to a netdev, too.
This patch adds this possibility by introducing a new "netdev=..."
parameter to the hubports.

To bundle up the output of multiple NICs into one socket netdev, you can
now run QEMU with these parameters for example:

qemu-system-ppc64 ... -netdev socket,id=s1,connect=:11122 \
    -netdev hubport,hubid=1,id=h1,netdev=s1 \
    -netdev hubport,hubid=1,id=h2 -device e1000,netdev=h2 \
    -netdev hubport,hubid=1,id=h3 -device virtio-net-pci,netdev=h3

For using the socket netdev, you have got to start another QEMU as the
receiving side first, for example with network dumping enabled:

qemu-system-x86_64 -M isapc -netdev socket,id=s0,listen=:11122 \
    -device ne2k_isa,netdev=s0 \
    -object filter-dump,id=f1,netdev=s0,file=/tmp/dump.dat

After the ppc64 guest tried to boot from both NICs, you can see in the
dump file (using Wireshark, for example), that the output of both NICs
(the e1000 and the virtio-net-pci) has been successfully transfered
via the socket netdev in this case.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/hub.c       | 27 +++++++++++++++++++++------
 net/hub.h       |  3 ++-
 net/net.c       |  2 +-
 qapi/net.json   |  4 +++-
 qemu-options.hx |  8 +++++---
 5 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/net/hub.c b/net/hub.c
index XXXXXXX..XXXXXXX 100644
--- a/net/hub.c
+++ b/net/hub.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qapi/error.h"
 #include "monitor/monitor.h"
 #include "net/net.h"
 #include "clients.h"
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_hub_port_info = {
     .cleanup = net_hub_port_cleanup,
 };
 
-static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
+static NetHubPort *net_hub_port_new(NetHub *hub, const char *name,
+                                    NetClientState *hubpeer)
 {
     NetClientState *nc;
     NetHubPort *port;
@@ -XXX,XX +XXX,XX @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
         name = default_name;
     }
 
-    nc = qemu_new_net_client(&net_hub_port_info, NULL, "hub", name);
+    nc = qemu_new_net_client(&net_hub_port_info, hubpeer, "hub", name);
     port = DO_UPCAST(NetHubPort, nc, nc);
     port->id = id;
     port->hub = hub;
@@ -XXX,XX +XXX,XX @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
 
 /**
  * Create a port on a given hub
+ * @hub_id: Number of the hub
  * @name: Net client name or NULL for default name.
+ * @hubpeer: Peer to use (if "netdev=id" has been specified)
  *
  * If there is no existing hub with the given id then a new hub is created.
  */
-NetClientState *net_hub_add_port(int hub_id, const char *name)
+NetClientState *net_hub_add_port(int hub_id, const char *name,
+                                 NetClientState *hubpeer)
 {
     NetHub *hub;
     NetHubPort *port;
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_add_port(int hub_id, const char *name)
         hub = net_hub_new(hub_id);
     }
 
-    port = net_hub_port_new(hub, name);
+    port = net_hub_port_new(hub, name, hubpeer);
     return &port->nc;
 }
 
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id)
         }
     }
 
-    nc = net_hub_add_port(hub_id, NULL);
+    nc = net_hub_add_port(hub_id, NULL, NULL);
     return nc;
 }
 
@@ -XXX,XX +XXX,XX @@ int net_init_hubport(const Netdev *netdev, const char *name,
                      NetClientState *peer, Error **errp)
 {
     const NetdevHubPortOptions *hubport;
+    NetClientState *hubpeer = NULL;
 
     assert(netdev->type == NET_CLIENT_DRIVER_HUBPORT);
     assert(!peer);
     hubport = &netdev->u.hubport;
 
-    net_hub_add_port(hubport->hubid, name);
+    if (hubport->has_netdev) {
+        hubpeer = qemu_find_netdev(hubport->netdev);
+        if (!hubpeer) {
+            error_setg(errp, "netdev '%s' not found", hubport->netdev);
+            return -1;
+        }
+    }
+
+    net_hub_add_port(hubport->hubid, name, hubpeer);
+
     return 0;
 }
 
diff --git a/net/hub.h b/net/hub.h
index XXXXXXX..XXXXXXX 100644
--- a/net/hub.h
+++ b/net/hub.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu-common.h"
 
-NetClientState *net_hub_add_port(int hub_id, const char *name);
+NetClientState *net_hub_add_port(int hub_id, const char *name,
+                                 NetClientState *hubpeer);
 NetClientState *net_hub_find_client_by_name(int hub_id, const char *name);
 void net_hub_info(Monitor *mon);
 void net_hub_check_clients(void);
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
         /* Do not add to a vlan if it's a nic with a netdev= parameter. */
         if (netdev->type != NET_CLIENT_DRIVER_NIC ||
             !opts->u.nic.has_netdev) {
-            peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL);
+            peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL, NULL);
         }
 
         if (net->has_vlan && !vlan_warned) {
diff --git a/qapi/net.json b/qapi/net.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
 # Connect two or more net clients through a software hub.
 #
 # @hubid: hub identifier number
+# @netdev: used to connect hub to a netdev instead of a device (since 2.12)
 #
 # Since: 1.2
 ##
 { 'struct': 'NetdevHubPortOptions',
   'data': {
-    'hubid':     'int32' } }
+    'hubid':     'int32',
+    '*netdev':    'str' } }
 
 ##
 # @NetdevNetmapOptions:
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 #endif
     "-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
     "                configure a vhost-user network, backed by a chardev 'dev'\n"
-    "-netdev hubport,id=str,hubid=n\n"
+    "-netdev hubport,id=str,hubid=n[,netdev=nd]\n"
     "                configure a hub port on QEMU VLAN 'n'\n", QEMU_ARCH_ALL)
 DEF("net", HAS_ARG, QEMU_OPTION_net,
     "-net nic[,vlan=n][,netdev=nd][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
@@ -XXX,XX +XXX,XX @@ vde_switch -F -sock /tmp/myswitch
 qemu-system-i386 linux.img -net nic -net vde,sock=/tmp/myswitch
 @end example
 
-@item -netdev hubport,id=@var{id},hubid=@var{hubid}
+@item -netdev hubport,id=@var{id},hubid=@var{hubid}[,netdev=@var{nd}]
 
 Create a hub port on QEMU "vlan" @var{hubid}.
 
 The hubport netdev lets you connect a NIC to a QEMU "vlan" instead of a single
 netdev.  @code{-net} and @code{-device} with parameter @option{vlan} create the
-required hub automatically.
+required hub automatically. Alternatively, you can also connect the hubport
+to another netdev with ID @var{nd} by using the @option{netdev=@var{nd}}
+option.
 
 @item -netdev vhost-user,chardev=@var{id}[,vhostforce=on|off][,queues=n]
 
-- 
2.7.4

From: Thomas Huth <thuth@redhat.com>

It does not make much sense to limit these commands to the legacy 'vlan'
concept only, they should work with the modern netdevs, too. So now
it is possible to use this command with one, two or three parameters.

With one parameter, the command installs a hostfwd rule on the default
"user" network:
    hostfwd_add tcp:...

With two parameters, the command installs a hostfwd rule on a netdev
(that's the new way of using this command):
    hostfwd_add netdev_id tcp:...

With three parameters, the command installs a rule on a 'vlan' (aka hub):
    hostfwd_add hub_id name tcp:...

Same applies to the hostfwd_remove command now.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hmp-commands.hx |  4 ++--
 net/slirp.c     | 33 +++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index XXXXXXX..XXXXXXX 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -XXX,XX +XXX,XX @@ ETEXI
     {
         .name       = "hostfwd_add",
         .args_type  = "arg1:s,arg2:s?,arg3:s?",
-        .params     = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport",
+        .params     = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport",
         .help       = "redirect TCP or UDP connections from host to guest (requires -net user)",
         .cmd        = hmp_hostfwd_add,
     },
@@ -XXX,XX +XXX,XX @@ ETEXI
     {
         .name       = "hostfwd_remove",
         .args_type  = "arg1:s,arg2:s?,arg3:s?",
-        .params     = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport",
+        .params     = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport",
         .help       = "remove host-to-guest TCP or UDP redirection",
         .cmd        = hmp_hostfwd_remove,
     },
diff --git a/net/slirp.c b/net/slirp.c
index XXXXXXX..XXXXXXX 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -XXX,XX +XXX,XX @@ error:
     return -1;
 }
 
-static SlirpState *slirp_lookup(Monitor *mon, const char *vlan,
-                                const char *stack)
+static SlirpState *slirp_lookup(Monitor *mon, const char *hub_id,
+                                const char *name)
 {
-
-    if (vlan) {
+    if (name) {
         NetClientState *nc;
-        nc = net_hub_find_client_by_name(strtol(vlan, NULL, 0), stack);
-        if (!nc) {
-            monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n");
-            return NULL;
+        if (hub_id) {
+            nc = net_hub_find_client_by_name(strtol(hub_id, NULL, 0), name);
+            if (!nc) {
+                monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n");
+                return NULL;
+            }
+        } else {
+            nc = qemu_find_netdev(name);
+            if (!nc) {
+                monitor_printf(mon, "unrecognized netdev id '%s'\n", name);
+                return NULL;
+            }
         }
         if (strcmp(nc->model, "user")) {
             monitor_printf(mon, "invalid device specified\n");
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict)
     const char *arg2 = qdict_get_try_str(qdict, "arg2");
     const char *arg3 = qdict_get_try_str(qdict, "arg3");
 
-    if (arg2) {
+    if (arg3) {
         s = slirp_lookup(mon, arg1, arg2);
         src_str = arg3;
+    } else if (arg2) {
+        s = slirp_lookup(mon, NULL, arg1);
+        src_str = arg2;
     } else {
         s = slirp_lookup(mon, NULL, NULL);
         src_str = arg1;
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_add(Monitor *mon, const QDict *qdict)
     const char *arg2 = qdict_get_try_str(qdict, "arg2");
     const char *arg3 = qdict_get_try_str(qdict, "arg3");
 
-    if (arg2) {
+    if (arg3) {
         s = slirp_lookup(mon, arg1, arg2);
         redir_str = arg3;
+    } else if (arg2) {
+        s = slirp_lookup(mon, NULL, arg1);
+        redir_str = arg2;
     } else {
         s = slirp_lookup(mon, NULL, NULL);
         redir_str = arg1;
-- 
2.7.4

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

gently asked by his automatic reply :)

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 MAINTAINERS | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: hw/scsi/mfi.h
 F: tests/megasas-test.c
 
 Network packet abstractions
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: include/net/eth.h
 F: net/eth.c
@@ -XXX,XX +XXX,XX @@ F: hw/net/net_rx_pkt*
 F: hw/net/net_tx_pkt*
 
 Vmware
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: hw/net/vmxnet*
 F: hw/scsi/vmw_pvscsi*
@@ -XXX,XX +XXX,XX @@ F: hw/mem/nvdimm.c
 F: include/hw/mem/nvdimm.h
 
 e1000x
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: hw/net/e1000x*
 
 e1000e
-M: Dmitry Fleytman <dmitry@daynix.com>
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 S: Maintained
 F: hw/net/e1000e*
 
-- 
2.7.4

The following changes since commit d48125de38f48a61d6423ef6a01156d6dff9ee2c:

Merge tag 'kraxel-20220719-pull-request' of https://gitlab.com/kraxel/qemu into staging (2022-07-19 17:40:36 +0100)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 8bdab83b34efb0b598be4e5b98e4f466ca5f2f80:

net/colo.c: fix segmentation fault when packet is not parsed correctly (2022-07-20 16:58:08 +0800)

----------------------------------------------------------------

Changes since V1:
- Fix build erros of vhost-vdpa when virtio-net is not set

----------------------------------------------------------------
Eugenio Pérez (21):
      vhost: move descriptor translation to vhost_svq_vring_write_descs
      virtio-net: Expose MAC_TABLE_ENTRIES
      virtio-net: Expose ctrl virtqueue logic
      vdpa: Avoid compiler to squash reads to used idx
      vhost: Reorder vhost_svq_kick
      vhost: Move vhost_svq_kick call to vhost_svq_add
      vhost: Check for queue full at vhost_svq_add
      vhost: Decouple vhost_svq_add from VirtQueueElement
      vhost: Add SVQDescState
      vhost: Track number of descs in SVQDescState
      vhost: add vhost_svq_push_elem
      vhost: Expose vhost_svq_add
      vhost: add vhost_svq_poll
      vhost: Add svq avail_handler callback
      vdpa: Export vhost_vdpa_dma_map and unmap calls
      vhost-net-vdpa: add stubs for when no virtio-net device is present
      vdpa: manual forward CVQ buffers
      vdpa: Buffer CVQ support on shadow virtqueue
      vdpa: Extract get features part from vhost_vdpa_get_max_queue_pairs
      vdpa: Add device migration blocker
      vdpa: Add x-svq to NetdevVhostVDPAOptions

Zhang Chen (4):
      softmmu/runstate.c: add RunStateTransition support form COLO to PRELAUNCH
      net/colo: Fix a "double free" crash to clear the conn_list
      net/colo.c: No need to track conn_list for filter-rewriter
      net/colo.c: fix segmentation fault when packet is not parsed correctly

From: Eugenio Pérez <eperezma@redhat.com>

It's done for both in and out descriptors so it's better placed here.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

vhost-vdpa control virtqueue needs to know the maximum entries supported
by the virtio-net device, so we know if it is possible to apply the
filter.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 1 -
 include/hw/virtio/virtio-net.h | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@
 
 #define VIRTIO_NET_VM_VERSION    11
 
-#define MAC_TABLE_ENTRIES    64
 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
 
 /* previously fixed value */
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIONet, VIRTIO_NET)
  * and latency. */
 #define TX_BURST 256
 
+/* Maximum VIRTIO_NET_CTRL_MAC_TABLE_SET unicast + multicast entries. */
+#define MAC_TABLE_ENTRIES    64
+
 typedef struct virtio_net_conf
 {
     uint32_t txtimer;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows external vhost-net devices to modify the state of the
VirtIO device model once the vhost-vdpa device has acknowledged the
control commands.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c            | 84 ++++++++++++++++++++++++------------------
 include/hw/virtio/virtio-net.h |  4 ++
 2 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
     return VIRTIO_NET_OK;
 }
 
-static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+                                  const struct iovec *in_sg, unsigned in_num,
+                                  const struct iovec *out_sg,
+                                  unsigned out_num)
 {
     VirtIONet *n = VIRTIO_NET(vdev);
     struct virtio_net_ctrl_hdr ctrl;
     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-    VirtQueueElement *elem;
     size_t s;
     struct iovec *iov, *iov2;
-    unsigned int iov_cnt;
+
+    if (iov_size(in_sg, in_num) < sizeof(status) ||
+        iov_size(out_sg, out_num) < sizeof(ctrl)) {
+        virtio_error(vdev, "virtio-net ctrl missing headers");
+        return 0;
+    }
+
+    iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
+    s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
+    iov_discard_front(&iov, &out_num, sizeof(ctrl));
+    if (s != sizeof(ctrl)) {
+        status = VIRTIO_NET_ERR;
+    } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
+        status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
+        status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
+        status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
+        status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
+        status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
+        status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
+    }
+
+    s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
+    assert(s == sizeof(status));
+
+    g_free(iov2);
+    return sizeof(status);
+}
+
+static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtQueueElement *elem;
 
     for (;;) {
+        size_t written;
         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
         if (!elem) {
             break;
         }
-        if (iov_size(elem->in_sg, elem->in_num) < sizeof(status) ||
-            iov_size(elem->out_sg, elem->out_num) < sizeof(ctrl)) {
-            virtio_error(vdev, "virtio-net ctrl missing headers");
+
+        written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
+                                             elem->out_sg, elem->out_num);
+        if (written > 0) {
+            virtqueue_push(vq, elem, written);
+            virtio_notify(vdev, vq);
+            g_free(elem);
+        } else {
             virtqueue_detach_element(vq, elem, 0);
             g_free(elem);
             break;
         }
-
-        iov_cnt = elem->out_num;
-        iov2 = iov = g_memdup2(elem->out_sg,
-                               sizeof(struct iovec) * elem->out_num);
-        s = iov_to_buf(iov, iov_cnt, 0, &ctrl, sizeof(ctrl));
-        iov_discard_front(&iov, &iov_cnt, sizeof(ctrl));
-        if (s != sizeof(ctrl)) {
-            status = VIRTIO_NET_ERR;
-        } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
-            status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
-            status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
-            status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
-            status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
-            status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt);
-        } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
-            status = virtio_net_handle_offloads(n, ctrl.cmd, iov, iov_cnt);
-        }
-
-        s = iov_from_buf(elem->in_sg, elem->in_num, 0, &status, sizeof(status));
-        assert(s == sizeof(status));
-
-        virtqueue_push(vq, elem, sizeof(status));
-        virtio_notify(vdev, vq);
-        g_free(iov2);
-        g_free(elem);
     }
 }
 
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -XXX,XX +XXX,XX @@ struct VirtIONet {
     struct EBPFRSSContext ebpf_rss;
 };
 
+size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
+                                  const struct iovec *in_sg, unsigned in_num,
+                                  const struct iovec *out_sg,
+                                  unsigned out_num);
 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
                                    const char *type);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

In the next patch we will allow busypolling of this value. The compiler
have a running path where shadow_used_idx, last_used_idx, and vring used
idx are not modified within the same thread busypolling.

This was not an issue before since we always cleared device event
notifier before checking it, and that could act as memory barrier.
However, the busypoll needs something similar to kernel READ_ONCE.

Let's add it here, sepparated from the polling.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

From: Eugenio Pérez <eperezma@redhat.com>

Future code needs to call it from vhost_svq_add.

No functional change intended.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

The series needs to expose vhost_svq_add with full functionality,
including kick

From: Eugenio Pérez <eperezma@redhat.com>

The series need to expose vhost_svq_add with full functionality,
including checking for full queue.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 59 +++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
  * Add an element to a SVQ.
  *
  * The caller must check that there is enough slots for the new element. It
- * takes ownership of the element: In case of failure, it is free and the SVQ
- * is considered broken.
+ * takes ownership of the element: In case of failure not ENOSPC, it is free.
+ *
+ * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
  */
-static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 {
     unsigned qemu_head;
-    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    unsigned ndescs = elem->in_num + elem->out_num;
+    bool ok;
+
+    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
+        return -ENOSPC;
+    }
+
+    ok = vhost_svq_add_split(svq, elem, &qemu_head);
     if (unlikely(!ok)) {
         g_free(elem);
-        return false;
+        return -EINVAL;
     }
 
     svq->ring_id_maps[qemu_head] = elem;
     vhost_svq_kick(svq);
-    return true;
+    return 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 
         while (true) {
             VirtQueueElement *elem;
-            bool ok;
+            int r;
 
             if (svq->next_guest_avail_elem) {
                 elem = g_steal_pointer(&svq->next_guest_avail_elem);
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
-                /*
-                 * This condition is possible since a contiguous buffer in GPA
-                 * does not imply a contiguous buffer in qemu's VA
-                 * scatter-gather segments. If that happens, the buffer exposed
-                 * to the device needs to be a chain of descriptors at this
-                 * moment.
-                 *
-                 * SVQ cannot hold more available buffers if we are here:
-                 * queue the current guest descriptor and ignore further kicks
-                 * until some elements are used.
-                 */
-                svq->next_guest_avail_elem = elem;
-                return;
-            }
-
-            ok = vhost_svq_add(svq, elem);
-            if (unlikely(!ok)) {
-                /* VQ is broken, just return and ignore any other kicks */
+            r = vhost_svq_add(svq, elem);
+            if (unlikely(r != 0)) {
+                if (r == -ENOSPC) {
+                    /*
+                     * This condition is possible since a contiguous buffer in
+                     * GPA does not imply a contiguous buffer in qemu's VA
+                     * scatter-gather segments. If that happens, the buffer
+                     * exposed to the device needs to be a chain of descriptors
+                     * at this moment.
+                     *
+                     * SVQ cannot hold more available buffers if we are here:
+                     * queue the current guest descriptor and ignore kicks
+                     * until some elements are used.
+                     */
+                    svq->next_guest_avail_elem = elem;
+                }
+
+                /* VQ is full or broken, just return and ignore kicks */
                 return;
             }
         }
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

VirtQueueElement comes from the guest, but we're heading SVQ to be able
to modify the element presented to the device without the guest's
knowledge.

To do so, make SVQ accept sg buffers directly, instead of using
VirtQueueElement.

Add vhost_svq_add_element to maintain element convenience.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
 }
 
 static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-                                VirtQueueElement *elem, unsigned *head)
+                                const struct iovec *out_sg, size_t out_num,
+                                const struct iovec *in_sg, size_t in_num,
+                                unsigned *head)
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
     bool ok;
-    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
 
     *head = svq->free_head;
 
     /* We need some descriptors here */
-    if (unlikely(!elem->out_num && !elem->in_num)) {
+    if (unlikely(!out_num && !in_num)) {
         qemu_log_mask(LOG_GUEST_ERROR,
                       "Guest provided element with no descriptors");
         return false;
     }
 
-    ok = vhost_svq_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
-                                     elem->in_num > 0, false);
+    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
+                                     false);
     if (unlikely(!ok)) {
         return false;
     }
 
-    ok = vhost_svq_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
-                                     true);
+    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
     if (unlikely(!ok)) {
         return false;
     }
@@ -XXX,XX +XXX,XX @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
  *
  * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
  */
-static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+                          size_t out_num, const struct iovec *in_sg,
+                          size_t in_num, VirtQueueElement *elem)
 {
     unsigned qemu_head;
-    unsigned ndescs = elem->in_num + elem->out_num;
+    unsigned ndescs = in_num + out_num;
     bool ok;
 
     if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
         return -ENOSPC;
     }
 
-    ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
     if (unlikely(!ok)) {
         g_free(elem);
         return -EINVAL;
@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
     return 0;
 }
 
+/* Convenience wrapper to add a guest's element to SVQ */
+static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
+                                 VirtQueueElement *elem)
+{
+    return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
+                         elem->in_num, elem);
+}
+
 /**
  * Forward available buffers.
  *
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            r = vhost_svq_add(svq, elem);
+            r = vhost_svq_add_element(svq, elem);
             if (unlikely(r != 0)) {
                 if (r == -ENOSPC) {
                     /*
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This will allow SVQ to add context to the different queue elements.

This patch only store the actual element, no functional change intended.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++--------
 hw/virtio/vhost-shadow-virtqueue.h |  8 ++++++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
         return -EINVAL;
     }
 
-    svq->ring_id_maps[qemu_head] = elem;
+    svq->desc_state[qemu_head].elem = elem;
     vhost_svq_kick(svq);
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
         return NULL;
     }
 
-    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+    if (unlikely(!svq->desc_state[used_elem.id].elem)) {
         qemu_log_mask(LOG_GUEST_ERROR,
             "Device %s says index %u is used, but it was not available",
             svq->vdev->name, used_elem.id);
         return NULL;
     }
 
-    num = svq->ring_id_maps[used_elem.id]->in_num +
-          svq->ring_id_maps[used_elem.id]->out_num;
+    num = svq->desc_state[used_elem.id].elem->in_num +
+          svq->desc_state[used_elem.id].elem->out_num;
     last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
     svq->desc_next[last_used_chain] = svq->free_head;
     svq->free_head = used_elem.id;
 
     *len = used_elem.len;
-    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+    return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
 }
 
 static void vhost_svq_flush(VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
     memset(svq->vring.desc, 0, driver_size);
     svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
     memset(svq->vring.used, 0, device_size);
-    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    svq->desc_state = g_new0(SVQDescState, svq->vring.num);
     svq->desc_next = g_new0(uint16_t, svq->vring.num);
     for (unsigned i = 0; i < svq->vring.num - 1; i++) {
         svq->desc_next[i] = cpu_to_le16(i + 1);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
 
     for (unsigned i = 0; i < svq->vring.num; ++i) {
         g_autofree VirtQueueElement *elem = NULL;
-        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        elem = g_steal_pointer(&svq->desc_state[i].elem);
         if (elem) {
             virtqueue_detach_element(svq->vq, elem, 0);
         }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
     }
     svq->vq = NULL;
     g_free(svq->desc_next);
-    g_free(svq->ring_id_maps);
+    g_free(svq->desc_state);
     qemu_vfree(svq->vring.desc);
     qemu_vfree(svq->vring.used);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "standard-headers/linux/vhost_types.h"
 #include "hw/virtio/vhost-iova-tree.h"
 
+typedef struct SVQDescState {
+    VirtQueueElement *elem;
+} SVQDescState;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
     /* Shadow vring */
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* IOVA mapping */
     VhostIOVATree *iova_tree;
 
-    /* Map for use the guest's descriptors */
-    VirtQueueElement **ring_id_maps;
+    /* SVQ vring descriptors state */
+    SVQDescState *desc_state;
 
     /* Next VirtQueue element that guest made available */
     VirtQueueElement *next_guest_avail_elem;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

A guest's buffer continuos on GPA may need multiple descriptors on
qemu's VA, so SVQ should track its length sepparatedly.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 4 ++--
 hw/virtio/vhost-shadow-virtqueue.h | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This function allows external SVQ users to return guest's available
buffers.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 16 ++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  3 +++
 2 files changed, 19 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

This allows external parts of SVQ to forward custom buffers to the
device.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 6 +++---
 hw/virtio/vhost-shadow-virtqueue.h | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

It allows the Shadow Control VirtQueue to wait for the device to use the
available buffers.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 27 +++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  1 +
 2 files changed, 28 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

This allows external handlers to be aware of new buffers that the guest
places in the virtqueue.

When this callback is defined the ownership of the guest's virtqueue
element is transferred to the callback. This means that if the user
wants to forward the descriptor it needs to manually inject it. The
callback is also free to process the command by itself and use the
element with svq_push.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 14 ++++++++++++--
 hw/virtio/vhost-shadow-virtqueue.h | 31 ++++++++++++++++++++++++++++++-
 hw/virtio/vhost-vdpa.c             |  3 ++-
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
                 break;
             }
 
-            r = vhost_svq_add_element(svq, elem);
+            if (svq->ops) {
+                r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
+            } else {
+                r = vhost_svq_add_element(svq, elem);
+            }
             if (unlikely(r != 0)) {
                 if (r == -ENOSPC) {
                     /*
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * shadow methods and file descriptors.
  *
  * @iova_tree: Tree to perform descriptors translations
+ * @ops: SVQ owner callbacks
+ * @ops_opaque: ops opaque pointer
  *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+                                    const VhostShadowVirtqueueOps *ops,
+                                    void *ops_opaque)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     svq->iova_tree = iova_tree;
+    svq->ops = ops;
+    svq->ops_opaque = ops_opaque;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct SVQDescState {
     unsigned int ndescs;
 } SVQDescState;
 
+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
+
+/**
+ * Callback to handle an avail buffer.
+ *
+ * @svq:  Shadow virtqueue
+ * @elem:  Element placed in the queue by the guest
+ * @vq_callback_opaque:  Opaque
+ *
+ * Returns 0 if the vq is running as expected.
+ *
+ * Note that ownership of elem is transferred to the callback.
+ */
+typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq,
+                                      VirtQueueElement *elem,
+                                      void *vq_callback_opaque);
+
+typedef struct VhostShadowVirtqueueOps {
+    VirtQueueAvailCallback avail_handler;
+} VhostShadowVirtqueueOps;
+
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
     /* Shadow vring */
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      */
     uint16_t *desc_next;
 
+    /* Caller callbacks */
+    const VhostShadowVirtqueueOps *ops;
+
+    /* Caller callbacks opaque */
+    void *ops_opaque;
+
     /* Next head to expose to the device */
     uint16_t shadow_avail_idx;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+                                    const VhostShadowVirtqueueOps *ops,
+                                    void *ops_opaque);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
+        g_autoptr(VhostShadowVirtqueue) svq;
 
+        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
             return -1;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Shadow CVQ will copy buffers on qemu VA, so we avoid TOCTOU attacks from
the guest that could set a different state in qemu device model and vdpa
device.

To do so, it needs to be able to map these new buffers to the device.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 7 +++----
 include/hw/virtio/vhost-vdpa.h | 4 ++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
     return false;
 }
 
-static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
-                              void *vaddr, bool readonly)
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+                       void *vaddr, bool readonly)
 {
     struct vhost_msg_v2 msg = {};
     int fd = v->device_fd;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
     return ret;
 }
 
-static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
-                                hwaddr size)
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
 {
     struct vhost_msg_v2 msg = {};
     int fd = v->device_fd;
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
 
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+                       void *vaddr, bool readonly);
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size);
+
 #endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

net/vhost-vdpa.c will need functions that are declared in
vhost-shadow-virtqueue.c, that needs functions of virtio-net.c.

Copy the vhost-vdpa-stub.c code so
only the constructor net_init_vhost_vdpa needs to be defined.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/meson.build       |  3 ++-
 net/vhost-vdpa-stub.c | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 net/vhost-vdpa-stub.c

diff --git a/net/meson.build b/net/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -XXX,XX +XXX,XX @@ endif
 softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix))
 softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
 if have_vhost_net_vdpa
-  softmmu_ss.add(files('vhost-vdpa.c'))
+  softmmu_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-vdpa.c'), if_false: files('vhost-vdpa-stub.c'))
+  softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-vdpa-stub.c'))
 endif
 
 vmnet_files = files(
diff --git a/net/vhost-vdpa-stub.c b/net/vhost-vdpa-stub.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/net/vhost-vdpa-stub.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost-vdpa-stub.c
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "net/vhost-vdpa.h"
+#include "qapi/error.h"
+
+int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
+                        NetClientState *peer, Error **errp)
+{
+    error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
+    return -1;
+}
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Do a simple forwarding of CVQ buffers, the same work SVQ could do but
through callbacks. No functional change intended.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         |  3 ++-
 include/hw/virtio/vhost-vdpa.h |  3 +++
 net/vhost-vdpa.c               | 58 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
         g_autoptr(VhostShadowVirtqueue) svq;
 
-        svq = vhost_svq_new(v->iova_tree, NULL, NULL);
+        svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
+                            v->shadow_vq_ops_opaque);
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
             return -1;
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 #include <gmodule.h>
 
 #include "hw/virtio/vhost-iova-tree.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
+    const VhostShadowVirtqueueOps *shadow_vq_ops;
+    void *shadow_vq_ops_opaque;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 } VhostVDPA;
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "clients.h"
+#include "hw/virtio/virtio-net.h"
 #include "net/vhost_net.h"
 #include "net/vhost-vdpa.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "qemu/option.h"
 #include "qapi/error.h"
 #include <linux/vhost.h>
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
         .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+/**
+ * Forward buffer for the moment.
+ */
+static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
+                                            VirtQueueElement *elem,
+                                            void *opaque)
+{
+    unsigned int n = elem->out_num + elem->in_num;
+    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
+    size_t in_len, dev_written;
+    virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
+    int r;
+
+    memcpy(dev_buffers, elem->out_sg, elem->out_num);
+    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
+
+    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
+                      elem->in_num, elem);
+    if (unlikely(r != 0)) {
+        if (unlikely(r == -ENOSPC)) {
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
+                          __func__);
+        }
+        goto out;
+    }
+
+    /*
+     * We can poll here since we've had BQL from the time we sent the
+     * descriptor. Also, we need to take the answer before SVQ pulls by itself,
+     * when BQL is released
+     */
+    dev_written = vhost_svq_poll(svq);
+    if (unlikely(dev_written < sizeof(status))) {
+        error_report("Insufficient written data (%zu)", dev_written);
+    }
+
+out:
+    in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
+                          sizeof(status));
+    if (unlikely(in_len < sizeof(status))) {
+        error_report("Bad device CVQ written length");
+    }
+    vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
+    g_free(elem);
+    return r;
+}
+
+static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
+    .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
+};
+
 static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                            const char *device,
                                            const char *name,
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
+    if (!is_datapath) {
+        s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
+        s->vhost_vdpa.shadow_vq_ops_opaque = s;
+    }
     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
     if (ret) {
         qemu_del_net_client(nc);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Introduce the control virtqueue support for vDPA shadow virtqueue. This
is needed for advanced networking features like rx filtering.

Virtio-net control VQ copies the descriptors to qemu's VA, so we avoid
TOCTOU with the guest's or device's memory every time there is a device
model change.  Otherwise, the guest could change the memory content in
the time between qemu and the device read it.

To demonstrate command handling, VIRTIO_NET_F_CTRL_MACADDR is
implemented.  If the virtio-net driver changes MAC the virtio-net device
model will be updated with the new one, and a rx filtering change event
will be raised.

More cvq commands could be added here straightforwardly but they have
not been tested.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 205 insertions(+), 8 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ typedef struct VhostVDPAState {
     NetClientState nc;
     struct vhost_vdpa vhost_vdpa;
     VHostNetState *vhost_net;
+
+    /* Control commands shadow buffers */
+    void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
     bool started;
 } VhostVDPAState;
 
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 
+    qemu_vfree(s->cvq_cmd_out_buffer);
+    qemu_vfree(s->cvq_cmd_in_buffer);
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_vhost_vdpa_info = {
         .check_peer_type = vhost_vdpa_check_peer_type,
 };
 
+static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
+{
+    VhostIOVATree *tree = v->iova_tree;
+    DMAMap needle = {
+        /*
+         * No need to specify size or to look for more translations since
+         * this contiguous chunk was allocated by us.
+         */
+        .translated_addr = (hwaddr)(uintptr_t)addr,
+    };
+    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
+    int r;
+
+    if (unlikely(!map)) {
+        error_report("Cannot locate expected map");
+        return;
+    }
+
+    r = vhost_vdpa_dma_unmap(v, map->iova, map->size + 1);
+    if (unlikely(r != 0)) {
+        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
+    }
+
+    vhost_iova_tree_remove(tree, map);
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_len(void)
+{
+    /*
+     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
+     * In buffer is always 1 byte, so it should fit here
+     */
+    return sizeof(struct virtio_net_ctrl_hdr) +
+           2 * sizeof(struct virtio_net_ctrl_mac) +
+           MAC_TABLE_ENTRIES * ETH_ALEN;
+}
+
+static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
+{
+    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
+}
+
+/** Copy and map a guest buffer. */
+static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v,
+                                   const struct iovec *out_data,
+                                   size_t out_num, size_t data_len, void *buf,
+                                   size_t *written, bool write)
+{
+    DMAMap map = {};
+    int r;
+
+    if (unlikely(!data_len)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid legnth of %s buffer\n",
+                      __func__, write ? "in" : "out");
+        return false;
+    }
+
+    *written = iov_to_buf(out_data, out_num, 0, buf, data_len);
+    map.translated_addr = (hwaddr)(uintptr_t)buf;
+    map.size = vhost_vdpa_net_cvq_cmd_page_len() - 1;
+    map.perm = write ? IOMMU_RW : IOMMU_RO,
+    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
+    if (unlikely(r != IOVA_OK)) {
+        error_report("Cannot map injected element");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, map.iova, vhost_vdpa_net_cvq_cmd_page_len(), buf,
+                           !write);
+    if (unlikely(r < 0)) {
+        goto dma_map_err;
+    }
+
+    return true;
+
+dma_map_err:
+    vhost_iova_tree_remove(v->iova_tree, &map);
+    return false;
+}
+
 /**
- * Forward buffer for the moment.
+ * Copy the guest element into a dedicated buffer suitable to be sent to NIC
+ *
+ * @iov: [0] is the out buffer, [1] is the in one
+ */
+static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
+                                        VirtQueueElement *elem,
+                                        struct iovec *iov)
+{
+    size_t in_copied;
+    bool ok;
+
+    iov[0].iov_base = s->cvq_cmd_out_buffer;
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, elem->out_sg, elem->out_num,
+                                vhost_vdpa_net_cvq_cmd_len(), iov[0].iov_base,
+                                &iov[0].iov_len, false);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    iov[1].iov_base = s->cvq_cmd_in_buffer;
+    ok = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, NULL, 0,
+                                sizeof(virtio_net_ctrl_ack), iov[1].iov_base,
+                                &in_copied, true);
+    if (unlikely(!ok)) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
+        return false;
+    }
+
+    iov[1].iov_len = sizeof(virtio_net_ctrl_ack);
+    return true;
+}
+
+/**
+ * Do not forward commands not supported by SVQ. Otherwise, the device could
+ * accept it and qemu would not know how to update the device model.
+ */
+static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec *out,
+                                            size_t out_num)
+{
+    struct virtio_net_ctrl_hdr ctrl;
+    size_t n;
+
+    n = iov_to_buf(out, out_num, 0, &ctrl, sizeof(ctrl));
+    if (unlikely(n < sizeof(ctrl))) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "%s: invalid legnth of out buffer %zu\n", __func__, n);
+        return false;
+    }
+
+    switch (ctrl.class) {
+    case VIRTIO_NET_CTRL_MAC:
+        switch (ctrl.cmd) {
+        case VIRTIO_NET_CTRL_MAC_ADDR_SET:
+            return true;
+        default:
+            qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid mac cmd %u\n",
+                          __func__, ctrl.cmd);
+        };
+        break;
+    default:
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
+                      __func__, ctrl.class);
+    };
+
+    return false;
+}
+
+/**
+ * Validate and copy control virtqueue commands.
+ *
+ * Following QEMU guidelines, we offer a copy of the buffers to the device to
+ * prevent TOCTOU bugs.
  */
 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
                                             VirtQueueElement *elem,
                                             void *opaque)
 {
-    unsigned int n = elem->out_num + elem->in_num;
-    g_autofree struct iovec *dev_buffers = g_new(struct iovec, n);
+    VhostVDPAState *s = opaque;
     size_t in_len, dev_written;
     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
-    int r;
+    /* out and in buffers sent to the device */
+    struct iovec dev_buffers[2] = {
+        { .iov_base = s->cvq_cmd_out_buffer },
+        { .iov_base = s->cvq_cmd_in_buffer },
+    };
+    /* in buffer used for device model */
+    const struct iovec in = {
+        .iov_base = &status,
+        .iov_len = sizeof(status),
+    };
+    int r = -EINVAL;
+    bool ok;
+
+    ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);
+    if (unlikely(!ok)) {
+        goto out;
+    }
 
-    memcpy(dev_buffers, elem->out_sg, elem->out_num);
-    memcpy(dev_buffers + elem->out_num, elem->in_sg, elem->in_num);
+    ok = vhost_vdpa_net_cvq_validate_cmd(&dev_buffers[0], 1);
+    if (unlikely(!ok)) {
+        goto out;
+    }
 
-    r = vhost_svq_add(svq, &dev_buffers[0], elem->out_num, &dev_buffers[1],
-                      elem->in_num, elem);
+    r = vhost_svq_add(svq, &dev_buffers[0], 1, &dev_buffers[1], 1, elem);
     if (unlikely(r != 0)) {
         if (unlikely(r == -ENOSPC)) {
             qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
     dev_written = vhost_svq_poll(svq);
     if (unlikely(dev_written < sizeof(status))) {
         error_report("Insufficient written data (%zu)", dev_written);
+        goto out;
+    }
+
+    memcpy(&status, dev_buffers[1].iov_base, sizeof(status));
+    if (status != VIRTIO_NET_OK) {
+        goto out;
+    }
+
+    status = VIRTIO_NET_ERR;
+    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, dev_buffers, 1);
+    if (status != VIRTIO_NET_OK) {
+        error_report("Bad CVQ processing in model");
     }
 
 out:
@@ -XXX,XX +XXX,XX @@ out:
     }
     vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
     g_free(elem);
+    if (dev_buffers[0].iov_base) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[0].iov_base);
+    }
+    if (dev_buffers[1].iov_base) {
+        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, dev_buffers[1].iov_base);
+    }
     return r;
 }
 
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
     if (!is_datapath) {
+        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                            vhost_vdpa_net_cvq_cmd_page_len());
+        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
+        s->cvq_cmd_in_buffer = qemu_memalign(qemu_real_host_page_size(),
+                                            vhost_vdpa_net_cvq_cmd_page_len());
+        memset(s->cvq_cmd_in_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
+
         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
         s->vhost_vdpa.shadow_vq_ops_opaque = s;
     }
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

To know the device features is needed for CVQ SVQ, so SVQ knows if it
can handle all commands or not. Extract from
vhost_vdpa_get_max_queue_pairs so we can reuse it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     return nc;
 }
 
-static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
+static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
+{
+    int ret = ioctl(fd, VHOST_GET_FEATURES, features);
+    if (unlikely(ret < 0)) {
+        error_setg_errno(errp, errno,
+                         "Fail to query features from vhost-vDPA device");
+    }
+    return ret;
+}
+
+static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
+                                          int *has_cvq, Error **errp)
 {
     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
     g_autofree struct vhost_vdpa_config *config = NULL;
     __virtio16 *max_queue_pairs;
-    uint64_t features;
     int ret;
 
-    ret = ioctl(fd, VHOST_GET_FEATURES, &features);
-    if (ret) {
-        error_setg(errp, "Fail to query features from vhost-vDPA device");
-        return ret;
-    }
-
     if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
         *has_cvq = 1;
     } else {
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
                         NetClientState *peer, Error **errp)
 {
     const NetdevVhostVDPAOptions *opts;
+    uint64_t features;
     int vdpa_device_fd;
     g_autofree NetClientState **ncs = NULL;
     NetClientState *nc;
-    int queue_pairs, i, has_cvq = 0;
+    int queue_pairs, r, i, has_cvq = 0;
 
     assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
     opts = &netdev->u.vhost_vdpa;
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
         return -errno;
     }
 
-    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
+    r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
+    if (unlikely(r < 0)) {
+        return r;
+    }
+
+    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
                                                  &has_cvq, errp);
     if (queue_pairs < 0) {
         qemu_close(vdpa_device_fd);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Since the vhost-vdpa device is exposing _F_LOG, adding a migration blocker if
it uses CVQ.

However, qemu is able to migrate simple devices with no CVQ as long as
they use SVQ. To allow it, add a placeholder error to vhost_vdpa, and
only add to vhost_dev when used. vhost_dev machinery place the migration
blocker if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 15 +++++++++++++++
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 16 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
 #include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "hw/virtio/vhost-vdpa.h"
 #include "exec/address-spaces.h"
+#include "migration/blocker.h"
 #include "qemu/cutils.h"
 #include "qemu/main-loop.h"
 #include "cpu.h"
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
         return true;
     }
 
+    if (v->migration_blocker) {
+        int r = migrate_add_blocker(v->migration_blocker, &err);
+        if (unlikely(r < 0)) {
+            return false;
+        }
+    }
+
     for (i = 0; i < v->shadow_vqs->len; ++i) {
         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
@@ -XXX,XX +XXX,XX @@ err:
         vhost_svq_stop(svq);
     }
 
+    if (v->migration_blocker) {
+        migrate_del_blocker(v->migration_blocker);
+    }
+
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
         }
     }
 
+    if (v->migration_blocker) {
+        migrate_del_blocker(v->migration_blocker);
+    }
     return true;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
+    Error *migration_blocker;
     GPtrArray *shadow_vqs;
     const VhostShadowVirtqueueOps *shadow_vq_ops;
     void *shadow_vq_ops_opaque;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Finally offering the possibility to enable SVQ from the command line.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/vhost-vdpa.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 qapi/net.json    |  9 ++++++-
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ const int vdpa_feature_bits[] = {
     VHOST_INVALID_FEATURE_BIT
 };
 
+/** Supported device specific feature bits with SVQ */
+static const uint64_t vdpa_svq_device_features =
+    BIT_ULL(VIRTIO_NET_F_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
+    BIT_ULL(VIRTIO_NET_F_MTU) |
+    BIT_ULL(VIRTIO_NET_F_MAC) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
+    BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
+    BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
+    BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
+    BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
+    BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
+    BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
+    BIT_ULL(VIRTIO_NET_F_STATUS) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
+    BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
+    BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
+    BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
+    BIT_ULL(VIRTIO_NET_F_STANDBY);
+
 VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -XXX,XX +XXX,XX @@ err_init:
 static void vhost_vdpa_cleanup(NetClientState *nc)
 {
     VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+    struct vhost_dev *dev = &s->vhost_net->dev;
 
     qemu_vfree(s->cvq_cmd_out_buffer);
     qemu_vfree(s->cvq_cmd_in_buffer);
+    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
+        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
+    }
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
         g_free(s->vhost_net);
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
                                            int vdpa_device_fd,
                                            int queue_pair_index,
                                            int nvqs,
-                                           bool is_datapath)
+                                           bool is_datapath,
+                                           bool svq,
+                                           VhostIOVATree *iova_tree)
 {
     NetClientState *nc = NULL;
     VhostVDPAState *s;
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
     s->vhost_vdpa.device_fd = vdpa_device_fd;
     s->vhost_vdpa.index = queue_pair_index;
+    s->vhost_vdpa.shadow_vqs_enabled = svq;
+    s->vhost_vdpa.iova_tree = iova_tree;
     if (!is_datapath) {
         s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
                                             vhost_vdpa_net_cvq_cmd_page_len());
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 
         s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
         s->vhost_vdpa.shadow_vq_ops_opaque = s;
+        error_setg(&s->vhost_vdpa.migration_blocker,
+                   "Migration disabled: vhost-vdpa uses CVQ.");
     }
     ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
     return nc;
 }
 
+static int vhost_vdpa_get_iova_range(int fd,
+                                     struct vhost_vdpa_iova_range *iova_range)
+{
+    int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
+
+    return ret < 0 ? -errno : 0;
+}
+
 static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
 {
     int ret = ioctl(fd, VHOST_GET_FEATURES, features);
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
     uint64_t features;
     int vdpa_device_fd;
     g_autofree NetClientState **ncs = NULL;
+    g_autoptr(VhostIOVATree) iova_tree = NULL;
     NetClientState *nc;
     int queue_pairs, r, i, has_cvq = 0;
 
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
         return queue_pairs;
     }
 
+    if (opts->x_svq) {
+        struct vhost_vdpa_iova_range iova_range;
+
+        uint64_t invalid_dev_features =
+            features & ~vdpa_svq_device_features &
+            /* Transport are all accepted at this point */
+            ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
+                             VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
+
+        if (invalid_dev_features) {
+            error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
+                       invalid_dev_features);
+            goto err_svq;
+        }
+
+        vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
+        iova_tree = vhost_iova_tree_new(iova_range.first, iova_range.last);
+    }
+
     ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
 
     for (i = 0; i < queue_pairs; i++) {
         ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-                                     vdpa_device_fd, i, 2, true);
+                                     vdpa_device_fd, i, 2, true, opts->x_svq,
+                                     iova_tree);
         if (!ncs[i])
             goto err;
     }
 
     if (has_cvq) {
         nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
-                                 vdpa_device_fd, i, 1, false);
+                                 vdpa_device_fd, i, 1, false,
+                                 opts->x_svq, iova_tree);
         if (!nc)
             goto err;
     }
 
+    /* iova_tree ownership belongs to last NetClientState */
+    g_steal_pointer(&iova_tree);
     return 0;
 
 err:
@@ -XXX,XX +XXX,XX @@ err:
             qemu_del_net_client(ncs[i]);
         }
     }
+
+err_svq:
     qemu_close(vdpa_device_fd);
 
     return -1;
diff --git a/qapi/net.json b/qapi/net.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
 # @queues: number of queues to be created for multiqueue vhost-vdpa
 #          (default: 1)
 #
+# @x-svq: Start device with (experimental) shadow virtqueue. (Since 7.1)
+#         (default: false)
+#
+# Features:
+# @unstable: Member @x-svq is experimental.
+#
 # Since: 5.1
 ##
 { 'struct': 'NetdevVhostVDPAOptions',
   'data': {
     '*vhostdev':     'str',
-    '*queues':       'int' } }
+    '*queues':       'int',
+    '*x-svq':        {'type': 'bool', 'features' : [ 'unstable'] } } }
 
 ##
 # @NetdevVmnetHostOptions:
-- 
2.7.4

From: Zhang Chen <chen.zhang@intel.com>

If the checkpoint occurs when the guest finishes restarting
but has not started running, the runstate_set() may reject
the transition from COLO to PRELAUNCH with the crash log:

{"timestamp": {"seconds": 1593484591, "microseconds": 26605},\
"event": "RESET", "data": {"guest": true, "reason": "guest-reset"}}
qemu-system-x86_64: invalid runstate transition: 'colo' -> 'prelaunch'

Long-term testing says that it's pretty safe.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 softmmu/runstate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index XXXXXXX..XXXXXXX 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -XXX,XX +XXX,XX @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
 
     { RUN_STATE_COLO, RUN_STATE_RUNNING },
+    { RUN_STATE_COLO, RUN_STATE_PRELAUNCH },
     { RUN_STATE_COLO, RUN_STATE_SHUTDOWN},
 
     { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
-- 
2.7.4

From: Zhang Chen <chen.zhang@intel.com>

We notice the QEMU may crash when the guest has too many
incoming network connections with the following log:

15197@1593578622.668573:colo_proxy_main : colo proxy connection hashtable full, clear it
free(): invalid pointer
[1]    15195 abort (core dumped)  qemu-system-x86_64 ....

This is because we create the s->connection_track_table with
g_hash_table_new_full() which is defined as:

GHashTable * g_hash_table_new_full (GHashFunc hash_func,
                       GEqualFunc key_equal_func,
                       GDestroyNotify key_destroy_func,
                       GDestroyNotify value_destroy_func);

The fourth parameter connection_destroy() will be called to free the
memory allocated for all 'Connection' values in the hashtable when
we call g_hash_table_remove_all() in the connection_hashtable_reset().

But both connection_track_table and conn_list reference to the same
conn instance. It will trigger double free in conn_list clear. So this
patch remove free action on hash table side to avoid double free the
conn.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c    | 2 +-
 net/filter-rewriter.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
-                                                      connection_destroy);
+                                                      NULL);
 
     colo_compare_iothread(s);
 
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
-                                                      connection_destroy);
+                                                      NULL);
     s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
 }
 
-- 
2.7.4

From: Zhang Chen <chen.zhang@intel.com>

When COLO use only one vnet_hdr_support parameter between
filter-redirector and filter-mirror(or colo-compare), COLO will crash
with segmentation fault. Back track as follow:

Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
296         uint16_t proto = be16_to_cpu(PKT_GET_ETH_HDR(p)->h_proto);
(gdb) bt
0  0x0000555555cb200b in eth_get_l2_hdr_length (p=0x0)
    at /home/tao/project/COLO/colo-qemu/include/net/eth.h:296
1  0x0000555555cb22b4 in parse_packet_early (pkt=0x555556a44840) at
net/colo.c:49
2  0x0000555555cb2b91 in is_tcp_packet (pkt=0x555556a44840) at
net/filter-rewriter.c:63

So wrong vnet_hdr_len will cause pkt->data become NULL. Add check to
raise error and add trace-events to track vnet_hdr_len.

Signed-off-by: Tao Xu <tao3.xu@intel.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Li Zhijian <lizhijian@fujitsu.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo.c       | 9 ++++++++-
 net/trace-events | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/colo.c b/net/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ int parse_packet_early(Packet *pkt)
     static const uint8_t vlan[] = {0x81, 0x00};
     uint8_t *data = pkt->data + pkt->vnet_hdr_len;
     uint16_t l3_proto;
-    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+    ssize_t l2hdr_len;
+
+    if (data == NULL) {
+        trace_colo_proxy_main_vnet_info("This packet is not parsed correctly, "
+                                        "pkt->vnet_hdr_len", pkt->vnet_hdr_len);
+        return 1;
+    }
+    l2hdr_len = eth_get_l2_hdr_length(data);
 
     if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) {
         trace_colo_proxy_main("pkt->size < ETH_HLEN");
diff --git a/net/trace-events b/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -XXX,XX +XXX,XX @@ vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
 
 # colo.c
 colo_proxy_main(const char *chr) ": %s"
+colo_proxy_main_vnet_info(const char *sta, int size) ": %s = %d"
 
 # colo-compare.c
 colo_compare_main(const char *chr) ": %s"
-- 
2.7.4