Series comparison

-[Qemu-devel] [PULL 00/26] Net patches
+[PULL V3 00/15] Net patches
-The following changes since commit a73549f99612f758dec0fdea6ae1c30b6c709a0b:
+The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
-  Merge remote-tracking branch 'remotes/kraxel/tags/ui-20181012-pull-request' into staging (2018-10-12 16:45:51 +0100)
+  Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to a7ec0077c2db445d6bae421963188367d2695bd6:
+for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
-  qemu-options: Fix bad "macaddr" property in the documentation (2018-10-15 16:14:15 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
 ----------------------------------------------------------------
+Changes since V2:
+- fix 32bit build errros
 ----------------------------------------------------------------
-Jason Wang (4):
+Eugenio Pérez (14):
-      ne2000: fix possible out of bound access in ne2000_receive
+      vhost: Add VhostShadowVirtqueue
-      rtl8139: fix possible out of bound access
+      vhost: Add Shadow VirtQueue kick forwarding capabilities
-      pcnet: fix possible buffer overflow
+      vhost: Add Shadow VirtQueue call forwarding capabilities
-      net: ignore packet size greater than INT_MAX
+      vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
-Martin Wilck (1):
+Jason Wang (1):
-      e1000: indicate dropped packets in HW counters
+      virtio-net: fix map leaking on error during receive
-Thomas Huth (1):
+ hw/net/virtio-net.c                |   1 +
-      qemu-options: Fix bad "macaddr" property in the documentation
+ hw/virtio/meson.build              |   2 +-
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
-Zhang Chen (15):
+ hw/virtio/vhost-iova-tree.h        |  27 ++
-      filter-rewriter: Add TCP state machine and fix memory leak in connection_track_table
+ hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
-      colo-compare: implement the process of checkpoint
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
-      colo-compare: use notifier to notify packets comparing result
+ hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
-;5202;0c1;5202;0c      COLO: integrate colo compare with colo frame
+ include/hw/virtio/vhost-vdpa.h     |   8 +
-      COLO: Add block replication into colo process
+ include/qemu/iova-tree.h           |  38 ++-
-      COLO: Remove colo_state migration struct
+ util/iova-tree.c                   | 170 ++++++++++
-      COLO: Load dirty pages into SVM's RAM cache firstly
+files changed, 1584 insertions(+), 17 deletions(-)
-      ram/COLO: Record the dirty pages that SVM received
+ create mode 100644 hw/virtio/vhost-iova-tree.c
-      COLO: Flush memory data from ram cache
+ create mode 100644 hw/virtio/vhost-iova-tree.h
-      qapi/migration.json: Rename COLO unknown mode to none mode.
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
-      qapi: Add new command to query colo status
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
       savevm: split the process of different stages for loadvm/savevm
       filter: Add handle_event method for NetFilterClass
       filter-rewriter: handle checkpoint and failover event
       docs: Add COLO status diagram to COLO-FT.txt
 liujunjie (1):
       clean up callback when del virtqueue
 zhanghailiang (4):
       qmp event: Add COLO_EXIT event to notify users while exited COLO
       COLO: flush host dirty ram from cache
       COLO: notify net filters about checkpoint/failover event
       COLO: quick failover process by kick COLO thread
  docs/COLO-FT.txt          |  34 ++++++++
  hw/net/e1000.c            |  16 +++-
  hw/net/ne2000.c           |   4 +-
  hw/net/pcnet.c            |   4 +-
  hw/net/rtl8139.c          |   8 +-
  hw/net/trace-events       |   3 +
  hw/virtio/virtio.c        |   2 +
  include/exec/ram_addr.h   |   1 +
  include/migration/colo.h  |  11 ++-
  include/net/filter.h      |   5 ++
  migration/Makefile.objs   |   2 +-
  migration/colo-comm.c     |  76 -----------------
  migration/colo-failover.c |   2 +-
  migration/colo.c          | 212 +++++++++++++++++++++++++++++++++++++++++++---
  migration/migration.c     |  46 ++++++++--
  migration/ram.c           | 166 +++++++++++++++++++++++++++++++++++-
  migration/ram.h           |   4 +
  migration/savevm.c        |  53 ++++++++++--
  migration/savevm.h        |   5 ++
  migration/trace-events    |   3 +
  net/colo-compare.c        | 115 ++++++++++++++++++++++---
  net/colo-compare.h        |  24 ++++++
  net/colo.c                |  10 ++-
  net/colo.h                |  11 +--
  net/filter-rewriter.c     | 166 +++++++++++++++++++++++++++++++++---
  net/filter.c              |  17 ++++
  net/net.c                 |  26 +++++-
  qapi/migration.json       |  80 +++++++++++++++--
  qemu-options.hx           |   2 +-
  vl.c                      |   2 -
 files changed, 958 insertions(+), 152 deletions(-)
  delete mode 100644 migration/colo-comm.c
  create mode 100644 net/colo-compare.h

-[Qemu-devel] [PULL 26/26] qemu-options: Fix bad "macaddr" property in the documentation
+[PULL V3 01/15] virtio-net: fix map leaking on error during receive
-From: Thomas Huth <thuth@redhat.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-When using the "-device" option, the property is called "mac".
+Fixing this by detaching the cached elements on error. This addresses
-"macaddr" is only used for the legacy "-net nic" option.
+CVE-2022-26353.
-Reported-by: Harald Hoyer <harald@redhat.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
-Reviewed-by: Markus Armbruster <armbru@redhat.com>
+Cc: qemu-stable@nongnu.org
-Signed-off-by: Thomas Huth <thuth@redhat.com>
+Fixes: CVE-2022-26353
 Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- qemu-options.hx | 2 +-
+ hw/net/virtio-net.c | 1 +
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 1 insertion(+)
-diff --git a/qemu-options.hx b/qemu-options.hx
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
+--- a/hw/net/virtio-net.c
-+++ b/qemu-options.hx
++++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@ qemu-system-i386 linux.img \
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
-                  -netdev socket,id=n2,mcast=230.0.0.1:1234
- # launch yet another QEMU instance on same "bus"
+ err:
- qemu-system-i386 linux.img \
+     for (j = 0; j < i; j++) {
--                 -device e1000,netdev=n3,macaddr=52:54:00:12:34:58 \
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
-+                 -device e1000,netdev=n3,mac=52:54:00:12:34:58 \
+         g_free(elems[j]);
-                  -netdev socket,id=n3,mcast=230.0.0.1:1234
+     }
  @end example
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 02/26] colo-compare: implement the process of checkpoint
+[PULL V3 02/15] vhost: Add VhostShadowVirtqueue
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-While do checkpoint, we need to flush all the unhandled packets,
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-By using the filter notifier mechanism, we can easily to notify
+notifications and buffers, allowing qemu to track them. While qemu is
-every compare object to do this process, which runs inside
+forwarding the buffers and virtqueue changes, it is able to commit the
-of compare threads as a coroutine.
+memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
+This commit only exposes basic SVQ allocation and free. Next patches of
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
+the series add functionality like notifications and buffers forwarding.
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/migration/colo.h |  6 ++++
+ hw/virtio/meson.build              |  2 +-
- net/colo-compare.c       | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
- net/colo-compare.h       | 22 ++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
-files changed, 106 insertions(+)
+files changed, 91 insertions(+), 1 deletion(-)
- create mode 100644 net/colo-compare.h
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
-diff --git a/include/migration/colo.h b/include/migration/colo.h
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/include/migration/colo.h
+--- a/hw/virtio/meson.build
-+++ b/include/migration/colo.h
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
- #include "qemu-common.h"
- #include "qapi/qapi-types-migration.h"
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
-+enum colo_event {
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
-+    COLO_EVENT_NONE,
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+    COLO_EVENT_CHECKPOINT,
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-+    COLO_EVENT_FAILOVER,
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
-+};
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-+
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
  void colo_info_init(void);
  void migrate_start_colo_process(MigrationState *s);
 diff --git a/net/colo-compare.c b/net/colo-compare.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo-compare.c
 +++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/sockets.h"
  #include "colo.h"
  #include "sysemu/iothread.h"
 +#include "net/colo-compare.h"
 +#include "migration/colo.h"
  #define TYPE_COLO_COMPARE "colo-compare"
  #define COLO_COMPARE(obj) \
      OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
 +static QTAILQ_HEAD(, CompareState) net_compares =
 +       QTAILQ_HEAD_INITIALIZER(net_compares);
 +
  #define COMPARE_READ_LEN_MAX NET_BUFSIZE
  #define MAX_QUEUE_SIZE 1024
@@ -XXX,XX +XXX,XX @@
  /* TODO: Should be configurable */
  #define REGULAR_PACKET_CHECK_MS 3000
 +static QemuMutex event_mtx;
 +static QemuCond event_complete_cond;
 +static int event_unhandled_count;
 +
  /*
   *  + CompareState ++
   *  |               |
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
      IOThread *iothread;
      GMainContext *worker_context;
      QEMUTimer *packet_check_timer;
 +
 +    QEMUBH *event_bh;
 +    enum colo_event event;
 +
 +    QTAILQ_ENTRY(CompareState) next;
  } CompareState;
  typedef struct CompareClass {
@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
                  REGULAR_PACKET_CHECK_MS);
  }
 +/* Public API, Used for COLO frame to notify compare event */
 +void colo_notify_compares_event(void *opaque, int event, Error **errp)
 +{
 +    CompareState *s;
 +
 +    qemu_mutex_lock(&event_mtx);
 +    QTAILQ_FOREACH(s, &net_compares, next) {
 +        s->event = event;
 +        qemu_bh_schedule(s->event_bh);
 +        event_unhandled_count++;
 +    }
 +    /* Wait all compare threads to finish handling this event */
 +    while (event_unhandled_count > 0) {
 +        qemu_cond_wait(&event_complete_cond, &event_mtx);
 +    }
 +
 +    qemu_mutex_unlock(&event_mtx);
 +}
 +
  static void colo_compare_timer_init(CompareState *s)
  {
      AioContext *ctx = iothread_get_aio_context(s->iothread);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_timer_del(CompareState *s)
      }
   }
 +static void colo_flush_packets(void *opaque, void *user_data);
 +
 +static void colo_compare_handle_event(void *opaque)
 +{
 +    CompareState *s = opaque;
 +
 +    switch (s->event) {
 +    case COLO_EVENT_CHECKPOINT:
 +        g_queue_foreach(&s->conn_list, colo_flush_packets, s);
 +        break;
 +    case COLO_EVENT_FAILOVER:
 +        break;
 +    default:
 +        break;
 +    }
 +
 +    assert(event_unhandled_count > 0);
 +
 +    qemu_mutex_lock(&event_mtx);
 +    event_unhandled_count--;
 +    qemu_cond_broadcast(&event_complete_cond);
 +    qemu_mutex_unlock(&event_mtx);
 +}
 +
  static void colo_compare_iothread(CompareState *s)
  {
      object_ref(OBJECT(s->iothread));
@@ -XXX,XX +XXX,XX @@ static void colo_compare_iothread(CompareState *s)
                               s, s->worker_context, true);
      colo_compare_timer_init(s);
 +    s->event_bh = qemu_bh_new(colo_compare_handle_event, s);
  }
  static char *compare_get_pri_indev(Object *obj, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
      net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr);
      net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr);
 +    QTAILQ_INSERT_TAIL(&net_compares, s, next);
 +
      g_queue_init(&s->conn_list);
 +    qemu_mutex_init(&event_mtx);
 +    qemu_cond_init(&event_complete_cond);
 +
      s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                        connection_key_equal,
                                                        g_free,
@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
  static void colo_compare_finalize(Object *obj)
  {
      CompareState *s = COLO_COMPARE(obj);
 +    CompareState *tmp = NULL;
      qemu_chr_fe_deinit(&s->chr_pri_in, false);
      qemu_chr_fe_deinit(&s->chr_sec_in, false);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
      if (s->iothread) {
          colo_compare_timer_del(s);
      }
 +
 +    qemu_bh_delete(s->event_bh);
 +
 +    QTAILQ_FOREACH(tmp, &net_compares, next) {
 +        if (tmp == s) {
 +            QTAILQ_REMOVE(&net_compares, s, next);
 +            break;
 +        }
 +    }
 +
      /* Release all unhandled packets after compare thead exited */
      g_queue_foreach(&s->conn_list, colo_flush_packets, s);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
      if (s->iothread) {
          object_unref(OBJECT(s->iothread));
      }
 +
 +    qemu_mutex_destroy(&event_mtx);
 +    qemu_cond_destroy(&event_complete_cond);
 +
      g_free(s->pri_indev);
      g_free(s->sec_indev);
      g_free(s->outdev);
 diff --git a/net/colo-compare.h b/net/colo-compare.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/net/colo-compare.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
++ * vhost shadow virtqueue
 + * (a.k.a. Fault Tolerance or Continuous Replication)
 + *
-+ * Copyright (c) 2017 HUAWEI TECHNOLOGIES CO., LTD.
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
-+ * Copyright (c) 2017 FUJITSU LIMITED
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + * Copyright (c) 2017 Intel Corporation
 + *
-+ * Authors:
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + *    zhanghailiang <zhang.zhanghailiang@huawei.com>
 + *    Zhang Chen <zhangckid@gmail.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or
 + * later.  See the COPYING file in the top-level directory.
 + */
 +
-+#ifndef QEMU_COLO_COMPARE_H
++#include "qemu/osdep.h"
-+#define QEMU_COLO_COMPARE_H
++#include "hw/virtio/vhost-shadow-virtqueue.h"
 +
-+void colo_notify_compares_event(void *opaque, int event, Error **errp);
++#include "qemu/error-report.h"
 +
-+#endif /* QEMU_COLO_COMPARE_H */
++/**
 + * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
 + * Returns the new virtqueue or NULL.
 + *
 + * In case of error, reason is reported through error_report.
 + */
 +VhostShadowVirtqueue *vhost_svq_new(void)
 +{
 +    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 +    int r;
 +
 +    r = event_notifier_init(&svq->hdev_kick, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create kick event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_kick;
 +    }
 +
 +    r = event_notifier_init(&svq->hdev_call, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create call event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_call;
 +    }
 +
 +    return g_steal_pointer(&svq);
 +
 +err_init_hdev_call:
 +    event_notifier_cleanup(&svq->hdev_kick);
 +
 +err_init_hdev_kick:
 +    return NULL;
 +}
 +
 +/**
 + * Free the resources of the shadow virtqueue.
 + *
 + * @pvq: gpointer to SVQ so it can be used by autofree functions.
 + */
 +void vhost_svq_free(gpointer pvq)
 +{
 +    VhostShadowVirtqueue *vq = pvq;
 +    event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_cleanup(&vq->hdev_call);
 +    g_free(vq);
 +}
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost shadow virtqueue
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef VHOST_SHADOW_VIRTQUEUE_H
 +#define VHOST_SHADOW_VIRTQUEUE_H
 +
 +#include "qemu/event_notifier.h"
 +
 +/* Shadow virtqueue to relay notifications */
 +typedef struct VhostShadowVirtqueue {
 +    /* Shadow kick notifier, sent to vhost */
 +    EventNotifier hdev_kick;
 +    /* Shadow call notifier, sent to vhost */
 +    EventNotifier hdev_call;
 +} VhostShadowVirtqueue;
 +
 +VhostShadowVirtqueue *vhost_svq_new(void);
 +
 +void vhost_svq_free(gpointer vq);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 +
 +#endif
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 04/26] COLO: integrate colo compare with colo frame
+[PULL V3 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-For COLO FT, both the PVM and SVM run at the same time,
+At this mode no buffer forwarding will be performed in SVQ mode: Qemu
-only sync the state while it needs.
+will just forward the guest's kicks to the device.
-So here, let SVM runs while not doing checkpoint, change
+Host memory notifiers regions are left out for simplicity, and they will
-DEFAULT_MIGRATE_X_CHECKPOINT_DELAY to 200*100.
+not be addressed in this series.
-Besides, we forgot to release colo_checkpoint_semd and
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-colo_delay_timer, fix them here.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- migration/colo.c      | 42 ++++++++++++++++++++++++++++++++++++++++--
+ hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
- migration/migration.c |  6 ++----
+ hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
-files changed, 42 insertions(+), 6 deletions(-)
+ hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
+ include/hw/virtio/vhost-vdpa.h     |   4 ++
-diff --git a/migration/colo.c b/migration/colo.c
+files changed, 215 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/migration/colo.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "qemu/error-report.h"
- #include "migration/failover.h"
++#include "qemu/main-loop.h"
- #include "replication.h"
++#include "linux-headers/linux/vhost.h"
-+#include "net/colo-compare.h"
++
-+#include "net/colo.h"
++/**
++ * Forward guest notifications.
- static bool vmstate_loading;
++ *
-+static Notifier packets_compare_notifier;
++ * @n: guest kick event notifier, the one that guest set to notify svq.
++ */
- #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
++static void vhost_handle_guest_kick(EventNotifier *n)
++{
-@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
-         goto out;
++    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
-+    colo_notify_compares_event(NULL, COLO_EVENT_CHECKPOINT, &local_err);
++    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
-+    if (local_err) {
+     return g_steal_pointer(&svq);
-+        goto out;
-+    }
+ err_init_hdev_call:
-+
+@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
-     /* Disable block migration */
+ void vhost_svq_free(gpointer pvq)
-     migrate_set_block_enabled(false, &local_err);
+ {
-     qemu_savevm_state_header(fb);
+     VhostShadowVirtqueue *vq = pvq;
-@@ -XXX,XX +XXX,XX @@ out:
++    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
      EventNotifier hdev_call;
 +
 +    /*
 +     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
 +     * notifier allows to recover the VhostShadowVirtqueue from the event loop
 +     * easily. If we use the VirtQueue's one, we don't have an easy way to
 +     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
 +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t idx;
 +
 +    if (!v->shadow_vqs) {
 +        return;
 +    }
 +
 +    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 +        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 +    }
 +    g_ptr_array_free(v->shadow_vqs, true);
 +}
 +
  static int vhost_vdpa_cleanup(struct vhost_dev *dev)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
      trace_vhost_vdpa_cleanup(dev, v);
      vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      memory_listener_unregister(&v->listener);
 +    vhost_vdpa_svq_cleanup(dev);
      dev->opaque = NULL;
      ram_block_discard_disable(false);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
      return ret;
  }
-+static void colo_compare_notify_checkpoint(Notifier *notifier, void *data)
++static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
 +{
-+    colo_checkpoint_notify(data);
++    if (!v->shadow_vqs_enabled) {
-+}
++        return;
-+
++    }
- static void colo_process_checkpoint(MigrationState *s)
++
- {
++    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
-     QIOChannelBuffer *bioc;
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
++        vhost_svq_stop(svq);
-         goto out;
++    }
-     }
++}
++
-+    packets_compare_notifier.notify = colo_compare_notify_checkpoint;
+ static int vhost_vdpa_reset_device(struct vhost_dev *dev)
-+    colo_compare_register_notifier(&packets_compare_notifier);
+ {
-+
++    struct vhost_vdpa *v = dev->opaque;
-     /*
+     int ret;
-      * Wait for Secondary finish loading VM states and enter COLO
+     uint8_t status = 0;
-      * restore.
-@@ -XXX,XX +XXX,XX @@ out:
++    vhost_vdpa_reset_svq(v);
-         qemu_fclose(fb);
++
-     }
+     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
+     trace_vhost_vdpa_reset_device(dev, status);
--    timer_del(s->colo_delay_timer);
+     return ret;
--
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
-     /* Hope this not to be too long to wait here */
+     return ret;
-     qemu_sem_wait(&s->colo_exit_sem);
+  }
-     qemu_sem_destroy(&s->colo_exit_sem);
-+
++static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-+    /*
++                                         struct vhost_vring_file *file)
-+     * It is safe to unregister notifier after failover finished.
++{
-+     * Besides, colo_delay_timer and colo_checkpoint_sem can't be
++    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
-+     * released befor unregister notifier, or there will be use-after-free
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
-+     * error.
++}
-+     */
++
-+    colo_compare_unregister_notifier(&packets_compare_notifier);
++/**
-+    timer_del(s->colo_delay_timer);
++ * Set the shadow virtqueue descriptors to the device
-+    timer_free(s->colo_delay_timer);
++ *
-+    qemu_sem_destroy(&s->colo_checkpoint_sem);
++ * @dev: The vhost device model
-+
++ * @svq: The shadow virtqueue
-     /*
++ * @idx: The index of the virtqueue in the vhost device
-      * Must be called after failover BH is completed,
++ * @errp: Error
-      * Or the failover BH may shutdown the wrong fd that
++ */
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
++static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-     fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
++                                 VhostShadowVirtqueue *svq, unsigned idx,
-     object_unref(OBJECT(bioc));
++                                 Error **errp)
++{
-+    qemu_mutex_lock_iothread();
++    struct vhost_vring_file file = {
-+    vm_start();
++        .index = dev->vq_index + idx,
-+    trace_colo_vm_state_change("stop", "run");
++    };
-+    qemu_mutex_unlock_iothread();
++    const EventNotifier *event_notifier = &svq->hdev_kick;
-+
++    int r;
-     colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
++
-                       &local_err);
++    file.fd = event_notifier_get_fd(event_notifier);
-     if (local_err) {
++    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
++    if (unlikely(r != 0)) {
-             goto out;
++        error_setg_errno(errp, -r, "Can't set device kick fd");
-         }
++    }
++
-+        qemu_mutex_lock_iothread();
++    return r == 0;
-+        vm_stop_force_state(RUN_STATE_COLO);
++}
-+        trace_colo_vm_state_change("run", "stop");
++
-+        qemu_mutex_unlock_iothread();
++static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
-+
++{
-         /* FIXME: This is unnecessary for periodic checkpoint mode */
++    struct vhost_vdpa *v = dev->opaque;
-         colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
++    Error *err = NULL;
-                      &local_err);
++    unsigned i;
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
++
-         }
++    if (!v->shadow_vqs) {
++        return true;
-         vmstate_loading = false;
++    }
-+        vm_start();
++
-+        trace_colo_vm_state_change("stop", "run");
++    for (i = 0; i < v->shadow_vqs->len; ++i) {
-         qemu_mutex_unlock_iothread();
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
++        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
-         if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
++        if (unlikely(!ok)) {
-diff --git a/migration/migration.c b/migration/migration.c
++            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
---- a/migration/migration.c
+--- a/include/hw/virtio/vhost-vdpa.h
-+++ b/migration/migration.c
++++ b/include/hw/virtio/vhost-vdpa.h
 @@ -XXX,XX +XXX,XX @@
- /* Migration XBZRLE default cache size */
+ #ifndef HW_VIRTIO_VHOST_VDPA_H
- #define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
+ #define HW_VIRTIO_VHOST_VDPA_H
--/* The delay time (in ms) between two COLO checkpoints
++#include <gmodule.h>
-- * Note: Please change this default value to 10000 when we support hybrid mode.
++
-- */
+ #include "hw/virtio/virtio.h"
--#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
+ #include "standard-headers/linux/vhost_types.h"
-+/* The delay time (in ms) between two COLO checkpoints */
-+#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
- #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
+     bool iotlb_batch_begin_sent;
- #define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16
+     MemoryListener listener;
+     struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 06/26] COLO: Remove colo_state migration struct
+[PULL V3 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-We need to know if migration is going into COLO state for
+This will make qemu aware of the device used buffers, allowing it to
-incoming side before start normal migration.
+write the guest memory with its contents if needed.
-Instead by using the VMStateDescription to send colo_state
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-from source side to destination side, we use MIG_CMD_ENABLE_COLO
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 to indicate whether COLO is enabled or not.
 Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/migration/colo.h |  5 ++--
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
- migration/Makefile.objs  |  2 +-
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
- migration/colo-comm.c    | 76 ------------------------------------------------
+ hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
- migration/colo.c         | 13 ++++++++-
+files changed, 71 insertions(+), 2 deletions(-)
  migration/migration.c    | 23 ++++++++++++++-
  migration/savevm.c       | 17 +++++++++++
  migration/savevm.h       |  1 +
  migration/trace-events   |  1 +
  vl.c                     |  2 --
 files changed, 57 insertions(+), 83 deletions(-)
  delete mode 100644 migration/colo-comm.c
-diff --git a/include/migration/colo.h b/include/migration/colo.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/migration/colo.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/migration/colo.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ void migrate_start_colo_process(MigrationState *s);
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
  bool migration_in_colo_state(void);
  /* loadvm */
 -bool migration_incoming_enable_colo(void);
 -void migration_incoming_exit_colo(void);
 +void migration_incoming_enable_colo(void);
 +void migration_incoming_disable_colo(void);
 +bool migration_incoming_colo_enabled(void);
  void *colo_process_incoming_thread(void *opaque);
  bool migration_incoming_in_colo_state(void);
 diff --git a/migration/Makefile.objs b/migration/Makefile.objs
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/Makefile.objs
 +++ b/migration/Makefile.objs
@@ -XXX,XX +XXX,XX @@
  common-obj-y += migration.o socket.o fd.o exec.o
  common-obj-y += tls.o channel.o savevm.o
 -common-obj-y += colo-comm.o colo.o colo-failover.o
 +common-obj-y += colo.o colo-failover.o
  common-obj-y += vmstate.o vmstate-types.o page_cache.o
  common-obj-y += qemu-file.o global_state.o
  common-obj-y += qemu-file-channel.o
 diff --git a/migration/colo-comm.c b/migration/colo-comm.c
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/migration/colo-comm.c
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -/*
 - * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
 - * (a.k.a. Fault Tolerance or Continuous Replication)
 - *
 - * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
 - * Copyright (c) 2016 FUJITSU LIMITED
 - * Copyright (c) 2016 Intel Corporation
 - *
 - * This work is licensed under the terms of the GNU GPL, version 2 or
 - * later. See the COPYING file in the top-level directory.
 - *
 - */
 -
 -#include "qemu/osdep.h"
 -#include "migration.h"
 -#include "migration/colo.h"
 -#include "migration/vmstate.h"
 -#include "trace.h"
 -
 -typedef struct {
 -     bool colo_requested;
 -} COLOInfo;
 -
 -static COLOInfo colo_info;
 -
 -COLOMode get_colo_mode(void)
 -{
 -    if (migration_in_colo_state()) {
 -        return COLO_MODE_PRIMARY;
 -    } else if (migration_incoming_in_colo_state()) {
 -        return COLO_MODE_SECONDARY;
 -    } else {
 -        return COLO_MODE_UNKNOWN;
 -    }
 -}
 -
 -static int colo_info_pre_save(void *opaque)
 -{
 -    COLOInfo *s = opaque;
 -
 -    s->colo_requested = migrate_colo_enabled();
 -
 -    return 0;
 -}
 -
 -static bool colo_info_need(void *opaque)
 -{
 -   return migrate_colo_enabled();
 -}
 -
 -static const VMStateDescription colo_state = {
 -    .name = "COLOState",
 -    .version_id = 1,
 -    .minimum_version_id = 1,
 -    .pre_save = colo_info_pre_save,
 -    .needed = colo_info_need,
 -    .fields = (VMStateField[]) {
 -        VMSTATE_BOOL(colo_requested, COLOInfo),
 -        VMSTATE_END_OF_LIST()
 -    },
 -};
 -
 -void colo_info_init(void)
 -{
 -    vmstate_register(NULL, 0, &colo_state, &colo_info);
 -}
 -
 -bool migration_incoming_enable_colo(void)
 -{
 -    return colo_info.colo_requested;
 -}
 -
 -void migration_incoming_exit_colo(void)
 -{
 -    colo_info.colo_requested = false;
 -}
 diff --git a/migration/colo.c b/migration/colo.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/colo.c
 +++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
      qemu_sem_post(&s->colo_exit_sem);
  }
-+COLOMode get_colo_mode(void)
+ /**
 + * Forward vhost notifications
 + *
 + * @n: hdev call event notifier, the one that device set to notify svq.
 + */
 +static void vhost_svq_handle_call(EventNotifier *n)
 +{
-+    if (migration_in_colo_state()) {
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
-+        return COLO_MODE_PRIMARY;
++                                             hdev_call);
-+    } else if (migration_incoming_in_colo_state()) {
++    event_notifier_test_and_clear(n);
-+        return COLO_MODE_SECONDARY;
++    event_notifier_set(&svq->svq_call);
 +}
 +
 +/**
 + * Set the call notifier for the SVQ to call the guest
 + *
 + * @svq: Shadow virtqueue
 + * @call_fd: call notifier
 + *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
 +    if (call_fd == VHOST_FILE_UNBIND) {
 +        /*
 +         * Fail event_notifier_set if called handling device call.
 +         *
 +         * SVQ still needs device notifications, since it needs to keep
 +         * forwarding used buffers even with the unbind.
 +         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
-+        return COLO_MODE_UNKNOWN;
++        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
 +}
 +
- void colo_do_failover(MigrationState *s)
++/**
- {
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
-     /* Make sure VM stopped while failover happened. */
+  *
-@@ -XXX,XX +XXX,XX @@ out:
+  * @svq: The svq
-     if (mis->to_src_file) {
+@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          qemu_fclose(mis->to_src_file);
      }
--    migration_incoming_exit_colo();
-+    migration_incoming_disable_colo();
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
-     rcu_unregister_thread();
+     return g_steal_pointer(&svq);
-     return NULL;
-diff --git a/migration/migration.c b/migration/migration.c
+ err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
      VhostShadowVirtqueue *vq = pvq;
      vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_set_handler(&vq->hdev_call, NULL);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
  }
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
---- a/migration/migration.c
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/migration/migration.c
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
-     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
+      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
       */
      EventNotifier svq_kick;
 +
 +    /* Guest's call notifier, where the SVQ calls guest. */
 +    EventNotifier svq_call;
  } VhostShadowVirtqueue;
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
-+static bool migration_colo_enabled;
++static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
-+bool migration_incoming_colo_enabled(void)
++                                         struct vhost_vring_file *file)
 +{
-+    return migration_colo_enabled;
++    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +}
 +
-+void migration_incoming_disable_colo(void)
+ /**
-+{
+  * Set the shadow virtqueue descriptors to the device
-+    migration_colo_enabled = false;
+  *
-+}
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-+
+  * @svq: The shadow virtqueue
-+void migration_incoming_enable_colo(void)
+  * @idx: The index of the virtqueue in the vhost device
-+{
+  * @errp: Error
-+    migration_colo_enabled = true;
++ *
-+}
++ * Note that this function does not rewind kick file descriptor if cannot set
-+
++ * call one.
- void qemu_start_incoming_migration(const char *uri, Error **errp)
+  */
- {
+ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-     const char *p;
+                                  VhostShadowVirtqueue *svq, unsigned idx,
-@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-     }
+     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
+     if (unlikely(r != 0)) {
-     /* we get COLO info, and know if we are in COLO mode */
+         error_setg_errno(errp, -r, "Can't set device kick fd");
--    if (!ret && migration_incoming_enable_colo()) {
++        return false;
 +    if (!ret && migration_incoming_colo_enabled()) {
          /* Make sure all file formats flush their mutable metadata */
          bdrv_invalidate_cache_all(&local_err);
          if (local_err) {
@@ -XXX,XX +XXX,XX @@ static void *migration_thread(void *opaque)
          qemu_savevm_send_postcopy_advise(s->to_dst_file);
      }
 +    if (migrate_colo_enabled()) {
 +        /* Notify migration destination that we enable COLO */
 +        qemu_savevm_send_colo_enable(s->to_dst_file);
 +    }
 +
-     qemu_savevm_state_setup(s->to_dst_file);
++    event_notifier = &svq->hdev_call;
++    file.fd = event_notifier_get_fd(event_notifier);
-     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
++    r = vhost_vdpa_set_vring_dev_call(dev, &file);
-diff --git a/migration/savevm.c b/migration/savevm.c
++    if (unlikely(r != 0)) {
-index XXXXXXX..XXXXXXX 100644
++        error_setg_errno(errp, -r, "Can't set device call fd");
---- a/migration/savevm.c
+     }
-+++ b/migration/savevm.c
-@@ -XXX,XX +XXX,XX @@
+     return r == 0;
- #include "io/channel-file.h"
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
- #include "sysemu/replay.h"
+ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
- #include "qjson.h"
+                                        struct vhost_vring_file *file)
-+#include "migration/colo.h"
+ {
+-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
- #ifndef ETH_P_RARP
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
- #define ETH_P_RARP 0x8035
++    struct vhost_vdpa *v = dev->opaque;
-@@ -XXX,XX +XXX,XX @@ enum qemu_vm_cmd {
++
-                                       were previously sent during
++    if (v->shadow_vqs_enabled) {
-                                       precopy but are dirty. */
++        int vdpa_idx = file->index - dev->vq_index;
-     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
-+    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
++
-     MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
++        vhost_svq_set_svq_call_fd(svq, file->fd);
-     MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
++        return 0;
-     MIG_CMD_MAX
++    } else {
-@@ -XXX,XX +XXX,XX @@ static void qemu_savevm_command_send(QEMUFile *f,
++        return vhost_vdpa_set_vring_dev_call(dev, file);
-     qemu_fflush(f);
++    }
  }
-+void qemu_savevm_send_colo_enable(QEMUFile *f)
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
 +{
 +    trace_savevm_send_colo_enable();
 +    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
 +}
 +
  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
  {
      uint32_t buf;
@@ -XXX,XX +XXX,XX @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
      return 0;
  }
 +static int loadvm_process_enable_colo(MigrationIncomingState *mis)
 +{
 +    migration_incoming_enable_colo();
 +    return 0;
 +}
 +
  /*
   * Process an incoming 'QEMU_VM_COMMAND'
   * 0           just a normal return
@@ -XXX,XX +XXX,XX @@ static int loadvm_process_command(QEMUFile *f)
      case MIG_CMD_RECV_BITMAP:
          return loadvm_handle_recv_bitmap(mis, len);
 +
 +    case MIG_CMD_ENABLE_COLO:
 +        return loadvm_process_enable_colo(mis);
      }
      return 0;
 diff --git a/migration/savevm.h b/migration/savevm.h
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/savevm.h
 +++ b/migration/savevm.h
@@ -XXX,XX +XXX,XX @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
                                             uint16_t len,
                                             uint64_t *start_list,
                                             uint64_t *length_list);
 +void qemu_savevm_send_colo_enable(QEMUFile *f);
  int qemu_loadvm_state(QEMUFile *f);
  void qemu_loadvm_state_cleanup(void);
 diff --git a/migration/trace-events b/migration/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/trace-events
 +++ b/migration/trace-events
@@ -XXX,XX +XXX,XX @@ savevm_send_ping(uint32_t val) "0x%x"
  savevm_send_postcopy_listen(void) ""
  savevm_send_postcopy_run(void) ""
  savevm_send_postcopy_resume(void) ""
 +savevm_send_colo_enable(void) ""
  savevm_send_recv_bitmap(char *name) "%s"
  savevm_state_setup(void) ""
  savevm_state_resume_prepare(void) ""
 diff --git a/vl.c b/vl.c
 index XXXXXXX..XXXXXXX 100644
 --- a/vl.c
 +++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
  #endif
      }
 -    colo_info_init();
 -
      if (net_init_clients(&err) < 0) {
          error_report_err(err);
          exit(1);
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 01/26] filter-rewriter: Add TCP state machine and fix memory leak in connection_track_table
+[PULL V3 05/15] vhost: Add vhost_svq_valid_features to shadow vq
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-We add almost full TCP state machine in filter-rewriter, except
+This allows SVQ to negotiate features with the guest and the device. For
-TCPS_LISTEN and some simplify in VM active close FIN states.
+the device, SVQ is a driver. While this function bypasses all
-The reason for this simplify job is because guest kernel will track
+non-transport features, it needs to disable the features that SVQ does
-the TCP status and wait 2MSL time too, if client resend the FIN packet,
+not support when forwarding buffers. This includes packed vq layout,
-guest will resend the last ACK, so we needn't wait 2MSL time in filter-rewriter.
+indirect descriptors or event idx.
-After a net connection is closed, we didn't clear its related resources
+Future changes can add support to offer more features to the guest,
-in connection_track_table, which will lead to memory leak.
+since the use of VirtQueue gives this for free. This is left out at the
 moment for simplicity.
-Let's track the state of net connection, if it is closed, its related
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-resources will be cleared up.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo.c            |   2 +-
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
- net/colo.h            |   9 ++---
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
- net/filter-rewriter.c | 109 +++++++++++++++++++++++++++++++++++++++++++++-----
+ hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
-files changed, 104 insertions(+), 16 deletions(-)
+files changed, 61 insertions(+)
-diff --git a/net/colo.c b/net/colo.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ Connection *connection_new(ConnectionKey *key)
      conn->ip_proto = key->ip_proto;
      conn->processing = false;
      conn->offset = 0;
 -    conn->syn_flag = 0;
 +    conn->tcp_state = TCPS_CLOSED;
      conn->pack = 0;
      conn->sack = 0;
      g_queue_init(&conn->primary_list);
 diff --git a/net/colo.h b/net/colo.h
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo.h
 +++ b/net/colo.h
 @@ -XXX,XX +XXX,XX @@
- #include "slirp/slirp.h"
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
- #include "qemu/jhash.h"
- #include "qemu/timer.h"
+ #include "qemu/error-report.h"
-+#include "slirp/tcp.h"
++#include "qapi/error.h"
+ #include "qemu/main-loop.h"
- #define HASHTABLE_MAX_SIZE 16384
+ #include "linux-headers/linux/vhost.h"
-@@ -XXX,XX +XXX,XX @@ typedef struct Connection {
+ /**
-     uint32_t sack;
++ * Validate the transport device features that both guests can use with the SVQ
-     /* offset = secondary_seq - primary_seq */
++ * and SVQs can use with the device.
-     tcp_seq  offset;
++ *
--    /*
++ * @dev_features: The features
--     * we use this flag update offset func
++ * @errp: Error pointer
--     * run once in independent tcp connection
++ */
--     */
++bool vhost_svq_valid_features(uint64_t features, Error **errp)
--    int syn_flag;
++{
 +    bool ok = true;
 +    uint64_t svq_features = features;
 +
-+    int tcp_state; /* TCP FSM state */
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
-+    tcp_seq fin_ack_seq; /* the seq of 'fin=1,ack=1' */
++         ++b) {
- } Connection;
++        switch (b) {
++        case VIRTIO_F_ANY_LAYOUT:
- uint32_t connection_key_hash(const void *opaque);
++            continue;
 diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/filter-rewriter.c
 +++ b/net/filter-rewriter.c
@@ -XXX,XX +XXX,XX @@ static int is_tcp_packet(Packet *pkt)
  }
  /* handle tcp packet from primary guest */
 -static int handle_primary_tcp_pkt(NetFilterState *nf,
 +static int handle_primary_tcp_pkt(RewriterState *rf,
                                    Connection *conn,
 -                                  Packet *pkt)
 +                                  Packet *pkt, ConnectionKey *key)
  {
      struct tcphdr *tcp_pkt;
@@ -XXX,XX +XXX,XX @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
          trace_colo_filter_rewriter_conn_offset(conn->offset);
      }
 +    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
 +        conn->tcp_state == TCPS_SYN_SENT) {
 +        conn->tcp_state = TCPS_ESTABLISHED;
 +    }
 +
-     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
++        case VIRTIO_F_ACCESS_PLATFORM:
-         /*
++            /* SVQ trust in the host's IOMMU to translate addresses */
-          * we use this flag update offset func
++        case VIRTIO_F_VERSION_1:
-          * run once in independent tcp connection
++            /* SVQ trust that the guest vring is little endian */
-          */
++            if (!(svq_features & BIT_ULL(b))) {
--        conn->syn_flag = 1;
++                svq_features |= BIT_ULL(b);
-+        conn->tcp_state = TCPS_SYN_RECEIVED;
++                ok = false;
-     }
++            }
++            continue;
      if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
 -        if (conn->syn_flag) {
 +        if (conn->tcp_state == TCPS_SYN_RECEIVED) {
              /*
               * offset = secondary_seq - primary seq
               * ack packet sent by guest from primary node,
               * so we use th_ack - 1 get primary_seq
               */
              conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
 -            conn->syn_flag = 0;
 +            conn->tcp_state = TCPS_ESTABLISHED;
          }
          if (conn->offset) {
              /* handle packets to the secondary from the primary */
@@ -XXX,XX +XXX,XX @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
              net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
                                     pkt->size - pkt->vnet_hdr_len);
          }
 +
-+        /*
++        default:
-+         * Passive close step 3
++            if (svq_features & BIT_ULL(b)) {
-+         */
++                svq_features &= ~BIT_ULL(b);
-+        if ((conn->tcp_state == TCPS_LAST_ACK) &&
++                ok = false;
-+            (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
++            }
 +            conn->tcp_state = TCPS_CLOSED;
 +            g_hash_table_remove(rf->connection_track_table, key);
 +        }
 +    }
 +
-+    if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
++    if (!ok) {
-+        /*
++        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
-+         * Passive close.
++                         ", ok: 0x%"PRIx64, features, svq_features);
-+         * Step 1:
++    }
-+         * The *server* side of this connect is VM, *client* tries to close
++    return ok;
-+         * the connection. We will into CLOSE_WAIT status.
++}
 +         *
 +         * Step 2:
 +         * In this step we will into LAST_ACK status.
 +         *
 +         * We got 'fin=1, ack=1' packet from server side, we need to
 +         * record the seq of 'fin=1, ack=1' packet.
 +         *
 +         * Step 3:
 +         * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
 +         * packet from server side. From this point, we can ensure that there
 +         * will be no packets in the connection, except that, some errors
 +         * happen between the path of 'filter object' and vNIC, if this rare
 +         * case really happen, we can still create a new connection,
 +         * So it is safe to remove the connection from connection_track_table.
 +         *
 +         */
 +        if (conn->tcp_state == TCPS_ESTABLISHED) {
 +            conn->tcp_state = TCPS_CLOSE_WAIT;
 +        }
 +
-+        /*
++/**
-+         * Active close step 2.
+  * Forward guest notifications.
-+         */
+  *
-+        if (conn->tcp_state == TCPS_FIN_WAIT_1) {
+  * @n: guest kick event notifier, the one that guest set to notify svq.
-+            conn->tcp_state = TCPS_TIME_WAIT;
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-+            /*
+index XXXXXXX..XXXXXXX 100644
-+             * For simplify implementation, we needn't wait 2MSL time
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+             * in filter rewriter. Because guest kernel will track the
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-+             * TCP status and wait 2MSL time, if client resend the FIN
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
-+             * packet, guest will apply the last ACK too.
+     EventNotifier svq_call;
-+             */
+ } VhostShadowVirtqueue;
-+            conn->tcp_state = TCPS_CLOSED;
-+            g_hash_table_remove(rf->connection_track_table, key);
++bool vhost_svq_valid_features(uint64_t features, Error **errp);
-+        }
++
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
      g_autoptr(GPtrArray) shadow_vqs = NULL;
 +    uint64_t dev_features, svq_features;
 +    int r;
 +    bool ok;
      if (!v->shadow_vqs_enabled) {
          return 0;
      }
-     return 0;
++    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
- }
++    if (r != 0) {
++        error_setg_errno(errp, -r, "Can't get vdpa device features");
- /* handle tcp packet from secondary guest */
++        return r;
 -static int handle_secondary_tcp_pkt(NetFilterState *nf,
 +static int handle_secondary_tcp_pkt(RewriterState *rf,
                                      Connection *conn,
 -                                    Packet *pkt)
 +                                    Packet *pkt, ConnectionKey *key)
  {
      struct tcphdr *tcp_pkt;
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
          trace_colo_filter_rewriter_conn_offset(conn->offset);
      }
 -    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
 +    if (conn->tcp_state == TCPS_SYN_RECEIVED &&
 +        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
          /*
           * save offset = secondary_seq and then
           * in handle_primary_tcp_pkt make offset
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
          conn->offset = ntohl(tcp_pkt->th_seq);
      }
 +    /* VM active connect */
 +    if (conn->tcp_state == TCPS_CLOSED &&
 +        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
 +        conn->tcp_state = TCPS_SYN_SENT;
 +    }
 +
-     if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
++    svq_features = dev_features;
-         /* Only need to adjust seq while offset is Non-zero */
++    ok = vhost_svq_valid_features(svq_features, errp);
-         if (conn->offset) {
++    if (unlikely(!ok)) {
-@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
++        return -1;
          }
      }
 +    /*
 +     * Passive close step 2:
 +     */
 +    if (conn->tcp_state == TCPS_CLOSE_WAIT &&
 +        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
 +        conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
 +        conn->tcp_state = TCPS_LAST_ACK;
 +    }
 +
-+    /*
+     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
-+     * Active close
+     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-+     *
+         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +     * Step 1:
 +     * The *server* side of this connect is VM, *server* tries to close
 +     * the connection.
 +     *
 +     * Step 2:
 +     * We will into CLOSE_WAIT status.
 +     * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
 +     * CLOSING status.
 +     */
 +    if (conn->tcp_state == TCPS_ESTABLISHED &&
 +        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
 +        conn->tcp_state = TCPS_FIN_WAIT_1;
 +    }
 +
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
          if (sender == nf->netdev) {
              /* NET_FILTER_DIRECTION_TX */
 -            if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
 +            if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
                  qemu_net_queue_send(s->incoming_queue, sender, 0,
                  (const uint8_t *)pkt->data, pkt->size, NULL);
                  packet_destroy(pkt, NULL);
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
              }
          } else {
              /* NET_FILTER_DIRECTION_RX */
 -            if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
 +            if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
                  qemu_net_queue_send(s->incoming_queue, sender, 0,
                  (const uint8_t *)pkt->data, pkt->size, NULL);
                  packet_destroy(pkt, NULL);
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 15/26] filter: Add handle_event method for NetFilterClass
+[PULL V3 06/15] virtio: Add vhost_svq_get_vring_addr
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Filter needs to process the event of checkpoint/failover or
+It reports the shadow virtqueue address from qemu virtual address space.
 other event passed by COLO frame.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
+Since this will be different from the guest's vaddr, but the device can
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
+access it, SVQ takes special care about its alignment & lack of garbage
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+data. It assumes that IOMMU will work in host_page_size ranges for that.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/net/filter.h |  5 +++++
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
- net/filter.c         | 17 +++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
- net/net.c            | 19 +++++++++++++++++++
+files changed, 38 insertions(+)
 files changed, 41 insertions(+)
-diff --git a/include/net/filter.h b/include/net/filter.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/net/filter.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/net/filter.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ typedef ssize_t (FilterReceiveIOV)(NetFilterState *nc,
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  typedef void (FilterStatusChanged) (NetFilterState *nf, Error **errp);
 +typedef void (FilterHandleEvent) (NetFilterState *nf, int event, Error **errp);
 +
  typedef struct NetFilterClass {
      ObjectClass parent_class;
@@ -XXX,XX +XXX,XX @@ typedef struct NetFilterClass {
      FilterSetup *setup;
      FilterCleanup *cleanup;
      FilterStatusChanged *status_changed;
 +    FilterHandleEvent *handle_event;
      /* mandatory */
      FilterReceiveIOV *receive_iov;
  } NetFilterClass;
@@ -XXX,XX +XXX,XX @@ ssize_t qemu_netfilter_pass_to_next(NetClientState *sender,
                                      int iovcnt,
                                      void *opaque);
 +void colo_notify_filters_event(int event, Error **errp);
 +
  #endif /* QEMU_NET_FILTER_H */
 diff --git a/net/filter.c b/net/filter.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/filter.c
 +++ b/net/filter.c
@@ -XXX,XX +XXX,XX @@
  #include "net/vhost_net.h"
  #include "qom/object_interfaces.h"
  #include "qemu/iov.h"
 +#include "net/colo.h"
 +#include "migration/colo.h"
  static inline bool qemu_can_skip_netfilter(NetFilterState *nf)
  {
@@ -XXX,XX +XXX,XX @@ static void netfilter_finalize(Object *obj)
      g_free(nf->netdev_id);
  }
-+static void default_handle_event(NetFilterState *nf, int event, Error **errp)
+ /**
 + * Get the shadow vq vring address.
 + * @svq: Shadow virtqueue
 + * @addr: Destination to store address
 + */
 +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr)
 +{
-+    switch (event) {
++    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-+    case COLO_EVENT_CHECKPOINT:
++    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-+        break;
++    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    case COLO_EVENT_FAILOVER:
 +        object_property_set_str(OBJECT(nf), "off", "status", errp);
 +        break;
 +    default:
 +        break;
 +    }
 +}
 +
- static void netfilter_class_init(ObjectClass *oc, void *data)
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
  {
      UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
 +    NetFilterClass *nfc = NETFILTER_CLASS(oc);
      ucc->complete = netfilter_complete;
 +    nfc->handle_event = default_handle_event;
  }
  static const TypeInfo netfilter_info = {
 diff --git a/net/net.c b/net/net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ void hmp_info_network(Monitor *mon, const QDict *qdict)
      }
  }
 +void colo_notify_filters_event(int event, Error **errp)
 +{
-+    NetClientState *nc;
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
-+    NetFilterState *nf;
++    size_t avail_size = offsetof(vring_avail_t, ring) +
-+    NetFilterClass *nfc = NULL;
++                                             sizeof(uint16_t) * svq->vring.num;
 +    Error *local_err = NULL;
 +
-+    QTAILQ_FOREACH(nc, &net_clients, next) {
++    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
 +        QTAILQ_FOREACH(nf, &nc->filters, next) {
 +            nfc = NETFILTER_GET_CLASS(OBJECT(nf));
 +            nfc->handle_event(nf, event, &local_err);
 +            if (local_err) {
 +                error_propagate(errp, local_err);
 +                return;
 +            }
 +        }
 +    }
 +}
 +
- void qmp_set_link(const char *name, bool up, Error **errp)
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
- {
++{
-     NetClientState *ncs[MAX_QUEUE_NUM];
++    size_t used_size = offsetof(vring_used_t, ring) +
 +                                    sizeof(vring_used_elem_t) * svq->vring.num;
 +    return ROUND_UP(used_size, qemu_real_host_page_size);
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #define VHOST_SHADOW_VIRTQUEUE_H
  #include "qemu/event_notifier.h"
 +#include "hw/virtio/virtio.h"
 +#include "standard-headers/linux/vhost_types.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
 +    /* Shadow vring */
 +    struct vring vring;
 +
      /* Shadow kick notifier, sent to vhost */
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr);
 +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 07/26] COLO: Load dirty pages into SVM's RAM cache firstly
+[PULL V3 07/15] vdpa: adapt vhost_ops callbacks to svq
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-We should not load PVM's state directly into SVM, because there maybe some
+First half of the buffers forwarding part, preparing vhost-vdpa
-errors happen when SVM is receving data, which will break SVM.
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
 this is effectively dead code at the moment, but it helps to reduce
 patch size.
-We need to ensure receving all data before load the state into SVM. We use
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-an extra memory to cache these data (PVM's ram). The ram cache in secondary side
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 is initially the same as SVM/PVM's memory. And in the process of checkpoint,
 we cache the dirty pages of PVM into this ram cache firstly, so this ram cache
 always the same as PVM's memory at every checkpoint, then we flush this cached ram
 to SVM after we receive all PVM's state.
 Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
 Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/exec/ram_addr.h |  1 +
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
- migration/migration.c   |  7 +++++
+file changed, 41 insertions(+), 7 deletions(-)
  migration/ram.c         | 83 +++++++++++++++++++++++++++++++++++++++++++++++--
  migration/ram.h         |  4 +++
  migration/savevm.c      |  2 +-
 files changed, 94 insertions(+), 3 deletions(-)
-diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/ram_addr.h
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/include/exec/ram_addr.h
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ struct RAMBlock {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
-     struct rcu_head rcu;
+     return ret;
-     struct MemoryRegion *mr;
+  }
-     uint8_t *host;
-+    uint8_t *colo_cache; /* For colo, VM's ram cache */
++static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
-     ram_addr_t offset;
++                                         struct vhost_vring_state *ring)
-     ram_addr_t used_length;
++{
-     ram_addr_t max_length;
++    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-diff --git a/migration/migration.c b/migration/migration.c
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
-index XXXXXXX..XXXXXXX 100644
++}
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
              exit(EXIT_FAILURE);
          }
 +        if (colo_init_ram_cache() < 0) {
 +            error_report("Init ram cache failed");
 +            exit(EXIT_FAILURE);
 +        }
 +
-         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
+ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
+                                          struct vhost_vring_file *file)
-         mis->have_colo_incoming_thread = true;
+ {
-@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
          /* Wait checkpoint incoming thread exit before free resource */
          qemu_thread_join(&mis->colo_incoming_thread);
 +        /* We hold the global iothread lock, so it is safe here */
 +        colo_release_ram_cache();
      }
      if (ret < 0) {
 diff --git a/migration/ram.c b/migration/ram.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/ram.c
 +++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@ static inline void *host_from_ram_block_offset(RAMBlock *block,
      return block->host + offset;
  }
-+static inline void *colo_cache_from_block_offset(RAMBlock *block,
++static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
-+                                                 ram_addr_t offset)
++                                         struct vhost_vring_addr *addr)
 +{
-+    if (!offset_in_ramblock(block, offset)) {
++    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-+        return NULL;
++                                addr->desc_user_addr, addr->used_user_addr,
-+    }
++                                addr->avail_user_addr,
-+    if (!block->colo_cache) {
++                                addr->log_guest_addr);
-+        error_report("%s: colo_cache is NULL in block :%s",
++
-+                     __func__, block->idstr);
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-+        return NULL;
++
 +    }
 +    return block->colo_cache + offset;
 +}
 +
  /**
-  * ram_handle_compressed: handle the zero page case
+  * Set the shadow virtqueue descriptors to the device
   *
-@@ -XXX,XX +XXX,XX @@ static void decompress_data_with_multi_threads(QEMUFile *f,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
-     qemu_mutex_unlock(&decomp_done_lock);
+ static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
- }
+                                        struct vhost_vring_addr *addr)
+ {
-+/*
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-+ * colo cache: this is for secondary VM, we cache the whole
+-                                    addr->desc_user_addr, addr->used_user_addr,
-+ * memory of the secondary VM, it is need to hold the global lock
+-                                    addr->avail_user_addr,
-+ * to call this helper.
+-                                    addr->log_guest_addr);
-+ */
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-+int colo_init_ram_cache(void)
++    struct vhost_vdpa *v = dev->opaque;
 +{
 +    RAMBlock *block;
 +
-+    rcu_read_lock();
++    if (v->shadow_vqs_enabled) {
-+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
++        /*
-+        block->colo_cache = qemu_anon_ram_alloc(block->used_length,
++         * Device vring addr was set at device start. SVQ base is handled by
-+                                                NULL,
++         * VirtQueue code.
-+                                                false);
++         */
-+        if (!block->colo_cache) {
++        return 0;
 +            error_report("%s: Can't alloc memory for COLO cache of block %s,"
 +                         "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
 +                         block->used_length);
 +            goto out_locked;
 +        }
 +        memcpy(block->colo_cache, block->host, block->used_length);
 +    }
 +    rcu_read_unlock();
 +    return 0;
 +
 +out_locked:
 +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 +        if (block->colo_cache) {
 +            qemu_anon_ram_free(block->colo_cache, block->used_length);
 +            block->colo_cache = NULL;
 +        }
 +    }
 +
-+    rcu_read_unlock();
++    return vhost_vdpa_set_vring_dev_addr(dev, addr);
-+    return -errno;
+ }
-+}
  static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                         struct vhost_vring_state *ring)
  {
 -    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +    struct vhost_vdpa *v = dev->opaque;
 +
-+/* It is need to hold the global lock to call this helper */
++    if (v->shadow_vqs_enabled) {
-+void colo_release_ram_cache(void)
++        /*
-+{
++         * Device vring base was set at device start. SVQ base is handled by
-+    RAMBlock *block;
++         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
-+    rcu_read_lock();
++    return vhost_vdpa_set_dev_vring_base(dev, ring);
 +    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 +        if (block->colo_cache) {
 +            qemu_anon_ram_free(block->colo_cache, block->used_length);
 +            block->colo_cache = NULL;
 +        }
 +    }
 +    rcu_read_unlock();
 +}
 +
  /**
   * ram_load_setup: Setup RAM for migration incoming side
   *
@@ -XXX,XX +XXX,XX @@ static int ram_load_setup(QEMUFile *f, void *opaque)
      xbzrle_load_setup();
      ramblock_recv_map_init();
 +
      return 0;
  }
-@@ -XXX,XX +XXX,XX @@ static int ram_load_cleanup(void *opaque)
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
          g_free(rb->receivedmap);
          rb->receivedmap = NULL;
      }
 +
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                       RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
              RAMBlock *block = ram_block_from_stream(f, flags);
 -            host = host_from_ram_block_offset(block, addr);
 +            /*
 +             * After going into COLO, we should load the Page into colo_cache.
 +             */
 +            if (migration_incoming_in_colo_state()) {
 +                host = colo_cache_from_block_offset(block, addr);
 +            } else {
 +                host = host_from_ram_block_offset(block, addr);
 +            }
              if (!host) {
                  error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                  ret = -EINVAL;
                  break;
              }
 -            ramblock_recv_bitmap_set(block, host);
 +
 +            if (!migration_incoming_in_colo_state()) {
 +                ramblock_recv_bitmap_set(block, host);
 +            }
 +
              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
          }
 diff --git a/migration/ram.h b/migration/ram.h
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/ram.h
 +++ b/migration/ram.h
@@ -XXX,XX +XXX,XX @@ int64_t ramblock_recv_bitmap_send(QEMUFile *file,
                                    const char *block_name);
  int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
 +/* ram cache */
 +int colo_init_ram_cache(void);
 +void colo_release_ram_cache(void);
 +
  #endif
 diff --git a/migration/savevm.c b/migration/savevm.c
 index XXXXXXX..XXXXXXX 100644
 --- a/migration/savevm.c
 +++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
  static int loadvm_process_enable_colo(MigrationIncomingState *mis)
  {
      migration_incoming_enable_colo();
 -    return 0;
 +    return colo_init_ram_cache();
  }
  /*
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 03/26] colo-compare: use notifier to notify packets comparing result
+[PULL V3 08/15] vhost: Shadow virtqueue buffers forwarding
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-It's a good idea to use notifier to notify COLO frame of
+Initial version of shadow virtqueue that actually forward buffers. There
-inconsistent packets comparing.
+is no iommu support at the moment, and that will be addressed in future
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
+this means that SVQ is not usable at this point of the series on any
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+device.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
 For simplicity it only supports modern devices, that expects vring
 in little endian, with split ring and no event idx or indirect
 descriptors. Support for them will not be added in this series.
 It reuses the VirtQueue code for the device part. The driver part is
 based on Linux's virtio_ring driver, but with stripped functionality
 and optimizations so it's easier to review.
 However, forwarding buffers have some particular pieces: One of the most
 unexpected ones is that a guest's buffer can expand through more than
 one descriptor in SVQ. While this is handled gracefully by qemu's
 emulated virtio devices, it may cause unexpected SVQ queue full. This
 patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 37 ++++++++++++++++++++++++++-----------
+ hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
- net/colo-compare.h |  2 ++
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
-files changed, 28 insertions(+), 11 deletions(-)
+ hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
+files changed, 522 insertions(+), 11 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
- #include "sysemu/iothread.h"
+ #include "qemu/error-report.h"
- #include "net/colo-compare.h"
+ #include "qapi/error.h"
- #include "migration/colo.h"
+ #include "qemu/main-loop.h"
-+#include "migration/migration.h"
++#include "qemu/log.h"
++#include "qemu/memalign.h"
- #define TYPE_COLO_COMPARE "colo-compare"
+ #include "linux-headers/linux/vhost.h"
- #define COLO_COMPARE(obj) \
-@@ -XXX,XX +XXX,XX @@
+ /**
- static QTAILQ_HEAD(, CompareState) net_compares =
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
         QTAILQ_HEAD_INITIALIZER(net_compares);
 +static NotifierList colo_compare_notifiers =
 +    NOTIFIER_LIST_INITIALIZER(colo_compare_notifiers);
 +
  #define COMPARE_READ_LEN_MAX NET_BUFSIZE
  #define MAX_QUEUE_SIZE 1024
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
      return false;
  }
-+static void colo_compare_inconsistency_notify(void)
+ /**
-+{
+- * Forward guest notifications.
-+    notifier_list_notify(&colo_compare_notifiers,
++ * Number of descriptors that the SVQ can make available from the guest.
-+                migrate_get_current());
++ *
-+}
++ * @svq: The svq
-+
++ */
- static void colo_compare_tcp(CompareState *s, Connection *conn)
++static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
 +{
 +    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    const struct iovec *iovec, size_t num,
 +                                    bool more_descs, bool write)
 +{
 +    uint16_t i = svq->free_head, last = svq->free_head;
 +    unsigned n;
 +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 +    vring_desc_t *descs = svq->vring.desc;
 +
 +    if (num == 0) {
 +        return;
 +    }
 +
 +    for (n = 0; n < num; n++) {
 +        if (more_descs || (n + 1 < num)) {
 +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 +        } else {
 +            descs[i].flags = flags;
 +        }
 +        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
 +
 +        last = i;
 +        i = cpu_to_le16(descs[i].next);
 +    }
 +
 +    svq->free_head = le16_to_cpu(descs[last].next);
 +}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem, unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 +                            false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
-     Packet *ppkt = NULL, *spkt = NULL;
+     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
-@@ -XXX,XX +XXX,XX @@ sec:
+     event_notifier_test_and_clear(n);
-         qemu_hexdump((char *)spkt->data, stderr,
+-    event_notifier_set(&svq->hdev_kick);
-                      "colo-compare spkt", spkt->size);
++    vhost_handle_guest_kick(svq);
++}
--        /*
++
--         * colo_compare_inconsistent_notify();
++static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
--         * TODO: notice to checkpoint();
++{
--         */
++    if (svq->last_used_idx != svq->shadow_used_idx) {
-+        colo_compare_inconsistency_notify();
++        return true;
 +    }
 +
 +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
 +
 +    return svq->last_used_idx != svq->shadow_used_idx;
  }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
-@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
  void vhost_svq_stop(VhostShadowVirtqueue *svq)
  {
      event_notifier_set_handler(&svq->svq_kick, NULL);
 +    g_autofree VirtQueueElement *next_avail_elem = NULL;
 +
 +    if (!svq->vq) {
 +        return;
 +    }
 +
 +    /* Send all pending used descriptors to guest */
 +    vhost_svq_flush(svq, false);
 +
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq, unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq, unsigned idx,
 +                                  Error **errp)
  {
      struct vhost_vring_file file = {
          .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 -        return false;
 +        return r;
      }
+     event_notifier = &svq->hdev_call;
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+         error_setg_errno(errp, -r, "Can't set device call fd");
+     }
++    return r;
++}
++
++/**
++ * Unmap a SVQ area in the device
++ */
++static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
++                                      hwaddr size)
++{
++    int r;
++
++    size = ROUND_UP(size, qemu_real_host_page_size);
++    r = vhost_vdpa_dma_unmap(v, iova, size);
++    return r == 0;
++}
++
++static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
++                                       const VhostShadowVirtqueue *svq)
++{
++    struct vhost_vdpa *v = dev->opaque;
++    struct vhost_vring_addr svq_addr;
++    size_t device_size = vhost_svq_device_area_size(svq);
++    size_t driver_size = vhost_svq_driver_area_size(svq);
++    bool ok;
++
++    vhost_svq_get_vring_addr(svq, &svq_addr);
++
++    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
++    if (unlikely(!ok)) {
++        return false;
++    }
++
++    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
++}
++
++/**
++ * Map the shadow virtqueue rings in the device
++ *
++ * @dev: The vhost device
++ * @svq: The shadow virtqueue
++ * @addr: Assigned IOVA addresses
++ * @errp: Error pointer
++ */
++static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
++                                     const VhostShadowVirtqueue *svq,
++                                     struct vhost_vring_addr *addr,
++                                     Error **errp)
++{
++    struct vhost_vdpa *v = dev->opaque;
++    size_t device_size = vhost_svq_device_area_size(svq);
++    size_t driver_size = vhost_svq_driver_area_size(svq);
++    int r;
++
++    ERRP_GUARD();
++    vhost_svq_get_vring_addr(svq, addr);
++
++    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
++                           (void *)(uintptr_t)addr->desc_user_addr, true);
++    if (unlikely(r != 0)) {
++        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
++        return false;
++    }
++
++    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
++                           (void *)(intptr_t)addr->used_user_addr, false);
++    if (unlikely(r != 0)) {
++        error_setg_errno(errp, -r, "Cannot create vq device region: ");
++    }
++
++    return r == 0;
++}
++
++static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
++                                 VhostShadowVirtqueue *svq, unsigned idx,
++                                 Error **errp)
++{
++    uint16_t vq_index = dev->vq_index + idx;
++    struct vhost_vring_state s = {
++        .index = vq_index,
++    };
++    int r;
++
++    r = vhost_vdpa_set_dev_vring_base(dev, &s);
++    if (unlikely(r)) {
++        error_setg_errno(errp, -r, "Cannot set vring base");
++        return false;
++    }
++
++    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
+     return r == 0;
  }
-+void colo_compare_register_notifier(Notifier *notify)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    notifier_list_add(&colo_compare_notifiers, notify);
 +}
 +
 +void colo_compare_unregister_notifier(Notifier *notify)
 +{
 +    notifier_remove(notify);
 +}
 +
  static int colo_old_packet_check_one_conn(Connection *conn,
 -                                          void *user_data)
 +                                           void *user_data)
  {
      GList *result = NULL;
      int64_t check_time = REGULAR_PACKET_CHECK_MS;
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one_conn(Connection *conn,
      if (result) {
          /* Do checkpoint will flush old packet */
 -        /*
 -         * TODO: Notify colo frame to do checkpoint.
 -         * colo_compare_inconsistent_notify();
 -         */
 +        colo_compare_inconsistency_notify();
          return 0;
      }
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_packet(CompareState *s, Connection *conn,
+     for (i = 0; i < v->shadow_vqs->len; ++i) {
-             /*
++        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
-              * If one packet arrive late, the secondary_list or
+         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-              * primary_list will be empty, so we can't compare it
++        struct vhost_vring_addr addr = {
--             * until next comparison.
++            .index = i,
-+             * until next comparison. If the packets in the list are
++        };
-+             * timeout, it will trigger a checkpoint request.
++        int r;
-              */
+         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
-             trace_colo_compare_main("packet different");
+         if (unlikely(!ok)) {
-             g_queue_push_head(&conn->primary_list, pkt);
+-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
--            /* TODO: colo_notify_checkpoint();*/
++            goto err;
-+            colo_compare_inconsistency_notify();
++        }
-             break;
++
 +        vhost_svq_start(svq, dev->vdev, vq);
 +        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
 +        if (unlikely(!ok)) {
 +            goto err_map;
 +        }
 +
 +        /* Override vring GPA set by vhost subsystem */
 +        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
 +        if (unlikely(r != 0)) {
 +            error_setg_errno(&err, -r, "Cannot set device address");
 +            goto err_set_addr;
 +        }
 +    }
 +
 +    return true;
 +
 +err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
      }
-diff --git a/net/colo-compare.h b/net/colo-compare.h
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
-index XXXXXXX..XXXXXXX 100644
+         }
---- a/net/colo-compare.h
+         vhost_vdpa_set_vring_ready(dev);
-+++ b/net/colo-compare.h
+     } else {
-@@ -XXX,XX +XXX,XX @@
++        ok = vhost_vdpa_svqs_stop(dev);
- #define QEMU_COLO_COMPARE_H
++        if (unlikely(!ok)) {
++            return -1;
- void colo_notify_compares_event(void *opaque, int event, Error **errp);
++        }
-+void colo_compare_register_notifier(Notifier *notify);
+         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
-+void colo_compare_unregister_notifier(Notifier *notify);
+     }
  #endif /* QEMU_COLO_COMPARE_H */
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 05/26] COLO: Add block replication into colo process
+Deleted patch
-From: Zhang Chen <zhangckid@gmail.com>
-Make sure master start block replication after slave's block
-replication started.
-Besides, we need to activate VM's blocks before goes into
-COLO state.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- migration/colo.c      | 43 +++++++++++++++++++++++++++++++++++++++++++
- migration/migration.c | 10 ++++++++++
-files changed, 53 insertions(+)
-diff --git a/migration/colo.c b/migration/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
-+++ b/migration/colo.c
-@@ -XXX,XX +XXX,XX @@
- #include "replication.h"
- #include "net/colo-compare.h"
- #include "net/colo.h"
-+#include "block/block.h"
- static bool vmstate_loading;
- static Notifier packets_compare_notifier;
-@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
- {
-     int old_state;
-     MigrationIncomingState *mis = migration_incoming_get_current();
-+    Error *local_err = NULL;
-     /* Can not do failover during the process of VM's loading VMstate, Or
-      * it will break the secondary VM.
-@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
-     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
-                       MIGRATION_STATUS_COMPLETED);
-+    replication_stop_all(true, &local_err);
-+    if (local_err) {
-+        error_report_err(local_err);
-+    }
-+
-     if (!autostart) {
-         error_report("\"-S\" qemu option will be ignored in secondary side");
-         /* recover runstate to normal migration finish state */
-@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
- {
-     MigrationState *s = migrate_get_current();
-     int old_state;
-+    Error *local_err = NULL;
-     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
-                       MIGRATION_STATUS_COMPLETED);
-@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
-                      FailoverStatus_str(old_state));
-         return;
-     }
-+
-+    replication_stop_all(true, &local_err);
-+    if (local_err) {
-+        error_report_err(local_err);
-+        local_err = NULL;
-+    }
-+
-     /* Notify COLO thread that failover work is finished */
-     qemu_sem_post(&s->colo_exit_sem);
- }
-@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
-     qemu_savevm_state_header(fb);
-     qemu_savevm_state_setup(fb);
-     qemu_mutex_lock_iothread();
-+    replication_do_checkpoint_all(&local_err);
-+    if (local_err) {
-+        qemu_mutex_unlock_iothread();
-+        goto out;
-+    }
-     qemu_savevm_state_complete_precopy(fb, false, false);
-     qemu_mutex_unlock_iothread();
-@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
-     object_unref(OBJECT(bioc));
-     qemu_mutex_lock_iothread();
-+    replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
-+    if (local_err) {
-+        qemu_mutex_unlock_iothread();
-+        goto out;
-+    }
-+
-     vm_start();
-     qemu_mutex_unlock_iothread();
-     trace_colo_vm_state_change("stop", "run");
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
-     object_unref(OBJECT(bioc));
-     qemu_mutex_lock_iothread();
-+    replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
-+    if (local_err) {
-+        qemu_mutex_unlock_iothread();
-+        goto out;
-+    }
-     vm_start();
-     trace_colo_vm_state_change("stop", "run");
-     qemu_mutex_unlock_iothread();
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
-             goto out;
-         }
-+        replication_get_error_all(&local_err);
-+        if (local_err) {
-+            qemu_mutex_unlock_iothread();
-+            goto out;
-+        }
-+        /* discard colo disk buffer */
-+        replication_do_checkpoint_all(&local_err);
-+        if (local_err) {
-+            qemu_mutex_unlock_iothread();
-+            goto out;
-+        }
-+
-         vmstate_loading = false;
-         vm_start();
-         trace_colo_vm_state_change("stop", "run");
-diff --git a/migration/migration.c b/migration/migration.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/migration.c
-+++ b/migration/migration.c
-@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
-     MigrationIncomingState *mis = migration_incoming_get_current();
-     PostcopyState ps;
-     int ret;
-+    Error *local_err = NULL;
-     assert(mis->from_src_file);
-     mis->migration_incoming_co = qemu_coroutine_self();
-@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
-     /* we get COLO info, and know if we are in COLO mode */
-     if (!ret && migration_incoming_enable_colo()) {
-+        /* Make sure all file formats flush their mutable metadata */
-+        bdrv_invalidate_cache_all(&local_err);
-+        if (local_err) {
-+            migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
-+                    MIGRATION_STATUS_FAILED);
-+            error_report_err(local_err);
-+            exit(EXIT_FAILURE);
-+        }
-+
-         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
-              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
-         mis->have_colo_incoming_thread = true;
---
-.5.0

-[Qemu-devel] [PULL 08/26] ram/COLO: Record the dirty pages that SVM received
+Deleted patch
-From: Zhang Chen <zhangckid@gmail.com>
-We record the address of the dirty pages that received,
-it will help flushing pages that cached into SVM.
-Here, it is a trick, we record dirty pages by re-using migration
-dirty bitmap. In the later patch, we will start the dirty log
-for SVM, just like migration, in this way, we can record both
-the dirty pages caused by PVM and SVM, we only flush those dirty
-pages from RAM cache while do checkpoint.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- migration/ram.c | 43 ++++++++++++++++++++++++++++++++++++++++---
-file changed, 40 insertions(+), 3 deletions(-)
-diff --git a/migration/ram.c b/migration/ram.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/ram.c
-+++ b/migration/ram.c
-@@ -XXX,XX +XXX,XX @@ static inline void *colo_cache_from_block_offset(RAMBlock *block,
-                      __func__, block->idstr);
-         return NULL;
-     }
-+
-+    /*
-+    * During colo checkpoint, we need bitmap of these migrated pages.
-+    * It help us to decide which pages in ram cache should be flushed
-+    * into VM's RAM later.
-+    */
-+    if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
-+        ram_state->migration_dirty_pages++;
-+    }
-     return block->colo_cache + offset;
- }
-@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
-     RAMBlock *block;
-     rcu_read_lock();
--    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
-         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
-                                                 NULL,
-                                                 false);
-@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
-         memcpy(block->colo_cache, block->host, block->used_length);
-     }
-     rcu_read_unlock();
-+    /*
-+    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
-+    * with to decide which page in cache should be flushed into SVM's RAM. Here
-+    * we use the same name 'ram_bitmap' as for migration.
-+    */
-+    if (ram_bytes_total()) {
-+        RAMBlock *block;
-+
-+        RAMBLOCK_FOREACH_MIGRATABLE(block) {
-+            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
-+
-+            block->bmap = bitmap_new(pages);
-+            bitmap_set(block->bmap, 0, pages);
-+        }
-+    }
-+    ram_state = g_new0(RAMState, 1);
-+    ram_state->migration_dirty_pages = 0;
-+
-     return 0;
- out_locked:
--    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-+
-+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
-         if (block->colo_cache) {
-             qemu_anon_ram_free(block->colo_cache, block->used_length);
-             block->colo_cache = NULL;
-@@ -XXX,XX +XXX,XX @@ void colo_release_ram_cache(void)
- {
-     RAMBlock *block;
-+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
-+        g_free(block->bmap);
-+        block->bmap = NULL;
-+    }
-+
-     rcu_read_lock();
--    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-+
-+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
-         if (block->colo_cache) {
-             qemu_anon_ram_free(block->colo_cache, block->used_length);
-             block->colo_cache = NULL;
-         }
-     }
-+
-     rcu_read_unlock();
-+    g_free(ram_state);
-+    ram_state = NULL;
- }
- /**
---
-.5.0

-[Qemu-devel] [PULL 25/26] e1000: indicate dropped packets in HW counters
+[PULL V3 09/15] util: Add iova_tree_alloc_map
-From: Martin Wilck <mwilck@suse.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The e1000 emulation silently discards RX packets if there's
+This iova tree function allows it to look for a hole in allocated
-insufficient space in the ring buffer. This leads to errors
+regions and return a totally new translation for a given translated
-on higher-level protocols in the guest, with no indication
+address.
-about the error cause.
+It's usage is mainly to allow devices to access qemu address space,
-This patch increments the "Missed Packets Count" (MPC) and
+remapping guest's one into a new iova space where qemu can add chunks of
-"Receive No Buffers Count" (RNBC) HW counters in this case.
+addresses.
-As the emulation has no FIFO for buffering packets that can't
-immediately be pushed to the guest, these two registers are
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-practically equivalent (see 10.2.7.4, 10.2.7.33 in
+Reviewed-by: Peter Xu <peterx@redhat.com>
-https://www.intel.com/content/www/us/en/embedded/products/networking/82574l-gbe-controller-datasheet.html).
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 On a Linux guest, the register content  will be reflected in
 the "rx_missed_errors" and "rx_no_buffer_count" stats from
 "ethtool -S", and in the "missed" stat from "ip -s -s link show",
 giving at least some hint about the error cause inside the guest.
 If the cause is known, problems like this can often be avoided
 easily, by increasing the number of RX descriptors in the guest
 e1000 driver (e.g under Linux, "e1000.RxDescriptors=1024").
 The patch also adds a qemu trace message for this condition.
 Signed-off-by: Martin Wilck <mwilck@suse.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000.c      | 16 +++++++++++++---
+ include/qemu/iova-tree.h |  18 +++++++
- hw/net/trace-events |  3 +++
+ util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
-files changed, 16 insertions(+), 3 deletions(-)
+files changed, 154 insertions(+)
-diff --git a/hw/net/e1000.c b/hw/net/e1000.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/e1000.c
++++ b/include/qemu/iova-tree.h
 @@ -XXX,XX +XXX,XX @@
- #include "qemu/range.h"
+ #define  IOVA_OK           (0)
+ #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
- #include "e1000x_common.h"
+ #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
-+#include "trace.h"
++#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
- static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ typedef struct IOVATree IOVATree;
+ typedef struct DMAMap {
-@@ -XXX,XX +XXX,XX @@ static uint64_t rx_desc_base(E1000State *s)
+@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
-     return (bah << 32) + bal;
+ void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next)
 +{
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
-+static void
++/**
-+e1000_receiver_overrun(E1000State *s, size_t size)
++ * Try to find an unallocated IOVA range between prev and this elements.
-+{
++ *
-+    trace_e1000_receiver_overrun(size, s->mac_reg[RDH], s->mac_reg[RDT]);
++ * @args: Arguments to allocation
-+    e1000x_inc_reg_if_not_full(s->mac_reg, RNBC);
++ *
-+    e1000x_inc_reg_if_not_full(s->mac_reg, MPC);
++ * Cases:
-+    set_ics(s, 0, E1000_ICS_RXO);
++ *
-+}
++ * (1) !prev, !this: No entries allocated, always succeed
-+
++ *
- static ssize_t
++ * (2) !prev, this: We're iterating at the 1st element.
- e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
++ *
 + * (3) prev, !this: We're iterating at the last element.
 + *
 + * (4) prev, this: this is the most common case, we'll try to find a hole
 + * between "prev" and "this" mapping.
 + *
 + * Note that this function assumes the last valid iova is HWADDR_MAX, but it
 + * searches linearly so it's easy to discard the result if it's not the case.
 + */
 +static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
 +{
 +    const DMAMap *prev = args->prev, *this = args->this;
 +    uint64_t hole_start, hole_last;
 +
 +    if (this && this->iova + this->size < args->iova_begin) {
 +        return;
 +    }
 +
 +    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
 +    hole_last = this ? this->iova : HWADDR_MAX;
 +
 +    if (hole_last - hole_start > args->new_size) {
 +        args->iova_result = hole_start;
 +        args->iova_found = true;
 +    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
-@@ -XXX,XX +XXX,XX @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
+     g_tree_destroy(tree->tree);
      desc_offset = 0;
      total_size = size + e1000x_fcs_len(s->mac_reg);
      if (!e1000_has_rxbufs(s, total_size)) {
 -            set_ics(s, 0, E1000_ICS_RXO);
 -            return -1;
 +        e1000_receiver_overrun(s, total_size);
 +        return -1;
      }
      do {
          desc_size = total_size - desc_offset;
@@ -XXX,XX +XXX,XX @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
              rdh_start >= s->mac_reg[RDLEN] / sizeof(desc)) {
              DBGOUT(RXERR, "RDH wraparound @%x, RDT %x, RDLEN %x\n",
                     rdh_start, s->mac_reg[RDT], s->mac_reg[RDLEN]);
 -            set_ics(s, 0, E1000_ICS_RXO);
 +            e1000_receiver_overrun(s, total_size);
              return -1;
          }
      } while (desc_offset < total_size);
 diff --git a/hw/net/trace-events b/hw/net/trace-events
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/trace-events
 +++ b/hw/net/trace-events
@@ -XXX,XX +XXX,XX @@ net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS  hash"
  net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X"
  net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes"
 +# hw/net/e1000.c
 +e1000_receiver_overrun(size_t s, uint32_t rdh, uint32_t rdt) "Receiver overrun: dropped packet of %lu bytes, RDH=%u, RDT=%u"
 +
  # hw/net/e1000x_common.c
  e1000x_rx_can_recv_disabled(bool link_up, bool rx_enabled, bool pci_master) "link_up: %d, rx_enabled %d, pci_master %d"
  e1000x_vlan_is_vlan_pkt(bool is_vlan_pkt, uint16_t eth_proto, uint16_t vet) "Is VLAN packet: %d, ETH proto: 0x%X, VET: 0x%X"
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 12/26] qapi: Add new command to query colo status
+[PULL V3 10/15] util: add iova_tree_find_iova
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Libvirt or other high level software can use this command query colo status.
+This function does the reverse operation of iova_tree_find: To look for
-You can test this command like that:
+a mapping that match a translated address so we can do the reverse.
 {'execute':'query-colo-status'}
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
+This have linear complexity instead of logarithmic, but it supports
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+overlapping HVA. Future developments could reduce it.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- migration/colo.c    | 21 +++++++++++++++++++++
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
- qapi/migration.json | 32 ++++++++++++++++++++++++++++++++
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
-files changed, 53 insertions(+)
+files changed, 53 insertions(+), 1 deletion(-)
-diff --git a/migration/colo.c b/migration/colo.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
+--- a/include/qemu/iova-tree.h
-+++ b/migration/colo.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
- #include "net/colo.h"
+  * @tree: the iova tree to search from
- #include "block/block.h"
+  * @map: the mapping to search
- #include "qapi/qapi-events-migration.h"
+  *
-+#include "qapi/qmp/qerror.h"
+- * Search for a mapping in the iova tree that overlaps with the
++ * Search for a mapping in the iova tree that iova overlaps with the
- static bool vmstate_loading;
+  * mapping range specified.  Only the first found mapping will be
- static Notifier packets_compare_notifier;
+  * returned.
-@@ -XXX,XX +XXX,XX @@ void qmp_xen_colo_do_checkpoint(Error **errp)
+  *
- #endif
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
      bool iova_found;
  };
 +typedef struct IOVATreeFindIOVAArgs {
 +    const DMAMap *needle;
 +    const DMAMap *result;
 +} IOVATreeFindIOVAArgs;
 +
  /**
   * Iterate args to the next hole
   *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
      return g_tree_lookup(tree->tree, map);
  }
-+COLOStatus *qmp_query_colo_status(Error **errp)
++static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
 +                                                gpointer data)
 +{
-+    COLOStatus *s = g_new0(COLOStatus, 1);
++    const DMAMap *map = key;
 +    IOVATreeFindIOVAArgs *args = data;
 +    const DMAMap *needle;
 +
-+    s->mode = get_colo_mode();
++    g_assert(key == value);
 +
-+    switch (failover_get_state()) {
++    needle = args->needle;
-+    case FAILOVER_STATUS_NONE:
++    if (map->translated_addr + map->size < needle->translated_addr ||
-+        s->reason = COLO_EXIT_REASON_NONE;
++        needle->translated_addr + needle->size < map->translated_addr) {
-+        break;
++        return false;
 +    case FAILOVER_STATUS_REQUIRE:
 +        s->reason = COLO_EXIT_REASON_REQUEST;
 +        break;
 +    default:
 +        s->reason = COLO_EXIT_REASON_ERROR;
 +    }
 +
-+    return s;
++    args->result = map;
 +    return true;
 +}
 +
- static void colo_send_message(QEMUFile *f, COLOMessage msg,
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
-                               Error **errp)
++{
 +    IOVATreeFindIOVAArgs args = {
 +        .needle = map,
 +    };
 +
 +    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
 +    return args.result;
 +}
 +
  const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
  {
-diff --git a/qapi/migration.json b/qapi/migration.json
+     const DMAMap map = { .iova = iova, .size = 0 };
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/migration.json
 +++ b/qapi/migration.json
@@ -XXX,XX +XXX,XX @@
  { 'command': 'xen-colo-do-checkpoint' }
  ##
 +# @COLOStatus:
 +#
 +# The result format for 'query-colo-status'.
 +#
 +# @mode: COLO running mode. If COLO is running, this field will return
 +#        'primary' or 'secondary'.
 +#
 +# @reason: describes the reason for the COLO exit.
 +#
 +# Since: 3.0
 +##
 +{ 'struct': 'COLOStatus',
 +  'data': { 'mode': 'COLOMode', 'reason': 'COLOExitReason' } }
 +
 +##
 +# @query-colo-status:
 +#
 +# Query COLO status while the vm is running.
 +#
 +# Returns: A @COLOStatus object showing the status.
 +#
 +# Example:
 +#
 +# -> { "execute": "query-colo-status" }
 +# <- { "return": { "mode": "primary", "active": true, "reason": "request" } }
 +#
 +# Since: 3.0
 +##
 +{ 'command': 'query-colo-status',
 +  'returns': 'COLOStatus' }
 +
 +##
  # @migrate-recover:
  #
  # Provide a recovery migration stream URI.
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 16/26] filter-rewriter: handle checkpoint and failover event
+[PULL V3 11/15] vhost: Add VhostIOVATree
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-After one round of checkpoint, the states between PVM and SVM
+This tree is able to look for a translated address from an IOVA address.
 become consistent, so it is unnecessary to adjust the sequence
 of net packets for old connections, besides, while failover
 happens, filter-rewriter will into failover mode that needn't
 handle the new TCP connection.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
+At first glance it is similar to util/iova-tree. However, SVQ working on
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
+devices with limited IOVA space need more capabilities, like allocating
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+IOVA chunks or performing reverse translations (qemu addresses to iova).
 The allocation capability, as "assign a free IOVA address to this chunk
 of memory in qemu's address space" allows shadow virtqueue to create a
 new address space that is not restricted by guest's addressable one, so
 we can allocate shadow vqs vrings outside of it.
 It duplicates the tree so it can search efficiently in both directions,
 and it will signal overlap if iova or the translated address is present
 in any tree.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c    | 12 +++++------
+ hw/virtio/meson.build       |   2 +-
- net/colo.c            |  8 ++++++++
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
- net/colo.h            |  2 ++
+ hw/virtio/vhost-iova-tree.h |  27 +++++++++++
- net/filter-rewriter.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
+files changed, 138 insertions(+), 1 deletion(-)
-files changed, 73 insertions(+), 6 deletions(-)
+ create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/meson.build
-+++ b/net/colo-compare.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ enum {
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-     SECONDARY_IN,
- };
+ virtio_ss = ss.source_set()
+ virtio_ss.add(files('virtio.c'))
-+static void colo_compare_inconsistency_notify(void)
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
  virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/iova-tree.h"
 +#include "vhost-iova-tree.h"
 +
 +#define iova_min_addr qemu_real_host_page_size
 +
 +/**
 + * VhostIOVATree, able to:
 + * - Translate iova address
 + * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
 +struct VhostIOVATree {
 +    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
 +    /* Last addressable iova address in the device */
 +    uint64_t iova_last;
 +
 +    /* IOVA address to qemu memory maps. */
 +    IOVATree *iova_taddr_map;
 +};
 +
 +/**
 + * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
-+    notifier_list_notify(&colo_compare_notifiers,
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
-+                migrate_get_current());
++
 +    /* Some devices do not like 0 addresses */
 +    tree->iova_first = MAX(iova_first, iova_min_addr);
 +    tree->iova_last = iova_last;
 +
 +    tree->iova_taddr_map = iova_tree_new();
 +    return tree;
 +}
 +
- static int compare_chr_send(CompareState *s,
++/**
-                             const uint8_t *buf,
++ * Delete an iova tree
-                             uint32_t size,
++ */
-@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
      return false;
  }
 -static void colo_compare_inconsistency_notify(void)
 -{
 -    notifier_list_notify(&colo_compare_notifiers,
 -                migrate_get_current());
 -}
 -
  static void colo_compare_tcp(CompareState *s, Connection *conn)
  {
      Packet *ppkt = NULL, *spkt = NULL;
 diff --git a/net/colo.c b/net/colo.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo.c
 +++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ Connection *connection_get(GHashTable *connection_track_table,
      return conn;
  }
 +
 +bool connection_has_tracked(GHashTable *connection_track_table,
 +                            ConnectionKey *key)
 +{
-+    Connection *conn = g_hash_table_lookup(connection_track_table, key);
++    iova_tree_destroy(iova_tree->iova_taddr_map);
-+
++    g_free(iova_tree);
 +    return conn ? true : false;
 +}
 diff --git a/net/colo.h b/net/colo.h
 index XXXXXXX..XXXXXXX 100644
 --- a/net/colo.h
 +++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@ void connection_destroy(void *opaque);
  Connection *connection_get(GHashTable *connection_track_table,
                             ConnectionKey *key,
                             GQueue *conn_list);
 +bool connection_has_tracked(GHashTable *connection_track_table,
 +                            ConnectionKey *key);
  void connection_hashtable_reset(GHashTable *connection_track_table);
  Packet *packet_new(const void *data, int size, int vnet_hdr_len);
  void packet_destroy(void *opaque, void *user_data);
 diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/filter-rewriter.c
 +++ b/net/filter-rewriter.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/main-loop.h"
  #include "qemu/iov.h"
  #include "net/checksum.h"
 +#include "net/colo.h"
 +#include "migration/colo.h"
  #define FILTER_COLO_REWRITER(obj) \
      OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER)
  #define TYPE_FILTER_REWRITER "filter-rewriter"
 +#define FAILOVER_MODE_ON  true
 +#define FAILOVER_MODE_OFF false
  typedef struct RewriterState {
      NetFilterState parent_obj;
@@ -XXX,XX +XXX,XX @@ typedef struct RewriterState {
      /* hashtable to save connection */
      GHashTable *connection_track_table;
      bool vnet_hdr;
 +    bool failover_mode;
  } RewriterState;
 +static void filter_rewriter_failover_mode(RewriterState *s)
 +{
 +    s->failover_mode = FAILOVER_MODE_ON;
 +}
 +
- static void filter_rewriter_flush(NetFilterState *nf)
++/**
- {
++ * Find the IOVA address stored from a memory address
-     RewriterState *s = FILTER_COLO_REWRITER(nf);
++ *
-@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
++ * @tree: The iova tree
-              */
++ * @map: The map with the memory address
-             reverse_connection_key(&key);
++ *
-         }
++ * Return the stored mapping, or NULL if not found.
-+
++ */
-+        /* After failover we needn't change new TCP packet */
++const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
-+        if (s->failover_mode &&
++                                        const DMAMap *map)
 +            !connection_has_tracked(s->connection_track_table, &key)) {
 +            goto out;
 +        }
 +
          conn = connection_get(s->connection_track_table,
                                &key,
                                NULL);
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
          }
      }
 +out:
      packet_destroy(pkt, NULL);
      pkt = NULL;
      return 0;
  }
 +static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
 +{
-+    Connection *conn = (Connection *)value;
++    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +
 +    conn->offset = 0;
 +}
 +
-+static gboolean offset_is_nonzero(gpointer key,
++/**
-+                                  gpointer value,
++ * Allocate a new mapping
-+                                  gpointer user_data)
++ *
 + * @tree: The iova tree
 + * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
-+    Connection *conn = (Connection *)value;
++    /* Some vhost devices do not like addr 0. Skip first page */
 +    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
-+    return conn->offset ? true : false;
++    if (map->translated_addr + map->size < map->translated_addr ||
 +        map->perm == IOMMU_NONE) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /* Allocate a node in IOVA address */
 +    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
 +                               tree->iova_last);
 +}
 +
-+static void colo_rewriter_handle_event(NetFilterState *nf, int event,
++/**
-+                                       Error **errp)
++ * Remove existing mappings from iova tree
 + *
 + * @iova_tree: The vhost iova tree
 + * @map: The map to remove
 + */
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
 +{
-+    RewriterState *rs = FILTER_COLO_REWRITER(nf);
++    iova_tree_remove(iova_tree->iova_taddr_map, map);
 +}
 diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
-+    switch (event) {
++#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
-+    case COLO_EVENT_CHECKPOINT:
++#define HW_VIRTIO_VHOST_IOVA_TREE_H
 +        g_hash_table_foreach(rs->connection_track_table,
 +                            reset_seq_offset, NULL);
 +        break;
 +    case COLO_EVENT_FAILOVER:
 +        if (!g_hash_table_find(rs->connection_track_table,
 +                              offset_is_nonzero, NULL)) {
 +            filter_rewriter_failover_mode(rs);
 +        }
 +        break;
 +    default:
 +        break;
 +    }
 +}
 +
- static void colo_rewriter_cleanup(NetFilterState *nf)
++#include "qemu/iova-tree.h"
- {
++#include "exec/memory.h"
-     RewriterState *s = FILTER_COLO_REWRITER(nf);
++
-@@ -XXX,XX +XXX,XX @@ static void filter_rewriter_init(Object *obj)
++typedef struct VhostIOVATree VhostIOVATree;
-     RewriterState *s = FILTER_COLO_REWRITER(obj);
++
++VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
-     s->vnet_hdr = false;
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
-+    s->failover_mode = FAILOVER_MODE_OFF;
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
-     object_property_add_bool(obj, "vnet_hdr_support",
++
-                              filter_rewriter_get_vnet_hdr,
++const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
-                              filter_rewriter_set_vnet_hdr, NULL);
++                                        const DMAMap *map);
-@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_class_init(ObjectClass *oc, void *data)
++int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
-     nfc->setup = colo_rewriter_setup;
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
-     nfc->cleanup = colo_rewriter_cleanup;
++
-     nfc->receive_iov = colo_rewriter_receive_iov;
++#endif
 +    nfc->handle_event = colo_rewriter_handle_event;
  }
  static const TypeInfo colo_rewriter_info = {
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 09/26] COLO: Flush memory data from ram cache
+[PULL V3 12/15] vdpa: Add custom IOTLB translations to SVQ
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-During the time of VM's running, PVM may dirty some pages, we will transfer
+Use translations added in VhostIOVATree in SVQ.
-PVM's dirty pages to SVM and store them into SVM's RAM cache at next checkpoint
-time. So, the content of SVM's RAM cache will always be same with PVM's memory
+Only introduce usage here, not allocation and deallocation. As with
-after checkpoint.
+previous patches, we use the dead code paths of shadow_vqs_enabled to
+avoid commiting too many changes at once. These are impossible to take
-Instead of flushing all content of PVM's RAM cache into SVM's MEMORY,
+at the moment.
-we do this in a more efficient way:
-Only flush any page that dirtied by PVM since last checkpoint.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-In this way, we can ensure SVM's memory same with PVM's.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Besides, we must ensure flush RAM cache before load device state.
 Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
 Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- migration/ram.c        | 37 +++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
- migration/trace-events |  2 ++
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
-files changed, 39 insertions(+)
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
+ include/hw/virtio/vhost-vdpa.h     |   3 +
-diff --git a/migration/ram.c b/migration/ram.c
+files changed, 187 insertions(+), 30 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/migration/ram.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/migration/ram.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static bool postcopy_is_running(void)
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
+     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
  }
-+/*
+-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
-+ * Flush content of RAM cache into SVM's memory.
++/**
-+ * Only flush the pages that be dirtied by PVM or SVM or both.
++ * Translate addresses between the qemu's virtual address and the SVQ IOVA
 + *
 + * @svq: Shadow VirtQueue
 + * @vaddr: Translated IOVA addresses
 + * @iovec: Source qemu's VA addresses
 + * @num: Length of iovec and minimum length of vaddr
 + */
-+static void colo_flush_ram_cache(void)
++static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
 +                                     hwaddr *addrs, const struct iovec *iovec,
 +                                     size_t num)
 +{
-+    RAMBlock *block = NULL;
++    if (num == 0) {
-+    void *dst_host;
++        return true;
-+    void *src_host;
++    }
-+    unsigned long offset = 0;
++
-+
++    for (size_t i = 0; i < num; ++i) {
-+    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
++        DMAMap needle = {
-+    rcu_read_lock();
++            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
-+    block = QLIST_FIRST_RCU(&ram_list.blocks);
++            .size = iovec[i].iov_len,
-+
++        };
-+    while (block) {
++        Int128 needle_last, map_last;
-+        offset = migration_bitmap_find_dirty(ram_state, block, offset);
++        size_t off;
 +
-+        if (offset << TARGET_PAGE_BITS >= block->used_length) {
++        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
-+            offset = 0;
++        /*
-+            block = QLIST_NEXT_RCU(block, next);
++         * Map cannot be NULL since iova map contains all guest space and
-+        } else {
++         * qemu already has a physical address mapped
-+            migration_bitmap_clear_dirty(ram_state, block, offset);
++         */
-+            dst_host = block->host + (offset << TARGET_PAGE_BITS);
++        if (unlikely(!map)) {
-+            src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
++            qemu_log_mask(LOG_GUEST_ERROR,
-+            memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
++                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
-+    }
++
-+
++        off = needle.translated_addr - map->translated_addr;
-+    rcu_read_unlock();
++        addrs[i] = map->iova + off;
-+    trace_colo_flush_ram_cache_end();
++
 +        needle_last = int128_add(int128_make64(needle.translated_addr),
 +                                 int128_make64(iovec[i].iov_len));
 +        map_last = int128_make64(map->translated_addr + map->size);
 +        if (unlikely(int128_gt(needle_last, map_last))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
- static int ram_load(QEMUFile *f, void *opaque, int version_id)
++static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
- {
+                                     const struct iovec *iovec, size_t num,
-     int flags = 0, ret = 0, invalid_flags = 0;
+                                     bool more_descs, bool write)
-@@ -XXX,XX +XXX,XX @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
+ {
-     ret |= wait_for_decompress_done();
+@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
-     rcu_read_unlock();
+         } else {
-     trace_ram_load_complete(ret, seq_iter);
+             descs[i].flags = flags;
-+
+         }
-+    if (!ret  && migration_incoming_in_colo_state()) {
+-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
-+        colo_flush_ram_cache();
++        descs[i].addr = cpu_to_le64(sg[n]);
-+    }
+         descs[i].len = cpu_to_le32(iovec[n].iov_len);
-     return ret;
          last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
 -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 -                            false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                                struct vhost_vring_addr *addr)
  {
 -    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
 -    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
 -    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
  }
-diff --git a/migration/trace-events b/migration/trace-events
+ size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
---- a/migration/trace-events
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/migration/trace-events
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ ram_dirty_bitmap_sync_start(void) ""
+@@ -XXX,XX +XXX,XX @@
- ram_dirty_bitmap_sync_wait(void) ""
+ #include "qemu/event_notifier.h"
- ram_dirty_bitmap_sync_complete(void) ""
+ #include "hw/virtio/virtio.h"
- ram_state_resume_prepare(uint64_t v) "%" PRId64
+ #include "standard-headers/linux/vhost_types.h"
-+colo_flush_ram_cache_begin(uint64_t dirty_pages) "dirty_pages %" PRIu64
++#include "hw/virtio/vhost-iova-tree.h"
-+colo_flush_ram_cache_end(void) ""
+ /* Shadow virtqueue to relay notifications */
- # migration/migration.c
+ typedef struct VhostShadowVirtqueue {
- await_return_path_close_on_source_close(void) ""
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
 +
 +        iova = mem_region.iova;
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
 -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +    needle.translated_addr = svq_addr.used_user_addr;
 +    return vhost_vdpa_svq_unmap_ring(v, &needle);
 +}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)(uintptr_t)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)(uintptr_t)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
 +    addr->desc_user_addr = driver_region.iova;
 +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
 +    addr->avail_user_addr = driver_region.iova + avail_offset;
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)(intptr_t)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
 +    addr->used_user_addr = device_region.iova;
 -    return r == 0;
 +    return ok;
  }
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #include <gmodule.h>
 +#include "hw/virtio/vhost-iova-tree.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
      bool shadow_vqs_enabled;
 +    /* IOVA mapping used by the Shadow Virtqueue */
 +    VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 10/26] qmp event: Add COLO_EXIT event to notify users while exited COLO
+Deleted patch
-From: zhanghailiang <zhang.zhanghailiang@huawei.com>
-If some errors happen during VM's COLO FT stage, it's important to
-notify the users of this event. Together with 'x-colo-lost-heartbeat',
-Users can intervene in COLO's failover work immediately.
-If users don't want to get involved in COLO's failover verdict,
-it is still necessary to notify users that we exited COLO mode.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- migration/colo.c    | 31 +++++++++++++++++++++++++++++++
- qapi/migration.json | 38 ++++++++++++++++++++++++++++++++++++++
-files changed, 69 insertions(+)
-diff --git a/migration/colo.c b/migration/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
-+++ b/migration/colo.c
-@@ -XXX,XX +XXX,XX @@
- #include "net/colo-compare.h"
- #include "net/colo.h"
- #include "block/block.h"
-+#include "qapi/qapi-events-migration.h"
- static bool vmstate_loading;
- static Notifier packets_compare_notifier;
-@@ -XXX,XX +XXX,XX @@ out:
-         qemu_fclose(fb);
-     }
-+    /*
-+     * There are only two reasons we can get here, some error happened
-+     * or the user triggered failover.
-+     */
-+    switch (failover_get_state()) {
-+    case FAILOVER_STATUS_NONE:
-+        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
-+                                  COLO_EXIT_REASON_ERROR);
-+        break;
-+    case FAILOVER_STATUS_REQUIRE:
-+        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
-+                                  COLO_EXIT_REASON_REQUEST);
-+        break;
-+    default:
-+        abort();
-+    }
-+
-     /* Hope this not to be too long to wait here */
-     qemu_sem_wait(&s->colo_exit_sem);
-     qemu_sem_destroy(&s->colo_exit_sem);
-@@ -XXX,XX +XXX,XX @@ out:
-         error_report_err(local_err);
-     }
-+    switch (failover_get_state()) {
-+    case FAILOVER_STATUS_NONE:
-+        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
-+                                  COLO_EXIT_REASON_ERROR);
-+        break;
-+    case FAILOVER_STATUS_REQUIRE:
-+        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
-+                                  COLO_EXIT_REASON_REQUEST);
-+        break;
-+    default:
-+        abort();
-+    }
-+
-     if (fb) {
-         qemu_fclose(fb);
-     }
-diff --git a/qapi/migration.json b/qapi/migration.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/migration.json
-+++ b/qapi/migration.json
-@@ -XXX,XX +XXX,XX @@
-   'data': [ 'none', 'require', 'active', 'completed', 'relaunch' ] }
- ##
-+# @COLO_EXIT:
-+#
-+# Emitted when VM finishes COLO mode due to some errors happening or
-+# at the request of users.
-+#
-+# @mode: report COLO mode when COLO exited.
-+#
-+# @reason: describes the reason for the COLO exit.
-+#
-+# Since: 3.1
-+#
-+# Example:
-+#
-+# <- { "timestamp": {"seconds": 2032141960, "microseconds": 417172},
-+#      "event": "COLO_EXIT", "data": {"mode": "primary", "reason": "request" } }
-+#
-+##
-+{ 'event': 'COLO_EXIT',
-+  'data': {'mode': 'COLOMode', 'reason': 'COLOExitReason' } }
-+
-+##
-+# @COLOExitReason:
-+#
-+# The reason for a COLO exit
-+#
-+# @none: no failover has ever happened. This can't occur in the
-+# COLO_EXIT event, only in the result of query-colo-status.
-+#
-+# @request: COLO exit is due to an external request
-+#
-+# @error: COLO exit is due to an internal error
-+#
-+# Since: 3.1
-+##
-+{ 'enum': 'COLOExitReason',
-+  'data': [ 'none', 'request', 'error' ] }
-+
-+##
- # @x-colo-lost-heartbeat:
- #
- # Tell qemu that heartbeat is lost, request it to do takeover procedures.
---
-.5.0

-[Qemu-devel] [PULL 11/26] qapi/migration.json: Rename COLO unknown mode to none mode.
+Deleted patch
-From: Zhang Chen <chen.zhang@intel.com>
-Suggested by Markus Armbruster rename COLO unknown mode to none mode.
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Reviewed-by: Eric Blake <eblake@redhat.com>
-Reviewed-by: Markus Armbruster <armbru@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- migration/colo-failover.c |  2 +-
- migration/colo.c          |  2 +-
- qapi/migration.json       | 10 +++++-----
-files changed, 7 insertions(+), 7 deletions(-)
-diff --git a/migration/colo-failover.c b/migration/colo-failover.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/colo-failover.c
-+++ b/migration/colo-failover.c
-@@ -XXX,XX +XXX,XX @@ FailoverStatus failover_get_state(void)
- void qmp_x_colo_lost_heartbeat(Error **errp)
- {
--    if (get_colo_mode() == COLO_MODE_UNKNOWN) {
-+    if (get_colo_mode() == COLO_MODE_NONE) {
-         error_setg(errp, QERR_FEATURE_DISABLED, "colo");
-         return;
-     }
-diff --git a/migration/colo.c b/migration/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
-+++ b/migration/colo.c
-@@ -XXX,XX +XXX,XX @@ COLOMode get_colo_mode(void)
-     } else if (migration_incoming_in_colo_state()) {
-         return COLO_MODE_SECONDARY;
-     } else {
--        return COLO_MODE_UNKNOWN;
-+        return COLO_MODE_NONE;
-     }
- }
-diff --git a/qapi/migration.json b/qapi/migration.json
-index XXXXXXX..XXXXXXX 100644
---- a/qapi/migration.json
-+++ b/qapi/migration.json
-@@ -XXX,XX +XXX,XX @@
- ##
- # @COLOMode:
- #
--# The colo mode
-+# The COLO current mode.
- #
--# @unknown: unknown mode
-+# @none: COLO is disabled.
- #
--# @primary: master side
-+# @primary: COLO node in primary side.
- #
--# @secondary: slave side
-+# @secondary: COLO node in slave side.
- #
- # Since: 2.8
- ##
- { 'enum': 'COLOMode',
--  'data': [ 'unknown', 'primary', 'secondary'] }
-+  'data': [ 'none', 'primary', 'secondary'] }
- ##
- # @FailoverStatus:
---
-.5.0

-[Qemu-devel] [PULL 24/26] net: ignore packet size greater than INT_MAX
+[PULL V3 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-There should not be a reason for passing a packet size greater than
+From: Eugenio Pérez <eperezma@redhat.com>
 INT_MAX. It's usually a hint of bug somewhere, so ignore packet size
 greater than INT_MAX in qemu_deliver_packet_iov()
-CC: qemu-stable@nongnu.org
+This is needed to achieve migration, so the destination can restore its
-Reported-by: Daniel Shapira <daniel@twistlock.com>
+index.
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Setting base as last used idx, so destination will see as available all
 the entries that the device did not use, including the in-flight
 processing ones.
 This is ok for networking, but other kinds of devices might have
 problems with these retransmissions.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/net.c | 7 ++++++-
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
-file changed, 6 insertions(+), 1 deletion(-)
+file changed, 17 insertions(+)
-diff --git a/net/net.c b/net/net.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/net.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ ssize_t qemu_deliver_packet_iov(NetClientState *sender,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-                                 void *opaque)
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                         struct vhost_vring_state *ring)
  {
-     NetClientState *nc = opaque;
++    struct vhost_vdpa *v = dev->opaque;
 +    size_t size = iov_size(iov, iovcnt);
      int ret;
-+    if (size > INT_MAX) {
++    if (v->shadow_vqs_enabled) {
-+        return size;
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
 +                                                      ring->index);
 +
 +        /*
 +         * Setting base as last used idx, so destination will see as available
 +         * all the entries that the device did not use, including the in-flight
 +         * processing ones.
 +         *
 +         * TODO: This is ok for networking, but other kinds of devices might
 +         * have problems with these retransmissions.
 +         */
 +        ring->num = svq->last_used_idx;
 +        return 0;
 +    }
 +
-     if (nc->link_down) {
+     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
--        return iov_size(iov, iovcnt);
+     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
-+        return size;
+     return ret;
      }
      if (nc->receive_disabled) {
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 17/26] COLO: notify net filters about checkpoint/failover event
+[PULL V3 14/15] vdpa: Never set log_base addr if SVQ is enabled
-From: zhanghailiang <zhang.zhanghailiang@huawei.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Notify all net filters about the checkpoint and failover event.
+Setting the log address would make the device start reporting invalid
 dirty memory because the SVQ vrings are located in qemu's memory.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- migration/colo.c | 15 +++++++++++++++
+ hw/virtio/vhost-vdpa.c | 3 ++-
-file changed, 15 insertions(+)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/migration/colo.c b/migration/colo.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/migration/colo.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
- #include "qapi/qapi-events-migration.h"
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
- #include "qapi/qmp/qerror.h"
+                                      struct vhost_log *log)
- #include "sysemu/cpus.h"
+ {
-+#include "net/filter.h"
+-    if (vhost_vdpa_one_time_request(dev)) {
++    struct vhost_vdpa *v = dev->opaque;
- static bool vmstate_loading;
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
- static Notifier packets_compare_notifier;
+         return 0;
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
          error_report_err(local_err);
      }
-+    /* Notify all filters of all NIC to do checkpoint */
-+    colo_notify_filters_event(COLO_EVENT_FAILOVER, &local_err);
-+    if (local_err) {
-+        error_report_err(local_err);
-+    }
-+
-     if (!autostart) {
-         error_report("\"-S\" qemu option will be ignored in secondary side");
-         /* recover runstate to normal migration finish state */
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
-             goto out;
-         }
-+        /* Notify all filters of all NIC to do checkpoint */
-+        colo_notify_filters_event(COLO_EVENT_CHECKPOINT, &local_err);
-+
-+        if (local_err) {
-+            qemu_mutex_unlock_iothread();
-+            goto out;
-+        }
-+
-         vmstate_loading = false;
-         vm_start();
-         trace_colo_vm_state_change("stop", "run");
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 13/26] savevm: split the process of different stages for loadvm/savevm
+[PULL V3 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: Zhang Chen <zhangckid@gmail.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-There are several stages during loadvm/savevm process. In different stage,
+SVQ is able to log the dirty bits by itself, so let's use it to not
-migration incoming processes different types of sections.
+block migration.
 We want to control these stages more accuracy, it will benefit COLO
 performance, we don't have to save type of QEMU_VM_SECTION_START
 sections everytime while do checkpoint, besides, we want to separate
 the process of saving/loading memory and devices state.
-So we add three new helper functions: qemu_load_device_state() and
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-qemu_savevm_live_state() to achieve different process during migration.
+enabled. Even if the device supports it, the reports would be nonsense
 because SVQ memory is in the qemu region.
-Besides, we make qemu_loadvm_state_main() and qemu_save_device_state()
+The log region is still allocated. Future changes might skip that, but
-public, and simplify the codes of qemu_save_device_state() by calling the
+this series is already long enough.
 wrapper qemu_savevm_state_header().
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Zhang Chen <zhangckid@gmail.com>
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- migration/colo.c   | 41 ++++++++++++++++++++++++++++++++---------
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
- migration/savevm.c | 36 +++++++++++++++++++++++++++++-------
+ include/hw/virtio/vhost-vdpa.h |  1 +
- migration/savevm.h |  4 ++++
+files changed, 36 insertions(+), 4 deletions(-)
 files changed, 65 insertions(+), 16 deletions(-)
-diff --git a/migration/colo.c b/migration/colo.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/migration/colo.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
- #include "block/block.h"
+     return v->index != 0;
- #include "qapi/qapi-events-migration.h"
+ }
- #include "qapi/qmp/qerror.h"
-+#include "sysemu/cpus.h"
++static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
++                                       uint64_t *features)
- static bool vmstate_loading;
++{
- static Notifier packets_compare_notifier;
++    int ret;
-@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
++
++    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-     /* Disable block migration */
++    trace_vhost_vdpa_get_features(dev, *features);
-     migrate_set_block_enabled(false, &local_err);
++    return ret;
--    qemu_savevm_state_header(fb);
++}
--    qemu_savevm_state_setup(fb);
++
-     qemu_mutex_lock_iothread();
+ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
-     replication_do_checkpoint_all(&local_err);
+                                Error **errp)
-     if (local_err) {
+ {
-         qemu_mutex_unlock_iothread();
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
-         goto out;
+         return 0;
      }
--    qemu_savevm_state_complete_precopy(fb, false, false);
--    qemu_mutex_unlock_iothread();
+-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
--
++    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
--    qemu_fflush(fb);
+     if (r != 0) {
+         error_setg_errno(errp, -r, "Can't get vdpa device features");
-     colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
+         return r;
-     if (local_err) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
-+        qemu_mutex_unlock_iothread();
+ static int vhost_vdpa_set_features(struct vhost_dev *dev,
-+        goto out;
+                                    uint64_t features)
-+    }
+ {
-+    /* Note: device state is saved into buffer */
++    struct vhost_vdpa *v = dev->opaque;
-+    ret = qemu_save_device_state(fb);
+     int ret;
-+
-+    qemu_mutex_unlock_iothread();
+     if (vhost_vdpa_one_time_request(dev)) {
-+    if (ret < 0) {
+         return 0;
          goto out;
      }
-     /*
-+     * Only save VM's live state, which not including device state.
++    if (v->shadow_vqs_enabled) {
-+     * TODO: We may need a timeout mechanism to prevent COLO process
++        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
-+     * to be blocked here.
++            /*
-+     */
++             * QEMU is just trying to enable or disable logging. SVQ handles
-+    qemu_savevm_live_state(s->to_dst_file);
++             * this sepparately, so no need to forward this.
-+
++             */
-+    qemu_fflush(fb);
++            v->acked_features = features;
-+
++            return 0;
 +    /*
       * We need the size of the VMstate data in Secondary side,
       * With which we can decide how much data should be read.
       */
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
      uint64_t total_size;
      uint64_t value;
      Error *local_err = NULL;
 +    int ret;
      rcu_register_thread();
      qemu_sem_init(&mis->colo_incoming_sem, 0);
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
              goto out;
          }
 +        qemu_mutex_lock_iothread();
 +        cpu_synchronize_all_pre_loadvm();
 +        ret = qemu_loadvm_state_main(mis->from_src_file, mis);
 +        qemu_mutex_unlock_iothread();
 +
 +        if (ret < 0) {
 +            error_report("Load VM's live state (ram) error");
 +            goto out;
 +        }
 +
-         value = colo_receive_message_value(mis->from_src_file,
++        v->acked_features = features;
-                                  COLO_MESSAGE_VMSTATE_SIZE, &local_err);
++
-         if (local_err) {
++        /* We must not ack _F_LOG if SVQ is enabled */
-@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
++        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
-         }
++    }
++
-         qemu_mutex_lock_iothread();
+     trace_vhost_vdpa_set_features(dev, features);
--        qemu_system_reset(SHUTDOWN_CAUSE_NONE);
+     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
-         vmstate_loading = true;
+     if (ret) {
--        if (qemu_loadvm_state(fb) < 0) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
--            error_report("COLO: loadvm failed");
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
-+        ret = qemu_load_device_state(fb);
+                                      uint64_t *features)
-+        if (ret < 0) {
+ {
-+            error_report("COLO: load device state failed");
+-    int ret;
-             qemu_mutex_unlock_iothread();
++    struct vhost_vdpa *v = dev->opaque;
-             goto out;
++    int ret = vhost_vdpa_get_dev_features(dev, features);
-         }
++
-diff --git a/migration/savevm.c b/migration/savevm.c
++    if (ret == 0 && v->shadow_vqs_enabled) {
-index XXXXXXX..XXXXXXX 100644
++        /* Add SVQ logging capabilities */
---- a/migration/savevm.c
++        *features |= BIT_ULL(VHOST_F_LOG_ALL);
-+++ b/migration/savevm.c
++    }
-@@ -XXX,XX +XXX,XX @@ done:
 -    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
--static int qemu_save_device_state(QEMUFile *f)
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 +void qemu_savevm_live_state(QEMUFile *f)
  {
 -    SaveStateEntry *se;
 +    /* save QEMU_VM_SECTION_END section */
 +    qemu_savevm_state_complete_precopy(f, true, false);
 +    qemu_put_byte(f, QEMU_VM_EOF);
 +}
 -    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
 -    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
 +int qemu_save_device_state(QEMUFile *f)
 +{
 +    SaveStateEntry *se;
 +    if (!migration_in_colo_state()) {
 +        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
 +        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
 +    }
      cpu_synchronize_all_states();
      QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
@@ -XXX,XX +XXX,XX @@ enum LoadVMExitCodes {
      LOADVM_QUIT     =  1,
  };
 -static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 -
  /* ------ incoming postcopy messages ------ */
  /* 'advise' arrives before any transfers just to tell us that a postcopy
   * *might* happen - it might be skipped if precopy transferred everything
@@ -XXX,XX +XXX,XX @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
      return true;
  }
 -static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
 +int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
  {
      uint8_t section_type;
      int ret = 0;
@@ -XXX,XX +XXX,XX @@ int qemu_loadvm_state(QEMUFile *f)
      return ret;
  }
 +int qemu_load_device_state(QEMUFile *f)
 +{
 +    MigrationIncomingState *mis = migration_incoming_get_current();
 +    int ret;
 +
 +    /* Load QEMU_VM_SECTION_FULL section */
 +    ret = qemu_loadvm_state_main(f, mis);
 +    if (ret < 0) {
 +        error_report("Failed to load device state: %d", ret);
 +        return ret;
 +    }
 +
 +    cpu_synchronize_all_post_init();
 +    return 0;
 +}
 +
  int save_snapshot(const char *name, Error **errp)
  {
      BlockDriverState *bs, *bs1;
 diff --git a/migration/savevm.h b/migration/savevm.h
 index XXXXXXX..XXXXXXX 100644
---- a/migration/savevm.h
+--- a/include/hw/virtio/vhost-vdpa.h
-+++ b/migration/savevm.h
++++ b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
-                                            uint64_t *start_list,
+     bool iotlb_batch_begin_sent;
-                                            uint64_t *length_list);
+     MemoryListener listener;
- void qemu_savevm_send_colo_enable(QEMUFile *f);
+     struct vhost_vdpa_iova_range iova_range;
-+void qemu_savevm_live_state(QEMUFile *f);
++    uint64_t acked_features;
-+int qemu_save_device_state(QEMUFile *f);
+     bool shadow_vqs_enabled;
+     /* IOVA mapping used by the Shadow Virtqueue */
- int qemu_loadvm_state(QEMUFile *f);
+     VhostIOVATree *iova_tree;
  void qemu_loadvm_state_cleanup(void);
 +int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 +int qemu_load_device_state(QEMUFile *f);
  #endif
 --
-.5.0
+.7.4

-[Qemu-devel] [PULL 14/26] COLO: flush host dirty ram from cache
+Deleted patch
-From: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Don't need to flush all VM's ram from cache, only
-flush the dirty pages since last checkpoint
-Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- migration/ram.c | 9 +++++++++
-file changed, 9 insertions(+)
-diff --git a/migration/ram.c b/migration/ram.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/ram.c
-+++ b/migration/ram.c
-@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
-     }
-     ram_state = g_new0(RAMState, 1);
-     ram_state->migration_dirty_pages = 0;
-+    memory_global_dirty_log_start();
-     return 0;
-@@ -XXX,XX +XXX,XX @@ void colo_release_ram_cache(void)
- {
-     RAMBlock *block;
-+    memory_global_dirty_log_stop();
-     RAMBLOCK_FOREACH_MIGRATABLE(block) {
-         g_free(block->bmap);
-         block->bmap = NULL;
-@@ -XXX,XX +XXX,XX @@ static void colo_flush_ram_cache(void)
-     void *src_host;
-     unsigned long offset = 0;
-+    memory_global_dirty_log_sync();
-+    rcu_read_lock();
-+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
-+        migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
-+    }
-+    rcu_read_unlock();
-+
-     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
-     rcu_read_lock();
-     block = QLIST_FIRST_RCU(&ram_list.blocks);
---
-.5.0

-[Qemu-devel] [PULL 18/26] COLO: quick failover process by kick COLO thread
+Deleted patch
-From: zhanghailiang <zhang.zhanghailiang@huawei.com>
-COLO thread may sleep at qemu_sem_wait(&s->colo_checkpoint_sem),
-while failover works begin, It's better to wakeup it to quick
-the process.
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- migration/colo.c | 8 ++++++++
-file changed, 8 insertions(+)
-diff --git a/migration/colo.c b/migration/colo.c
-index XXXXXXX..XXXXXXX 100644
---- a/migration/colo.c
-+++ b/migration/colo.c
-@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
-     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
-                       MIGRATION_STATUS_COMPLETED);
-+    /*
-+     * kick COLO thread which might wait at
-+     * qemu_sem_wait(&s->colo_checkpoint_sem).
-+     */
-+    colo_checkpoint_notify(migrate_get_current());
-     /*
-      * Wake up COLO thread which may blocked in recv() or send(),
-@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
-         qemu_sem_wait(&s->colo_checkpoint_sem);
-+        if (s->state != MIGRATION_STATUS_COLO) {
-+            goto out;
-+        }
-         ret = colo_do_checkpoint_transaction(s, bioc, fb);
-         if (ret < 0) {
-             goto out;
---
-.5.0

-[Qemu-devel] [PULL 19/26] docs: Add COLO status diagram to COLO-FT.txt
+Deleted patch
-From: Zhang Chen <chen.zhang@intel.com>
-This diagram make user better understand COLO.
-Suggested by Markus Armbruster.
-Signed-off-by: Zhang Chen <zhangckid@gmail.com>
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- docs/COLO-FT.txt | 34 ++++++++++++++++++++++++++++++++++
-file changed, 34 insertions(+)
-diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
-index XXXXXXX..XXXXXXX 100644
---- a/docs/COLO-FT.txt
-+++ b/docs/COLO-FT.txt
-@@ -XXX,XX +XXX,XX @@ Note:
- HeartBeat has not been implemented yet, so you need to trigger failover process
- by using 'x-colo-lost-heartbeat' command.
-+== COLO operation status ==
-+
-++-----------------+
-+|                 |
-+|    Start COLO   |
-+|                 |
-++--------+--------+
-+         |
-+         |  Main qmp command:
-+         |  migrate-set-capabilities with x-colo
-+         |  migrate
-+         |
-+         v
-++--------+--------+
-+|                 |
-+|  COLO running   |
-+|                 |
-++--------+--------+
-+         |
-+         |  Main qmp command:
-+         |  x-colo-lost-heartbeat
-+         |  or
-+         |  some error happened
-+         v
-++--------+--------+
-+|                 |  send qmp event:
-+|  COLO failover  |  COLO_EXIT
-+|                 |
-++-----------------+
-+
-+COLO use the qmp command to switch and report operation status.
-+The diagram just shows the main qmp command, you can get the detail
-+in test procedure.
-+
- == Test procedure ==
-. Startup qemu
- Primary:
---
-.5.0

-[Qemu-devel] [PULL 20/26] clean up callback when del virtqueue
+Deleted patch
-From: liujunjie <liujunjie23@huawei.com>
-Before, we did not clear callback like handle_output when delete
-the virtqueue which may result be segmentfault.
-The scene is as follows:
-. Start a vm with multiqueue vhost-net,
-. then we write VIRTIO_PCI_GUEST_FEATURES in PCI configuration to
-triger multiqueue disable in this vm which will delete the virtqueue.
-In this step, the tx_bh is deleted but the callback virtio_net_handle_tx_bh
-still exist.
-. Finally, we write VIRTIO_PCI_QUEUE_NOTIFY in PCI configuration to
-notify the deleted virtqueue. In this way, virtio_net_handle_tx_bh
-will be called and qemu will be crashed.
-Although the way described above is uncommon, we had better reinforce it.
-CC: qemu-stable@nongnu.org
-Signed-off-by: liujunjie <liujunjie23@huawei.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/virtio/virtio.c | 2 ++
-file changed, 2 insertions(+)
-diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/virtio.c
-+++ b/hw/virtio/virtio.c
-@@ -XXX,XX +XXX,XX @@ void virtio_del_queue(VirtIODevice *vdev, int n)
-     vdev->vq[n].vring.num = 0;
-     vdev->vq[n].vring.num_default = 0;
-+    vdev->vq[n].handle_output = NULL;
-+    vdev->vq[n].handle_aio_output = NULL;
- }
- static void virtio_set_isr(VirtIODevice *vdev, int value)
---
-.5.0

-[Qemu-devel] [PULL 21/26] ne2000: fix possible out of bound access in ne2000_receive
+Deleted patch
-In ne2000_receive(), we try to assign size_ to size which converts
-from size_t to integer. This will cause troubles when size_ is greater
-INT_MAX, this will lead a negative value in size and it can then pass
-the check of size < MIN_BUF_SIZE which may lead out of bound access of
-for both buf and buf1.
-Fixing by converting the type of size to size_t.
-CC: qemu-stable@nongnu.org
-Reported-by: Daniel Shapira <daniel@twistlock.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/ne2000.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/ne2000.c
-+++ b/hw/net/ne2000.c
-@@ -XXX,XX +XXX,XX @@ static int ne2000_buffer_full(NE2000State *s)
- ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
- {
-     NE2000State *s = qemu_get_nic_opaque(nc);
--    int size = size_;
-+    size_t size = size_;
-     uint8_t *p;
-     unsigned int total_len, next, avail, len, index, mcast_idx;
-     uint8_t buf1[60];
-@@ -XXX,XX +XXX,XX @@ ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
-         { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
- #if defined(DEBUG_NE2000)
--    printf("NE2000: received len=%d\n", size);
-+    printf("NE2000: received len=%zu\n", size);
- #endif
-     if (s->cmd & E8390_STOP || ne2000_buffer_full(s))
---
-.5.0

-[Qemu-devel] [PULL 22/26] rtl8139: fix possible out of bound access
+Deleted patch
-In rtl8139_do_receive(), we try to assign size_ to size which converts
-from size_t to integer. This will cause troubles when size_ is greater
-INT_MAX, this will lead a negative value in size and it can then pass
-the check of size < MIN_BUF_SIZE which may lead out of bound access of
-for both buf and buf1.
-Fixing by converting the type of size to size_t.
-CC: qemu-stable@nongnu.org
-Reported-by: Daniel Shapira <daniel@twistlock.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/rtl8139.c | 8 ++++----
-file changed, 4 insertions(+), 4 deletions(-)
-diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/rtl8139.c
-+++ b/hw/net/rtl8139.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
-     RTL8139State *s = qemu_get_nic_opaque(nc);
-     PCIDevice *d = PCI_DEVICE(s);
-     /* size is the length of the buffer passed to the driver */
--    int size = size_;
-+    size_t size = size_;
-     const uint8_t *dot1q_buf = NULL;
-     uint32_t packet_header = 0;
-@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
-     static const uint8_t broadcast_macaddr[6] =
-         { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
--    DPRINTF(">>> received len=%d\n", size);
-+    DPRINTF(">>> received len=%zu\n", size);
-     /* test if board clock is stopped */
-     if (!s->clock_enabled)
-@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
-         if (size+4 > rx_space)
-         {
--            DPRINTF("C+ Rx mode : descriptor %d size %d received %d + 4\n",
-+            DPRINTF("C+ Rx mode : descriptor %d size %d received %zu + 4\n",
-                 descriptor, rx_space, size);
-             s->IntrStatus |= RxOverflow;
-@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
-         if (avail != 0 && RX_ALIGN(size + 8) >= avail)
-         {
-             DPRINTF("rx overflow: rx buffer length %d head 0x%04x "
--                "read 0x%04x === available 0x%04x need 0x%04x\n",
-+                "read 0x%04x === available 0x%04x need 0x%04zx\n",
-                 s->RxBufferSize, s->RxBufAddr, s->RxBufPtr, avail, size + 8);
-             s->IntrStatus |= RxOverflow;
---
-.5.0

-[Qemu-devel] [PULL 23/26] pcnet: fix possible buffer overflow
+Deleted patch
-In pcnet_receive(), we try to assign size_ to size which converts from
-size_t to integer. This will cause troubles when size_ is greater
-INT_MAX, this will lead a negative value in size and it can then pass
-the check of size < MIN_BUF_SIZE which may lead out of bound access
-for both buf and buf1.
-Fixing by converting the type of size to size_t.
-CC: qemu-stable@nongnu.org
-Reported-by: Daniel Shapira <daniel@twistlock.com>
-Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/pcnet.c | 4 ++--
-file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/pcnet.c
-+++ b/hw/net/pcnet.c
-@@ -XXX,XX +XXX,XX @@ ssize_t pcnet_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
-     uint8_t buf1[60];
-     int remaining;
-     int crc_err = 0;
--    int size = size_;
-+    size_t size = size_;
-     if (CSR_DRX(s) || CSR_STOP(s) || CSR_SPND(s) || !size ||
-         (CSR_LOOP(s) && !s->looptest)) {
-         return -1;
-     }
- #ifdef PCNET_DEBUG
--    printf("pcnet_receive size=%d\n", size);
-+    printf("pcnet_receive size=%zu\n", size);
- #endif
-     /* if too small buffer, then expand it */
---
-.5.0

The following changes since commit a73549f99612f758dec0fdea6ae1c30b6c709a0b:

Merge remote-tracking branch 'remotes/kraxel/tags/ui-20181012-pull-request' into staging (2018-10-12 16:45:51 +0100)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to a7ec0077c2db445d6bae421963188367d2695bd6:

qemu-options: Fix bad "macaddr" property in the documentation (2018-10-15 16:14:15 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Jason Wang (4):
      ne2000: fix possible out of bound access in ne2000_receive
      rtl8139: fix possible out of bound access
      pcnet: fix possible buffer overflow
      net: ignore packet size greater than INT_MAX

Martin Wilck (1):
      e1000: indicate dropped packets in HW counters

Thomas Huth (1):
      qemu-options: Fix bad "macaddr" property in the documentation

Zhang Chen (15):
      filter-rewriter: Add TCP state machine and fix memory leak in connection_track_table
      colo-compare: implement the process of checkpoint
      colo-compare: use notifier to notify packets comparing result
1;5202;0c1;5202;0c      COLO: integrate colo compare with colo frame
      COLO: Add block replication into colo process
      COLO: Remove colo_state migration struct
      COLO: Load dirty pages into SVM's RAM cache firstly
      ram/COLO: Record the dirty pages that SVM received
      COLO: Flush memory data from ram cache
      qapi/migration.json: Rename COLO unknown mode to none mode.
      qapi: Add new command to query colo status
      savevm: split the process of different stages for loadvm/savevm
      filter: Add handle_event method for NetFilterClass
      filter-rewriter: handle checkpoint and failover event
      docs: Add COLO status diagram to COLO-FT.txt

liujunjie (1):
      clean up callback when del virtqueue

zhanghailiang (4):
      qmp event: Add COLO_EXIT event to notify users while exited COLO
      COLO: flush host dirty ram from cache
      COLO: notify net filters about checkpoint/failover event
      COLO: quick failover process by kick COLO thread

From: Zhang Chen <zhangckid@gmail.com>

We add almost full TCP state machine in filter-rewriter, except
TCPS_LISTEN and some simplify in VM active close FIN states.
The reason for this simplify job is because guest kernel will track
the TCP status and wait 2MSL time too, if client resend the FIN packet,
guest will resend the last ACK, so we needn't wait 2MSL time in filter-rewriter.

After a net connection is closed, we didn't clear its related resources
in connection_track_table, which will lead to memory leak.

Let's track the state of net connection, if it is closed, its related
resources will be cleared up.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo.c            |   2 +-
 net/colo.h            |   9 ++---
 net/filter-rewriter.c | 109 +++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/net/colo.c b/net/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ Connection *connection_new(ConnectionKey *key)
     conn->ip_proto = key->ip_proto;
     conn->processing = false;
     conn->offset = 0;
-    conn->syn_flag = 0;
+    conn->tcp_state = TCPS_CLOSED;
     conn->pack = 0;
     conn->sack = 0;
     g_queue_init(&conn->primary_list);
diff --git a/net/colo.h b/net/colo.h
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.h
+++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@
 #include "slirp/slirp.h"
 #include "qemu/jhash.h"
 #include "qemu/timer.h"
+#include "slirp/tcp.h"
 
 #define HASHTABLE_MAX_SIZE 16384
 
@@ -XXX,XX +XXX,XX @@ typedef struct Connection {
     uint32_t sack;
     /* offset = secondary_seq - primary_seq */
     tcp_seq  offset;
-    /*
-     * we use this flag update offset func
-     * run once in independent tcp connection
-     */
-    int syn_flag;
+
+    int tcp_state; /* TCP FSM state */
+    tcp_seq fin_ack_seq; /* the seq of 'fin=1,ack=1' */
 } Connection;
 
 uint32_t connection_key_hash(const void *opaque);
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -XXX,XX +XXX,XX @@ static int is_tcp_packet(Packet *pkt)
 }
 
 /* handle tcp packet from primary guest */
-static int handle_primary_tcp_pkt(NetFilterState *nf,
+static int handle_primary_tcp_pkt(RewriterState *rf,
                                   Connection *conn,
-                                  Packet *pkt)
+                                  Packet *pkt, ConnectionKey *key)
 {
     struct tcphdr *tcp_pkt;
 
@@ -XXX,XX +XXX,XX @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
         trace_colo_filter_rewriter_conn_offset(conn->offset);
     }
 
+    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
+        conn->tcp_state == TCPS_SYN_SENT) {
+        conn->tcp_state = TCPS_ESTABLISHED;
+    }
+
     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
         /*
          * we use this flag update offset func
          * run once in independent tcp connection
          */
-        conn->syn_flag = 1;
+        conn->tcp_state = TCPS_SYN_RECEIVED;
     }
 
     if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
-        if (conn->syn_flag) {
+        if (conn->tcp_state == TCPS_SYN_RECEIVED) {
             /*
              * offset = secondary_seq - primary seq
              * ack packet sent by guest from primary node,
              * so we use th_ack - 1 get primary_seq
              */
             conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
-            conn->syn_flag = 0;
+            conn->tcp_state = TCPS_ESTABLISHED;
         }
         if (conn->offset) {
             /* handle packets to the secondary from the primary */
@@ -XXX,XX +XXX,XX @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
             net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
                                    pkt->size - pkt->vnet_hdr_len);
         }
+
+        /*
+         * Passive close step 3
+         */
+        if ((conn->tcp_state == TCPS_LAST_ACK) &&
+            (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
+            conn->tcp_state = TCPS_CLOSED;
+            g_hash_table_remove(rf->connection_track_table, key);
+        }
+    }
+
+    if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
+        /*
+         * Passive close.
+         * Step 1:
+         * The *server* side of this connect is VM, *client* tries to close
+         * the connection. We will into CLOSE_WAIT status.
+         *
+         * Step 2:
+         * In this step we will into LAST_ACK status.
+         *
+         * We got 'fin=1, ack=1' packet from server side, we need to
+         * record the seq of 'fin=1, ack=1' packet.
+         *
+         * Step 3:
+         * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
+         * packet from server side. From this point, we can ensure that there
+         * will be no packets in the connection, except that, some errors
+         * happen between the path of 'filter object' and vNIC, if this rare
+         * case really happen, we can still create a new connection,
+         * So it is safe to remove the connection from connection_track_table.
+         *
+         */
+        if (conn->tcp_state == TCPS_ESTABLISHED) {
+            conn->tcp_state = TCPS_CLOSE_WAIT;
+        }
+
+        /*
+         * Active close step 2.
+         */
+        if (conn->tcp_state == TCPS_FIN_WAIT_1) {
+            conn->tcp_state = TCPS_TIME_WAIT;
+            /*
+             * For simplify implementation, we needn't wait 2MSL time
+             * in filter rewriter. Because guest kernel will track the
+             * TCP status and wait 2MSL time, if client resend the FIN
+             * packet, guest will apply the last ACK too.
+             */
+            conn->tcp_state = TCPS_CLOSED;
+            g_hash_table_remove(rf->connection_track_table, key);
+        }
     }
 
     return 0;
 }
 
 /* handle tcp packet from secondary guest */
-static int handle_secondary_tcp_pkt(NetFilterState *nf,
+static int handle_secondary_tcp_pkt(RewriterState *rf,
                                     Connection *conn,
-                                    Packet *pkt)
+                                    Packet *pkt, ConnectionKey *key)
 {
     struct tcphdr *tcp_pkt;
 
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
         trace_colo_filter_rewriter_conn_offset(conn->offset);
     }
 
-    if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
+    if (conn->tcp_state == TCPS_SYN_RECEIVED &&
+        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
         /*
          * save offset = secondary_seq and then
          * in handle_primary_tcp_pkt make offset
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
         conn->offset = ntohl(tcp_pkt->th_seq);
     }
 
+    /* VM active connect */
+    if (conn->tcp_state == TCPS_CLOSED &&
+        ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
+        conn->tcp_state = TCPS_SYN_SENT;
+    }
+
     if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
         /* Only need to adjust seq while offset is Non-zero */
         if (conn->offset) {
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
         }
     }
 
+    /*
+     * Passive close step 2:
+     */
+    if (conn->tcp_state == TCPS_CLOSE_WAIT &&
+        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
+        conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
+        conn->tcp_state = TCPS_LAST_ACK;
+    }
+
+    /*
+     * Active close
+     *
+     * Step 1:
+     * The *server* side of this connect is VM, *server* tries to close
+     * the connection.
+     *
+     * Step 2:
+     * We will into CLOSE_WAIT status.
+     * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
+     * CLOSING status.
+     */
+    if (conn->tcp_state == TCPS_ESTABLISHED &&
+        (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
+        conn->tcp_state = TCPS_FIN_WAIT_1;
+    }
+
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
 
         if (sender == nf->netdev) {
             /* NET_FILTER_DIRECTION_TX */
-            if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
+            if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
                 qemu_net_queue_send(s->incoming_queue, sender, 0,
                 (const uint8_t *)pkt->data, pkt->size, NULL);
                 packet_destroy(pkt, NULL);
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
             }
         } else {
             /* NET_FILTER_DIRECTION_RX */
-            if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
+            if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
                 qemu_net_queue_send(s->incoming_queue, sender, 0,
                 (const uint8_t *)pkt->data, pkt->size, NULL);
                 packet_destroy(pkt, NULL);
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

While do checkpoint, we need to flush all the unhandled packets,
By using the filter notifier mechanism, we can easily to notify
every compare object to do this process, which runs inside
of compare threads as a coroutine.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/migration/colo.h |  6 ++++
 net/colo-compare.c       | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/colo-compare.h       | 22 ++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 net/colo-compare.h

diff --git a/include/migration/colo.h b/include/migration/colo.h
index XXXXXXX..XXXXXXX 100644
--- a/include/migration/colo.h
+++ b/include/migration/colo.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu-common.h"
 #include "qapi/qapi-types-migration.h"
 
+enum colo_event {
+    COLO_EVENT_NONE,
+    COLO_EVENT_CHECKPOINT,
+    COLO_EVENT_FAILOVER,
+};
+
 void colo_info_init(void);
 
 void migrate_start_colo_process(MigrationState *s);
diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/sockets.h"
 #include "colo.h"
 #include "sysemu/iothread.h"
+#include "net/colo-compare.h"
+#include "migration/colo.h"
 
 #define TYPE_COLO_COMPARE "colo-compare"
 #define COLO_COMPARE(obj) \
     OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
 
+static QTAILQ_HEAD(, CompareState) net_compares =
+       QTAILQ_HEAD_INITIALIZER(net_compares);
+
 #define COMPARE_READ_LEN_MAX NET_BUFSIZE
 #define MAX_QUEUE_SIZE 1024
 
@@ -XXX,XX +XXX,XX @@
 /* TODO: Should be configurable */
 #define REGULAR_PACKET_CHECK_MS 3000
 
+static QemuMutex event_mtx;
+static QemuCond event_complete_cond;
+static int event_unhandled_count;
+
 /*
  *  + CompareState ++
  *  |               |
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
     IOThread *iothread;
     GMainContext *worker_context;
     QEMUTimer *packet_check_timer;
+
+    QEMUBH *event_bh;
+    enum colo_event event;
+
+    QTAILQ_ENTRY(CompareState) next;
 } CompareState;
 
 typedef struct CompareClass {
@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
                 REGULAR_PACKET_CHECK_MS);
 }
 
+/* Public API, Used for COLO frame to notify compare event */
+void colo_notify_compares_event(void *opaque, int event, Error **errp)
+{
+    CompareState *s;
+
+    qemu_mutex_lock(&event_mtx);
+    QTAILQ_FOREACH(s, &net_compares, next) {
+        s->event = event;
+        qemu_bh_schedule(s->event_bh);
+        event_unhandled_count++;
+    }
+    /* Wait all compare threads to finish handling this event */
+    while (event_unhandled_count > 0) {
+        qemu_cond_wait(&event_complete_cond, &event_mtx);
+    }
+
+    qemu_mutex_unlock(&event_mtx);
+}
+
 static void colo_compare_timer_init(CompareState *s)
 {
     AioContext *ctx = iothread_get_aio_context(s->iothread);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_timer_del(CompareState *s)
     }
  }
 
+static void colo_flush_packets(void *opaque, void *user_data);
+
+static void colo_compare_handle_event(void *opaque)
+{
+    CompareState *s = opaque;
+
+    switch (s->event) {
+    case COLO_EVENT_CHECKPOINT:
+        g_queue_foreach(&s->conn_list, colo_flush_packets, s);
+        break;
+    case COLO_EVENT_FAILOVER:
+        break;
+    default:
+        break;
+    }
+
+    assert(event_unhandled_count > 0);
+
+    qemu_mutex_lock(&event_mtx);
+    event_unhandled_count--;
+    qemu_cond_broadcast(&event_complete_cond);
+    qemu_mutex_unlock(&event_mtx);
+}
+
 static void colo_compare_iothread(CompareState *s)
 {
     object_ref(OBJECT(s->iothread));
@@ -XXX,XX +XXX,XX @@ static void colo_compare_iothread(CompareState *s)
                              s, s->worker_context, true);
 
     colo_compare_timer_init(s);
+    s->event_bh = qemu_bh_new(colo_compare_handle_event, s);
 }
 
 static char *compare_get_pri_indev(Object *obj, Error **errp)
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
     net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr);
     net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr);
 
+    QTAILQ_INSERT_TAIL(&net_compares, s, next);
+
     g_queue_init(&s->conn_list);
 
+    qemu_mutex_init(&event_mtx);
+    qemu_cond_init(&event_complete_cond);
+
     s->connection_track_table = g_hash_table_new_full(connection_key_hash,
                                                       connection_key_equal,
                                                       g_free,
@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
 static void colo_compare_finalize(Object *obj)
 {
     CompareState *s = COLO_COMPARE(obj);
+    CompareState *tmp = NULL;
 
     qemu_chr_fe_deinit(&s->chr_pri_in, false);
     qemu_chr_fe_deinit(&s->chr_sec_in, false);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
     if (s->iothread) {
         colo_compare_timer_del(s);
     }
+
+    qemu_bh_delete(s->event_bh);
+
+    QTAILQ_FOREACH(tmp, &net_compares, next) {
+        if (tmp == s) {
+            QTAILQ_REMOVE(&net_compares, s, next);
+            break;
+        }
+    }
+
     /* Release all unhandled packets after compare thead exited */
     g_queue_foreach(&s->conn_list, colo_flush_packets, s);
 
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
     if (s->iothread) {
         object_unref(OBJECT(s->iothread));
     }
+
+    qemu_mutex_destroy(&event_mtx);
+    qemu_cond_destroy(&event_complete_cond);
+
     g_free(s->pri_indev);
     g_free(s->sec_indev);
     g_free(s->outdev);
diff --git a/net/colo-compare.h b/net/colo-compare.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/net/colo-compare.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2017 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2017 FUJITSU LIMITED
+ * Copyright (c) 2017 Intel Corporation
+ *
+ * Authors:
+ *    zhanghailiang <zhang.zhanghailiang@huawei.com>
+ *    Zhang Chen <zhangckid@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_COMPARE_H
+#define QEMU_COLO_COMPARE_H
+
+void colo_notify_compares_event(void *opaque, int event, Error **errp);
+
+#endif /* QEMU_COLO_COMPARE_H */
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

It's a good idea to use notifier to notify COLO frame of
inconsistent packets comparing.

Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 37 ++++++++++++++++++++++++++-----------
 net/colo-compare.h |  2 ++
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/iothread.h"
 #include "net/colo-compare.h"
 #include "migration/colo.h"
+#include "migration/migration.h"
 
 #define TYPE_COLO_COMPARE "colo-compare"
 #define COLO_COMPARE(obj) \
@@ -XXX,XX +XXX,XX @@
 static QTAILQ_HEAD(, CompareState) net_compares =
        QTAILQ_HEAD_INITIALIZER(net_compares);
 
+static NotifierList colo_compare_notifiers =
+    NOTIFIER_LIST_INITIALIZER(colo_compare_notifiers);
+
 #define COMPARE_READ_LEN_MAX NET_BUFSIZE
 #define MAX_QUEUE_SIZE 1024
 
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
     return false;
 }
 
+static void colo_compare_inconsistency_notify(void)
+{
+    notifier_list_notify(&colo_compare_notifiers,
+                migrate_get_current());
+}
+
 static void colo_compare_tcp(CompareState *s, Connection *conn)
 {
     Packet *ppkt = NULL, *spkt = NULL;
@@ -XXX,XX +XXX,XX @@ sec:
         qemu_hexdump((char *)spkt->data, stderr,
                      "colo-compare spkt", spkt->size);
 
-        /*
-         * colo_compare_inconsistent_notify();
-         * TODO: notice to checkpoint();
-         */
+        colo_compare_inconsistency_notify();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
     }
 }
 
+void colo_compare_register_notifier(Notifier *notify)
+{
+    notifier_list_add(&colo_compare_notifiers, notify);
+}
+
+void colo_compare_unregister_notifier(Notifier *notify)
+{
+    notifier_remove(notify);
+}
+
 static int colo_old_packet_check_one_conn(Connection *conn,
-                                          void *user_data)
+                                           void *user_data)
 {
     GList *result = NULL;
     int64_t check_time = REGULAR_PACKET_CHECK_MS;
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one_conn(Connection *conn,
 
     if (result) {
         /* Do checkpoint will flush old packet */
-        /*
-         * TODO: Notify colo frame to do checkpoint.
-         * colo_compare_inconsistent_notify();
-         */
+        colo_compare_inconsistency_notify();
         return 0;
     }
 
@@ -XXX,XX +XXX,XX @@ static void colo_compare_packet(CompareState *s, Connection *conn,
             /*
              * If one packet arrive late, the secondary_list or
              * primary_list will be empty, so we can't compare it
-             * until next comparison.
+             * until next comparison. If the packets in the list are
+             * timeout, it will trigger a checkpoint request.
              */
             trace_colo_compare_main("packet different");
             g_queue_push_head(&conn->primary_list, pkt);
-            /* TODO: colo_notify_checkpoint();*/
+            colo_compare_inconsistency_notify();
             break;
         }
     }
diff --git a/net/colo-compare.h b/net/colo-compare.h
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.h
+++ b/net/colo-compare.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_COLO_COMPARE_H
 
 void colo_notify_compares_event(void *opaque, int event, Error **errp);
+void colo_compare_register_notifier(Notifier *notify);
+void colo_compare_unregister_notifier(Notifier *notify);
 
 #endif /* QEMU_COLO_COMPARE_H */
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

For COLO FT, both the PVM and SVM run at the same time,
only sync the state while it needs.

So here, let SVM runs while not doing checkpoint, change
DEFAULT_MIGRATE_X_CHECKPOINT_DELAY to 200*100.

Besides, we forgot to release colo_checkpoint_semd and
colo_delay_timer, fix them here.

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "migration/failover.h"
 #include "replication.h"
+#include "net/colo-compare.h"
+#include "net/colo.h"
 
 static bool vmstate_loading;
+static Notifier packets_compare_notifier;
 
 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
 
@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
         goto out;
     }
 
+    colo_notify_compares_event(NULL, COLO_EVENT_CHECKPOINT, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
     /* Disable block migration */
     migrate_set_block_enabled(false, &local_err);
     qemu_savevm_state_header(fb);
@@ -XXX,XX +XXX,XX @@ out:
     return ret;
 }
 
+static void colo_compare_notify_checkpoint(Notifier *notifier, void *data)
+{
+    colo_checkpoint_notify(data);
+}
+
 static void colo_process_checkpoint(MigrationState *s)
 {
     QIOChannelBuffer *bioc;
@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
         goto out;
     }
 
+    packets_compare_notifier.notify = colo_compare_notify_checkpoint;
+    colo_compare_register_notifier(&packets_compare_notifier);
+
     /*
      * Wait for Secondary finish loading VM states and enter COLO
      * restore.
@@ -XXX,XX +XXX,XX @@ out:
         qemu_fclose(fb);
     }
 
-    timer_del(s->colo_delay_timer);
-
     /* Hope this not to be too long to wait here */
     qemu_sem_wait(&s->colo_exit_sem);
     qemu_sem_destroy(&s->colo_exit_sem);
+
+    /*
+     * It is safe to unregister notifier after failover finished.
+     * Besides, colo_delay_timer and colo_checkpoint_sem can't be
+     * released befor unregister notifier, or there will be use-after-free
+     * error.
+     */
+    colo_compare_unregister_notifier(&packets_compare_notifier);
+    timer_del(s->colo_delay_timer);
+    timer_free(s->colo_delay_timer);
+    qemu_sem_destroy(&s->colo_checkpoint_sem);
+
     /*
      * Must be called after failover BH is completed,
      * Or the failover BH may shutdown the wrong fd that
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
     fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
     object_unref(OBJECT(bioc));
 
+    qemu_mutex_lock_iothread();
+    vm_start();
+    trace_colo_vm_state_change("stop", "run");
+    qemu_mutex_unlock_iothread();
+
     colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
                       &local_err);
     if (local_err) {
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
             goto out;
         }
 
+        qemu_mutex_lock_iothread();
+        vm_stop_force_state(RUN_STATE_COLO);
+        trace_colo_vm_state_change("run", "stop");
+        qemu_mutex_unlock_iothread();
+
         /* FIXME: This is unnecessary for periodic checkpoint mode */
         colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
                      &local_err);
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
         }
 
         vmstate_loading = false;
+        vm_start();
+        trace_colo_vm_state_change("stop", "run");
         qemu_mutex_unlock_iothread();
 
         if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
diff --git a/migration/migration.c b/migration/migration.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -XXX,XX +XXX,XX @@
 /* Migration XBZRLE default cache size */
 #define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
 
-/* The delay time (in ms) between two COLO checkpoints
- * Note: Please change this default value to 10000 when we support hybrid mode.
- */
-#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
+/* The delay time (in ms) between two COLO checkpoints */
+#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
 #define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16
 
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

Make sure master start block replication after slave's block
replication started.

Besides, we need to activate VM's blocks before goes into
COLO state.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/colo.c      | 43 +++++++++++++++++++++++++++++++++++++++++++
 migration/migration.c | 10 ++++++++++
 2 files changed, 53 insertions(+)

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@
 #include "replication.h"
 #include "net/colo-compare.h"
 #include "net/colo.h"
+#include "block/block.h"
 
 static bool vmstate_loading;
 static Notifier packets_compare_notifier;
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
 {
     int old_state;
     MigrationIncomingState *mis = migration_incoming_get_current();
+    Error *local_err = NULL;
 
     /* Can not do failover during the process of VM's loading VMstate, Or
      * it will break the secondary VM.
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
                       MIGRATION_STATUS_COMPLETED);
 
+    replication_stop_all(true, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
     if (!autostart) {
         error_report("\"-S\" qemu option will be ignored in secondary side");
         /* recover runstate to normal migration finish state */
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
 {
     MigrationState *s = migrate_get_current();
     int old_state;
+    Error *local_err = NULL;
 
     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
                       MIGRATION_STATUS_COMPLETED);
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
                      FailoverStatus_str(old_state));
         return;
     }
+
+    replication_stop_all(true, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        local_err = NULL;
+    }
+
     /* Notify COLO thread that failover work is finished */
     qemu_sem_post(&s->colo_exit_sem);
 }
@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
     qemu_savevm_state_header(fb);
     qemu_savevm_state_setup(fb);
     qemu_mutex_lock_iothread();
+    replication_do_checkpoint_all(&local_err);
+    if (local_err) {
+        qemu_mutex_unlock_iothread();
+        goto out;
+    }
     qemu_savevm_state_complete_precopy(fb, false, false);
     qemu_mutex_unlock_iothread();
 
@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
     object_unref(OBJECT(bioc));
 
     qemu_mutex_lock_iothread();
+    replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
+    if (local_err) {
+        qemu_mutex_unlock_iothread();
+        goto out;
+    }
+
     vm_start();
     qemu_mutex_unlock_iothread();
     trace_colo_vm_state_change("stop", "run");
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
     object_unref(OBJECT(bioc));
 
     qemu_mutex_lock_iothread();
+    replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
+    if (local_err) {
+        qemu_mutex_unlock_iothread();
+        goto out;
+    }
     vm_start();
     trace_colo_vm_state_change("stop", "run");
     qemu_mutex_unlock_iothread();
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
             goto out;
         }
 
+        replication_get_error_all(&local_err);
+        if (local_err) {
+            qemu_mutex_unlock_iothread();
+            goto out;
+        }
+        /* discard colo disk buffer */
+        replication_do_checkpoint_all(&local_err);
+        if (local_err) {
+            qemu_mutex_unlock_iothread();
+            goto out;
+        }
+
         vmstate_loading = false;
         vm_start();
         trace_colo_vm_state_change("stop", "run");
diff --git a/migration/migration.c b/migration/migration.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
     MigrationIncomingState *mis = migration_incoming_get_current();
     PostcopyState ps;
     int ret;
+    Error *local_err = NULL;
 
     assert(mis->from_src_file);
     mis->migration_incoming_co = qemu_coroutine_self();
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
 
     /* we get COLO info, and know if we are in COLO mode */
     if (!ret && migration_incoming_enable_colo()) {
+        /* Make sure all file formats flush their mutable metadata */
+        bdrv_invalidate_cache_all(&local_err);
+        if (local_err) {
+            migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
+                    MIGRATION_STATUS_FAILED);
+            error_report_err(local_err);
+            exit(EXIT_FAILURE);
+        }
+
         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
         mis->have_colo_incoming_thread = true;
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

We need to know if migration is going into COLO state for
incoming side before start normal migration.

Instead by using the VMStateDescription to send colo_state
from source side to destination side, we use MIG_CMD_ENABLE_COLO
to indicate whether COLO is enabled or not.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/migration/colo.h |  5 ++--
 migration/Makefile.objs  |  2 +-
 migration/colo-comm.c    | 76 ------------------------------------------------
 migration/colo.c         | 13 ++++++++-
 migration/migration.c    | 23 ++++++++++++++-
 migration/savevm.c       | 17 +++++++++++
 migration/savevm.h       |  1 +
 migration/trace-events   |  1 +
 vl.c                     |  2 --
 9 files changed, 57 insertions(+), 83 deletions(-)
 delete mode 100644 migration/colo-comm.c

diff --git a/include/migration/colo.h b/include/migration/colo.h
index XXXXXXX..XXXXXXX 100644
--- a/include/migration/colo.h
+++ b/include/migration/colo.h
@@ -XXX,XX +XXX,XX @@ void migrate_start_colo_process(MigrationState *s);
 bool migration_in_colo_state(void);
 
 /* loadvm */
-bool migration_incoming_enable_colo(void);
-void migration_incoming_exit_colo(void);
+void migration_incoming_enable_colo(void);
+void migration_incoming_disable_colo(void);
+bool migration_incoming_colo_enabled(void);
 void *colo_process_incoming_thread(void *opaque);
 bool migration_incoming_in_colo_state(void);
 
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index XXXXXXX..XXXXXXX 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -XXX,XX +XXX,XX @@
 common-obj-y += migration.o socket.o fd.o exec.o
 common-obj-y += tls.o channel.o savevm.o
-common-obj-y += colo-comm.o colo.o colo-failover.o
+common-obj-y += colo.o colo-failover.o
 common-obj-y += vmstate.o vmstate-types.o page_cache.o
 common-obj-y += qemu-file.o global_state.o
 common-obj-y += qemu-file-channel.o
diff --git a/migration/colo-comm.c b/migration/colo-comm.c
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/migration/colo-comm.c
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
- * (a.k.a. Fault Tolerance or Continuous Replication)
- *
- * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
- * Copyright (c) 2016 FUJITSU LIMITED
- * Copyright (c) 2016 Intel Corporation
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or
- * later. See the COPYING file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "migration.h"
-#include "migration/colo.h"
-#include "migration/vmstate.h"
-#include "trace.h"
-
-typedef struct {
-     bool colo_requested;
-} COLOInfo;
-
-static COLOInfo colo_info;
-
-COLOMode get_colo_mode(void)
-{
-    if (migration_in_colo_state()) {
-        return COLO_MODE_PRIMARY;
-    } else if (migration_incoming_in_colo_state()) {
-        return COLO_MODE_SECONDARY;
-    } else {
-        return COLO_MODE_UNKNOWN;
-    }
-}
-
-static int colo_info_pre_save(void *opaque)
-{
-    COLOInfo *s = opaque;
-
-    s->colo_requested = migrate_colo_enabled();
-
-    return 0;
-}
-
-static bool colo_info_need(void *opaque)
-{
-   return migrate_colo_enabled();
-}
-
-static const VMStateDescription colo_state = {
-    .name = "COLOState",
-    .version_id = 1,
-    .minimum_version_id = 1,
-    .pre_save = colo_info_pre_save,
-    .needed = colo_info_need,
-    .fields = (VMStateField[]) {
-        VMSTATE_BOOL(colo_requested, COLOInfo),
-        VMSTATE_END_OF_LIST()
-    },
-};
-
-void colo_info_init(void)
-{
-    vmstate_register(NULL, 0, &colo_state, &colo_info);
-}
-
-bool migration_incoming_enable_colo(void)
-{
-    return colo_info.colo_requested;
-}
-
-void migration_incoming_exit_colo(void)
-{
-    colo_info.colo_requested = false;
-}
diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
     qemu_sem_post(&s->colo_exit_sem);
 }
 
+COLOMode get_colo_mode(void)
+{
+    if (migration_in_colo_state()) {
+        return COLO_MODE_PRIMARY;
+    } else if (migration_incoming_in_colo_state()) {
+        return COLO_MODE_SECONDARY;
+    } else {
+        return COLO_MODE_UNKNOWN;
+    }
+}
+
 void colo_do_failover(MigrationState *s)
 {
     /* Make sure VM stopped while failover happened. */
@@ -XXX,XX +XXX,XX @@ out:
     if (mis->to_src_file) {
         qemu_fclose(mis->to_src_file);
     }
-    migration_incoming_exit_colo();
+    migration_incoming_disable_colo();
 
     rcu_unregister_thread();
     return NULL;
diff --git a/migration/migration.c b/migration/migration.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -XXX,XX +XXX,XX @@ int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
 }
 
+static bool migration_colo_enabled;
+bool migration_incoming_colo_enabled(void)
+{
+    return migration_colo_enabled;
+}
+
+void migration_incoming_disable_colo(void)
+{
+    migration_colo_enabled = false;
+}
+
+void migration_incoming_enable_colo(void)
+{
+    migration_colo_enabled = true;
+}
+
 void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
     const char *p;
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
     }
 
     /* we get COLO info, and know if we are in COLO mode */
-    if (!ret && migration_incoming_enable_colo()) {
+    if (!ret && migration_incoming_colo_enabled()) {
         /* Make sure all file formats flush their mutable metadata */
         bdrv_invalidate_cache_all(&local_err);
         if (local_err) {
@@ -XXX,XX +XXX,XX @@ static void *migration_thread(void *opaque)
         qemu_savevm_send_postcopy_advise(s->to_dst_file);
     }
 
+    if (migrate_colo_enabled()) {
+        /* Notify migration destination that we enable COLO */
+        qemu_savevm_send_colo_enable(s->to_dst_file);
+    }
+
     qemu_savevm_state_setup(s->to_dst_file);
 
     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
diff --git a/migration/savevm.c b/migration/savevm.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@
 #include "io/channel-file.h"
 #include "sysemu/replay.h"
 #include "qjson.h"
+#include "migration/colo.h"
 
 #ifndef ETH_P_RARP
 #define ETH_P_RARP 0x8035
@@ -XXX,XX +XXX,XX @@ enum qemu_vm_cmd {
                                       were previously sent during
                                       precopy but are dirty. */
     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
+    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
     MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
     MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
     MIG_CMD_MAX
@@ -XXX,XX +XXX,XX @@ static void qemu_savevm_command_send(QEMUFile *f,
     qemu_fflush(f);
 }
 
+void qemu_savevm_send_colo_enable(QEMUFile *f)
+{
+    trace_savevm_send_colo_enable();
+    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
+}
+
 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
 {
     uint32_t buf;
@@ -XXX,XX +XXX,XX @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
     return 0;
 }
 
+static int loadvm_process_enable_colo(MigrationIncomingState *mis)
+{
+    migration_incoming_enable_colo();
+    return 0;
+}
+
 /*
  * Process an incoming 'QEMU_VM_COMMAND'
  * 0           just a normal return
@@ -XXX,XX +XXX,XX @@ static int loadvm_process_command(QEMUFile *f)
 
     case MIG_CMD_RECV_BITMAP:
         return loadvm_handle_recv_bitmap(mis, len);
+
+    case MIG_CMD_ENABLE_COLO:
+        return loadvm_process_enable_colo(mis);
     }
 
     return 0;
diff --git a/migration/savevm.h b/migration/savevm.h
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -XXX,XX +XXX,XX @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
                                            uint16_t len,
                                            uint64_t *start_list,
                                            uint64_t *length_list);
+void qemu_savevm_send_colo_enable(QEMUFile *f);
 
 int qemu_loadvm_state(QEMUFile *f);
 void qemu_loadvm_state_cleanup(void);
diff --git a/migration/trace-events b/migration/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -XXX,XX +XXX,XX @@ savevm_send_ping(uint32_t val) "0x%x"
 savevm_send_postcopy_listen(void) ""
 savevm_send_postcopy_run(void) ""
 savevm_send_postcopy_resume(void) ""
+savevm_send_colo_enable(void) ""
 savevm_send_recv_bitmap(char *name) "%s"
 savevm_state_setup(void) ""
 savevm_state_resume_prepare(void) ""
diff --git a/vl.c b/vl.c
index XXXXXXX..XXXXXXX 100644
--- a/vl.c
+++ b/vl.c
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
 #endif
     }
 
-    colo_info_init();
-
     if (net_init_clients(&err) < 0) {
         error_report_err(err);
         exit(1);
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

We should not load PVM's state directly into SVM, because there maybe some
errors happen when SVM is receving data, which will break SVM.

We need to ensure receving all data before load the state into SVM. We use
an extra memory to cache these data (PVM's ram). The ram cache in secondary side
is initially the same as SVM/PVM's memory. And in the process of checkpoint,
we cache the dirty pages of PVM into this ram cache firstly, so this ram cache
always the same as PVM's memory at every checkpoint, then we flush this cached ram
to SVM after we receive all PVM's state.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/exec/ram_addr.h |  1 +
 migration/migration.c   |  7 +++++
 migration/ram.c         | 83 +++++++++++++++++++++++++++++++++++++++++++++++--
 migration/ram.h         |  4 +++
 migration/savevm.c      |  2 +-
 5 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -XXX,XX +XXX,XX @@ struct RAMBlock {
     struct rcu_head rcu;
     struct MemoryRegion *mr;
     uint8_t *host;
+    uint8_t *colo_cache; /* For colo, VM's ram cache */
     ram_addr_t offset;
     ram_addr_t used_length;
     ram_addr_t max_length;
diff --git a/migration/migration.c b/migration/migration.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
             exit(EXIT_FAILURE);
         }
 
+        if (colo_init_ram_cache() < 0) {
+            error_report("Init ram cache failed");
+            exit(EXIT_FAILURE);
+        }
+
         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
         mis->have_colo_incoming_thread = true;
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
 
         /* Wait checkpoint incoming thread exit before free resource */
         qemu_thread_join(&mis->colo_incoming_thread);
+        /* We hold the global iothread lock, so it is safe here */
+        colo_release_ram_cache();
     }
 
     if (ret < 0) {
diff --git a/migration/ram.c b/migration/ram.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@ static inline void *host_from_ram_block_offset(RAMBlock *block,
     return block->host + offset;
 }
 
+static inline void *colo_cache_from_block_offset(RAMBlock *block,
+                                                 ram_addr_t offset)
+{
+    if (!offset_in_ramblock(block, offset)) {
+        return NULL;
+    }
+    if (!block->colo_cache) {
+        error_report("%s: colo_cache is NULL in block :%s",
+                     __func__, block->idstr);
+        return NULL;
+    }
+    return block->colo_cache + offset;
+}
+
 /**
  * ram_handle_compressed: handle the zero page case
  *
@@ -XXX,XX +XXX,XX @@ static void decompress_data_with_multi_threads(QEMUFile *f,
     qemu_mutex_unlock(&decomp_done_lock);
 }
 
+/*
+ * colo cache: this is for secondary VM, we cache the whole
+ * memory of the secondary VM, it is need to hold the global lock
+ * to call this helper.
+ */
+int colo_init_ram_cache(void)
+{
+    RAMBlock *block;
+
+    rcu_read_lock();
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        block->colo_cache = qemu_anon_ram_alloc(block->used_length,
+                                                NULL,
+                                                false);
+        if (!block->colo_cache) {
+            error_report("%s: Can't alloc memory for COLO cache of block %s,"
+                         "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
+                         block->used_length);
+            goto out_locked;
+        }
+        memcpy(block->colo_cache, block->host, block->used_length);
+    }
+    rcu_read_unlock();
+    return 0;
+
+out_locked:
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        if (block->colo_cache) {
+            qemu_anon_ram_free(block->colo_cache, block->used_length);
+            block->colo_cache = NULL;
+        }
+    }
+
+    rcu_read_unlock();
+    return -errno;
+}
+
+/* It is need to hold the global lock to call this helper */
+void colo_release_ram_cache(void)
+{
+    RAMBlock *block;
+
+    rcu_read_lock();
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        if (block->colo_cache) {
+            qemu_anon_ram_free(block->colo_cache, block->used_length);
+            block->colo_cache = NULL;
+        }
+    }
+    rcu_read_unlock();
+}
+
 /**
  * ram_load_setup: Setup RAM for migration incoming side
  *
@@ -XXX,XX +XXX,XX @@ static int ram_load_setup(QEMUFile *f, void *opaque)
 
     xbzrle_load_setup();
     ramblock_recv_map_init();
+
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static int ram_load_cleanup(void *opaque)
         g_free(rb->receivedmap);
         rb->receivedmap = NULL;
     }
+
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
             RAMBlock *block = ram_block_from_stream(f, flags);
 
-            host = host_from_ram_block_offset(block, addr);
+            /*
+             * After going into COLO, we should load the Page into colo_cache.
+             */
+            if (migration_incoming_in_colo_state()) {
+                host = colo_cache_from_block_offset(block, addr);
+            } else {
+                host = host_from_ram_block_offset(block, addr);
+            }
             if (!host) {
                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
                 ret = -EINVAL;
                 break;
             }
-            ramblock_recv_bitmap_set(block, host);
+
+            if (!migration_incoming_in_colo_state()) {
+                ramblock_recv_bitmap_set(block, host);
+            }
+
             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
         }
 
diff --git a/migration/ram.h b/migration/ram.h
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -XXX,XX +XXX,XX @@ int64_t ramblock_recv_bitmap_send(QEMUFile *file,
                                   const char *block_name);
 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
 
+/* ram cache */
+int colo_init_ram_cache(void);
+void colo_release_ram_cache(void);
+
 #endif
diff --git a/migration/savevm.c b/migration/savevm.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
 static int loadvm_process_enable_colo(MigrationIncomingState *mis)
 {
     migration_incoming_enable_colo();
-    return 0;
+    return colo_init_ram_cache();
 }
 
 /*
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

We record the address of the dirty pages that received,
it will help flushing pages that cached into SVM.

Here, it is a trick, we record dirty pages by re-using migration
dirty bitmap. In the later patch, we will start the dirty log
for SVM, just like migration, in this way, we can record both
the dirty pages caused by PVM and SVM, we only flush those dirty
pages from RAM cache while do checkpoint.

diff --git a/migration/ram.c b/migration/ram.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@ static inline void *colo_cache_from_block_offset(RAMBlock *block,
                      __func__, block->idstr);
         return NULL;
     }
+
+    /*
+    * During colo checkpoint, we need bitmap of these migrated pages.
+    * It help us to decide which pages in ram cache should be flushed
+    * into VM's RAM later.
+    */
+    if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
+        ram_state->migration_dirty_pages++;
+    }
     return block->colo_cache + offset;
 }
 
@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
     RAMBlock *block;
 
     rcu_read_lock();
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
                                                 NULL,
                                                 false);
@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
         memcpy(block->colo_cache, block->host, block->used_length);
     }
     rcu_read_unlock();
+    /*
+    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
+    * with to decide which page in cache should be flushed into SVM's RAM. Here
+    * we use the same name 'ram_bitmap' as for migration.
+    */
+    if (ram_bytes_total()) {
+        RAMBlock *block;
+
+        RAMBLOCK_FOREACH_MIGRATABLE(block) {
+            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
+
+            block->bmap = bitmap_new(pages);
+            bitmap_set(block->bmap, 0, pages);
+        }
+    }
+    ram_state = g_new0(RAMState, 1);
+    ram_state->migration_dirty_pages = 0;
+
     return 0;
 
 out_locked:
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
         if (block->colo_cache) {
             qemu_anon_ram_free(block->colo_cache, block->used_length);
             block->colo_cache = NULL;
@@ -XXX,XX +XXX,XX @@ void colo_release_ram_cache(void)
 {
     RAMBlock *block;
 
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
+        g_free(block->bmap);
+        block->bmap = NULL;
+    }
+
     rcu_read_lock();
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
         if (block->colo_cache) {
             qemu_anon_ram_free(block->colo_cache, block->used_length);
             block->colo_cache = NULL;
         }
     }
+
     rcu_read_unlock();
+    g_free(ram_state);
+    ram_state = NULL;
 }
 
 /**
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

During the time of VM's running, PVM may dirty some pages, we will transfer
PVM's dirty pages to SVM and store them into SVM's RAM cache at next checkpoint
time. So, the content of SVM's RAM cache will always be same with PVM's memory
after checkpoint.

Instead of flushing all content of PVM's RAM cache into SVM's MEMORY,
we do this in a more efficient way:
Only flush any page that dirtied by PVM since last checkpoint.
In this way, we can ensure SVM's memory same with PVM's.

Besides, we must ensure flush RAM cache before load device state.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/ram.c        | 37 +++++++++++++++++++++++++++++++++++++
 migration/trace-events |  2 ++
 2 files changed, 39 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@ static bool postcopy_is_running(void)
     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
 }
 
+/*
+ * Flush content of RAM cache into SVM's memory.
+ * Only flush the pages that be dirtied by PVM or SVM or both.
+ */
+static void colo_flush_ram_cache(void)
+{
+    RAMBlock *block = NULL;
+    void *dst_host;
+    void *src_host;
+    unsigned long offset = 0;
+
+    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
+    rcu_read_lock();
+    block = QLIST_FIRST_RCU(&ram_list.blocks);
+
+    while (block) {
+        offset = migration_bitmap_find_dirty(ram_state, block, offset);
+
+        if (offset << TARGET_PAGE_BITS >= block->used_length) {
+            offset = 0;
+            block = QLIST_NEXT_RCU(block, next);
+        } else {
+            migration_bitmap_clear_dirty(ram_state, block, offset);
+            dst_host = block->host + (offset << TARGET_PAGE_BITS);
+            src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
+            memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
+        }
+    }
+
+    rcu_read_unlock();
+    trace_colo_flush_ram_cache_end();
+}
+
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
     int flags = 0, ret = 0, invalid_flags = 0;
@@ -XXX,XX +XXX,XX @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
     ret |= wait_for_decompress_done();
     rcu_read_unlock();
     trace_ram_load_complete(ret, seq_iter);
+
+    if (!ret  && migration_incoming_in_colo_state()) {
+        colo_flush_ram_cache();
+    }
     return ret;
 }
 
diff --git a/migration/trace-events b/migration/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -XXX,XX +XXX,XX @@ ram_dirty_bitmap_sync_start(void) ""
 ram_dirty_bitmap_sync_wait(void) ""
 ram_dirty_bitmap_sync_complete(void) ""
 ram_state_resume_prepare(uint64_t v) "%" PRId64
+colo_flush_ram_cache_begin(uint64_t dirty_pages) "dirty_pages %" PRIu64
+colo_flush_ram_cache_end(void) ""
 
 # migration/migration.c
 await_return_path_close_on_source_close(void) ""
-- 
2.5.0

From: zhanghailiang <zhang.zhanghailiang@huawei.com>

If some errors happen during VM's COLO FT stage, it's important to
notify the users of this event. Together with 'x-colo-lost-heartbeat',
Users can intervene in COLO's failover work immediately.
If users don't want to get involved in COLO's failover verdict,
it is still necessary to notify users that we exited COLO mode.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/colo.c    | 31 +++++++++++++++++++++++++++++++
 qapi/migration.json | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@
 #include "net/colo-compare.h"
 #include "net/colo.h"
 #include "block/block.h"
+#include "qapi/qapi-events-migration.h"
 
 static bool vmstate_loading;
 static Notifier packets_compare_notifier;
@@ -XXX,XX +XXX,XX @@ out:
         qemu_fclose(fb);
     }
 
+    /*
+     * There are only two reasons we can get here, some error happened
+     * or the user triggered failover.
+     */
+    switch (failover_get_state()) {
+    case FAILOVER_STATUS_NONE:
+        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
+                                  COLO_EXIT_REASON_ERROR);
+        break;
+    case FAILOVER_STATUS_REQUIRE:
+        qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
+                                  COLO_EXIT_REASON_REQUEST);
+        break;
+    default:
+        abort();
+    }
+
     /* Hope this not to be too long to wait here */
     qemu_sem_wait(&s->colo_exit_sem);
     qemu_sem_destroy(&s->colo_exit_sem);
@@ -XXX,XX +XXX,XX @@ out:
         error_report_err(local_err);
     }
 
+    switch (failover_get_state()) {
+    case FAILOVER_STATUS_NONE:
+        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
+                                  COLO_EXIT_REASON_ERROR);
+        break;
+    case FAILOVER_STATUS_REQUIRE:
+        qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
+                                  COLO_EXIT_REASON_REQUEST);
+        break;
+    default:
+        abort();
+    }
+
     if (fb) {
         qemu_fclose(fb);
     }
diff --git a/qapi/migration.json b/qapi/migration.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -XXX,XX +XXX,XX @@
   'data': [ 'none', 'require', 'active', 'completed', 'relaunch' ] }
 
 ##
+# @COLO_EXIT:
+#
+# Emitted when VM finishes COLO mode due to some errors happening or
+# at the request of users.
+#
+# @mode: report COLO mode when COLO exited.
+#
+# @reason: describes the reason for the COLO exit.
+#
+# Since: 3.1
+#
+# Example:
+#
+# <- { "timestamp": {"seconds": 2032141960, "microseconds": 417172},
+#      "event": "COLO_EXIT", "data": {"mode": "primary", "reason": "request" } }
+#
+##
+{ 'event': 'COLO_EXIT',
+  'data': {'mode': 'COLOMode', 'reason': 'COLOExitReason' } }
+
+##
+# @COLOExitReason:
+#
+# The reason for a COLO exit
+#
+# @none: no failover has ever happened. This can't occur in the
+# COLO_EXIT event, only in the result of query-colo-status.
+#
+# @request: COLO exit is due to an external request
+#
+# @error: COLO exit is due to an internal error
+#
+# Since: 3.1
+##
+{ 'enum': 'COLOExitReason',
+  'data': [ 'none', 'request', 'error' ] }
+
+##
 # @x-colo-lost-heartbeat:
 #
 # Tell qemu that heartbeat is lost, request it to do takeover procedures.
-- 
2.5.0

From: Zhang Chen <chen.zhang@intel.com>

Suggested by Markus Armbruster rename COLO unknown mode to none mode.

Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/colo-failover.c |  2 +-
 migration/colo.c          |  2 +-
 qapi/migration.json       | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/migration/colo-failover.c b/migration/colo-failover.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo-failover.c
+++ b/migration/colo-failover.c
@@ -XXX,XX +XXX,XX @@ FailoverStatus failover_get_state(void)
 
 void qmp_x_colo_lost_heartbeat(Error **errp)
 {
-    if (get_colo_mode() == COLO_MODE_UNKNOWN) {
+    if (get_colo_mode() == COLO_MODE_NONE) {
         error_setg(errp, QERR_FEATURE_DISABLED, "colo");
         return;
     }
diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@ COLOMode get_colo_mode(void)
     } else if (migration_incoming_in_colo_state()) {
         return COLO_MODE_SECONDARY;
     } else {
-        return COLO_MODE_UNKNOWN;
+        return COLO_MODE_NONE;
     }
 }
 
diff --git a/qapi/migration.json b/qapi/migration.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -XXX,XX +XXX,XX @@
 ##
 # @COLOMode:
 #
-# The colo mode
+# The COLO current mode.
 #
-# @unknown: unknown mode
+# @none: COLO is disabled.
 #
-# @primary: master side
+# @primary: COLO node in primary side.
 #
-# @secondary: slave side
+# @secondary: COLO node in slave side.
 #
 # Since: 2.8
 ##
 { 'enum': 'COLOMode',
-  'data': [ 'unknown', 'primary', 'secondary'] }
+  'data': [ 'none', 'primary', 'secondary'] }
 
 ##
 # @FailoverStatus:
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

Libvirt or other high level software can use this command query colo status.
You can test this command like that:
{'execute':'query-colo-status'}

Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/colo.c    | 21 +++++++++++++++++++++
 qapi/migration.json | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@
 #include "net/colo.h"
 #include "block/block.h"
 #include "qapi/qapi-events-migration.h"
+#include "qapi/qmp/qerror.h"
 
 static bool vmstate_loading;
 static Notifier packets_compare_notifier;
@@ -XXX,XX +XXX,XX @@ void qmp_xen_colo_do_checkpoint(Error **errp)
 #endif
 }
 
+COLOStatus *qmp_query_colo_status(Error **errp)
+{
+    COLOStatus *s = g_new0(COLOStatus, 1);
+
+    s->mode = get_colo_mode();
+
+    switch (failover_get_state()) {
+    case FAILOVER_STATUS_NONE:
+        s->reason = COLO_EXIT_REASON_NONE;
+        break;
+    case FAILOVER_STATUS_REQUIRE:
+        s->reason = COLO_EXIT_REASON_REQUEST;
+        break;
+    default:
+        s->reason = COLO_EXIT_REASON_ERROR;
+    }
+
+    return s;
+}
+
 static void colo_send_message(QEMUFile *f, COLOMessage msg,
                               Error **errp)
 {
diff --git a/qapi/migration.json b/qapi/migration.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -XXX,XX +XXX,XX @@
 { 'command': 'xen-colo-do-checkpoint' }
 
 ##
+# @COLOStatus:
+#
+# The result format for 'query-colo-status'.
+#
+# @mode: COLO running mode. If COLO is running, this field will return
+#        'primary' or 'secondary'.
+#
+# @reason: describes the reason for the COLO exit.
+#
+# Since: 3.0
+##
+{ 'struct': 'COLOStatus',
+  'data': { 'mode': 'COLOMode', 'reason': 'COLOExitReason' } }
+
+##
+# @query-colo-status:
+#
+# Query COLO status while the vm is running.
+#
+# Returns: A @COLOStatus object showing the status.
+#
+# Example:
+#
+# -> { "execute": "query-colo-status" }
+# <- { "return": { "mode": "primary", "active": true, "reason": "request" } }
+#
+# Since: 3.0
+##
+{ 'command': 'query-colo-status',
+  'returns': 'COLOStatus' }
+
+##
 # @migrate-recover:
 #
 # Provide a recovery migration stream URI.
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

There are several stages during loadvm/savevm process. In different stage,
migration incoming processes different types of sections.
We want to control these stages more accuracy, it will benefit COLO
performance, we don't have to save type of QEMU_VM_SECTION_START
sections everytime while do checkpoint, besides, we want to separate
the process of saving/loading memory and devices state.

So we add three new helper functions: qemu_load_device_state() and
qemu_savevm_live_state() to achieve different process during migration.

Besides, we make qemu_loadvm_state_main() and qemu_save_device_state()
public, and simplify the codes of qemu_save_device_state() by calling the
wrapper qemu_savevm_state_header().

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/colo.c   | 41 ++++++++++++++++++++++++++++++++---------
 migration/savevm.c | 36 +++++++++++++++++++++++++++++-------
 migration/savevm.h |  4 ++++
 3 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@
 #include "block/block.h"
 #include "qapi/qapi-events-migration.h"
 #include "qapi/qmp/qerror.h"
+#include "sysemu/cpus.h"
 
 static bool vmstate_loading;
 static Notifier packets_compare_notifier;
@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
 
     /* Disable block migration */
     migrate_set_block_enabled(false, &local_err);
-    qemu_savevm_state_header(fb);
-    qemu_savevm_state_setup(fb);
     qemu_mutex_lock_iothread();
     replication_do_checkpoint_all(&local_err);
     if (local_err) {
         qemu_mutex_unlock_iothread();
         goto out;
     }
-    qemu_savevm_state_complete_precopy(fb, false, false);
-    qemu_mutex_unlock_iothread();
-
-    qemu_fflush(fb);
 
     colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
     if (local_err) {
+        qemu_mutex_unlock_iothread();
+        goto out;
+    }
+    /* Note: device state is saved into buffer */
+    ret = qemu_save_device_state(fb);
+
+    qemu_mutex_unlock_iothread();
+    if (ret < 0) {
         goto out;
     }
     /*
+     * Only save VM's live state, which not including device state.
+     * TODO: We may need a timeout mechanism to prevent COLO process
+     * to be blocked here.
+     */
+    qemu_savevm_live_state(s->to_dst_file);
+
+    qemu_fflush(fb);
+
+    /*
      * We need the size of the VMstate data in Secondary side,
      * With which we can decide how much data should be read.
      */
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
     uint64_t total_size;
     uint64_t value;
     Error *local_err = NULL;
+    int ret;
 
     rcu_register_thread();
     qemu_sem_init(&mis->colo_incoming_sem, 0);
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
             goto out;
         }
 
+        qemu_mutex_lock_iothread();
+        cpu_synchronize_all_pre_loadvm();
+        ret = qemu_loadvm_state_main(mis->from_src_file, mis);
+        qemu_mutex_unlock_iothread();
+
+        if (ret < 0) {
+            error_report("Load VM's live state (ram) error");
+            goto out;
+        }
+
         value = colo_receive_message_value(mis->from_src_file,
                                  COLO_MESSAGE_VMSTATE_SIZE, &local_err);
         if (local_err) {
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
         }
 
         qemu_mutex_lock_iothread();
-        qemu_system_reset(SHUTDOWN_CAUSE_NONE);
         vmstate_loading = true;
-        if (qemu_loadvm_state(fb) < 0) {
-            error_report("COLO: loadvm failed");
+        ret = qemu_load_device_state(fb);
+        if (ret < 0) {
+            error_report("COLO: load device state failed");
             qemu_mutex_unlock_iothread();
             goto out;
         }
diff --git a/migration/savevm.c b/migration/savevm.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -XXX,XX +XXX,XX @@ done:
     return ret;
 }
 
-static int qemu_save_device_state(QEMUFile *f)
+void qemu_savevm_live_state(QEMUFile *f)
 {
-    SaveStateEntry *se;
+    /* save QEMU_VM_SECTION_END section */
+    qemu_savevm_state_complete_precopy(f, true, false);
+    qemu_put_byte(f, QEMU_VM_EOF);
+}
 
-    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
-    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+int qemu_save_device_state(QEMUFile *f)
+{
+    SaveStateEntry *se;
 
+    if (!migration_in_colo_state()) {
+        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
+        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+    }
     cpu_synchronize_all_states();
 
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
@@ -XXX,XX +XXX,XX @@ enum LoadVMExitCodes {
     LOADVM_QUIT     =  1,
 };
 
-static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
-
 /* ------ incoming postcopy messages ------ */
 /* 'advise' arrives before any transfers just to tell us that a postcopy
  * *might* happen - it might be skipped if precopy transferred everything
@@ -XXX,XX +XXX,XX @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
     return true;
 }
 
-static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
+int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
 {
     uint8_t section_type;
     int ret = 0;
@@ -XXX,XX +XXX,XX @@ int qemu_loadvm_state(QEMUFile *f)
     return ret;
 }
 
+int qemu_load_device_state(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int ret;
+
+    /* Load QEMU_VM_SECTION_FULL section */
+    ret = qemu_loadvm_state_main(f, mis);
+    if (ret < 0) {
+        error_report("Failed to load device state: %d", ret);
+        return ret;
+    }
+
+    cpu_synchronize_all_post_init();
+    return 0;
+}
+
 int save_snapshot(const char *name, Error **errp)
 {
     BlockDriverState *bs, *bs1;
diff --git a/migration/savevm.h b/migration/savevm.h
index XXXXXXX..XXXXXXX 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -XXX,XX +XXX,XX @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
                                            uint64_t *start_list,
                                            uint64_t *length_list);
 void qemu_savevm_send_colo_enable(QEMUFile *f);
+void qemu_savevm_live_state(QEMUFile *f);
+int qemu_save_device_state(QEMUFile *f);
 
 int qemu_loadvm_state(QEMUFile *f);
 void qemu_loadvm_state_cleanup(void);
+int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
+int qemu_load_device_state(QEMUFile *f);
 
 #endif
-- 
2.5.0

From: zhanghailiang <zhang.zhanghailiang@huawei.com>

Don't need to flush all VM's ram from cache, only
flush the dirty pages since last checkpoint

Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/ram.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
     }
     ram_state = g_new0(RAMState, 1);
     ram_state->migration_dirty_pages = 0;
+    memory_global_dirty_log_start();
 
     return 0;
 
@@ -XXX,XX +XXX,XX @@ void colo_release_ram_cache(void)
 {
     RAMBlock *block;
 
+    memory_global_dirty_log_stop();
     RAMBLOCK_FOREACH_MIGRATABLE(block) {
         g_free(block->bmap);
         block->bmap = NULL;
@@ -XXX,XX +XXX,XX @@ static void colo_flush_ram_cache(void)
     void *src_host;
     unsigned long offset = 0;
 
+    memory_global_dirty_log_sync();
+    rcu_read_lock();
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
+        migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
+    }
+    rcu_read_unlock();
+
     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
     rcu_read_lock();
     block = QLIST_FIRST_RCU(&ram_list.blocks);
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

Filter needs to process the event of checkpoint/failover or
other event passed by COLO frame.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/net/filter.h |  5 +++++
 net/filter.c         | 17 +++++++++++++++++
 net/net.c            | 19 +++++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/include/net/filter.h b/include/net/filter.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/filter.h
+++ b/include/net/filter.h
@@ -XXX,XX +XXX,XX @@ typedef ssize_t (FilterReceiveIOV)(NetFilterState *nc,
 
 typedef void (FilterStatusChanged) (NetFilterState *nf, Error **errp);
 
+typedef void (FilterHandleEvent) (NetFilterState *nf, int event, Error **errp);
+
 typedef struct NetFilterClass {
     ObjectClass parent_class;
 
@@ -XXX,XX +XXX,XX @@ typedef struct NetFilterClass {
     FilterSetup *setup;
     FilterCleanup *cleanup;
     FilterStatusChanged *status_changed;
+    FilterHandleEvent *handle_event;
     /* mandatory */
     FilterReceiveIOV *receive_iov;
 } NetFilterClass;
@@ -XXX,XX +XXX,XX @@ ssize_t qemu_netfilter_pass_to_next(NetClientState *sender,
                                     int iovcnt,
                                     void *opaque);
 
+void colo_notify_filters_event(int event, Error **errp);
+
 #endif /* QEMU_NET_FILTER_H */
diff --git a/net/filter.c b/net/filter.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter.c
+++ b/net/filter.c
@@ -XXX,XX +XXX,XX @@
 #include "net/vhost_net.h"
 #include "qom/object_interfaces.h"
 #include "qemu/iov.h"
+#include "net/colo.h"
+#include "migration/colo.h"
 
 static inline bool qemu_can_skip_netfilter(NetFilterState *nf)
 {
@@ -XXX,XX +XXX,XX @@ static void netfilter_finalize(Object *obj)
     g_free(nf->netdev_id);
 }
 
+static void default_handle_event(NetFilterState *nf, int event, Error **errp)
+{
+    switch (event) {
+    case COLO_EVENT_CHECKPOINT:
+        break;
+    case COLO_EVENT_FAILOVER:
+        object_property_set_str(OBJECT(nf), "off", "status", errp);
+        break;
+    default:
+        break;
+    }
+}
+
 static void netfilter_class_init(ObjectClass *oc, void *data)
 {
     UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+    NetFilterClass *nfc = NETFILTER_CLASS(oc);
 
     ucc->complete = netfilter_complete;
+    nfc->handle_event = default_handle_event;
 }
 
 static const TypeInfo netfilter_info = {
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ void hmp_info_network(Monitor *mon, const QDict *qdict)
     }
 }
 
+void colo_notify_filters_event(int event, Error **errp)
+{
+    NetClientState *nc;
+    NetFilterState *nf;
+    NetFilterClass *nfc = NULL;
+    Error *local_err = NULL;
+
+    QTAILQ_FOREACH(nc, &net_clients, next) {
+        QTAILQ_FOREACH(nf, &nc->filters, next) {
+            nfc = NETFILTER_GET_CLASS(OBJECT(nf));
+            nfc->handle_event(nf, event, &local_err);
+            if (local_err) {
+                error_propagate(errp, local_err);
+                return;
+            }
+        }
+    }
+}
+
 void qmp_set_link(const char *name, bool up, Error **errp)
 {
     NetClientState *ncs[MAX_QUEUE_NUM];
-- 
2.5.0

From: Zhang Chen <zhangckid@gmail.com>

After one round of checkpoint, the states between PVM and SVM
become consistent, so it is unnecessary to adjust the sequence
of net packets for old connections, besides, while failover
happens, filter-rewriter will into failover mode that needn't
handle the new TCP connection.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c    | 12 +++++------
 net/colo.c            |  8 ++++++++
 net/colo.h            |  2 ++
 net/filter-rewriter.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ enum {
     SECONDARY_IN,
 };
 
+static void colo_compare_inconsistency_notify(void)
+{
+    notifier_list_notify(&colo_compare_notifiers,
+                migrate_get_current());
+}
+
 static int compare_chr_send(CompareState *s,
                             const uint8_t *buf,
                             uint32_t size,
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
     return false;
 }
 
-static void colo_compare_inconsistency_notify(void)
-{
-    notifier_list_notify(&colo_compare_notifiers,
-                migrate_get_current());
-}
-
 static void colo_compare_tcp(CompareState *s, Connection *conn)
 {
     Packet *ppkt = NULL, *spkt = NULL;
diff --git a/net/colo.c b/net/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.c
+++ b/net/colo.c
@@ -XXX,XX +XXX,XX @@ Connection *connection_get(GHashTable *connection_track_table,
 
     return conn;
 }
+
+bool connection_has_tracked(GHashTable *connection_track_table,
+                            ConnectionKey *key)
+{
+    Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+    return conn ? true : false;
+}
diff --git a/net/colo.h b/net/colo.h
index XXXXXXX..XXXXXXX 100644
--- a/net/colo.h
+++ b/net/colo.h
@@ -XXX,XX +XXX,XX @@ void connection_destroy(void *opaque);
 Connection *connection_get(GHashTable *connection_track_table,
                            ConnectionKey *key,
                            GQueue *conn_list);
+bool connection_has_tracked(GHashTable *connection_track_table,
+                            ConnectionKey *key);
 void connection_hashtable_reset(GHashTable *connection_track_table);
 Packet *packet_new(const void *data, int size, int vnet_hdr_len);
 void packet_destroy(void *opaque, void *user_data);
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter-rewriter.c
+++ b/net/filter-rewriter.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/main-loop.h"
 #include "qemu/iov.h"
 #include "net/checksum.h"
+#include "net/colo.h"
+#include "migration/colo.h"
 
 #define FILTER_COLO_REWRITER(obj) \
     OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER)
 
 #define TYPE_FILTER_REWRITER "filter-rewriter"
+#define FAILOVER_MODE_ON  true
+#define FAILOVER_MODE_OFF false
 
 typedef struct RewriterState {
     NetFilterState parent_obj;
@@ -XXX,XX +XXX,XX @@ typedef struct RewriterState {
     /* hashtable to save connection */
     GHashTable *connection_track_table;
     bool vnet_hdr;
+    bool failover_mode;
 } RewriterState;
 
+static void filter_rewriter_failover_mode(RewriterState *s)
+{
+    s->failover_mode = FAILOVER_MODE_ON;
+}
+
 static void filter_rewriter_flush(NetFilterState *nf)
 {
     RewriterState *s = FILTER_COLO_REWRITER(nf);
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
              */
             reverse_connection_key(&key);
         }
+
+        /* After failover we needn't change new TCP packet */
+        if (s->failover_mode &&
+            !connection_has_tracked(s->connection_track_table, &key)) {
+            goto out;
+        }
+
         conn = connection_get(s->connection_track_table,
                               &key,
                               NULL);
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
         }
     }
 
+out:
     packet_destroy(pkt, NULL);
     pkt = NULL;
     return 0;
 }
 
+static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
+{
+    Connection *conn = (Connection *)value;
+
+    conn->offset = 0;
+}
+
+static gboolean offset_is_nonzero(gpointer key,
+                                  gpointer value,
+                                  gpointer user_data)
+{
+    Connection *conn = (Connection *)value;
+
+    return conn->offset ? true : false;
+}
+
+static void colo_rewriter_handle_event(NetFilterState *nf, int event,
+                                       Error **errp)
+{
+    RewriterState *rs = FILTER_COLO_REWRITER(nf);
+
+    switch (event) {
+    case COLO_EVENT_CHECKPOINT:
+        g_hash_table_foreach(rs->connection_track_table,
+                            reset_seq_offset, NULL);
+        break;
+    case COLO_EVENT_FAILOVER:
+        if (!g_hash_table_find(rs->connection_track_table,
+                              offset_is_nonzero, NULL)) {
+            filter_rewriter_failover_mode(rs);
+        }
+        break;
+    default:
+        break;
+    }
+}
+
 static void colo_rewriter_cleanup(NetFilterState *nf)
 {
     RewriterState *s = FILTER_COLO_REWRITER(nf);
@@ -XXX,XX +XXX,XX @@ static void filter_rewriter_init(Object *obj)
     RewriterState *s = FILTER_COLO_REWRITER(obj);
 
     s->vnet_hdr = false;
+    s->failover_mode = FAILOVER_MODE_OFF;
     object_property_add_bool(obj, "vnet_hdr_support",
                              filter_rewriter_get_vnet_hdr,
                              filter_rewriter_set_vnet_hdr, NULL);
@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_class_init(ObjectClass *oc, void *data)
     nfc->setup = colo_rewriter_setup;
     nfc->cleanup = colo_rewriter_cleanup;
     nfc->receive_iov = colo_rewriter_receive_iov;
+    nfc->handle_event = colo_rewriter_handle_event;
 }
 
 static const TypeInfo colo_rewriter_info = {
-- 
2.5.0

From: zhanghailiang <zhang.zhanghailiang@huawei.com>

Notify all net filters about the checkpoint and failover event.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 migration/colo.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@
 #include "qapi/qapi-events-migration.h"
 #include "qapi/qmp/qerror.h"
 #include "sysemu/cpus.h"
+#include "net/filter.h"
 
 static bool vmstate_loading;
 static Notifier packets_compare_notifier;
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
         error_report_err(local_err);
     }
 
+    /* Notify all filters of all NIC to do checkpoint */
+    colo_notify_filters_event(COLO_EVENT_FAILOVER, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
     if (!autostart) {
         error_report("\"-S\" qemu option will be ignored in secondary side");
         /* recover runstate to normal migration finish state */
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
             goto out;
         }
 
+        /* Notify all filters of all NIC to do checkpoint */
+        colo_notify_filters_event(COLO_EVENT_CHECKPOINT, &local_err);
+
+        if (local_err) {
+            qemu_mutex_unlock_iothread();
+            goto out;
+        }
+
         vmstate_loading = false;
         vm_start();
         trace_colo_vm_state_change("stop", "run");
-- 
2.5.0

From: zhanghailiang <zhang.zhanghailiang@huawei.com>

COLO thread may sleep at qemu_sem_wait(&s->colo_checkpoint_sem),
while failover works begin, It's better to wakeup it to quick
the process.

diff --git a/migration/colo.c b/migration/colo.c
index XXXXXXX..XXXXXXX 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
 
     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
                       MIGRATION_STATUS_COMPLETED);
+    /*
+     * kick COLO thread which might wait at
+     * qemu_sem_wait(&s->colo_checkpoint_sem).
+     */
+    colo_checkpoint_notify(migrate_get_current());
 
     /*
      * Wake up COLO thread which may blocked in recv() or send(),
@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
 
         qemu_sem_wait(&s->colo_checkpoint_sem);
 
+        if (s->state != MIGRATION_STATUS_COLO) {
+            goto out;
+        }
         ret = colo_do_checkpoint_transaction(s, bioc, fb);
         if (ret < 0) {
             goto out;
-- 
2.5.0

From: Zhang Chen <chen.zhang@intel.com>

This diagram make user better understand COLO.
Suggested by Markus Armbruster.

Signed-off-by: Zhang Chen <zhangckid@gmail.com>
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 docs/COLO-FT.txt | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
index XXXXXXX..XXXXXXX 100644
--- a/docs/COLO-FT.txt
+++ b/docs/COLO-FT.txt
@@ -XXX,XX +XXX,XX @@ Note:
 HeartBeat has not been implemented yet, so you need to trigger failover process
 by using 'x-colo-lost-heartbeat' command.
 
+== COLO operation status ==
+
++-----------------+
+|                 |
+|    Start COLO   |
+|                 |
++--------+--------+
+         |
+         |  Main qmp command:
+         |  migrate-set-capabilities with x-colo
+         |  migrate
+         |
+         v
++--------+--------+
+|                 |
+|  COLO running   |
+|                 |
++--------+--------+
+         |
+         |  Main qmp command:
+         |  x-colo-lost-heartbeat
+         |  or
+         |  some error happened
+         v
++--------+--------+
+|                 |  send qmp event:
+|  COLO failover  |  COLO_EXIT
+|                 |
++-----------------+
+
+COLO use the qmp command to switch and report operation status.
+The diagram just shows the main qmp command, you can get the detail
+in test procedure.
+
 == Test procedure ==
 1. Startup qemu
 Primary:
-- 
2.5.0

From: liujunjie <liujunjie23@huawei.com>

Before, we did not clear callback like handle_output when delete
the virtqueue which may result be segmentfault.
The scene is as follows:
1. Start a vm with multiqueue vhost-net,
2. then we write VIRTIO_PCI_GUEST_FEATURES in PCI configuration to
triger multiqueue disable in this vm which will delete the virtqueue.
In this step, the tx_bh is deleted but the callback virtio_net_handle_tx_bh
still exist.
3. Finally, we write VIRTIO_PCI_QUEUE_NOTIFY in PCI configuration to
notify the deleted virtqueue. In this way, virtio_net_handle_tx_bh
will be called and qemu will be crashed.

Although the way described above is uncommon, we had better reinforce it.

CC: qemu-stable@nongnu.org
Signed-off-by: liujunjie <liujunjie23@huawei.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/virtio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -XXX,XX +XXX,XX @@ void virtio_del_queue(VirtIODevice *vdev, int n)
 
     vdev->vq[n].vring.num = 0;
     vdev->vq[n].vring.num_default = 0;
+    vdev->vq[n].handle_output = NULL;
+    vdev->vq[n].handle_aio_output = NULL;
 }
 
 static void virtio_set_isr(VirtIODevice *vdev, int value)
-- 
2.5.0

In ne2000_receive(), we try to assign size_ to size which converts
from size_t to integer. This will cause troubles when size_ is greater
INT_MAX, this will lead a negative value in size and it can then pass
the check of size < MIN_BUF_SIZE which may lead out of bound access of
for both buf and buf1.

Fixing by converting the type of size to size_t.

CC: qemu-stable@nongnu.org
Reported-by: Daniel Shapira <daniel@twistlock.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/ne2000.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/ne2000.c
+++ b/hw/net/ne2000.c
@@ -XXX,XX +XXX,XX @@ static int ne2000_buffer_full(NE2000State *s)
 ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
 {
     NE2000State *s = qemu_get_nic_opaque(nc);
-    int size = size_;
+    size_t size = size_;
     uint8_t *p;
     unsigned int total_len, next, avail, len, index, mcast_idx;
     uint8_t buf1[60];
@@ -XXX,XX +XXX,XX @@ ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
         { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 #if defined(DEBUG_NE2000)
-    printf("NE2000: received len=%d\n", size);
+    printf("NE2000: received len=%zu\n", size);
 #endif
 
     if (s->cmd & E8390_STOP || ne2000_buffer_full(s))
-- 
2.5.0

In rtl8139_do_receive(), we try to assign size_ to size which converts
from size_t to integer. This will cause troubles when size_ is greater
INT_MAX, this will lead a negative value in size and it can then pass
the check of size < MIN_BUF_SIZE which may lead out of bound access of
for both buf and buf1.

Fixing by converting the type of size to size_t.

CC: qemu-stable@nongnu.org
Reported-by: Daniel Shapira <daniel@twistlock.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/rtl8139.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
     RTL8139State *s = qemu_get_nic_opaque(nc);
     PCIDevice *d = PCI_DEVICE(s);
     /* size is the length of the buffer passed to the driver */
-    int size = size_;
+    size_t size = size_;
     const uint8_t *dot1q_buf = NULL;
 
     uint32_t packet_header = 0;
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
     static const uint8_t broadcast_macaddr[6] =
         { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
-    DPRINTF(">>> received len=%d\n", size);
+    DPRINTF(">>> received len=%zu\n", size);
 
     /* test if board clock is stopped */
     if (!s->clock_enabled)
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
 
         if (size+4 > rx_space)
         {
-            DPRINTF("C+ Rx mode : descriptor %d size %d received %d + 4\n",
+            DPRINTF("C+ Rx mode : descriptor %d size %d received %zu + 4\n",
                 descriptor, rx_space, size);
 
             s->IntrStatus |= RxOverflow;
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
         if (avail != 0 && RX_ALIGN(size + 8) >= avail)
         {
             DPRINTF("rx overflow: rx buffer length %d head 0x%04x "
-                "read 0x%04x === available 0x%04x need 0x%04x\n",
+                "read 0x%04x === available 0x%04x need 0x%04zx\n",
                 s->RxBufferSize, s->RxBufAddr, s->RxBufPtr, avail, size + 8);
 
             s->IntrStatus |= RxOverflow;
-- 
2.5.0

In pcnet_receive(), we try to assign size_ to size which converts from
size_t to integer. This will cause troubles when size_ is greater
INT_MAX, this will lead a negative value in size and it can then pass
the check of size < MIN_BUF_SIZE which may lead out of bound access
for both buf and buf1.

Fixing by converting the type of size to size_t.

CC: qemu-stable@nongnu.org
Reported-by: Daniel Shapira <daniel@twistlock.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/pcnet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/pcnet.c
+++ b/hw/net/pcnet.c
@@ -XXX,XX +XXX,XX @@ ssize_t pcnet_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
     uint8_t buf1[60];
     int remaining;
     int crc_err = 0;
-    int size = size_;
+    size_t size = size_;
 
     if (CSR_DRX(s) || CSR_STOP(s) || CSR_SPND(s) || !size ||
         (CSR_LOOP(s) && !s->looptest)) {
         return -1;
     }
 #ifdef PCNET_DEBUG
-    printf("pcnet_receive size=%d\n", size);
+    printf("pcnet_receive size=%zu\n", size);
 #endif
 
     /* if too small buffer, then expand it */
-- 
2.5.0

From: Martin Wilck <mwilck@suse.com>

The e1000 emulation silently discards RX packets if there's
insufficient space in the ring buffer. This leads to errors
on higher-level protocols in the guest, with no indication
about the error cause.

This patch increments the "Missed Packets Count" (MPC) and
"Receive No Buffers Count" (RNBC) HW counters in this case.
As the emulation has no FIFO for buffering packets that can't
immediately be pushed to the guest, these two registers are
practically equivalent (see 10.2.7.4, 10.2.7.33 in
https://www.intel.com/content/www/us/en/embedded/products/networking/82574l-gbe-controller-datasheet.html).

On a Linux guest, the register content  will be reflected in
the "rx_missed_errors" and "rx_no_buffer_count" stats from
"ethtool -S", and in the "missed" stat from "ip -s -s link show",
giving at least some hint about the error cause inside the guest.

If the cause is known, problems like this can often be avoided
easily, by increasing the number of RX descriptors in the guest
e1000 driver (e.g under Linux, "e1000.RxDescriptors=1024").

The patch also adds a qemu trace message for this condition.

Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000.c      | 16 +++++++++++++---
 hw/net/trace-events |  3 +++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/range.h"
 
 #include "e1000x_common.h"
+#include "trace.h"
 
 static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 
@@ -XXX,XX +XXX,XX @@ static uint64_t rx_desc_base(E1000State *s)
     return (bah << 32) + bal;
 }
 
+static void
+e1000_receiver_overrun(E1000State *s, size_t size)
+{
+    trace_e1000_receiver_overrun(size, s->mac_reg[RDH], s->mac_reg[RDT]);
+    e1000x_inc_reg_if_not_full(s->mac_reg, RNBC);
+    e1000x_inc_reg_if_not_full(s->mac_reg, MPC);
+    set_ics(s, 0, E1000_ICS_RXO);
+}
+
 static ssize_t
 e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
 {
@@ -XXX,XX +XXX,XX @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
     desc_offset = 0;
     total_size = size + e1000x_fcs_len(s->mac_reg);
     if (!e1000_has_rxbufs(s, total_size)) {
-            set_ics(s, 0, E1000_ICS_RXO);
-            return -1;
+        e1000_receiver_overrun(s, total_size);
+        return -1;
     }
     do {
         desc_size = total_size - desc_offset;
@@ -XXX,XX +XXX,XX @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
             rdh_start >= s->mac_reg[RDLEN] / sizeof(desc)) {
             DBGOUT(RXERR, "RDH wraparound @%x, RDT %x, RDLEN %x\n",
                    rdh_start, s->mac_reg[RDT], s->mac_reg[RDLEN]);
-            set_ics(s, 0, E1000_ICS_RXO);
+            e1000_receiver_overrun(s, total_size);
             return -1;
         }
     } while (desc_offset < total_size);
diff --git a/hw/net/trace-events b/hw/net/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -XXX,XX +XXX,XX @@ net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS  hash"
 net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X"
 net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes"
 
+# hw/net/e1000.c
+e1000_receiver_overrun(size_t s, uint32_t rdh, uint32_t rdt) "Receiver overrun: dropped packet of %lu bytes, RDH=%u, RDT=%u"
+
 # hw/net/e1000x_common.c
 e1000x_rx_can_recv_disabled(bool link_up, bool rx_enabled, bool pci_master) "link_up: %d, rx_enabled %d, pci_master %d"
 e1000x_vlan_is_vlan_pkt(bool is_vlan_pkt, uint16_t eth_proto, uint16_t vet) "Is VLAN packet: %d, ETH proto: 0x%X, VET: 0x%X"
-- 
2.5.0

The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:

Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)

----------------------------------------------------------------

Changes since V2:
- fix 32bit build errros

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 170 ++++++++++
 10 files changed, 1584 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 ++
 4 files changed, 215 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 3 files changed, 522 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec, size_t num,
+                                    bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem, unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
+                            false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq, unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)(uintptr_t)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)(intptr_t)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq, unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next)
+{
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     hwaddr *addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        Int128 needle_last, map_last;
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = map->iova + off;
+
+        needle_last = int128_add(int128_make64(needle.translated_addr),
+                                 int128_make64(iovec[i].iov_len));
+        map_last = int128_make64(map->translated_addr + map->size);
+        if (unlikely(int128_gt(needle_last, map_last))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                     const struct iovec *iovec, size_t num,
                                     bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64(sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-                            false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr)
 {
-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 }
 
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)(uintptr_t)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)(uintptr_t)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)(intptr_t)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4