1
The following changes since commit a73549f99612f758dec0fdea6ae1c30b6c709a0b:
1
The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
2
2
3
Merge remote-tracking branch 'remotes/kraxel/tags/ui-20181012-pull-request' into staging (2018-10-12 16:45:51 +0100)
3
Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
4
4
5
are available in the git repository at:
5
are available in the git repository at:
6
6
7
https://github.com/jasowang/qemu.git tags/net-pull-request
7
https://github.com/jasowang/qemu.git tags/net-pull-request
8
8
9
for you to fetch changes up to a7ec0077c2db445d6bae421963188367d2695bd6:
9
for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
10
10
11
qemu-options: Fix bad "macaddr" property in the documentation (2018-10-15 16:14:15 +0800)
11
vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
14
15
Changes since V2:
16
- fix 32bit build errros
17
15
----------------------------------------------------------------
18
----------------------------------------------------------------
16
Jason Wang (4):
19
Eugenio Pérez (14):
17
ne2000: fix possible out of bound access in ne2000_receive
20
vhost: Add VhostShadowVirtqueue
18
rtl8139: fix possible out of bound access
21
vhost: Add Shadow VirtQueue kick forwarding capabilities
19
pcnet: fix possible buffer overflow
22
vhost: Add Shadow VirtQueue call forwarding capabilities
20
net: ignore packet size greater than INT_MAX
23
vhost: Add vhost_svq_valid_features to shadow vq
24
virtio: Add vhost_svq_get_vring_addr
25
vdpa: adapt vhost_ops callbacks to svq
26
vhost: Shadow virtqueue buffers forwarding
27
util: Add iova_tree_alloc_map
28
util: add iova_tree_find_iova
29
vhost: Add VhostIOVATree
30
vdpa: Add custom IOTLB translations to SVQ
31
vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
32
vdpa: Never set log_base addr if SVQ is enabled
33
vdpa: Expose VHOST_F_LOG_ALL on SVQ
21
34
22
Martin Wilck (1):
35
Jason Wang (1):
23
e1000: indicate dropped packets in HW counters
36
virtio-net: fix map leaking on error during receive
24
37
25
Thomas Huth (1):
38
hw/net/virtio-net.c | 1 +
26
qemu-options: Fix bad "macaddr" property in the documentation
39
hw/virtio/meson.build | 2 +-
27
40
hw/virtio/vhost-iova-tree.c | 110 +++++++
28
Zhang Chen (15):
41
hw/virtio/vhost-iova-tree.h | 27 ++
29
filter-rewriter: Add TCP state machine and fix memory leak in connection_track_table
42
hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
30
colo-compare: implement the process of checkpoint
43
hw/virtio/vhost-shadow-virtqueue.h | 87 +++++
31
colo-compare: use notifier to notify packets comparing result
44
hw/virtio/vhost-vdpa.c | 522 +++++++++++++++++++++++++++++-
32
1;5202;0c1;5202;0c COLO: integrate colo compare with colo frame
45
include/hw/virtio/vhost-vdpa.h | 8 +
33
COLO: Add block replication into colo process
46
include/qemu/iova-tree.h | 38 ++-
34
COLO: Remove colo_state migration struct
47
util/iova-tree.c | 170 ++++++++++
35
COLO: Load dirty pages into SVM's RAM cache firstly
48
10 files changed, 1584 insertions(+), 17 deletions(-)
36
ram/COLO: Record the dirty pages that SVM received
49
create mode 100644 hw/virtio/vhost-iova-tree.c
37
COLO: Flush memory data from ram cache
50
create mode 100644 hw/virtio/vhost-iova-tree.h
38
qapi/migration.json: Rename COLO unknown mode to none mode.
51
create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
39
qapi: Add new command to query colo status
52
create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
40
savevm: split the process of different stages for loadvm/savevm
41
filter: Add handle_event method for NetFilterClass
42
filter-rewriter: handle checkpoint and failover event
43
docs: Add COLO status diagram to COLO-FT.txt
44
45
liujunjie (1):
46
clean up callback when del virtqueue
47
48
zhanghailiang (4):
49
qmp event: Add COLO_EXIT event to notify users while exited COLO
50
COLO: flush host dirty ram from cache
51
COLO: notify net filters about checkpoint/failover event
52
COLO: quick failover process by kick COLO thread
53
54
docs/COLO-FT.txt | 34 ++++++++
55
hw/net/e1000.c | 16 +++-
56
hw/net/ne2000.c | 4 +-
57
hw/net/pcnet.c | 4 +-
58
hw/net/rtl8139.c | 8 +-
59
hw/net/trace-events | 3 +
60
hw/virtio/virtio.c | 2 +
61
include/exec/ram_addr.h | 1 +
62
include/migration/colo.h | 11 ++-
63
include/net/filter.h | 5 ++
64
migration/Makefile.objs | 2 +-
65
migration/colo-comm.c | 76 -----------------
66
migration/colo-failover.c | 2 +-
67
migration/colo.c | 212 +++++++++++++++++++++++++++++++++++++++++++---
68
migration/migration.c | 46 ++++++++--
69
migration/ram.c | 166 +++++++++++++++++++++++++++++++++++-
70
migration/ram.h | 4 +
71
migration/savevm.c | 53 ++++++++++--
72
migration/savevm.h | 5 ++
73
migration/trace-events | 3 +
74
net/colo-compare.c | 115 ++++++++++++++++++++++---
75
net/colo-compare.h | 24 ++++++
76
net/colo.c | 10 ++-
77
net/colo.h | 11 +--
78
net/filter-rewriter.c | 166 +++++++++++++++++++++++++++++++++---
79
net/filter.c | 17 ++++
80
net/net.c | 26 +++++-
81
qapi/migration.json | 80 +++++++++++++++--
82
qemu-options.hx | 2 +-
83
vl.c | 2 -
84
30 files changed, 958 insertions(+), 152 deletions(-)
85
delete mode 100644 migration/colo-comm.c
86
create mode 100644 net/colo-compare.h
87
53
88
54
89
55
diff view generated by jsdifflib
1
From: Thomas Huth <thuth@redhat.com>
1
Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
2
tries to fix the use after free of the sg by caching the virtqueue
3
elements in an array and unmap them at once after receiving the
4
packets, But it forgot to unmap the cached elements on error which
5
will lead to leaking of mapping and other unexpected results.
2
6
3
When using the "-device" option, the property is called "mac".
7
Fixing this by detaching the cached elements on error. This addresses
4
"macaddr" is only used for the legacy "-net nic" option.
8
CVE-2022-26353.
5
9
6
Reported-by: Harald Hoyer <harald@redhat.com>
10
Reported-by: Victor Tom <vv474172261@gmail.com>
7
Reviewed-by: Markus Armbruster <armbru@redhat.com>
11
Cc: qemu-stable@nongnu.org
8
Signed-off-by: Thomas Huth <thuth@redhat.com>
12
Fixes: CVE-2022-26353
13
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
14
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
9
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
---
16
---
11
qemu-options.hx | 2 +-
17
hw/net/virtio-net.c | 1 +
12
1 file changed, 1 insertion(+), 1 deletion(-)
18
1 file changed, 1 insertion(+)
13
19
14
diff --git a/qemu-options.hx b/qemu-options.hx
20
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
15
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
16
--- a/qemu-options.hx
22
--- a/hw/net/virtio-net.c
17
+++ b/qemu-options.hx
23
+++ b/hw/net/virtio-net.c
18
@@ -XXX,XX +XXX,XX @@ qemu-system-i386 linux.img \
24
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
19
-netdev socket,id=n2,mcast=230.0.0.1:1234
25
20
# launch yet another QEMU instance on same "bus"
26
err:
21
qemu-system-i386 linux.img \
27
for (j = 0; j < i; j++) {
22
- -device e1000,netdev=n3,macaddr=52:54:00:12:34:58 \
28
+ virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
23
+ -device e1000,netdev=n3,mac=52:54:00:12:34:58 \
29
g_free(elems[j]);
24
-netdev socket,id=n3,mcast=230.0.0.1:1234
30
}
25
@end example
26
31
27
--
32
--
28
2.5.0
33
2.7.4
29
30
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
While do checkpoint, we need to flush all the unhandled packets,
3
Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
4
By using the filter notifier mechanism, we can easily to notify
4
notifications and buffers, allowing qemu to track them. While qemu is
5
every compare object to do this process, which runs inside
5
forwarding the buffers and virtqueue changes, it is able to commit the
6
of compare threads as a coroutine.
6
memory it's being dirtied, the same way regular qemu's VirtIO devices
7
do.
7
8
8
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
9
This commit only exposes basic SVQ allocation and free. Next patches of
9
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
10
the series add functionality like notifications and buffers forwarding.
10
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
11
12
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
13
Acked-by: Michael S. Tsirkin <mst@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
14
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
---
15
---
13
include/migration/colo.h | 6 ++++
16
hw/virtio/meson.build | 2 +-
14
net/colo-compare.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
17
hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
15
net/colo-compare.h | 22 ++++++++++++++
18
hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
16
3 files changed, 106 insertions(+)
19
3 files changed, 91 insertions(+), 1 deletion(-)
17
create mode 100644 net/colo-compare.h
20
create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
21
create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
18
22
19
diff --git a/include/migration/colo.h b/include/migration/colo.h
23
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
20
index XXXXXXX..XXXXXXX 100644
24
index XXXXXXX..XXXXXXX 100644
21
--- a/include/migration/colo.h
25
--- a/hw/virtio/meson.build
22
+++ b/include/migration/colo.h
26
+++ b/hw/virtio/meson.build
23
@@ -XXX,XX +XXX,XX @@
27
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
24
#include "qemu-common.h"
28
25
#include "qapi/qapi-types-migration.h"
29
virtio_ss = ss.source_set()
26
30
virtio_ss.add(files('virtio.c'))
27
+enum colo_event {
31
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
28
+ COLO_EVENT_NONE,
32
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
29
+ COLO_EVENT_CHECKPOINT,
33
virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
30
+ COLO_EVENT_FAILOVER,
34
virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
31
+};
35
virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
32
+
36
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
33
void colo_info_init(void);
34
35
void migrate_start_colo_process(MigrationState *s);
36
diff --git a/net/colo-compare.c b/net/colo-compare.c
37
index XXXXXXX..XXXXXXX 100644
38
--- a/net/colo-compare.c
39
+++ b/net/colo-compare.c
40
@@ -XXX,XX +XXX,XX @@
41
#include "qemu/sockets.h"
42
#include "colo.h"
43
#include "sysemu/iothread.h"
44
+#include "net/colo-compare.h"
45
+#include "migration/colo.h"
46
47
#define TYPE_COLO_COMPARE "colo-compare"
48
#define COLO_COMPARE(obj) \
49
OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
50
51
+static QTAILQ_HEAD(, CompareState) net_compares =
52
+ QTAILQ_HEAD_INITIALIZER(net_compares);
53
+
54
#define COMPARE_READ_LEN_MAX NET_BUFSIZE
55
#define MAX_QUEUE_SIZE 1024
56
57
@@ -XXX,XX +XXX,XX @@
58
/* TODO: Should be configurable */
59
#define REGULAR_PACKET_CHECK_MS 3000
60
61
+static QemuMutex event_mtx;
62
+static QemuCond event_complete_cond;
63
+static int event_unhandled_count;
64
+
65
/*
66
* + CompareState ++
67
* | |
68
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
69
IOThread *iothread;
70
GMainContext *worker_context;
71
QEMUTimer *packet_check_timer;
72
+
73
+ QEMUBH *event_bh;
74
+ enum colo_event event;
75
+
76
+ QTAILQ_ENTRY(CompareState) next;
77
} CompareState;
78
79
typedef struct CompareClass {
80
@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
81
REGULAR_PACKET_CHECK_MS);
82
}
83
84
+/* Public API, Used for COLO frame to notify compare event */
85
+void colo_notify_compares_event(void *opaque, int event, Error **errp)
86
+{
87
+ CompareState *s;
88
+
89
+ qemu_mutex_lock(&event_mtx);
90
+ QTAILQ_FOREACH(s, &net_compares, next) {
91
+ s->event = event;
92
+ qemu_bh_schedule(s->event_bh);
93
+ event_unhandled_count++;
94
+ }
95
+ /* Wait all compare threads to finish handling this event */
96
+ while (event_unhandled_count > 0) {
97
+ qemu_cond_wait(&event_complete_cond, &event_mtx);
98
+ }
99
+
100
+ qemu_mutex_unlock(&event_mtx);
101
+}
102
+
103
static void colo_compare_timer_init(CompareState *s)
104
{
105
AioContext *ctx = iothread_get_aio_context(s->iothread);
106
@@ -XXX,XX +XXX,XX @@ static void colo_compare_timer_del(CompareState *s)
107
}
108
}
109
110
+static void colo_flush_packets(void *opaque, void *user_data);
111
+
112
+static void colo_compare_handle_event(void *opaque)
113
+{
114
+ CompareState *s = opaque;
115
+
116
+ switch (s->event) {
117
+ case COLO_EVENT_CHECKPOINT:
118
+ g_queue_foreach(&s->conn_list, colo_flush_packets, s);
119
+ break;
120
+ case COLO_EVENT_FAILOVER:
121
+ break;
122
+ default:
123
+ break;
124
+ }
125
+
126
+ assert(event_unhandled_count > 0);
127
+
128
+ qemu_mutex_lock(&event_mtx);
129
+ event_unhandled_count--;
130
+ qemu_cond_broadcast(&event_complete_cond);
131
+ qemu_mutex_unlock(&event_mtx);
132
+}
133
+
134
static void colo_compare_iothread(CompareState *s)
135
{
136
object_ref(OBJECT(s->iothread));
137
@@ -XXX,XX +XXX,XX @@ static void colo_compare_iothread(CompareState *s)
138
s, s->worker_context, true);
139
140
colo_compare_timer_init(s);
141
+ s->event_bh = qemu_bh_new(colo_compare_handle_event, s);
142
}
143
144
static char *compare_get_pri_indev(Object *obj, Error **errp)
145
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
146
net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr);
147
net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr);
148
149
+ QTAILQ_INSERT_TAIL(&net_compares, s, next);
150
+
151
g_queue_init(&s->conn_list);
152
153
+ qemu_mutex_init(&event_mtx);
154
+ qemu_cond_init(&event_complete_cond);
155
+
156
s->connection_track_table = g_hash_table_new_full(connection_key_hash,
157
connection_key_equal,
158
g_free,
159
@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
160
static void colo_compare_finalize(Object *obj)
161
{
162
CompareState *s = COLO_COMPARE(obj);
163
+ CompareState *tmp = NULL;
164
165
qemu_chr_fe_deinit(&s->chr_pri_in, false);
166
qemu_chr_fe_deinit(&s->chr_sec_in, false);
167
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
168
if (s->iothread) {
169
colo_compare_timer_del(s);
170
}
171
+
172
+ qemu_bh_delete(s->event_bh);
173
+
174
+ QTAILQ_FOREACH(tmp, &net_compares, next) {
175
+ if (tmp == s) {
176
+ QTAILQ_REMOVE(&net_compares, s, next);
177
+ break;
178
+ }
179
+ }
180
+
181
/* Release all unhandled packets after compare thead exited */
182
g_queue_foreach(&s->conn_list, colo_flush_packets, s);
183
184
@@ -XXX,XX +XXX,XX @@ static void colo_compare_finalize(Object *obj)
185
if (s->iothread) {
186
object_unref(OBJECT(s->iothread));
187
}
188
+
189
+ qemu_mutex_destroy(&event_mtx);
190
+ qemu_cond_destroy(&event_complete_cond);
191
+
192
g_free(s->pri_indev);
193
g_free(s->sec_indev);
194
g_free(s->outdev);
195
diff --git a/net/colo-compare.h b/net/colo-compare.h
196
new file mode 100644
37
new file mode 100644
197
index XXXXXXX..XXXXXXX
38
index XXXXXXX..XXXXXXX
198
--- /dev/null
39
--- /dev/null
199
+++ b/net/colo-compare.h
40
+++ b/hw/virtio/vhost-shadow-virtqueue.c
200
@@ -XXX,XX +XXX,XX @@
41
@@ -XXX,XX +XXX,XX @@
201
+/*
42
+/*
202
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
43
+ * vhost shadow virtqueue
203
+ * (a.k.a. Fault Tolerance or Continuous Replication)
204
+ *
44
+ *
205
+ * Copyright (c) 2017 HUAWEI TECHNOLOGIES CO., LTD.
45
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
206
+ * Copyright (c) 2017 FUJITSU LIMITED
46
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
207
+ * Copyright (c) 2017 Intel Corporation
208
+ *
47
+ *
209
+ * Authors:
48
+ * SPDX-License-Identifier: GPL-2.0-or-later
210
+ * zhanghailiang <zhang.zhanghailiang@huawei.com>
211
+ * Zhang Chen <zhangckid@gmail.com>
212
+ *
213
+ * This work is licensed under the terms of the GNU GPL, version 2 or
214
+ * later. See the COPYING file in the top-level directory.
215
+ */
49
+ */
216
+
50
+
217
+#ifndef QEMU_COLO_COMPARE_H
51
+#include "qemu/osdep.h"
218
+#define QEMU_COLO_COMPARE_H
52
+#include "hw/virtio/vhost-shadow-virtqueue.h"
219
+
53
+
220
+void colo_notify_compares_event(void *opaque, int event, Error **errp);
54
+#include "qemu/error-report.h"
221
+
55
+
222
+#endif /* QEMU_COLO_COMPARE_H */
56
+/**
57
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
58
+ * shadow methods and file descriptors.
59
+ *
60
+ * Returns the new virtqueue or NULL.
61
+ *
62
+ * In case of error, reason is reported through error_report.
63
+ */
64
+VhostShadowVirtqueue *vhost_svq_new(void)
65
+{
66
+ g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
67
+ int r;
68
+
69
+ r = event_notifier_init(&svq->hdev_kick, 0);
70
+ if (r != 0) {
71
+ error_report("Couldn't create kick event notifier: %s (%d)",
72
+ g_strerror(errno), errno);
73
+ goto err_init_hdev_kick;
74
+ }
75
+
76
+ r = event_notifier_init(&svq->hdev_call, 0);
77
+ if (r != 0) {
78
+ error_report("Couldn't create call event notifier: %s (%d)",
79
+ g_strerror(errno), errno);
80
+ goto err_init_hdev_call;
81
+ }
82
+
83
+ return g_steal_pointer(&svq);
84
+
85
+err_init_hdev_call:
86
+ event_notifier_cleanup(&svq->hdev_kick);
87
+
88
+err_init_hdev_kick:
89
+ return NULL;
90
+}
91
+
92
+/**
93
+ * Free the resources of the shadow virtqueue.
94
+ *
95
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
96
+ */
97
+void vhost_svq_free(gpointer pvq)
98
+{
99
+ VhostShadowVirtqueue *vq = pvq;
100
+ event_notifier_cleanup(&vq->hdev_kick);
101
+ event_notifier_cleanup(&vq->hdev_call);
102
+ g_free(vq);
103
+}
104
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
105
new file mode 100644
106
index XXXXXXX..XXXXXXX
107
--- /dev/null
108
+++ b/hw/virtio/vhost-shadow-virtqueue.h
109
@@ -XXX,XX +XXX,XX @@
110
+/*
111
+ * vhost shadow virtqueue
112
+ *
113
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
114
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
115
+ *
116
+ * SPDX-License-Identifier: GPL-2.0-or-later
117
+ */
118
+
119
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
120
+#define VHOST_SHADOW_VIRTQUEUE_H
121
+
122
+#include "qemu/event_notifier.h"
123
+
124
+/* Shadow virtqueue to relay notifications */
125
+typedef struct VhostShadowVirtqueue {
126
+ /* Shadow kick notifier, sent to vhost */
127
+ EventNotifier hdev_kick;
128
+ /* Shadow call notifier, sent to vhost */
129
+ EventNotifier hdev_call;
130
+} VhostShadowVirtqueue;
131
+
132
+VhostShadowVirtqueue *vhost_svq_new(void);
133
+
134
+void vhost_svq_free(gpointer vq);
135
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
136
+
137
+#endif
223
--
138
--
224
2.5.0
139
2.7.4
225
140
226
141
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
For COLO FT, both the PVM and SVM run at the same time,
3
At this mode no buffer forwarding will be performed in SVQ mode: Qemu
4
only sync the state while it needs.
4
will just forward the guest's kicks to the device.
5
5
6
So here, let SVM runs while not doing checkpoint, change
6
Host memory notifiers regions are left out for simplicity, and they will
7
DEFAULT_MIGRATE_X_CHECKPOINT_DELAY to 200*100.
7
not be addressed in this series.
8
8
9
Besides, we forgot to release colo_checkpoint_semd and
9
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
10
colo_delay_timer, fix them here.
10
Acked-by: Michael S. Tsirkin <mst@redhat.com>
11
12
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
13
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
14
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
15
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
16
Signed-off-by: Jason Wang <jasowang@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
17
---
12
---
18
migration/colo.c | 42 ++++++++++++++++++++++++++++++++++++++++--
13
hw/virtio/vhost-shadow-virtqueue.c | 55 ++++++++++++++
19
migration/migration.c | 6 ++----
14
hw/virtio/vhost-shadow-virtqueue.h | 14 ++++
20
2 files changed, 42 insertions(+), 6 deletions(-)
15
hw/virtio/vhost-vdpa.c | 144 ++++++++++++++++++++++++++++++++++++-
21
16
include/hw/virtio/vhost-vdpa.h | 4 ++
22
diff --git a/migration/colo.c b/migration/colo.c
17
4 files changed, 215 insertions(+), 2 deletions(-)
18
19
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
23
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
24
--- a/migration/colo.c
21
--- a/hw/virtio/vhost-shadow-virtqueue.c
25
+++ b/migration/colo.c
22
+++ b/hw/virtio/vhost-shadow-virtqueue.c
26
@@ -XXX,XX +XXX,XX @@
23
@@ -XXX,XX +XXX,XX @@
24
#include "hw/virtio/vhost-shadow-virtqueue.h"
25
27
#include "qemu/error-report.h"
26
#include "qemu/error-report.h"
28
#include "migration/failover.h"
27
+#include "qemu/main-loop.h"
29
#include "replication.h"
28
+#include "linux-headers/linux/vhost.h"
30
+#include "net/colo-compare.h"
29
+
31
+#include "net/colo.h"
30
+/**
32
31
+ * Forward guest notifications.
33
static bool vmstate_loading;
32
+ *
34
+static Notifier packets_compare_notifier;
33
+ * @n: guest kick event notifier, the one that guest set to notify svq.
35
34
+ */
36
#define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
35
+static void vhost_handle_guest_kick(EventNotifier *n)
37
36
+{
38
@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
37
+ VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
39
goto out;
38
+ event_notifier_test_and_clear(n);
39
+ event_notifier_set(&svq->hdev_kick);
40
+}
41
+
42
+/**
43
+ * Set a new file descriptor for the guest to kick the SVQ and notify for avail
44
+ *
45
+ * @svq: The svq
46
+ * @svq_kick_fd: The svq kick fd
47
+ *
48
+ * Note that the SVQ will never close the old file descriptor.
49
+ */
50
+void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
51
+{
52
+ EventNotifier *svq_kick = &svq->svq_kick;
53
+ bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
54
+ bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
55
+
56
+ if (poll_stop) {
57
+ event_notifier_set_handler(svq_kick, NULL);
58
+ }
59
+
60
+ /*
61
+ * event_notifier_set_handler already checks for guest's notifications if
62
+ * they arrive at the new file descriptor in the switch, so there is no
63
+ * need to explicitly check for them.
64
+ */
65
+ if (poll_start) {
66
+ event_notifier_init_fd(svq_kick, svq_kick_fd);
67
+ event_notifier_set(svq_kick);
68
+ event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
69
+ }
70
+}
71
+
72
+/**
73
+ * Stop the shadow virtqueue operation.
74
+ * @svq: Shadow Virtqueue
75
+ */
76
+void vhost_svq_stop(VhostShadowVirtqueue *svq)
77
+{
78
+ event_notifier_set_handler(&svq->svq_kick, NULL);
79
+}
80
81
/**
82
* Creates vhost shadow virtqueue, and instructs the vhost device to use the
83
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
84
goto err_init_hdev_call;
40
}
85
}
41
86
42
+ colo_notify_compares_event(NULL, COLO_EVENT_CHECKPOINT, &local_err);
87
+ event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
43
+ if (local_err) {
88
return g_steal_pointer(&svq);
44
+ goto out;
89
45
+ }
90
err_init_hdev_call:
46
+
91
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
47
/* Disable block migration */
92
void vhost_svq_free(gpointer pvq)
48
migrate_set_block_enabled(false, &local_err);
93
{
49
qemu_savevm_state_header(fb);
94
VhostShadowVirtqueue *vq = pvq;
50
@@ -XXX,XX +XXX,XX @@ out:
95
+ vhost_svq_stop(vq);
96
event_notifier_cleanup(&vq->hdev_kick);
97
event_notifier_cleanup(&vq->hdev_call);
98
g_free(vq);
99
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
100
index XXXXXXX..XXXXXXX 100644
101
--- a/hw/virtio/vhost-shadow-virtqueue.h
102
+++ b/hw/virtio/vhost-shadow-virtqueue.h
103
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
104
EventNotifier hdev_kick;
105
/* Shadow call notifier, sent to vhost */
106
EventNotifier hdev_call;
107
+
108
+ /*
109
+ * Borrowed virtqueue's guest to host notifier. To borrow it in this event
110
+ * notifier allows to recover the VhostShadowVirtqueue from the event loop
111
+ * easily. If we use the VirtQueue's one, we don't have an easy way to
112
+ * retrieve VhostShadowVirtqueue.
113
+ *
114
+ * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
115
+ */
116
+ EventNotifier svq_kick;
117
} VhostShadowVirtqueue;
118
119
+void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
120
+
121
+void vhost_svq_stop(VhostShadowVirtqueue *svq);
122
+
123
VhostShadowVirtqueue *vhost_svq_new(void);
124
125
void vhost_svq_free(gpointer vq);
126
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
127
index XXXXXXX..XXXXXXX 100644
128
--- a/hw/virtio/vhost-vdpa.c
129
+++ b/hw/virtio/vhost-vdpa.c
130
@@ -XXX,XX +XXX,XX @@
131
#include "hw/virtio/vhost.h"
132
#include "hw/virtio/vhost-backend.h"
133
#include "hw/virtio/virtio-net.h"
134
+#include "hw/virtio/vhost-shadow-virtqueue.h"
135
#include "hw/virtio/vhost-vdpa.h"
136
#include "exec/address-spaces.h"
137
#include "qemu/main-loop.h"
138
#include "cpu.h"
139
#include "trace.h"
140
#include "qemu-common.h"
141
+#include "qapi/error.h"
142
143
/*
144
* Return one past the end of the end of section. Be careful with uint64_t
145
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
146
return v->index != 0;
147
}
148
149
+static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
150
+ Error **errp)
151
+{
152
+ g_autoptr(GPtrArray) shadow_vqs = NULL;
153
+
154
+ if (!v->shadow_vqs_enabled) {
155
+ return 0;
156
+ }
157
+
158
+ shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
159
+ for (unsigned n = 0; n < hdev->nvqs; ++n) {
160
+ g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
161
+
162
+ if (unlikely(!svq)) {
163
+ error_setg(errp, "Cannot create svq %u", n);
164
+ return -1;
165
+ }
166
+ g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
167
+ }
168
+
169
+ v->shadow_vqs = g_steal_pointer(&shadow_vqs);
170
+ return 0;
171
+}
172
+
173
static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
174
{
175
struct vhost_vdpa *v;
176
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
177
dev->opaque = opaque ;
178
v->listener = vhost_vdpa_memory_listener;
179
v->msg_type = VHOST_IOTLB_MSG_V2;
180
+ ret = vhost_vdpa_init_svq(dev, v, errp);
181
+ if (ret) {
182
+ goto err;
183
+ }
184
185
vhost_vdpa_get_iova_range(v);
186
187
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
188
VIRTIO_CONFIG_S_DRIVER);
189
190
return 0;
191
+
192
+err:
193
+ ram_block_discard_disable(false);
194
+ return ret;
195
}
196
197
static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
198
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
199
200
static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
201
{
202
+ struct vhost_vdpa *v = dev->opaque;
203
int i;
204
205
+ if (v->shadow_vqs_enabled) {
206
+ /* FIXME SVQ is not compatible with host notifiers mr */
207
+ return;
208
+ }
209
+
210
for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
211
if (vhost_vdpa_host_notifier_init(dev, i)) {
212
goto err;
213
@@ -XXX,XX +XXX,XX @@ err:
214
return;
215
}
216
217
+static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
218
+{
219
+ struct vhost_vdpa *v = dev->opaque;
220
+ size_t idx;
221
+
222
+ if (!v->shadow_vqs) {
223
+ return;
224
+ }
225
+
226
+ for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
227
+ vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
228
+ }
229
+ g_ptr_array_free(v->shadow_vqs, true);
230
+}
231
+
232
static int vhost_vdpa_cleanup(struct vhost_dev *dev)
233
{
234
struct vhost_vdpa *v;
235
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
236
trace_vhost_vdpa_cleanup(dev, v);
237
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
238
memory_listener_unregister(&v->listener);
239
+ vhost_vdpa_svq_cleanup(dev);
240
241
dev->opaque = NULL;
242
ram_block_discard_disable(false);
243
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
51
return ret;
244
return ret;
52
}
245
}
53
246
54
+static void colo_compare_notify_checkpoint(Notifier *notifier, void *data)
247
+static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
55
+{
248
+{
56
+ colo_checkpoint_notify(data);
249
+ if (!v->shadow_vqs_enabled) {
57
+}
250
+ return;
58
+
251
+ }
59
static void colo_process_checkpoint(MigrationState *s)
252
+
60
{
253
+ for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
61
QIOChannelBuffer *bioc;
254
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
62
@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
255
+ vhost_svq_stop(svq);
63
goto out;
256
+ }
64
}
257
+}
65
258
+
66
+ packets_compare_notifier.notify = colo_compare_notify_checkpoint;
259
static int vhost_vdpa_reset_device(struct vhost_dev *dev)
67
+ colo_compare_register_notifier(&packets_compare_notifier);
260
{
68
+
261
+ struct vhost_vdpa *v = dev->opaque;
69
/*
262
int ret;
70
* Wait for Secondary finish loading VM states and enter COLO
263
uint8_t status = 0;
71
* restore.
264
72
@@ -XXX,XX +XXX,XX @@ out:
265
+ vhost_vdpa_reset_svq(v);
73
qemu_fclose(fb);
266
+
74
}
267
ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
75
268
trace_vhost_vdpa_reset_device(dev, status);
76
- timer_del(s->colo_delay_timer);
269
return ret;
77
-
270
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
78
/* Hope this not to be too long to wait here */
271
return ret;
79
qemu_sem_wait(&s->colo_exit_sem);
272
}
80
qemu_sem_destroy(&s->colo_exit_sem);
273
81
+
274
+static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
82
+ /*
275
+ struct vhost_vring_file *file)
83
+ * It is safe to unregister notifier after failover finished.
276
+{
84
+ * Besides, colo_delay_timer and colo_checkpoint_sem can't be
277
+ trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
85
+ * released befor unregister notifier, or there will be use-after-free
278
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
86
+ * error.
279
+}
87
+ */
280
+
88
+ colo_compare_unregister_notifier(&packets_compare_notifier);
281
+/**
89
+ timer_del(s->colo_delay_timer);
282
+ * Set the shadow virtqueue descriptors to the device
90
+ timer_free(s->colo_delay_timer);
283
+ *
91
+ qemu_sem_destroy(&s->colo_checkpoint_sem);
284
+ * @dev: The vhost device model
92
+
285
+ * @svq: The shadow virtqueue
93
/*
286
+ * @idx: The index of the virtqueue in the vhost device
94
* Must be called after failover BH is completed,
287
+ * @errp: Error
95
* Or the failover BH may shutdown the wrong fd that
288
+ */
96
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
289
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
97
fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
290
+ VhostShadowVirtqueue *svq, unsigned idx,
98
object_unref(OBJECT(bioc));
291
+ Error **errp)
99
292
+{
100
+ qemu_mutex_lock_iothread();
293
+ struct vhost_vring_file file = {
101
+ vm_start();
294
+ .index = dev->vq_index + idx,
102
+ trace_colo_vm_state_change("stop", "run");
295
+ };
103
+ qemu_mutex_unlock_iothread();
296
+ const EventNotifier *event_notifier = &svq->hdev_kick;
104
+
297
+ int r;
105
colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
298
+
106
&local_err);
299
+ file.fd = event_notifier_get_fd(event_notifier);
107
if (local_err) {
300
+ r = vhost_vdpa_set_vring_dev_kick(dev, &file);
108
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
301
+ if (unlikely(r != 0)) {
109
goto out;
302
+ error_setg_errno(errp, -r, "Can't set device kick fd");
110
}
303
+ }
111
304
+
112
+ qemu_mutex_lock_iothread();
305
+ return r == 0;
113
+ vm_stop_force_state(RUN_STATE_COLO);
306
+}
114
+ trace_colo_vm_state_change("run", "stop");
307
+
115
+ qemu_mutex_unlock_iothread();
308
+static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
116
+
309
+{
117
/* FIXME: This is unnecessary for periodic checkpoint mode */
310
+ struct vhost_vdpa *v = dev->opaque;
118
colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
311
+ Error *err = NULL;
119
&local_err);
312
+ unsigned i;
120
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
313
+
121
}
314
+ if (!v->shadow_vqs) {
122
315
+ return true;
123
vmstate_loading = false;
316
+ }
124
+ vm_start();
317
+
125
+ trace_colo_vm_state_change("stop", "run");
318
+ for (i = 0; i < v->shadow_vqs->len; ++i) {
126
qemu_mutex_unlock_iothread();
319
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
127
320
+ bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
128
if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
321
+ if (unlikely(!ok)) {
129
diff --git a/migration/migration.c b/migration/migration.c
322
+ error_reportf_err(err, "Cannot setup SVQ %u: ", i);
323
+ return false;
324
+ }
325
+ }
326
+
327
+ return true;
328
+}
329
+
330
static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
331
{
332
struct vhost_vdpa *v = dev->opaque;
333
+ bool ok;
334
trace_vhost_vdpa_dev_start(dev, started);
335
336
if (started) {
337
vhost_vdpa_host_notifiers_init(dev);
338
+ ok = vhost_vdpa_svqs_start(dev);
339
+ if (unlikely(!ok)) {
340
+ return -1;
341
+ }
342
vhost_vdpa_set_vring_ready(dev);
343
} else {
344
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
345
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
346
static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
347
struct vhost_vring_file *file)
348
{
349
- trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
350
- return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
351
+ struct vhost_vdpa *v = dev->opaque;
352
+ int vdpa_idx = file->index - dev->vq_index;
353
+
354
+ if (v->shadow_vqs_enabled) {
355
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
356
+ vhost_svq_set_svq_kick_fd(svq, file->fd);
357
+ return 0;
358
+ } else {
359
+ return vhost_vdpa_set_vring_dev_kick(dev, file);
360
+ }
361
}
362
363
static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
364
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
130
index XXXXXXX..XXXXXXX 100644
365
index XXXXXXX..XXXXXXX 100644
131
--- a/migration/migration.c
366
--- a/include/hw/virtio/vhost-vdpa.h
132
+++ b/migration/migration.c
367
+++ b/include/hw/virtio/vhost-vdpa.h
133
@@ -XXX,XX +XXX,XX @@
368
@@ -XXX,XX +XXX,XX @@
134
/* Migration XBZRLE default cache size */
369
#ifndef HW_VIRTIO_VHOST_VDPA_H
135
#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
370
#define HW_VIRTIO_VHOST_VDPA_H
136
371
137
-/* The delay time (in ms) between two COLO checkpoints
372
+#include <gmodule.h>
138
- * Note: Please change this default value to 10000 when we support hybrid mode.
373
+
139
- */
374
#include "hw/virtio/virtio.h"
140
-#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
375
#include "standard-headers/linux/vhost_types.h"
141
+/* The delay time (in ms) between two COLO checkpoints */
376
142
+#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
377
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
143
#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
378
bool iotlb_batch_begin_sent;
144
#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16
379
MemoryListener listener;
145
380
struct vhost_vdpa_iova_range iova_range;
381
+ bool shadow_vqs_enabled;
382
+ GPtrArray *shadow_vqs;
383
struct vhost_dev *dev;
384
VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
385
} VhostVDPA;
146
--
386
--
147
2.5.0
387
2.7.4
148
388
149
389
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
We need to know if migration is going into COLO state for
3
This will make qemu aware of the device used buffers, allowing it to
4
incoming side before start normal migration.
4
write the guest memory with its contents if needed.
5
5
6
Instead by using the VMStateDescription to send colo_state
6
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
7
from source side to destination side, we use MIG_CMD_ENABLE_COLO
7
Acked-by: Michael S. Tsirkin <mst@redhat.com>
8
to indicate whether COLO is enabled or not.
9
10
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
11
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
12
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
13
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
14
Signed-off-by: Jason Wang <jasowang@redhat.com>
8
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
---
9
---
16
include/migration/colo.h | 5 ++--
10
hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
17
migration/Makefile.objs | 2 +-
11
hw/virtio/vhost-shadow-virtqueue.h | 4 ++++
18
migration/colo-comm.c | 76 ------------------------------------------------
12
hw/virtio/vhost-vdpa.c | 31 +++++++++++++++++++++++++++++--
19
migration/colo.c | 13 ++++++++-
13
3 files changed, 71 insertions(+), 2 deletions(-)
20
migration/migration.c | 23 ++++++++++++++-
21
migration/savevm.c | 17 +++++++++++
22
migration/savevm.h | 1 +
23
migration/trace-events | 1 +
24
vl.c | 2 --
25
9 files changed, 57 insertions(+), 83 deletions(-)
26
delete mode 100644 migration/colo-comm.c
27
14
28
diff --git a/include/migration/colo.h b/include/migration/colo.h
15
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
29
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
30
--- a/include/migration/colo.h
17
--- a/hw/virtio/vhost-shadow-virtqueue.c
31
+++ b/include/migration/colo.h
18
+++ b/hw/virtio/vhost-shadow-virtqueue.c
32
@@ -XXX,XX +XXX,XX @@ void migrate_start_colo_process(MigrationState *s);
19
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
33
bool migration_in_colo_state(void);
34
35
/* loadvm */
36
-bool migration_incoming_enable_colo(void);
37
-void migration_incoming_exit_colo(void);
38
+void migration_incoming_enable_colo(void);
39
+void migration_incoming_disable_colo(void);
40
+bool migration_incoming_colo_enabled(void);
41
void *colo_process_incoming_thread(void *opaque);
42
bool migration_incoming_in_colo_state(void);
43
44
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
45
index XXXXXXX..XXXXXXX 100644
46
--- a/migration/Makefile.objs
47
+++ b/migration/Makefile.objs
48
@@ -XXX,XX +XXX,XX @@
49
common-obj-y += migration.o socket.o fd.o exec.o
50
common-obj-y += tls.o channel.o savevm.o
51
-common-obj-y += colo-comm.o colo.o colo-failover.o
52
+common-obj-y += colo.o colo-failover.o
53
common-obj-y += vmstate.o vmstate-types.o page_cache.o
54
common-obj-y += qemu-file.o global_state.o
55
common-obj-y += qemu-file-channel.o
56
diff --git a/migration/colo-comm.c b/migration/colo-comm.c
57
deleted file mode 100644
58
index XXXXXXX..XXXXXXX
59
--- a/migration/colo-comm.c
60
+++ /dev/null
61
@@ -XXX,XX +XXX,XX @@
62
-/*
63
- * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
64
- * (a.k.a. Fault Tolerance or Continuous Replication)
65
- *
66
- * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
67
- * Copyright (c) 2016 FUJITSU LIMITED
68
- * Copyright (c) 2016 Intel Corporation
69
- *
70
- * This work is licensed under the terms of the GNU GPL, version 2 or
71
- * later. See the COPYING file in the top-level directory.
72
- *
73
- */
74
-
75
-#include "qemu/osdep.h"
76
-#include "migration.h"
77
-#include "migration/colo.h"
78
-#include "migration/vmstate.h"
79
-#include "trace.h"
80
-
81
-typedef struct {
82
- bool colo_requested;
83
-} COLOInfo;
84
-
85
-static COLOInfo colo_info;
86
-
87
-COLOMode get_colo_mode(void)
88
-{
89
- if (migration_in_colo_state()) {
90
- return COLO_MODE_PRIMARY;
91
- } else if (migration_incoming_in_colo_state()) {
92
- return COLO_MODE_SECONDARY;
93
- } else {
94
- return COLO_MODE_UNKNOWN;
95
- }
96
-}
97
-
98
-static int colo_info_pre_save(void *opaque)
99
-{
100
- COLOInfo *s = opaque;
101
-
102
- s->colo_requested = migrate_colo_enabled();
103
-
104
- return 0;
105
-}
106
-
107
-static bool colo_info_need(void *opaque)
108
-{
109
- return migrate_colo_enabled();
110
-}
111
-
112
-static const VMStateDescription colo_state = {
113
- .name = "COLOState",
114
- .version_id = 1,
115
- .minimum_version_id = 1,
116
- .pre_save = colo_info_pre_save,
117
- .needed = colo_info_need,
118
- .fields = (VMStateField[]) {
119
- VMSTATE_BOOL(colo_requested, COLOInfo),
120
- VMSTATE_END_OF_LIST()
121
- },
122
-};
123
-
124
-void colo_info_init(void)
125
-{
126
- vmstate_register(NULL, 0, &colo_state, &colo_info);
127
-}
128
-
129
-bool migration_incoming_enable_colo(void)
130
-{
131
- return colo_info.colo_requested;
132
-}
133
-
134
-void migration_incoming_exit_colo(void)
135
-{
136
- colo_info.colo_requested = false;
137
-}
138
diff --git a/migration/colo.c b/migration/colo.c
139
index XXXXXXX..XXXXXXX 100644
140
--- a/migration/colo.c
141
+++ b/migration/colo.c
142
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
143
qemu_sem_post(&s->colo_exit_sem);
144
}
20
}
145
21
146
+COLOMode get_colo_mode(void)
22
/**
23
+ * Forward vhost notifications
24
+ *
25
+ * @n: hdev call event notifier, the one that device set to notify svq.
26
+ */
27
+static void vhost_svq_handle_call(EventNotifier *n)
147
+{
28
+{
148
+ if (migration_in_colo_state()) {
29
+ VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
149
+ return COLO_MODE_PRIMARY;
30
+ hdev_call);
150
+ } else if (migration_incoming_in_colo_state()) {
31
+ event_notifier_test_and_clear(n);
151
+ return COLO_MODE_SECONDARY;
32
+ event_notifier_set(&svq->svq_call);
33
+}
34
+
35
+/**
36
+ * Set the call notifier for the SVQ to call the guest
37
+ *
38
+ * @svq: Shadow virtqueue
39
+ * @call_fd: call notifier
40
+ *
41
+ * Called on BQL context.
42
+ */
43
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
44
+{
45
+ if (call_fd == VHOST_FILE_UNBIND) {
46
+ /*
47
+ * Fail event_notifier_set if called handling device call.
48
+ *
49
+ * SVQ still needs device notifications, since it needs to keep
50
+ * forwarding used buffers even with the unbind.
51
+ */
52
+ memset(&svq->svq_call, 0, sizeof(svq->svq_call));
152
+ } else {
53
+ } else {
153
+ return COLO_MODE_UNKNOWN;
54
+ event_notifier_init_fd(&svq->svq_call, call_fd);
154
+ }
55
+ }
155
+}
56
+}
156
+
57
+
157
void colo_do_failover(MigrationState *s)
58
+/**
158
{
59
* Set a new file descriptor for the guest to kick the SVQ and notify for avail
159
/* Make sure VM stopped while failover happened. */
60
*
160
@@ -XXX,XX +XXX,XX @@ out:
61
* @svq: The svq
161
if (mis->to_src_file) {
62
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
162
qemu_fclose(mis->to_src_file);
163
}
63
}
164
- migration_incoming_exit_colo();
64
165
+ migration_incoming_disable_colo();
65
event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
166
66
+ event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
167
rcu_unregister_thread();
67
return g_steal_pointer(&svq);
168
return NULL;
68
169
diff --git a/migration/migration.c b/migration/migration.c
69
err_init_hdev_call:
70
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
71
VhostShadowVirtqueue *vq = pvq;
72
vhost_svq_stop(vq);
73
event_notifier_cleanup(&vq->hdev_kick);
74
+ event_notifier_set_handler(&vq->hdev_call, NULL);
75
event_notifier_cleanup(&vq->hdev_call);
76
g_free(vq);
77
}
78
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
170
index XXXXXXX..XXXXXXX 100644
79
index XXXXXXX..XXXXXXX 100644
171
--- a/migration/migration.c
80
--- a/hw/virtio/vhost-shadow-virtqueue.h
172
+++ b/migration/migration.c
81
+++ b/hw/virtio/vhost-shadow-virtqueue.h
173
@@ -XXX,XX +XXX,XX @@ int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
82
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
174
return migrate_send_rp_message(mis, msg_type, msglen, bufc);
83
* So shadow virtqueue must not clean it, or we would lose VirtQueue one.
84
*/
85
EventNotifier svq_kick;
86
+
87
+ /* Guest's call notifier, where the SVQ calls guest. */
88
+ EventNotifier svq_call;
89
} VhostShadowVirtqueue;
90
91
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
92
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
93
94
void vhost_svq_stop(VhostShadowVirtqueue *svq);
95
96
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
97
index XXXXXXX..XXXXXXX 100644
98
--- a/hw/virtio/vhost-vdpa.c
99
+++ b/hw/virtio/vhost-vdpa.c
100
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
101
return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
175
}
102
}
176
103
177
+static bool migration_colo_enabled;
104
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
178
+bool migration_incoming_colo_enabled(void)
105
+ struct vhost_vring_file *file)
179
+{
106
+{
180
+ return migration_colo_enabled;
107
+ trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
108
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
181
+}
109
+}
182
+
110
+
183
+void migration_incoming_disable_colo(void)
111
/**
184
+{
112
* Set the shadow virtqueue descriptors to the device
185
+ migration_colo_enabled = false;
113
*
186
+}
114
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
187
+
115
* @svq: The shadow virtqueue
188
+void migration_incoming_enable_colo(void)
116
* @idx: The index of the virtqueue in the vhost device
189
+{
117
* @errp: Error
190
+ migration_colo_enabled = true;
118
+ *
191
+}
119
+ * Note that this function does not rewind kick file descriptor if cannot set
192
+
120
+ * call one.
193
void qemu_start_incoming_migration(const char *uri, Error **errp)
121
*/
194
{
122
static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
195
const char *p;
123
VhostShadowVirtqueue *svq, unsigned idx,
196
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
124
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
197
}
125
r = vhost_vdpa_set_vring_dev_kick(dev, &file);
198
126
if (unlikely(r != 0)) {
199
/* we get COLO info, and know if we are in COLO mode */
127
error_setg_errno(errp, -r, "Can't set device kick fd");
200
- if (!ret && migration_incoming_enable_colo()) {
128
+ return false;
201
+ if (!ret && migration_incoming_colo_enabled()) {
202
/* Make sure all file formats flush their mutable metadata */
203
bdrv_invalidate_cache_all(&local_err);
204
if (local_err) {
205
@@ -XXX,XX +XXX,XX @@ static void *migration_thread(void *opaque)
206
qemu_savevm_send_postcopy_advise(s->to_dst_file);
207
}
208
209
+ if (migrate_colo_enabled()) {
210
+ /* Notify migration destination that we enable COLO */
211
+ qemu_savevm_send_colo_enable(s->to_dst_file);
212
+ }
129
+ }
213
+
130
+
214
qemu_savevm_state_setup(s->to_dst_file);
131
+ event_notifier = &svq->hdev_call;
215
132
+ file.fd = event_notifier_get_fd(event_notifier);
216
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
133
+ r = vhost_vdpa_set_vring_dev_call(dev, &file);
217
diff --git a/migration/savevm.c b/migration/savevm.c
134
+ if (unlikely(r != 0)) {
218
index XXXXXXX..XXXXXXX 100644
135
+ error_setg_errno(errp, -r, "Can't set device call fd");
219
--- a/migration/savevm.c
136
}
220
+++ b/migration/savevm.c
137
221
@@ -XXX,XX +XXX,XX @@
138
return r == 0;
222
#include "io/channel-file.h"
139
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
223
#include "sysemu/replay.h"
140
static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
224
#include "qjson.h"
141
struct vhost_vring_file *file)
225
+#include "migration/colo.h"
142
{
226
143
- trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
227
#ifndef ETH_P_RARP
144
- return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
228
#define ETH_P_RARP 0x8035
145
+ struct vhost_vdpa *v = dev->opaque;
229
@@ -XXX,XX +XXX,XX @@ enum qemu_vm_cmd {
146
+
230
were previously sent during
147
+ if (v->shadow_vqs_enabled) {
231
precopy but are dirty. */
148
+ int vdpa_idx = file->index - dev->vq_index;
232
MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */
149
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
233
+ MIG_CMD_ENABLE_COLO, /* Enable COLO */
150
+
234
MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */
151
+ vhost_svq_set_svq_call_fd(svq, file->fd);
235
MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */
152
+ return 0;
236
MIG_CMD_MAX
153
+ } else {
237
@@ -XXX,XX +XXX,XX @@ static void qemu_savevm_command_send(QEMUFile *f,
154
+ return vhost_vdpa_set_vring_dev_call(dev, file);
238
qemu_fflush(f);
155
+ }
239
}
156
}
240
157
241
+void qemu_savevm_send_colo_enable(QEMUFile *f)
158
static int vhost_vdpa_get_features(struct vhost_dev *dev,
242
+{
243
+ trace_savevm_send_colo_enable();
244
+ qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
245
+}
246
+
247
void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
248
{
249
uint32_t buf;
250
@@ -XXX,XX +XXX,XX @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
251
return 0;
252
}
253
254
+static int loadvm_process_enable_colo(MigrationIncomingState *mis)
255
+{
256
+ migration_incoming_enable_colo();
257
+ return 0;
258
+}
259
+
260
/*
261
* Process an incoming 'QEMU_VM_COMMAND'
262
* 0 just a normal return
263
@@ -XXX,XX +XXX,XX @@ static int loadvm_process_command(QEMUFile *f)
264
265
case MIG_CMD_RECV_BITMAP:
266
return loadvm_handle_recv_bitmap(mis, len);
267
+
268
+ case MIG_CMD_ENABLE_COLO:
269
+ return loadvm_process_enable_colo(mis);
270
}
271
272
return 0;
273
diff --git a/migration/savevm.h b/migration/savevm.h
274
index XXXXXXX..XXXXXXX 100644
275
--- a/migration/savevm.h
276
+++ b/migration/savevm.h
277
@@ -XXX,XX +XXX,XX @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
278
uint16_t len,
279
uint64_t *start_list,
280
uint64_t *length_list);
281
+void qemu_savevm_send_colo_enable(QEMUFile *f);
282
283
int qemu_loadvm_state(QEMUFile *f);
284
void qemu_loadvm_state_cleanup(void);
285
diff --git a/migration/trace-events b/migration/trace-events
286
index XXXXXXX..XXXXXXX 100644
287
--- a/migration/trace-events
288
+++ b/migration/trace-events
289
@@ -XXX,XX +XXX,XX @@ savevm_send_ping(uint32_t val) "0x%x"
290
savevm_send_postcopy_listen(void) ""
291
savevm_send_postcopy_run(void) ""
292
savevm_send_postcopy_resume(void) ""
293
+savevm_send_colo_enable(void) ""
294
savevm_send_recv_bitmap(char *name) "%s"
295
savevm_state_setup(void) ""
296
savevm_state_resume_prepare(void) ""
297
diff --git a/vl.c b/vl.c
298
index XXXXXXX..XXXXXXX 100644
299
--- a/vl.c
300
+++ b/vl.c
301
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
302
#endif
303
}
304
305
- colo_info_init();
306
-
307
if (net_init_clients(&err) < 0) {
308
error_report_err(err);
309
exit(1);
310
--
159
--
311
2.5.0
160
2.7.4
312
161
313
162
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
We add almost full TCP state machine in filter-rewriter, except
3
This allows SVQ to negotiate features with the guest and the device. For
4
TCPS_LISTEN and some simplify in VM active close FIN states.
4
the device, SVQ is a driver. While this function bypasses all
5
The reason for this simplify job is because guest kernel will track
5
non-transport features, it needs to disable the features that SVQ does
6
the TCP status and wait 2MSL time too, if client resend the FIN packet,
6
not support when forwarding buffers. This includes packed vq layout,
7
guest will resend the last ACK, so we needn't wait 2MSL time in filter-rewriter.
7
indirect descriptors or event idx.
8
8
9
After a net connection is closed, we didn't clear its related resources
9
Future changes can add support to offer more features to the guest,
10
in connection_track_table, which will lead to memory leak.
10
since the use of VirtQueue gives this for free. This is left out at the
11
moment for simplicity.
11
12
12
Let's track the state of net connection, if it is closed, its related
13
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
13
resources will be cleared up.
14
Acked-by: Michael S. Tsirkin <mst@redhat.com>
14
15
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
16
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
17
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
18
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
19
---
16
---
20
net/colo.c | 2 +-
17
hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
21
net/colo.h | 9 ++---
18
hw/virtio/vhost-shadow-virtqueue.h | 2 ++
22
net/filter-rewriter.c | 109 +++++++++++++++++++++++++++++++++++++++++++++-----
19
hw/virtio/vhost-vdpa.c | 15 +++++++++++++
23
3 files changed, 104 insertions(+), 16 deletions(-)
20
3 files changed, 61 insertions(+)
24
21
25
diff --git a/net/colo.c b/net/colo.c
22
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
26
index XXXXXXX..XXXXXXX 100644
23
index XXXXXXX..XXXXXXX 100644
27
--- a/net/colo.c
24
--- a/hw/virtio/vhost-shadow-virtqueue.c
28
+++ b/net/colo.c
25
+++ b/hw/virtio/vhost-shadow-virtqueue.c
29
@@ -XXX,XX +XXX,XX @@ Connection *connection_new(ConnectionKey *key)
30
conn->ip_proto = key->ip_proto;
31
conn->processing = false;
32
conn->offset = 0;
33
- conn->syn_flag = 0;
34
+ conn->tcp_state = TCPS_CLOSED;
35
conn->pack = 0;
36
conn->sack = 0;
37
g_queue_init(&conn->primary_list);
38
diff --git a/net/colo.h b/net/colo.h
39
index XXXXXXX..XXXXXXX 100644
40
--- a/net/colo.h
41
+++ b/net/colo.h
42
@@ -XXX,XX +XXX,XX @@
26
@@ -XXX,XX +XXX,XX @@
43
#include "slirp/slirp.h"
27
#include "hw/virtio/vhost-shadow-virtqueue.h"
44
#include "qemu/jhash.h"
28
45
#include "qemu/timer.h"
29
#include "qemu/error-report.h"
46
+#include "slirp/tcp.h"
30
+#include "qapi/error.h"
47
31
#include "qemu/main-loop.h"
48
#define HASHTABLE_MAX_SIZE 16384
32
#include "linux-headers/linux/vhost.h"
49
33
50
@@ -XXX,XX +XXX,XX @@ typedef struct Connection {
34
/**
51
uint32_t sack;
35
+ * Validate the transport device features that both guests can use with the SVQ
52
/* offset = secondary_seq - primary_seq */
36
+ * and SVQs can use with the device.
53
tcp_seq offset;
37
+ *
54
- /*
38
+ * @dev_features: The features
55
- * we use this flag update offset func
39
+ * @errp: Error pointer
56
- * run once in independent tcp connection
40
+ */
57
- */
41
+bool vhost_svq_valid_features(uint64_t features, Error **errp)
58
- int syn_flag;
42
+{
43
+ bool ok = true;
44
+ uint64_t svq_features = features;
59
+
45
+
60
+ int tcp_state; /* TCP FSM state */
46
+ for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
61
+ tcp_seq fin_ack_seq; /* the seq of 'fin=1,ack=1' */
47
+ ++b) {
62
} Connection;
48
+ switch (b) {
63
49
+ case VIRTIO_F_ANY_LAYOUT:
64
uint32_t connection_key_hash(const void *opaque);
50
+ continue;
65
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
66
index XXXXXXX..XXXXXXX 100644
67
--- a/net/filter-rewriter.c
68
+++ b/net/filter-rewriter.c
69
@@ -XXX,XX +XXX,XX @@ static int is_tcp_packet(Packet *pkt)
70
}
71
72
/* handle tcp packet from primary guest */
73
-static int handle_primary_tcp_pkt(NetFilterState *nf,
74
+static int handle_primary_tcp_pkt(RewriterState *rf,
75
Connection *conn,
76
- Packet *pkt)
77
+ Packet *pkt, ConnectionKey *key)
78
{
79
struct tcphdr *tcp_pkt;
80
81
@@ -XXX,XX +XXX,XX @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
82
trace_colo_filter_rewriter_conn_offset(conn->offset);
83
}
84
85
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
86
+ conn->tcp_state == TCPS_SYN_SENT) {
87
+ conn->tcp_state = TCPS_ESTABLISHED;
88
+ }
89
+
51
+
90
if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
52
+ case VIRTIO_F_ACCESS_PLATFORM:
91
/*
53
+ /* SVQ trust in the host's IOMMU to translate addresses */
92
* we use this flag update offset func
54
+ case VIRTIO_F_VERSION_1:
93
* run once in independent tcp connection
55
+ /* SVQ trust that the guest vring is little endian */
94
*/
56
+ if (!(svq_features & BIT_ULL(b))) {
95
- conn->syn_flag = 1;
57
+ svq_features |= BIT_ULL(b);
96
+ conn->tcp_state = TCPS_SYN_RECEIVED;
58
+ ok = false;
97
}
59
+ }
98
60
+ continue;
99
if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
100
- if (conn->syn_flag) {
101
+ if (conn->tcp_state == TCPS_SYN_RECEIVED) {
102
/*
103
* offset = secondary_seq - primary seq
104
* ack packet sent by guest from primary node,
105
* so we use th_ack - 1 get primary_seq
106
*/
107
conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
108
- conn->syn_flag = 0;
109
+ conn->tcp_state = TCPS_ESTABLISHED;
110
}
111
if (conn->offset) {
112
/* handle packets to the secondary from the primary */
113
@@ -XXX,XX +XXX,XX @@ static int handle_primary_tcp_pkt(NetFilterState *nf,
114
net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
115
pkt->size - pkt->vnet_hdr_len);
116
}
117
+
61
+
118
+ /*
62
+ default:
119
+ * Passive close step 3
63
+ if (svq_features & BIT_ULL(b)) {
120
+ */
64
+ svq_features &= ~BIT_ULL(b);
121
+ if ((conn->tcp_state == TCPS_LAST_ACK) &&
65
+ ok = false;
122
+ (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
66
+ }
123
+ conn->tcp_state = TCPS_CLOSED;
124
+ g_hash_table_remove(rf->connection_track_table, key);
125
+ }
67
+ }
126
+ }
68
+ }
127
+
69
+
128
+ if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
70
+ if (!ok) {
129
+ /*
71
+ error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
130
+ * Passive close.
72
+ ", ok: 0x%"PRIx64, features, svq_features);
131
+ * Step 1:
73
+ }
132
+ * The *server* side of this connect is VM, *client* tries to close
74
+ return ok;
133
+ * the connection. We will into CLOSE_WAIT status.
75
+}
134
+ *
135
+ * Step 2:
136
+ * In this step we will into LAST_ACK status.
137
+ *
138
+ * We got 'fin=1, ack=1' packet from server side, we need to
139
+ * record the seq of 'fin=1, ack=1' packet.
140
+ *
141
+ * Step 3:
142
+ * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
143
+ * packet from server side. From this point, we can ensure that there
144
+ * will be no packets in the connection, except that, some errors
145
+ * happen between the path of 'filter object' and vNIC, if this rare
146
+ * case really happen, we can still create a new connection,
147
+ * So it is safe to remove the connection from connection_track_table.
148
+ *
149
+ */
150
+ if (conn->tcp_state == TCPS_ESTABLISHED) {
151
+ conn->tcp_state = TCPS_CLOSE_WAIT;
152
+ }
153
+
76
+
154
+ /*
77
+/**
155
+ * Active close step 2.
78
* Forward guest notifications.
156
+ */
79
*
157
+ if (conn->tcp_state == TCPS_FIN_WAIT_1) {
80
* @n: guest kick event notifier, the one that guest set to notify svq.
158
+ conn->tcp_state = TCPS_TIME_WAIT;
81
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
159
+ /*
82
index XXXXXXX..XXXXXXX 100644
160
+ * For simplify implementation, we needn't wait 2MSL time
83
--- a/hw/virtio/vhost-shadow-virtqueue.h
161
+ * in filter rewriter. Because guest kernel will track the
84
+++ b/hw/virtio/vhost-shadow-virtqueue.h
162
+ * TCP status and wait 2MSL time, if client resend the FIN
85
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
163
+ * packet, guest will apply the last ACK too.
86
EventNotifier svq_call;
164
+ */
87
} VhostShadowVirtqueue;
165
+ conn->tcp_state = TCPS_CLOSED;
88
166
+ g_hash_table_remove(rf->connection_track_table, key);
89
+bool vhost_svq_valid_features(uint64_t features, Error **errp);
167
+ }
90
+
91
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
92
void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
93
94
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
95
index XXXXXXX..XXXXXXX 100644
96
--- a/hw/virtio/vhost-vdpa.c
97
+++ b/hw/virtio/vhost-vdpa.c
98
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
99
Error **errp)
100
{
101
g_autoptr(GPtrArray) shadow_vqs = NULL;
102
+ uint64_t dev_features, svq_features;
103
+ int r;
104
+ bool ok;
105
106
if (!v->shadow_vqs_enabled) {
107
return 0;
168
}
108
}
169
109
170
return 0;
110
+ r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
171
}
111
+ if (r != 0) {
172
112
+ error_setg_errno(errp, -r, "Can't get vdpa device features");
173
/* handle tcp packet from secondary guest */
113
+ return r;
174
-static int handle_secondary_tcp_pkt(NetFilterState *nf,
175
+static int handle_secondary_tcp_pkt(RewriterState *rf,
176
Connection *conn,
177
- Packet *pkt)
178
+ Packet *pkt, ConnectionKey *key)
179
{
180
struct tcphdr *tcp_pkt;
181
182
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
183
trace_colo_filter_rewriter_conn_offset(conn->offset);
184
}
185
186
- if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
187
+ if (conn->tcp_state == TCPS_SYN_RECEIVED &&
188
+ ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
189
/*
190
* save offset = secondary_seq and then
191
* in handle_primary_tcp_pkt make offset
192
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
193
conn->offset = ntohl(tcp_pkt->th_seq);
194
}
195
196
+ /* VM active connect */
197
+ if (conn->tcp_state == TCPS_CLOSED &&
198
+ ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
199
+ conn->tcp_state = TCPS_SYN_SENT;
200
+ }
114
+ }
201
+
115
+
202
if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
116
+ svq_features = dev_features;
203
/* Only need to adjust seq while offset is Non-zero */
117
+ ok = vhost_svq_valid_features(svq_features, errp);
204
if (conn->offset) {
118
+ if (unlikely(!ok)) {
205
@@ -XXX,XX +XXX,XX @@ static int handle_secondary_tcp_pkt(NetFilterState *nf,
119
+ return -1;
206
}
207
}
208
209
+ /*
210
+ * Passive close step 2:
211
+ */
212
+ if (conn->tcp_state == TCPS_CLOSE_WAIT &&
213
+ (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
214
+ conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
215
+ conn->tcp_state = TCPS_LAST_ACK;
216
+ }
120
+ }
217
+
121
+
218
+ /*
122
shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
219
+ * Active close
123
for (unsigned n = 0; n < hdev->nvqs; ++n) {
220
+ *
124
g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
221
+ * Step 1:
222
+ * The *server* side of this connect is VM, *server* tries to close
223
+ * the connection.
224
+ *
225
+ * Step 2:
226
+ * We will into CLOSE_WAIT status.
227
+ * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
228
+ * CLOSING status.
229
+ */
230
+ if (conn->tcp_state == TCPS_ESTABLISHED &&
231
+ (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
232
+ conn->tcp_state = TCPS_FIN_WAIT_1;
233
+ }
234
+
235
return 0;
236
}
237
238
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
239
240
if (sender == nf->netdev) {
241
/* NET_FILTER_DIRECTION_TX */
242
- if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
243
+ if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
244
qemu_net_queue_send(s->incoming_queue, sender, 0,
245
(const uint8_t *)pkt->data, pkt->size, NULL);
246
packet_destroy(pkt, NULL);
247
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
248
}
249
} else {
250
/* NET_FILTER_DIRECTION_RX */
251
- if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
252
+ if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
253
qemu_net_queue_send(s->incoming_queue, sender, 0,
254
(const uint8_t *)pkt->data, pkt->size, NULL);
255
packet_destroy(pkt, NULL);
256
--
125
--
257
2.5.0
126
2.7.4
258
127
259
128
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
Filter needs to process the event of checkpoint/failover or
3
It reports the shadow virtqueue address from qemu virtual address space.
4
other event passed by COLO frame.
5
4
6
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
5
Since this will be different from the guest's vaddr, but the device can
7
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
6
access it, SVQ takes special care about its alignment & lack of garbage
8
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
7
data. It assumes that IOMMU will work in host_page_size ranges for that.
8
9
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
10
Acked-by: Michael S. Tsirkin <mst@redhat.com>
9
Signed-off-by: Jason Wang <jasowang@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
---
12
---
11
include/net/filter.h | 5 +++++
13
hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
12
net/filter.c | 17 +++++++++++++++++
14
hw/virtio/vhost-shadow-virtqueue.h | 9 +++++++++
13
net/net.c | 19 +++++++++++++++++++
15
2 files changed, 38 insertions(+)
14
3 files changed, 41 insertions(+)
15
16
16
diff --git a/include/net/filter.h b/include/net/filter.h
17
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
17
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
18
--- a/include/net/filter.h
19
--- a/hw/virtio/vhost-shadow-virtqueue.c
19
+++ b/include/net/filter.h
20
+++ b/hw/virtio/vhost-shadow-virtqueue.c
20
@@ -XXX,XX +XXX,XX @@ typedef ssize_t (FilterReceiveIOV)(NetFilterState *nc,
21
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
21
22
typedef void (FilterStatusChanged) (NetFilterState *nf, Error **errp);
23
24
+typedef void (FilterHandleEvent) (NetFilterState *nf, int event, Error **errp);
25
+
26
typedef struct NetFilterClass {
27
ObjectClass parent_class;
28
29
@@ -XXX,XX +XXX,XX @@ typedef struct NetFilterClass {
30
FilterSetup *setup;
31
FilterCleanup *cleanup;
32
FilterStatusChanged *status_changed;
33
+ FilterHandleEvent *handle_event;
34
/* mandatory */
35
FilterReceiveIOV *receive_iov;
36
} NetFilterClass;
37
@@ -XXX,XX +XXX,XX @@ ssize_t qemu_netfilter_pass_to_next(NetClientState *sender,
38
int iovcnt,
39
void *opaque);
40
41
+void colo_notify_filters_event(int event, Error **errp);
42
+
43
#endif /* QEMU_NET_FILTER_H */
44
diff --git a/net/filter.c b/net/filter.c
45
index XXXXXXX..XXXXXXX 100644
46
--- a/net/filter.c
47
+++ b/net/filter.c
48
@@ -XXX,XX +XXX,XX @@
49
#include "net/vhost_net.h"
50
#include "qom/object_interfaces.h"
51
#include "qemu/iov.h"
52
+#include "net/colo.h"
53
+#include "migration/colo.h"
54
55
static inline bool qemu_can_skip_netfilter(NetFilterState *nf)
56
{
57
@@ -XXX,XX +XXX,XX @@ static void netfilter_finalize(Object *obj)
58
g_free(nf->netdev_id);
59
}
22
}
60
23
61
+static void default_handle_event(NetFilterState *nf, int event, Error **errp)
24
/**
25
+ * Get the shadow vq vring address.
26
+ * @svq: Shadow virtqueue
27
+ * @addr: Destination to store address
28
+ */
29
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
30
+ struct vhost_vring_addr *addr)
62
+{
31
+{
63
+ switch (event) {
32
+ addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
64
+ case COLO_EVENT_CHECKPOINT:
33
+ addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
65
+ break;
34
+ addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
66
+ case COLO_EVENT_FAILOVER:
67
+ object_property_set_str(OBJECT(nf), "off", "status", errp);
68
+ break;
69
+ default:
70
+ break;
71
+ }
72
+}
35
+}
73
+
36
+
74
static void netfilter_class_init(ObjectClass *oc, void *data)
37
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
75
{
76
UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
77
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
78
79
ucc->complete = netfilter_complete;
80
+ nfc->handle_event = default_handle_event;
81
}
82
83
static const TypeInfo netfilter_info = {
84
diff --git a/net/net.c b/net/net.c
85
index XXXXXXX..XXXXXXX 100644
86
--- a/net/net.c
87
+++ b/net/net.c
88
@@ -XXX,XX +XXX,XX @@ void hmp_info_network(Monitor *mon, const QDict *qdict)
89
}
90
}
91
92
+void colo_notify_filters_event(int event, Error **errp)
93
+{
38
+{
94
+ NetClientState *nc;
39
+ size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
95
+ NetFilterState *nf;
40
+ size_t avail_size = offsetof(vring_avail_t, ring) +
96
+ NetFilterClass *nfc = NULL;
41
+ sizeof(uint16_t) * svq->vring.num;
97
+ Error *local_err = NULL;
98
+
42
+
99
+ QTAILQ_FOREACH(nc, &net_clients, next) {
43
+ return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
100
+ QTAILQ_FOREACH(nf, &nc->filters, next) {
101
+ nfc = NETFILTER_GET_CLASS(OBJECT(nf));
102
+ nfc->handle_event(nf, event, &local_err);
103
+ if (local_err) {
104
+ error_propagate(errp, local_err);
105
+ return;
106
+ }
107
+ }
108
+ }
109
+}
44
+}
110
+
45
+
111
void qmp_set_link(const char *name, bool up, Error **errp)
46
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
112
{
47
+{
113
NetClientState *ncs[MAX_QUEUE_NUM];
48
+ size_t used_size = offsetof(vring_used_t, ring) +
49
+ sizeof(vring_used_elem_t) * svq->vring.num;
50
+ return ROUND_UP(used_size, qemu_real_host_page_size);
51
+}
52
+
53
+/**
54
* Set a new file descriptor for the guest to kick the SVQ and notify for avail
55
*
56
* @svq: The svq
57
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
58
index XXXXXXX..XXXXXXX 100644
59
--- a/hw/virtio/vhost-shadow-virtqueue.h
60
+++ b/hw/virtio/vhost-shadow-virtqueue.h
61
@@ -XXX,XX +XXX,XX @@
62
#define VHOST_SHADOW_VIRTQUEUE_H
63
64
#include "qemu/event_notifier.h"
65
+#include "hw/virtio/virtio.h"
66
+#include "standard-headers/linux/vhost_types.h"
67
68
/* Shadow virtqueue to relay notifications */
69
typedef struct VhostShadowVirtqueue {
70
+ /* Shadow vring */
71
+ struct vring vring;
72
+
73
/* Shadow kick notifier, sent to vhost */
74
EventNotifier hdev_kick;
75
/* Shadow call notifier, sent to vhost */
76
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
77
78
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
79
void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
80
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
81
+ struct vhost_vring_addr *addr);
82
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
83
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
84
85
void vhost_svq_stop(VhostShadowVirtqueue *svq);
86
114
--
87
--
115
2.5.0
88
2.7.4
116
89
117
90
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
We should not load PVM's state directly into SVM, because there maybe some
3
First half of the buffers forwarding part, preparing vhost-vdpa
4
errors happen when SVM is receving data, which will break SVM.
4
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
5
this is effectively dead code at the moment, but it helps to reduce
6
patch size.
5
7
6
We need to ensure receving all data before load the state into SVM. We use
8
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
7
an extra memory to cache these data (PVM's ram). The ram cache in secondary side
9
Acked-by: Michael S. Tsirkin <mst@redhat.com>
8
is initially the same as SVM/PVM's memory. And in the process of checkpoint,
9
we cache the dirty pages of PVM into this ram cache firstly, so this ram cache
10
always the same as PVM's memory at every checkpoint, then we flush this cached ram
11
to SVM after we receive all PVM's state.
12
13
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
14
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
15
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
16
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
17
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
18
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
Signed-off-by: Jason Wang <jasowang@redhat.com>
19
---
11
---
20
include/exec/ram_addr.h | 1 +
12
hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
21
migration/migration.c | 7 +++++
13
1 file changed, 41 insertions(+), 7 deletions(-)
22
migration/ram.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++--
23
migration/ram.h | 4 +++
24
migration/savevm.c | 2 +-
25
5 files changed, 94 insertions(+), 3 deletions(-)
26
14
27
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
15
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
28
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
29
--- a/include/exec/ram_addr.h
17
--- a/hw/virtio/vhost-vdpa.c
30
+++ b/include/exec/ram_addr.h
18
+++ b/hw/virtio/vhost-vdpa.c
31
@@ -XXX,XX +XXX,XX @@ struct RAMBlock {
19
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
32
struct rcu_head rcu;
20
return ret;
33
struct MemoryRegion *mr;
21
}
34
uint8_t *host;
22
35
+ uint8_t *colo_cache; /* For colo, VM's ram cache */
23
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
36
ram_addr_t offset;
24
+ struct vhost_vring_state *ring)
37
ram_addr_t used_length;
25
+{
38
ram_addr_t max_length;
26
+ trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
39
diff --git a/migration/migration.c b/migration/migration.c
27
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
40
index XXXXXXX..XXXXXXX 100644
28
+}
41
--- a/migration/migration.c
42
+++ b/migration/migration.c
43
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
44
exit(EXIT_FAILURE);
45
}
46
47
+ if (colo_init_ram_cache() < 0) {
48
+ error_report("Init ram cache failed");
49
+ exit(EXIT_FAILURE);
50
+ }
51
+
29
+
52
qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
30
static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
53
colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
31
struct vhost_vring_file *file)
54
mis->have_colo_incoming_thread = true;
32
{
55
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
33
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
56
34
return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
57
/* Wait checkpoint incoming thread exit before free resource */
58
qemu_thread_join(&mis->colo_incoming_thread);
59
+ /* We hold the global iothread lock, so it is safe here */
60
+ colo_release_ram_cache();
61
}
62
63
if (ret < 0) {
64
diff --git a/migration/ram.c b/migration/ram.c
65
index XXXXXXX..XXXXXXX 100644
66
--- a/migration/ram.c
67
+++ b/migration/ram.c
68
@@ -XXX,XX +XXX,XX @@ static inline void *host_from_ram_block_offset(RAMBlock *block,
69
return block->host + offset;
70
}
35
}
71
36
72
+static inline void *colo_cache_from_block_offset(RAMBlock *block,
37
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
73
+ ram_addr_t offset)
38
+ struct vhost_vring_addr *addr)
74
+{
39
+{
75
+ if (!offset_in_ramblock(block, offset)) {
40
+ trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
76
+ return NULL;
41
+ addr->desc_user_addr, addr->used_user_addr,
77
+ }
42
+ addr->avail_user_addr,
78
+ if (!block->colo_cache) {
43
+ addr->log_guest_addr);
79
+ error_report("%s: colo_cache is NULL in block :%s",
44
+
80
+ __func__, block->idstr);
45
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
81
+ return NULL;
46
+
82
+ }
83
+ return block->colo_cache + offset;
84
+}
47
+}
85
+
48
+
86
/**
49
/**
87
* ram_handle_compressed: handle the zero page case
50
* Set the shadow virtqueue descriptors to the device
88
*
51
*
89
@@ -XXX,XX +XXX,XX @@ static void decompress_data_with_multi_threads(QEMUFile *f,
52
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
90
qemu_mutex_unlock(&decomp_done_lock);
53
static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
91
}
54
struct vhost_vring_addr *addr)
92
55
{
93
+/*
56
- trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
94
+ * colo cache: this is for secondary VM, we cache the whole
57
- addr->desc_user_addr, addr->used_user_addr,
95
+ * memory of the secondary VM, it is need to hold the global lock
58
- addr->avail_user_addr,
96
+ * to call this helper.
59
- addr->log_guest_addr);
97
+ */
60
- return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
98
+int colo_init_ram_cache(void)
61
+ struct vhost_vdpa *v = dev->opaque;
99
+{
100
+ RAMBlock *block;
101
+
62
+
102
+ rcu_read_lock();
63
+ if (v->shadow_vqs_enabled) {
103
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
64
+ /*
104
+ block->colo_cache = qemu_anon_ram_alloc(block->used_length,
65
+ * Device vring addr was set at device start. SVQ base is handled by
105
+ NULL,
66
+ * VirtQueue code.
106
+ false);
67
+ */
107
+ if (!block->colo_cache) {
68
+ return 0;
108
+ error_report("%s: Can't alloc memory for COLO cache of block %s,"
109
+ "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
110
+ block->used_length);
111
+ goto out_locked;
112
+ }
113
+ memcpy(block->colo_cache, block->host, block->used_length);
114
+ }
115
+ rcu_read_unlock();
116
+ return 0;
117
+
118
+out_locked:
119
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
120
+ if (block->colo_cache) {
121
+ qemu_anon_ram_free(block->colo_cache, block->used_length);
122
+ block->colo_cache = NULL;
123
+ }
124
+ }
69
+ }
125
+
70
+
126
+ rcu_read_unlock();
71
+ return vhost_vdpa_set_vring_dev_addr(dev, addr);
127
+ return -errno;
72
}
128
+}
73
74
static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
75
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
76
static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
77
struct vhost_vring_state *ring)
78
{
79
- trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
80
- return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
81
+ struct vhost_vdpa *v = dev->opaque;
129
+
82
+
130
+/* It is need to hold the global lock to call this helper */
83
+ if (v->shadow_vqs_enabled) {
131
+void colo_release_ram_cache(void)
84
+ /*
132
+{
85
+ * Device vring base was set at device start. SVQ base is handled by
133
+ RAMBlock *block;
86
+ * VirtQueue code.
87
+ */
88
+ return 0;
89
+ }
134
+
90
+
135
+ rcu_read_lock();
91
+ return vhost_vdpa_set_dev_vring_base(dev, ring);
136
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
137
+ if (block->colo_cache) {
138
+ qemu_anon_ram_free(block->colo_cache, block->used_length);
139
+ block->colo_cache = NULL;
140
+ }
141
+ }
142
+ rcu_read_unlock();
143
+}
144
+
145
/**
146
* ram_load_setup: Setup RAM for migration incoming side
147
*
148
@@ -XXX,XX +XXX,XX @@ static int ram_load_setup(QEMUFile *f, void *opaque)
149
150
xbzrle_load_setup();
151
ramblock_recv_map_init();
152
+
153
return 0;
154
}
92
}
155
93
156
@@ -XXX,XX +XXX,XX @@ static int ram_load_cleanup(void *opaque)
94
static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
157
g_free(rb->receivedmap);
158
rb->receivedmap = NULL;
159
}
160
+
161
return 0;
162
}
163
164
@@ -XXX,XX +XXX,XX @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
165
RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
166
RAMBlock *block = ram_block_from_stream(f, flags);
167
168
- host = host_from_ram_block_offset(block, addr);
169
+ /*
170
+ * After going into COLO, we should load the Page into colo_cache.
171
+ */
172
+ if (migration_incoming_in_colo_state()) {
173
+ host = colo_cache_from_block_offset(block, addr);
174
+ } else {
175
+ host = host_from_ram_block_offset(block, addr);
176
+ }
177
if (!host) {
178
error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
179
ret = -EINVAL;
180
break;
181
}
182
- ramblock_recv_bitmap_set(block, host);
183
+
184
+ if (!migration_incoming_in_colo_state()) {
185
+ ramblock_recv_bitmap_set(block, host);
186
+ }
187
+
188
trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
189
}
190
191
diff --git a/migration/ram.h b/migration/ram.h
192
index XXXXXXX..XXXXXXX 100644
193
--- a/migration/ram.h
194
+++ b/migration/ram.h
195
@@ -XXX,XX +XXX,XX @@ int64_t ramblock_recv_bitmap_send(QEMUFile *file,
196
const char *block_name);
197
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
198
199
+/* ram cache */
200
+int colo_init_ram_cache(void);
201
+void colo_release_ram_cache(void);
202
+
203
#endif
204
diff --git a/migration/savevm.c b/migration/savevm.c
205
index XXXXXXX..XXXXXXX 100644
206
--- a/migration/savevm.c
207
+++ b/migration/savevm.c
208
@@ -XXX,XX +XXX,XX @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
209
static int loadvm_process_enable_colo(MigrationIncomingState *mis)
210
{
211
migration_incoming_enable_colo();
212
- return 0;
213
+ return colo_init_ram_cache();
214
}
215
216
/*
217
--
95
--
218
2.5.0
96
2.7.4
219
97
220
98
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
It's a good idea to use notifier to notify COLO frame of
3
Initial version of shadow virtqueue that actually forward buffers. There
4
inconsistent packets comparing.
4
is no iommu support at the moment, and that will be addressed in future
5
5
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
6
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
6
this means that SVQ is not usable at this point of the series on any
7
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
7
device.
8
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
8
9
For simplicity it only supports modern devices, that expects vring
10
in little endian, with split ring and no event idx or indirect
11
descriptors. Support for them will not be added in this series.
12
13
It reuses the VirtQueue code for the device part. The driver part is
14
based on Linux's virtio_ring driver, but with stripped functionality
15
and optimizations so it's easier to review.
16
17
However, forwarding buffers have some particular pieces: One of the most
18
unexpected ones is that a guest's buffer can expand through more than
19
one descriptor in SVQ. While this is handled gracefully by qemu's
20
emulated virtio devices, it may cause unexpected SVQ queue full. This
21
patch also solves it by checking for this condition at both guest's
22
kicks and device's calls. The code may be more elegant in the future if
23
SVQ code runs in its own iocontext.
24
25
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
26
Acked-by: Michael S. Tsirkin <mst@redhat.com>
9
Signed-off-by: Jason Wang <jasowang@redhat.com>
27
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
---
28
---
11
net/colo-compare.c | 37 ++++++++++++++++++++++++++-----------
29
hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
12
net/colo-compare.h | 2 ++
30
hw/virtio/vhost-shadow-virtqueue.h | 26 +++
13
2 files changed, 28 insertions(+), 11 deletions(-)
31
hw/virtio/vhost-vdpa.c | 155 +++++++++++++++-
14
32
3 files changed, 522 insertions(+), 11 deletions(-)
15
diff --git a/net/colo-compare.c b/net/colo-compare.c
33
34
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
16
index XXXXXXX..XXXXXXX 100644
35
index XXXXXXX..XXXXXXX 100644
17
--- a/net/colo-compare.c
36
--- a/hw/virtio/vhost-shadow-virtqueue.c
18
+++ b/net/colo-compare.c
37
+++ b/hw/virtio/vhost-shadow-virtqueue.c
19
@@ -XXX,XX +XXX,XX @@
38
@@ -XXX,XX +XXX,XX @@
20
#include "sysemu/iothread.h"
39
#include "qemu/error-report.h"
21
#include "net/colo-compare.h"
40
#include "qapi/error.h"
22
#include "migration/colo.h"
41
#include "qemu/main-loop.h"
23
+#include "migration/migration.h"
42
+#include "qemu/log.h"
24
43
+#include "qemu/memalign.h"
25
#define TYPE_COLO_COMPARE "colo-compare"
44
#include "linux-headers/linux/vhost.h"
26
#define COLO_COMPARE(obj) \
45
27
@@ -XXX,XX +XXX,XX @@
46
/**
28
static QTAILQ_HEAD(, CompareState) net_compares =
47
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
29
QTAILQ_HEAD_INITIALIZER(net_compares);
30
31
+static NotifierList colo_compare_notifiers =
32
+ NOTIFIER_LIST_INITIALIZER(colo_compare_notifiers);
33
+
34
#define COMPARE_READ_LEN_MAX NET_BUFSIZE
35
#define MAX_QUEUE_SIZE 1024
36
37
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
38
return false;
39
}
48
}
40
49
41
+static void colo_compare_inconsistency_notify(void)
50
/**
42
+{
51
- * Forward guest notifications.
43
+ notifier_list_notify(&colo_compare_notifiers,
52
+ * Number of descriptors that the SVQ can make available from the guest.
44
+ migrate_get_current());
53
+ *
45
+}
54
+ * @svq: The svq
46
+
55
+ */
47
static void colo_compare_tcp(CompareState *s, Connection *conn)
56
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
57
+{
58
+ return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
59
+}
60
+
61
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
62
+ const struct iovec *iovec, size_t num,
63
+ bool more_descs, bool write)
64
+{
65
+ uint16_t i = svq->free_head, last = svq->free_head;
66
+ unsigned n;
67
+ uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
68
+ vring_desc_t *descs = svq->vring.desc;
69
+
70
+ if (num == 0) {
71
+ return;
72
+ }
73
+
74
+ for (n = 0; n < num; n++) {
75
+ if (more_descs || (n + 1 < num)) {
76
+ descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
77
+ } else {
78
+ descs[i].flags = flags;
79
+ }
80
+ descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
81
+ descs[i].len = cpu_to_le32(iovec[n].iov_len);
82
+
83
+ last = i;
84
+ i = cpu_to_le16(descs[i].next);
85
+ }
86
+
87
+ svq->free_head = le16_to_cpu(descs[last].next);
88
+}
89
+
90
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
91
+ VirtQueueElement *elem, unsigned *head)
92
+{
93
+ unsigned avail_idx;
94
+ vring_avail_t *avail = svq->vring.avail;
95
+
96
+ *head = svq->free_head;
97
+
98
+ /* We need some descriptors here */
99
+ if (unlikely(!elem->out_num && !elem->in_num)) {
100
+ qemu_log_mask(LOG_GUEST_ERROR,
101
+ "Guest provided element with no descriptors");
102
+ return false;
103
+ }
104
+
105
+ vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
106
+ false);
107
+ vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
108
+
109
+ /*
110
+ * Put the entry in the available array (but don't update avail->idx until
111
+ * they do sync).
112
+ */
113
+ avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
114
+ avail->ring[avail_idx] = cpu_to_le16(*head);
115
+ svq->shadow_avail_idx++;
116
+
117
+ /* Update the avail index after write the descriptor */
118
+ smp_wmb();
119
+ avail->idx = cpu_to_le16(svq->shadow_avail_idx);
120
+
121
+ return true;
122
+}
123
+
124
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
125
+{
126
+ unsigned qemu_head;
127
+ bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
128
+ if (unlikely(!ok)) {
129
+ return false;
130
+ }
131
+
132
+ svq->ring_id_maps[qemu_head] = elem;
133
+ return true;
134
+}
135
+
136
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
137
+{
138
+ /*
139
+ * We need to expose the available array entries before checking the used
140
+ * flags
141
+ */
142
+ smp_mb();
143
+ if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
144
+ return;
145
+ }
146
+
147
+ event_notifier_set(&svq->hdev_kick);
148
+}
149
+
150
+/**
151
+ * Forward available buffers.
152
+ *
153
+ * @svq: Shadow VirtQueue
154
+ *
155
+ * Note that this function does not guarantee that all guest's available
156
+ * buffers are available to the device in SVQ avail ring. The guest may have
157
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
158
+ * qemu vaddr.
159
+ *
160
+ * If that happens, guest's kick notifications will be disabled until the
161
+ * device uses some buffers.
162
+ */
163
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
164
+{
165
+ /* Clear event notifier */
166
+ event_notifier_test_and_clear(&svq->svq_kick);
167
+
168
+ /* Forward to the device as many available buffers as possible */
169
+ do {
170
+ virtio_queue_set_notification(svq->vq, false);
171
+
172
+ while (true) {
173
+ VirtQueueElement *elem;
174
+ bool ok;
175
+
176
+ if (svq->next_guest_avail_elem) {
177
+ elem = g_steal_pointer(&svq->next_guest_avail_elem);
178
+ } else {
179
+ elem = virtqueue_pop(svq->vq, sizeof(*elem));
180
+ }
181
+
182
+ if (!elem) {
183
+ break;
184
+ }
185
+
186
+ if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
187
+ /*
188
+ * This condition is possible since a contiguous buffer in GPA
189
+ * does not imply a contiguous buffer in qemu's VA
190
+ * scatter-gather segments. If that happens, the buffer exposed
191
+ * to the device needs to be a chain of descriptors at this
192
+ * moment.
193
+ *
194
+ * SVQ cannot hold more available buffers if we are here:
195
+ * queue the current guest descriptor and ignore further kicks
196
+ * until some elements are used.
197
+ */
198
+ svq->next_guest_avail_elem = elem;
199
+ return;
200
+ }
201
+
202
+ ok = vhost_svq_add(svq, elem);
203
+ if (unlikely(!ok)) {
204
+ /* VQ is broken, just return and ignore any other kicks */
205
+ return;
206
+ }
207
+ vhost_svq_kick(svq);
208
+ }
209
+
210
+ virtio_queue_set_notification(svq->vq, true);
211
+ } while (!virtio_queue_empty(svq->vq));
212
+}
213
+
214
+/**
215
+ * Handle guest's kick.
216
*
217
* @n: guest kick event notifier, the one that guest set to notify svq.
218
*/
219
-static void vhost_handle_guest_kick(EventNotifier *n)
220
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
48
{
221
{
49
Packet *ppkt = NULL, *spkt = NULL;
222
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
50
@@ -XXX,XX +XXX,XX @@ sec:
223
event_notifier_test_and_clear(n);
51
qemu_hexdump((char *)spkt->data, stderr,
224
- event_notifier_set(&svq->hdev_kick);
52
"colo-compare spkt", spkt->size);
225
+ vhost_handle_guest_kick(svq);
53
226
+}
54
- /*
227
+
55
- * colo_compare_inconsistent_notify();
228
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
56
- * TODO: notice to checkpoint();
229
+{
57
- */
230
+ if (svq->last_used_idx != svq->shadow_used_idx) {
58
+ colo_compare_inconsistency_notify();
231
+ return true;
232
+ }
233
+
234
+ svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
235
+
236
+ return svq->last_used_idx != svq->shadow_used_idx;
237
}
238
239
/**
240
- * Forward vhost notifications
241
+ * Enable vhost device calls after disable them.
242
+ *
243
+ * @svq: The svq
244
+ *
245
+ * It returns false if there are pending used buffers from the vhost device,
246
+ * avoiding the possible races between SVQ checking for more work and enabling
247
+ * callbacks. True if SVQ used vring has no more pending buffers.
248
+ */
249
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
250
+{
251
+ svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
252
+ /* Make sure the flag is written before the read of used_idx */
253
+ smp_mb();
254
+ return !vhost_svq_more_used(svq);
255
+}
256
+
257
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
258
+{
259
+ svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
260
+}
261
+
262
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
263
+ uint32_t *len)
264
+{
265
+ vring_desc_t *descs = svq->vring.desc;
266
+ const vring_used_t *used = svq->vring.used;
267
+ vring_used_elem_t used_elem;
268
+ uint16_t last_used;
269
+
270
+ if (!vhost_svq_more_used(svq)) {
271
+ return NULL;
272
+ }
273
+
274
+ /* Only get used array entries after they have been exposed by dev */
275
+ smp_rmb();
276
+ last_used = svq->last_used_idx & (svq->vring.num - 1);
277
+ used_elem.id = le32_to_cpu(used->ring[last_used].id);
278
+ used_elem.len = le32_to_cpu(used->ring[last_used].len);
279
+
280
+ svq->last_used_idx++;
281
+ if (unlikely(used_elem.id >= svq->vring.num)) {
282
+ qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
283
+ svq->vdev->name, used_elem.id);
284
+ return NULL;
285
+ }
286
+
287
+ if (unlikely(!svq->ring_id_maps[used_elem.id])) {
288
+ qemu_log_mask(LOG_GUEST_ERROR,
289
+ "Device %s says index %u is used, but it was not available",
290
+ svq->vdev->name, used_elem.id);
291
+ return NULL;
292
+ }
293
+
294
+ descs[used_elem.id].next = svq->free_head;
295
+ svq->free_head = used_elem.id;
296
+
297
+ *len = used_elem.len;
298
+ return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
299
+}
300
+
301
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
302
+ bool check_for_avail_queue)
303
+{
304
+ VirtQueue *vq = svq->vq;
305
+
306
+ /* Forward as many used buffers as possible. */
307
+ do {
308
+ unsigned i = 0;
309
+
310
+ vhost_svq_disable_notification(svq);
311
+ while (true) {
312
+ uint32_t len;
313
+ g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
314
+ if (!elem) {
315
+ break;
316
+ }
317
+
318
+ if (unlikely(i >= svq->vring.num)) {
319
+ qemu_log_mask(LOG_GUEST_ERROR,
320
+ "More than %u used buffers obtained in a %u size SVQ",
321
+ i, svq->vring.num);
322
+ virtqueue_fill(vq, elem, len, i);
323
+ virtqueue_flush(vq, i);
324
+ return;
325
+ }
326
+ virtqueue_fill(vq, elem, len, i++);
327
+ }
328
+
329
+ virtqueue_flush(vq, i);
330
+ event_notifier_set(&svq->svq_call);
331
+
332
+ if (check_for_avail_queue && svq->next_guest_avail_elem) {
333
+ /*
334
+ * Avail ring was full when vhost_svq_flush was called, so it's a
335
+ * good moment to make more descriptors available if possible.
336
+ */
337
+ vhost_handle_guest_kick(svq);
338
+ }
339
+ } while (!vhost_svq_enable_notification(svq));
340
+}
341
+
342
+/**
343
+ * Forward used buffers.
344
*
345
* @n: hdev call event notifier, the one that device set to notify svq.
346
+ *
347
+ * Note that we are not making any buffers available in the loop, there is no
348
+ * way that it runs more than virtqueue size times.
349
*/
350
static void vhost_svq_handle_call(EventNotifier *n)
351
{
352
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
353
hdev_call);
354
event_notifier_test_and_clear(n);
355
- event_notifier_set(&svq->svq_call);
356
+ vhost_svq_flush(svq, true);
357
}
358
359
/**
360
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
361
if (poll_start) {
362
event_notifier_init_fd(svq_kick, svq_kick_fd);
363
event_notifier_set(svq_kick);
364
- event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
365
+ event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
366
+ }
367
+}
368
+
369
+/**
370
+ * Start the shadow virtqueue operation.
371
+ *
372
+ * @svq: Shadow Virtqueue
373
+ * @vdev: VirtIO device
374
+ * @vq: Virtqueue to shadow
375
+ */
376
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
377
+ VirtQueue *vq)
378
+{
379
+ size_t desc_size, driver_size, device_size;
380
+
381
+ svq->next_guest_avail_elem = NULL;
382
+ svq->shadow_avail_idx = 0;
383
+ svq->shadow_used_idx = 0;
384
+ svq->last_used_idx = 0;
385
+ svq->vdev = vdev;
386
+ svq->vq = vq;
387
+
388
+ svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
389
+ driver_size = vhost_svq_driver_area_size(svq);
390
+ device_size = vhost_svq_device_area_size(svq);
391
+ svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
392
+ desc_size = sizeof(vring_desc_t) * svq->vring.num;
393
+ svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
394
+ memset(svq->vring.desc, 0, driver_size);
395
+ svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
396
+ memset(svq->vring.used, 0, device_size);
397
+ svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
398
+ for (unsigned i = 0; i < svq->vring.num - 1; i++) {
399
+ svq->vring.desc[i].next = cpu_to_le16(i + 1);
59
}
400
}
60
}
401
}
61
402
62
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
403
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
404
void vhost_svq_stop(VhostShadowVirtqueue *svq)
405
{
406
event_notifier_set_handler(&svq->svq_kick, NULL);
407
+ g_autofree VirtQueueElement *next_avail_elem = NULL;
408
+
409
+ if (!svq->vq) {
410
+ return;
411
+ }
412
+
413
+ /* Send all pending used descriptors to guest */
414
+ vhost_svq_flush(svq, false);
415
+
416
+ for (unsigned i = 0; i < svq->vring.num; ++i) {
417
+ g_autofree VirtQueueElement *elem = NULL;
418
+ elem = g_steal_pointer(&svq->ring_id_maps[i]);
419
+ if (elem) {
420
+ virtqueue_detach_element(svq->vq, elem, 0);
421
+ }
422
+ }
423
+
424
+ next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
425
+ if (next_avail_elem) {
426
+ virtqueue_detach_element(svq->vq, next_avail_elem, 0);
427
+ }
428
+ svq->vq = NULL;
429
+ g_free(svq->ring_id_maps);
430
+ qemu_vfree(svq->vring.desc);
431
+ qemu_vfree(svq->vring.used);
432
}
433
434
/**
435
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
436
index XXXXXXX..XXXXXXX 100644
437
--- a/hw/virtio/vhost-shadow-virtqueue.h
438
+++ b/hw/virtio/vhost-shadow-virtqueue.h
439
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
440
441
/* Guest's call notifier, where the SVQ calls guest. */
442
EventNotifier svq_call;
443
+
444
+ /* Virtio queue shadowing */
445
+ VirtQueue *vq;
446
+
447
+ /* Virtio device */
448
+ VirtIODevice *vdev;
449
+
450
+ /* Map for use the guest's descriptors */
451
+ VirtQueueElement **ring_id_maps;
452
+
453
+ /* Next VirtQueue element that guest made available */
454
+ VirtQueueElement *next_guest_avail_elem;
455
+
456
+ /* Next head to expose to the device */
457
+ uint16_t shadow_avail_idx;
458
+
459
+ /* Next free descriptor */
460
+ uint16_t free_head;
461
+
462
+ /* Last seen used idx */
463
+ uint16_t shadow_used_idx;
464
+
465
+ /* Next head to consume from the device */
466
+ uint16_t last_used_idx;
467
} VhostShadowVirtqueue;
468
469
bool vhost_svq_valid_features(uint64_t features, Error **errp);
470
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
471
size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
472
size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
473
474
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
475
+ VirtQueue *vq);
476
void vhost_svq_stop(VhostShadowVirtqueue *svq);
477
478
VhostShadowVirtqueue *vhost_svq_new(void);
479
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
480
index XXXXXXX..XXXXXXX 100644
481
--- a/hw/virtio/vhost-vdpa.c
482
+++ b/hw/virtio/vhost-vdpa.c
483
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
484
* Note that this function does not rewind kick file descriptor if cannot set
485
* call one.
486
*/
487
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
488
- VhostShadowVirtqueue *svq, unsigned idx,
489
- Error **errp)
490
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
491
+ VhostShadowVirtqueue *svq, unsigned idx,
492
+ Error **errp)
493
{
494
struct vhost_vring_file file = {
495
.index = dev->vq_index + idx,
496
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
497
r = vhost_vdpa_set_vring_dev_kick(dev, &file);
498
if (unlikely(r != 0)) {
499
error_setg_errno(errp, -r, "Can't set device kick fd");
500
- return false;
501
+ return r;
63
}
502
}
503
504
event_notifier = &svq->hdev_call;
505
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
506
error_setg_errno(errp, -r, "Can't set device call fd");
507
}
508
509
+ return r;
510
+}
511
+
512
+/**
513
+ * Unmap a SVQ area in the device
514
+ */
515
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
516
+ hwaddr size)
517
+{
518
+ int r;
519
+
520
+ size = ROUND_UP(size, qemu_real_host_page_size);
521
+ r = vhost_vdpa_dma_unmap(v, iova, size);
522
+ return r == 0;
523
+}
524
+
525
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
526
+ const VhostShadowVirtqueue *svq)
527
+{
528
+ struct vhost_vdpa *v = dev->opaque;
529
+ struct vhost_vring_addr svq_addr;
530
+ size_t device_size = vhost_svq_device_area_size(svq);
531
+ size_t driver_size = vhost_svq_driver_area_size(svq);
532
+ bool ok;
533
+
534
+ vhost_svq_get_vring_addr(svq, &svq_addr);
535
+
536
+ ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
537
+ if (unlikely(!ok)) {
538
+ return false;
539
+ }
540
+
541
+ return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
542
+}
543
+
544
+/**
545
+ * Map the shadow virtqueue rings in the device
546
+ *
547
+ * @dev: The vhost device
548
+ * @svq: The shadow virtqueue
549
+ * @addr: Assigned IOVA addresses
550
+ * @errp: Error pointer
551
+ */
552
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
553
+ const VhostShadowVirtqueue *svq,
554
+ struct vhost_vring_addr *addr,
555
+ Error **errp)
556
+{
557
+ struct vhost_vdpa *v = dev->opaque;
558
+ size_t device_size = vhost_svq_device_area_size(svq);
559
+ size_t driver_size = vhost_svq_driver_area_size(svq);
560
+ int r;
561
+
562
+ ERRP_GUARD();
563
+ vhost_svq_get_vring_addr(svq, addr);
564
+
565
+ r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
566
+ (void *)(uintptr_t)addr->desc_user_addr, true);
567
+ if (unlikely(r != 0)) {
568
+ error_setg_errno(errp, -r, "Cannot create vq driver region: ");
569
+ return false;
570
+ }
571
+
572
+ r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
573
+ (void *)(intptr_t)addr->used_user_addr, false);
574
+ if (unlikely(r != 0)) {
575
+ error_setg_errno(errp, -r, "Cannot create vq device region: ");
576
+ }
577
+
578
+ return r == 0;
579
+}
580
+
581
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
582
+ VhostShadowVirtqueue *svq, unsigned idx,
583
+ Error **errp)
584
+{
585
+ uint16_t vq_index = dev->vq_index + idx;
586
+ struct vhost_vring_state s = {
587
+ .index = vq_index,
588
+ };
589
+ int r;
590
+
591
+ r = vhost_vdpa_set_dev_vring_base(dev, &s);
592
+ if (unlikely(r)) {
593
+ error_setg_errno(errp, -r, "Cannot set vring base");
594
+ return false;
595
+ }
596
+
597
+ r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
598
return r == 0;
64
}
599
}
65
600
66
+void colo_compare_register_notifier(Notifier *notify)
601
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
67
+{
68
+ notifier_list_add(&colo_compare_notifiers, notify);
69
+}
70
+
71
+void colo_compare_unregister_notifier(Notifier *notify)
72
+{
73
+ notifier_remove(notify);
74
+}
75
+
76
static int colo_old_packet_check_one_conn(Connection *conn,
77
- void *user_data)
78
+ void *user_data)
79
{
80
GList *result = NULL;
81
int64_t check_time = REGULAR_PACKET_CHECK_MS;
82
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one_conn(Connection *conn,
83
84
if (result) {
85
/* Do checkpoint will flush old packet */
86
- /*
87
- * TODO: Notify colo frame to do checkpoint.
88
- * colo_compare_inconsistent_notify();
89
- */
90
+ colo_compare_inconsistency_notify();
91
return 0;
92
}
602
}
93
603
94
@@ -XXX,XX +XXX,XX @@ static void colo_compare_packet(CompareState *s, Connection *conn,
604
for (i = 0; i < v->shadow_vqs->len; ++i) {
95
/*
605
+ VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
96
* If one packet arrive late, the secondary_list or
606
VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
97
* primary_list will be empty, so we can't compare it
607
+ struct vhost_vring_addr addr = {
98
- * until next comparison.
608
+ .index = i,
99
+ * until next comparison. If the packets in the list are
609
+ };
100
+ * timeout, it will trigger a checkpoint request.
610
+ int r;
101
*/
611
bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
102
trace_colo_compare_main("packet different");
612
if (unlikely(!ok)) {
103
g_queue_push_head(&conn->primary_list, pkt);
613
- error_reportf_err(err, "Cannot setup SVQ %u: ", i);
104
- /* TODO: colo_notify_checkpoint();*/
614
+ goto err;
105
+ colo_compare_inconsistency_notify();
615
+ }
106
break;
616
+
617
+ vhost_svq_start(svq, dev->vdev, vq);
618
+ ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
619
+ if (unlikely(!ok)) {
620
+ goto err_map;
621
+ }
622
+
623
+ /* Override vring GPA set by vhost subsystem */
624
+ r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
625
+ if (unlikely(r != 0)) {
626
+ error_setg_errno(&err, -r, "Cannot set device address");
627
+ goto err_set_addr;
628
+ }
629
+ }
630
+
631
+ return true;
632
+
633
+err_set_addr:
634
+ vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
635
+
636
+err_map:
637
+ vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
638
+
639
+err:
640
+ error_reportf_err(err, "Cannot setup SVQ %u: ", i);
641
+ for (unsigned j = 0; j < i; ++j) {
642
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
643
+ vhost_vdpa_svq_unmap_rings(dev, svq);
644
+ vhost_svq_stop(svq);
645
+ }
646
+
647
+ return false;
648
+}
649
+
650
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
651
+{
652
+ struct vhost_vdpa *v = dev->opaque;
653
+
654
+ if (!v->shadow_vqs) {
655
+ return true;
656
+ }
657
+
658
+ for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
659
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
660
+ bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
661
+ if (unlikely(!ok)) {
662
return false;
107
}
663
}
108
}
664
}
109
diff --git a/net/colo-compare.h b/net/colo-compare.h
665
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
110
index XXXXXXX..XXXXXXX 100644
666
}
111
--- a/net/colo-compare.h
667
vhost_vdpa_set_vring_ready(dev);
112
+++ b/net/colo-compare.h
668
} else {
113
@@ -XXX,XX +XXX,XX @@
669
+ ok = vhost_vdpa_svqs_stop(dev);
114
#define QEMU_COLO_COMPARE_H
670
+ if (unlikely(!ok)) {
115
671
+ return -1;
116
void colo_notify_compares_event(void *opaque, int event, Error **errp);
672
+ }
117
+void colo_compare_register_notifier(Notifier *notify);
673
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
118
+void colo_compare_unregister_notifier(Notifier *notify);
674
}
119
675
120
#endif /* QEMU_COLO_COMPARE_H */
121
--
676
--
122
2.5.0
677
2.7.4
123
678
124
679
diff view generated by jsdifflib
Deleted patch
1
From: Zhang Chen <zhangckid@gmail.com>
2
1
3
Make sure master start block replication after slave's block
4
replication started.
5
6
Besides, we need to activate VM's blocks before goes into
7
COLO state.
8
9
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
10
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
11
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
12
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
13
Signed-off-by: Jason Wang <jasowang@redhat.com>
14
---
15
migration/colo.c | 43 +++++++++++++++++++++++++++++++++++++++++++
16
migration/migration.c | 10 ++++++++++
17
2 files changed, 53 insertions(+)
18
19
diff --git a/migration/colo.c b/migration/colo.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/migration/colo.c
22
+++ b/migration/colo.c
23
@@ -XXX,XX +XXX,XX @@
24
#include "replication.h"
25
#include "net/colo-compare.h"
26
#include "net/colo.h"
27
+#include "block/block.h"
28
29
static bool vmstate_loading;
30
static Notifier packets_compare_notifier;
31
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
32
{
33
int old_state;
34
MigrationIncomingState *mis = migration_incoming_get_current();
35
+ Error *local_err = NULL;
36
37
/* Can not do failover during the process of VM's loading VMstate, Or
38
* it will break the secondary VM.
39
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
40
migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
41
MIGRATION_STATUS_COMPLETED);
42
43
+ replication_stop_all(true, &local_err);
44
+ if (local_err) {
45
+ error_report_err(local_err);
46
+ }
47
+
48
if (!autostart) {
49
error_report("\"-S\" qemu option will be ignored in secondary side");
50
/* recover runstate to normal migration finish state */
51
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
52
{
53
MigrationState *s = migrate_get_current();
54
int old_state;
55
+ Error *local_err = NULL;
56
57
migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
58
MIGRATION_STATUS_COMPLETED);
59
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
60
FailoverStatus_str(old_state));
61
return;
62
}
63
+
64
+ replication_stop_all(true, &local_err);
65
+ if (local_err) {
66
+ error_report_err(local_err);
67
+ local_err = NULL;
68
+ }
69
+
70
/* Notify COLO thread that failover work is finished */
71
qemu_sem_post(&s->colo_exit_sem);
72
}
73
@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
74
qemu_savevm_state_header(fb);
75
qemu_savevm_state_setup(fb);
76
qemu_mutex_lock_iothread();
77
+ replication_do_checkpoint_all(&local_err);
78
+ if (local_err) {
79
+ qemu_mutex_unlock_iothread();
80
+ goto out;
81
+ }
82
qemu_savevm_state_complete_precopy(fb, false, false);
83
qemu_mutex_unlock_iothread();
84
85
@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
86
object_unref(OBJECT(bioc));
87
88
qemu_mutex_lock_iothread();
89
+ replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
90
+ if (local_err) {
91
+ qemu_mutex_unlock_iothread();
92
+ goto out;
93
+ }
94
+
95
vm_start();
96
qemu_mutex_unlock_iothread();
97
trace_colo_vm_state_change("stop", "run");
98
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
99
object_unref(OBJECT(bioc));
100
101
qemu_mutex_lock_iothread();
102
+ replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
103
+ if (local_err) {
104
+ qemu_mutex_unlock_iothread();
105
+ goto out;
106
+ }
107
vm_start();
108
trace_colo_vm_state_change("stop", "run");
109
qemu_mutex_unlock_iothread();
110
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
111
goto out;
112
}
113
114
+ replication_get_error_all(&local_err);
115
+ if (local_err) {
116
+ qemu_mutex_unlock_iothread();
117
+ goto out;
118
+ }
119
+ /* discard colo disk buffer */
120
+ replication_do_checkpoint_all(&local_err);
121
+ if (local_err) {
122
+ qemu_mutex_unlock_iothread();
123
+ goto out;
124
+ }
125
+
126
vmstate_loading = false;
127
vm_start();
128
trace_colo_vm_state_change("stop", "run");
129
diff --git a/migration/migration.c b/migration/migration.c
130
index XXXXXXX..XXXXXXX 100644
131
--- a/migration/migration.c
132
+++ b/migration/migration.c
133
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
134
MigrationIncomingState *mis = migration_incoming_get_current();
135
PostcopyState ps;
136
int ret;
137
+ Error *local_err = NULL;
138
139
assert(mis->from_src_file);
140
mis->migration_incoming_co = qemu_coroutine_self();
141
@@ -XXX,XX +XXX,XX @@ static void process_incoming_migration_co(void *opaque)
142
143
/* we get COLO info, and know if we are in COLO mode */
144
if (!ret && migration_incoming_enable_colo()) {
145
+ /* Make sure all file formats flush their mutable metadata */
146
+ bdrv_invalidate_cache_all(&local_err);
147
+ if (local_err) {
148
+ migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
149
+ MIGRATION_STATUS_FAILED);
150
+ error_report_err(local_err);
151
+ exit(EXIT_FAILURE);
152
+ }
153
+
154
qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
155
colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
156
mis->have_colo_incoming_thread = true;
157
--
158
2.5.0
159
160
diff view generated by jsdifflib
Deleted patch
1
From: Zhang Chen <zhangckid@gmail.com>
2
1
3
We record the address of the dirty pages that received,
4
it will help flushing pages that cached into SVM.
5
6
Here, it is a trick, we record dirty pages by re-using migration
7
dirty bitmap. In the later patch, we will start the dirty log
8
for SVM, just like migration, in this way, we can record both
9
the dirty pages caused by PVM and SVM, we only flush those dirty
10
pages from RAM cache while do checkpoint.
11
12
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
13
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
14
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
15
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
16
Signed-off-by: Jason Wang <jasowang@redhat.com>
17
---
18
migration/ram.c | 43 ++++++++++++++++++++++++++++++++++++++++---
19
1 file changed, 40 insertions(+), 3 deletions(-)
20
21
diff --git a/migration/ram.c b/migration/ram.c
22
index XXXXXXX..XXXXXXX 100644
23
--- a/migration/ram.c
24
+++ b/migration/ram.c
25
@@ -XXX,XX +XXX,XX @@ static inline void *colo_cache_from_block_offset(RAMBlock *block,
26
__func__, block->idstr);
27
return NULL;
28
}
29
+
30
+ /*
31
+ * During colo checkpoint, we need bitmap of these migrated pages.
32
+ * It help us to decide which pages in ram cache should be flushed
33
+ * into VM's RAM later.
34
+ */
35
+ if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
36
+ ram_state->migration_dirty_pages++;
37
+ }
38
return block->colo_cache + offset;
39
}
40
41
@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
42
RAMBlock *block;
43
44
rcu_read_lock();
45
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
46
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
47
block->colo_cache = qemu_anon_ram_alloc(block->used_length,
48
NULL,
49
false);
50
@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
51
memcpy(block->colo_cache, block->host, block->used_length);
52
}
53
rcu_read_unlock();
54
+ /*
55
+ * Record the dirty pages that sent by PVM, we use this dirty bitmap together
56
+ * with to decide which page in cache should be flushed into SVM's RAM. Here
57
+ * we use the same name 'ram_bitmap' as for migration.
58
+ */
59
+ if (ram_bytes_total()) {
60
+ RAMBlock *block;
61
+
62
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
63
+ unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
64
+
65
+ block->bmap = bitmap_new(pages);
66
+ bitmap_set(block->bmap, 0, pages);
67
+ }
68
+ }
69
+ ram_state = g_new0(RAMState, 1);
70
+ ram_state->migration_dirty_pages = 0;
71
+
72
return 0;
73
74
out_locked:
75
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
76
+
77
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
78
if (block->colo_cache) {
79
qemu_anon_ram_free(block->colo_cache, block->used_length);
80
block->colo_cache = NULL;
81
@@ -XXX,XX +XXX,XX @@ void colo_release_ram_cache(void)
82
{
83
RAMBlock *block;
84
85
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
86
+ g_free(block->bmap);
87
+ block->bmap = NULL;
88
+ }
89
+
90
rcu_read_lock();
91
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
92
+
93
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
94
if (block->colo_cache) {
95
qemu_anon_ram_free(block->colo_cache, block->used_length);
96
block->colo_cache = NULL;
97
}
98
}
99
+
100
rcu_read_unlock();
101
+ g_free(ram_state);
102
+ ram_state = NULL;
103
}
104
105
/**
106
--
107
2.5.0
108
109
diff view generated by jsdifflib
1
From: Martin Wilck <mwilck@suse.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
The e1000 emulation silently discards RX packets if there's
3
This iova tree function allows it to look for a hole in allocated
4
insufficient space in the ring buffer. This leads to errors
4
regions and return a totally new translation for a given translated
5
on higher-level protocols in the guest, with no indication
5
address.
6
about the error cause.
6
7
7
It's usage is mainly to allow devices to access qemu address space,
8
This patch increments the "Missed Packets Count" (MPC) and
8
remapping guest's one into a new iova space where qemu can add chunks of
9
"Receive No Buffers Count" (RNBC) HW counters in this case.
9
addresses.
10
As the emulation has no FIFO for buffering packets that can't
10
11
immediately be pushed to the guest, these two registers are
11
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
12
practically equivalent (see 10.2.7.4, 10.2.7.33 in
12
Reviewed-by: Peter Xu <peterx@redhat.com>
13
https://www.intel.com/content/www/us/en/embedded/products/networking/82574l-gbe-controller-datasheet.html).
13
Acked-by: Michael S. Tsirkin <mst@redhat.com>
14
15
On a Linux guest, the register content will be reflected in
16
the "rx_missed_errors" and "rx_no_buffer_count" stats from
17
"ethtool -S", and in the "missed" stat from "ip -s -s link show",
18
giving at least some hint about the error cause inside the guest.
19
20
If the cause is known, problems like this can often be avoided
21
easily, by increasing the number of RX descriptors in the guest
22
e1000 driver (e.g under Linux, "e1000.RxDescriptors=1024").
23
24
The patch also adds a qemu trace message for this condition.
25
26
Signed-off-by: Martin Wilck <mwilck@suse.com>
27
Signed-off-by: Jason Wang <jasowang@redhat.com>
14
Signed-off-by: Jason Wang <jasowang@redhat.com>
28
---
15
---
29
hw/net/e1000.c | 16 +++++++++++++---
16
include/qemu/iova-tree.h | 18 +++++++
30
hw/net/trace-events | 3 +++
17
util/iova-tree.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++
31
2 files changed, 16 insertions(+), 3 deletions(-)
18
2 files changed, 154 insertions(+)
32
19
33
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
20
diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
34
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
35
--- a/hw/net/e1000.c
22
--- a/include/qemu/iova-tree.h
36
+++ b/hw/net/e1000.c
23
+++ b/include/qemu/iova-tree.h
37
@@ -XXX,XX +XXX,XX @@
24
@@ -XXX,XX +XXX,XX @@
38
#include "qemu/range.h"
25
#define IOVA_OK (0)
39
26
#define IOVA_ERR_INVALID (-1) /* Invalid parameters */
40
#include "e1000x_common.h"
27
#define IOVA_ERR_OVERLAP (-2) /* IOVA range overlapped */
41
+#include "trace.h"
28
+#define IOVA_ERR_NOMEM (-3) /* Cannot allocate */
42
29
43
static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
30
typedef struct IOVATree IOVATree;
44
31
typedef struct DMAMap {
45
@@ -XXX,XX +XXX,XX @@ static uint64_t rx_desc_base(E1000State *s)
32
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
46
return (bah << 32) + bal;
33
void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
34
35
/**
36
+ * iova_tree_alloc_map:
37
+ *
38
+ * @tree: the iova tree to allocate from
39
+ * @map: the new map (as translated addr & size) to allocate in the iova region
40
+ * @iova_begin: the minimum address of the allocation
41
+ * @iova_end: the maximum addressable direction of the allocation
42
+ *
43
+ * Allocates a new region of a given size, between iova_min and iova_max.
44
+ *
45
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
46
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
47
+ * in map->iova.
48
+ */
49
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
50
+ hwaddr iova_end);
51
+
52
+/**
53
* iova_tree_destroy:
54
*
55
* @tree: the iova tree to destroy
56
diff --git a/util/iova-tree.c b/util/iova-tree.c
57
index XXXXXXX..XXXXXXX 100644
58
--- a/util/iova-tree.c
59
+++ b/util/iova-tree.c
60
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
61
GTree *tree;
62
};
63
64
+/* Args to pass to iova_tree_alloc foreach function. */
65
+struct IOVATreeAllocArgs {
66
+ /* Size of the desired allocation */
67
+ size_t new_size;
68
+
69
+ /* The minimum address allowed in the allocation */
70
+ hwaddr iova_begin;
71
+
72
+ /* Map at the left of the hole, can be NULL if "this" is first one */
73
+ const DMAMap *prev;
74
+
75
+ /* Map at the right of the hole, can be NULL if "prev" is the last one */
76
+ const DMAMap *this;
77
+
78
+ /* If found, we fill in the IOVA here */
79
+ hwaddr iova_result;
80
+
81
+ /* Whether have we found a valid IOVA */
82
+ bool iova_found;
83
+};
84
+
85
+/**
86
+ * Iterate args to the next hole
87
+ *
88
+ * @args: The alloc arguments
89
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
90
+ */
91
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
92
+ const DMAMap *next)
93
+{
94
+ args->prev = args->this;
95
+ args->this = next;
96
+}
97
+
98
static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
99
{
100
const DMAMap *m1 = a, *m2 = b;
101
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
102
return IOVA_OK;
47
}
103
}
48
104
49
+static void
105
+/**
50
+e1000_receiver_overrun(E1000State *s, size_t size)
106
+ * Try to find an unallocated IOVA range between prev and this elements.
51
+{
107
+ *
52
+ trace_e1000_receiver_overrun(size, s->mac_reg[RDH], s->mac_reg[RDT]);
108
+ * @args: Arguments to allocation
53
+ e1000x_inc_reg_if_not_full(s->mac_reg, RNBC);
109
+ *
54
+ e1000x_inc_reg_if_not_full(s->mac_reg, MPC);
110
+ * Cases:
55
+ set_ics(s, 0, E1000_ICS_RXO);
111
+ *
56
+}
112
+ * (1) !prev, !this: No entries allocated, always succeed
57
+
113
+ *
58
static ssize_t
114
+ * (2) !prev, this: We're iterating at the 1st element.
59
e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
115
+ *
116
+ * (3) prev, !this: We're iterating at the last element.
117
+ *
118
+ * (4) prev, this: this is the most common case, we'll try to find a hole
119
+ * between "prev" and "this" mapping.
120
+ *
121
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
122
+ * searches linearly so it's easy to discard the result if it's not the case.
123
+ */
124
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
125
+{
126
+ const DMAMap *prev = args->prev, *this = args->this;
127
+ uint64_t hole_start, hole_last;
128
+
129
+ if (this && this->iova + this->size < args->iova_begin) {
130
+ return;
131
+ }
132
+
133
+ hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
134
+ hole_last = this ? this->iova : HWADDR_MAX;
135
+
136
+ if (hole_last - hole_start > args->new_size) {
137
+ args->iova_result = hole_start;
138
+ args->iova_found = true;
139
+ }
140
+}
141
+
142
+/**
143
+ * Foreach dma node in the tree, compare if there is a hole with its previous
144
+ * node (or minimum iova address allowed) and the node.
145
+ *
146
+ * @key: Node iterating
147
+ * @value: Node iterating
148
+ * @pargs: Struct to communicate with the outside world
149
+ *
150
+ * Return: false to keep iterating, true if needs break.
151
+ */
152
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
153
+ gpointer pargs)
154
+{
155
+ struct IOVATreeAllocArgs *args = pargs;
156
+ DMAMap *node = value;
157
+
158
+ assert(key == value);
159
+
160
+ iova_tree_alloc_args_iterate(args, node);
161
+ iova_tree_alloc_map_in_hole(args);
162
+ return args->iova_found;
163
+}
164
+
165
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
166
+ hwaddr iova_last)
167
+{
168
+ struct IOVATreeAllocArgs args = {
169
+ .new_size = map->size,
170
+ .iova_begin = iova_begin,
171
+ };
172
+
173
+ if (unlikely(iova_last < iova_begin)) {
174
+ return IOVA_ERR_INVALID;
175
+ }
176
+
177
+ /*
178
+ * Find a valid hole for the mapping
179
+ *
180
+ * Assuming low iova_begin, so no need to do a binary search to
181
+ * locate the first node.
182
+ *
183
+ * TODO: Replace all this with g_tree_node_first/next/last when available
184
+ * (from glib since 2.68). To do it with g_tree_foreach complicates the
185
+ * code a lot.
186
+ *
187
+ */
188
+ g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
189
+ if (!args.iova_found) {
190
+ /*
191
+ * Either tree is empty or the last hole is still not checked.
192
+ * g_tree_foreach does not compare (last, iova_last] range, so we check
193
+ * it here.
194
+ */
195
+ iova_tree_alloc_args_iterate(&args, NULL);
196
+ iova_tree_alloc_map_in_hole(&args);
197
+ }
198
+
199
+ if (!args.iova_found || args.iova_result + map->size > iova_last) {
200
+ return IOVA_ERR_NOMEM;
201
+ }
202
+
203
+ map->iova = args.iova_result;
204
+ return iova_tree_insert(tree, map);
205
+}
206
+
207
void iova_tree_destroy(IOVATree *tree)
60
{
208
{
61
@@ -XXX,XX +XXX,XX @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
209
g_tree_destroy(tree->tree);
62
desc_offset = 0;
63
total_size = size + e1000x_fcs_len(s->mac_reg);
64
if (!e1000_has_rxbufs(s, total_size)) {
65
- set_ics(s, 0, E1000_ICS_RXO);
66
- return -1;
67
+ e1000_receiver_overrun(s, total_size);
68
+ return -1;
69
}
70
do {
71
desc_size = total_size - desc_offset;
72
@@ -XXX,XX +XXX,XX @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
73
rdh_start >= s->mac_reg[RDLEN] / sizeof(desc)) {
74
DBGOUT(RXERR, "RDH wraparound @%x, RDT %x, RDLEN %x\n",
75
rdh_start, s->mac_reg[RDT], s->mac_reg[RDLEN]);
76
- set_ics(s, 0, E1000_ICS_RXO);
77
+ e1000_receiver_overrun(s, total_size);
78
return -1;
79
}
80
} while (desc_offset < total_size);
81
diff --git a/hw/net/trace-events b/hw/net/trace-events
82
index XXXXXXX..XXXXXXX 100644
83
--- a/hw/net/trace-events
84
+++ b/hw/net/trace-events
85
@@ -XXX,XX +XXX,XX @@ net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS hash"
86
net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X"
87
net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes"
88
89
+# hw/net/e1000.c
90
+e1000_receiver_overrun(size_t s, uint32_t rdh, uint32_t rdt) "Receiver overrun: dropped packet of %lu bytes, RDH=%u, RDT=%u"
91
+
92
# hw/net/e1000x_common.c
93
e1000x_rx_can_recv_disabled(bool link_up, bool rx_enabled, bool pci_master) "link_up: %d, rx_enabled %d, pci_master %d"
94
e1000x_vlan_is_vlan_pkt(bool is_vlan_pkt, uint16_t eth_proto, uint16_t vet) "Is VLAN packet: %d, ETH proto: 0x%X, VET: 0x%X"
95
--
210
--
96
2.5.0
211
2.7.4
97
212
98
213
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
Libvirt or other high level software can use this command query colo status.
3
This function does the reverse operation of iova_tree_find: To look for
4
You can test this command like that:
4
a mapping that match a translated address so we can do the reverse.
5
{'execute':'query-colo-status'}
6
5
7
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
6
This have linear complexity instead of logarithmic, but it supports
8
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
7
overlapping HVA. Future developments could reduce it.
8
9
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
10
Acked-by: Michael S. Tsirkin <mst@redhat.com>
9
Signed-off-by: Jason Wang <jasowang@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
---
12
---
11
migration/colo.c | 21 +++++++++++++++++++++
13
include/qemu/iova-tree.h | 20 +++++++++++++++++++-
12
qapi/migration.json | 32 ++++++++++++++++++++++++++++++++
14
util/iova-tree.c | 34 ++++++++++++++++++++++++++++++++++
13
2 files changed, 53 insertions(+)
15
2 files changed, 53 insertions(+), 1 deletion(-)
14
16
15
diff --git a/migration/colo.c b/migration/colo.c
17
diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
16
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
17
--- a/migration/colo.c
19
--- a/include/qemu/iova-tree.h
18
+++ b/migration/colo.c
20
+++ b/include/qemu/iova-tree.h
19
@@ -XXX,XX +XXX,XX @@
21
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
20
#include "net/colo.h"
22
* @tree: the iova tree to search from
21
#include "block/block.h"
23
* @map: the mapping to search
22
#include "qapi/qapi-events-migration.h"
24
*
23
+#include "qapi/qmp/qerror.h"
25
- * Search for a mapping in the iova tree that overlaps with the
24
26
+ * Search for a mapping in the iova tree that iova overlaps with the
25
static bool vmstate_loading;
27
* mapping range specified. Only the first found mapping will be
26
static Notifier packets_compare_notifier;
28
* returned.
27
@@ -XXX,XX +XXX,XX @@ void qmp_xen_colo_do_checkpoint(Error **errp)
29
*
28
#endif
30
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
31
const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
32
33
/**
34
+ * iova_tree_find_iova:
35
+ *
36
+ * @tree: the iova tree to search from
37
+ * @map: the mapping to search
38
+ *
39
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
40
+ * mapping range specified. Only the first found mapping will be
41
+ * returned.
42
+ *
43
+ * Return: DMAMap pointer if found, or NULL if not found. Note that
44
+ * the returned DMAMap pointer is maintained internally. User should
45
+ * only read the content but never modify or free the content. Also,
46
+ * user is responsible to make sure the pointer is valid (say, no
47
+ * concurrent deletion in progress).
48
+ */
49
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
50
+
51
+/**
52
* iova_tree_find_address:
53
*
54
* @tree: the iova tree to search from
55
diff --git a/util/iova-tree.c b/util/iova-tree.c
56
index XXXXXXX..XXXXXXX 100644
57
--- a/util/iova-tree.c
58
+++ b/util/iova-tree.c
59
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
60
bool iova_found;
61
};
62
63
+typedef struct IOVATreeFindIOVAArgs {
64
+ const DMAMap *needle;
65
+ const DMAMap *result;
66
+} IOVATreeFindIOVAArgs;
67
+
68
/**
69
* Iterate args to the next hole
70
*
71
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
72
return g_tree_lookup(tree->tree, map);
29
}
73
}
30
74
31
+COLOStatus *qmp_query_colo_status(Error **errp)
75
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
76
+ gpointer data)
32
+{
77
+{
33
+ COLOStatus *s = g_new0(COLOStatus, 1);
78
+ const DMAMap *map = key;
79
+ IOVATreeFindIOVAArgs *args = data;
80
+ const DMAMap *needle;
34
+
81
+
35
+ s->mode = get_colo_mode();
82
+ g_assert(key == value);
36
+
83
+
37
+ switch (failover_get_state()) {
84
+ needle = args->needle;
38
+ case FAILOVER_STATUS_NONE:
85
+ if (map->translated_addr + map->size < needle->translated_addr ||
39
+ s->reason = COLO_EXIT_REASON_NONE;
86
+ needle->translated_addr + needle->size < map->translated_addr) {
40
+ break;
87
+ return false;
41
+ case FAILOVER_STATUS_REQUIRE:
42
+ s->reason = COLO_EXIT_REASON_REQUEST;
43
+ break;
44
+ default:
45
+ s->reason = COLO_EXIT_REASON_ERROR;
46
+ }
88
+ }
47
+
89
+
48
+ return s;
90
+ args->result = map;
91
+ return true;
49
+}
92
+}
50
+
93
+
51
static void colo_send_message(QEMUFile *f, COLOMessage msg,
94
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
52
Error **errp)
95
+{
96
+ IOVATreeFindIOVAArgs args = {
97
+ .needle = map,
98
+ };
99
+
100
+ g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
101
+ return args.result;
102
+}
103
+
104
const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
53
{
105
{
54
diff --git a/qapi/migration.json b/qapi/migration.json
106
const DMAMap map = { .iova = iova, .size = 0 };
55
index XXXXXXX..XXXXXXX 100644
56
--- a/qapi/migration.json
57
+++ b/qapi/migration.json
58
@@ -XXX,XX +XXX,XX @@
59
{ 'command': 'xen-colo-do-checkpoint' }
60
61
##
62
+# @COLOStatus:
63
+#
64
+# The result format for 'query-colo-status'.
65
+#
66
+# @mode: COLO running mode. If COLO is running, this field will return
67
+# 'primary' or 'secondary'.
68
+#
69
+# @reason: describes the reason for the COLO exit.
70
+#
71
+# Since: 3.0
72
+##
73
+{ 'struct': 'COLOStatus',
74
+ 'data': { 'mode': 'COLOMode', 'reason': 'COLOExitReason' } }
75
+
76
+##
77
+# @query-colo-status:
78
+#
79
+# Query COLO status while the vm is running.
80
+#
81
+# Returns: A @COLOStatus object showing the status.
82
+#
83
+# Example:
84
+#
85
+# -> { "execute": "query-colo-status" }
86
+# <- { "return": { "mode": "primary", "active": true, "reason": "request" } }
87
+#
88
+# Since: 3.0
89
+##
90
+{ 'command': 'query-colo-status',
91
+ 'returns': 'COLOStatus' }
92
+
93
+##
94
# @migrate-recover:
95
#
96
# Provide a recovery migration stream URI.
97
--
107
--
98
2.5.0
108
2.7.4
99
109
100
110
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
After one round of checkpoint, the states between PVM and SVM
3
This tree is able to look for a translated address from an IOVA address.
4
become consistent, so it is unnecessary to adjust the sequence
5
of net packets for old connections, besides, while failover
6
happens, filter-rewriter will into failover mode that needn't
7
handle the new TCP connection.
8
4
9
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
5
At first glance it is similar to util/iova-tree. However, SVQ working on
10
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
6
devices with limited IOVA space need more capabilities, like allocating
11
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
7
IOVA chunks or performing reverse translations (qemu addresses to iova).
8
9
The allocation capability, as "assign a free IOVA address to this chunk
10
of memory in qemu's address space" allows shadow virtqueue to create a
11
new address space that is not restricted by guest's addressable one, so
12
we can allocate shadow vqs vrings outside of it.
13
14
It duplicates the tree so it can search efficiently in both directions,
15
and it will signal overlap if iova or the translated address is present
16
in any tree.
17
18
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
19
Acked-by: Michael S. Tsirkin <mst@redhat.com>
12
Signed-off-by: Jason Wang <jasowang@redhat.com>
20
Signed-off-by: Jason Wang <jasowang@redhat.com>
13
---
21
---
14
net/colo-compare.c | 12 +++++------
22
hw/virtio/meson.build | 2 +-
15
net/colo.c | 8 ++++++++
23
hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
16
net/colo.h | 2 ++
24
hw/virtio/vhost-iova-tree.h | 27 +++++++++++
17
net/filter-rewriter.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
25
3 files changed, 138 insertions(+), 1 deletion(-)
18
4 files changed, 73 insertions(+), 6 deletions(-)
26
create mode 100644 hw/virtio/vhost-iova-tree.c
27
create mode 100644 hw/virtio/vhost-iova-tree.h
19
28
20
diff --git a/net/colo-compare.c b/net/colo-compare.c
29
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
21
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
22
--- a/net/colo-compare.c
31
--- a/hw/virtio/meson.build
23
+++ b/net/colo-compare.c
32
+++ b/hw/virtio/meson.build
24
@@ -XXX,XX +XXX,XX @@ enum {
33
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
25
SECONDARY_IN,
34
26
};
35
virtio_ss = ss.source_set()
27
36
virtio_ss.add(files('virtio.c'))
28
+static void colo_compare_inconsistency_notify(void)
37
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
38
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
39
virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
40
virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
41
virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
42
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
43
new file mode 100644
44
index XXXXXXX..XXXXXXX
45
--- /dev/null
46
+++ b/hw/virtio/vhost-iova-tree.c
47
@@ -XXX,XX +XXX,XX @@
48
+/*
49
+ * vhost software live migration iova tree
50
+ *
51
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
52
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
53
+ *
54
+ * SPDX-License-Identifier: GPL-2.0-or-later
55
+ */
56
+
57
+#include "qemu/osdep.h"
58
+#include "qemu/iova-tree.h"
59
+#include "vhost-iova-tree.h"
60
+
61
+#define iova_min_addr qemu_real_host_page_size
62
+
63
+/**
64
+ * VhostIOVATree, able to:
65
+ * - Translate iova address
66
+ * - Reverse translate iova address (from translated to iova)
67
+ * - Allocate IOVA regions for translated range (linear operation)
68
+ */
69
+struct VhostIOVATree {
70
+ /* First addressable iova address in the device */
71
+ uint64_t iova_first;
72
+
73
+ /* Last addressable iova address in the device */
74
+ uint64_t iova_last;
75
+
76
+ /* IOVA address to qemu memory maps. */
77
+ IOVATree *iova_taddr_map;
78
+};
79
+
80
+/**
81
+ * Create a new IOVA tree
82
+ *
83
+ * Returns the new IOVA tree
84
+ */
85
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
29
+{
86
+{
30
+ notifier_list_notify(&colo_compare_notifiers,
87
+ VhostIOVATree *tree = g_new(VhostIOVATree, 1);
31
+ migrate_get_current());
88
+
89
+ /* Some devices do not like 0 addresses */
90
+ tree->iova_first = MAX(iova_first, iova_min_addr);
91
+ tree->iova_last = iova_last;
92
+
93
+ tree->iova_taddr_map = iova_tree_new();
94
+ return tree;
32
+}
95
+}
33
+
96
+
34
static int compare_chr_send(CompareState *s,
97
+/**
35
const uint8_t *buf,
98
+ * Delete an iova tree
36
uint32_t size,
99
+ */
37
@@ -XXX,XX +XXX,XX @@ static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
100
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
38
return false;
39
}
40
41
-static void colo_compare_inconsistency_notify(void)
42
-{
43
- notifier_list_notify(&colo_compare_notifiers,
44
- migrate_get_current());
45
-}
46
-
47
static void colo_compare_tcp(CompareState *s, Connection *conn)
48
{
49
Packet *ppkt = NULL, *spkt = NULL;
50
diff --git a/net/colo.c b/net/colo.c
51
index XXXXXXX..XXXXXXX 100644
52
--- a/net/colo.c
53
+++ b/net/colo.c
54
@@ -XXX,XX +XXX,XX @@ Connection *connection_get(GHashTable *connection_track_table,
55
56
return conn;
57
}
58
+
59
+bool connection_has_tracked(GHashTable *connection_track_table,
60
+ ConnectionKey *key)
61
+{
101
+{
62
+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
102
+ iova_tree_destroy(iova_tree->iova_taddr_map);
63
+
103
+ g_free(iova_tree);
64
+ return conn ? true : false;
65
+}
66
diff --git a/net/colo.h b/net/colo.h
67
index XXXXXXX..XXXXXXX 100644
68
--- a/net/colo.h
69
+++ b/net/colo.h
70
@@ -XXX,XX +XXX,XX @@ void connection_destroy(void *opaque);
71
Connection *connection_get(GHashTable *connection_track_table,
72
ConnectionKey *key,
73
GQueue *conn_list);
74
+bool connection_has_tracked(GHashTable *connection_track_table,
75
+ ConnectionKey *key);
76
void connection_hashtable_reset(GHashTable *connection_track_table);
77
Packet *packet_new(const void *data, int size, int vnet_hdr_len);
78
void packet_destroy(void *opaque, void *user_data);
79
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
80
index XXXXXXX..XXXXXXX 100644
81
--- a/net/filter-rewriter.c
82
+++ b/net/filter-rewriter.c
83
@@ -XXX,XX +XXX,XX @@
84
#include "qemu/main-loop.h"
85
#include "qemu/iov.h"
86
#include "net/checksum.h"
87
+#include "net/colo.h"
88
+#include "migration/colo.h"
89
90
#define FILTER_COLO_REWRITER(obj) \
91
OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER)
92
93
#define TYPE_FILTER_REWRITER "filter-rewriter"
94
+#define FAILOVER_MODE_ON true
95
+#define FAILOVER_MODE_OFF false
96
97
typedef struct RewriterState {
98
NetFilterState parent_obj;
99
@@ -XXX,XX +XXX,XX @@ typedef struct RewriterState {
100
/* hashtable to save connection */
101
GHashTable *connection_track_table;
102
bool vnet_hdr;
103
+ bool failover_mode;
104
} RewriterState;
105
106
+static void filter_rewriter_failover_mode(RewriterState *s)
107
+{
108
+ s->failover_mode = FAILOVER_MODE_ON;
109
+}
104
+}
110
+
105
+
111
static void filter_rewriter_flush(NetFilterState *nf)
106
+/**
112
{
107
+ * Find the IOVA address stored from a memory address
113
RewriterState *s = FILTER_COLO_REWRITER(nf);
108
+ *
114
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
109
+ * @tree: The iova tree
115
*/
110
+ * @map: The map with the memory address
116
reverse_connection_key(&key);
111
+ *
117
}
112
+ * Return the stored mapping, or NULL if not found.
118
+
113
+ */
119
+ /* After failover we needn't change new TCP packet */
114
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
120
+ if (s->failover_mode &&
115
+ const DMAMap *map)
121
+ !connection_has_tracked(s->connection_track_table, &key)) {
122
+ goto out;
123
+ }
124
+
125
conn = connection_get(s->connection_track_table,
126
&key,
127
NULL);
128
@@ -XXX,XX +XXX,XX @@ static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
129
}
130
}
131
132
+out:
133
packet_destroy(pkt, NULL);
134
pkt = NULL;
135
return 0;
136
}
137
138
+static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
139
+{
116
+{
140
+ Connection *conn = (Connection *)value;
117
+ return iova_tree_find_iova(tree->iova_taddr_map, map);
141
+
142
+ conn->offset = 0;
143
+}
118
+}
144
+
119
+
145
+static gboolean offset_is_nonzero(gpointer key,
120
+/**
146
+ gpointer value,
121
+ * Allocate a new mapping
147
+ gpointer user_data)
122
+ *
123
+ * @tree: The iova tree
124
+ * @map: The iova map
125
+ *
126
+ * Returns:
127
+ * - IOVA_OK if the map fits in the container
128
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
129
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
130
+ *
131
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
132
+ */
133
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
148
+{
134
+{
149
+ Connection *conn = (Connection *)value;
135
+ /* Some vhost devices do not like addr 0. Skip first page */
136
+ hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
150
+
137
+
151
+ return conn->offset ? true : false;
138
+ if (map->translated_addr + map->size < map->translated_addr ||
139
+ map->perm == IOMMU_NONE) {
140
+ return IOVA_ERR_INVALID;
141
+ }
142
+
143
+ /* Allocate a node in IOVA address */
144
+ return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
145
+ tree->iova_last);
152
+}
146
+}
153
+
147
+
154
+static void colo_rewriter_handle_event(NetFilterState *nf, int event,
148
+/**
155
+ Error **errp)
149
+ * Remove existing mappings from iova tree
150
+ *
151
+ * @iova_tree: The vhost iova tree
152
+ * @map: The map to remove
153
+ */
154
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
156
+{
155
+{
157
+ RewriterState *rs = FILTER_COLO_REWRITER(nf);
156
+ iova_tree_remove(iova_tree->iova_taddr_map, map);
157
+}
158
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
159
new file mode 100644
160
index XXXXXXX..XXXXXXX
161
--- /dev/null
162
+++ b/hw/virtio/vhost-iova-tree.h
163
@@ -XXX,XX +XXX,XX @@
164
+/*
165
+ * vhost software live migration iova tree
166
+ *
167
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
168
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
169
+ *
170
+ * SPDX-License-Identifier: GPL-2.0-or-later
171
+ */
158
+
172
+
159
+ switch (event) {
173
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
160
+ case COLO_EVENT_CHECKPOINT:
174
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
161
+ g_hash_table_foreach(rs->connection_track_table,
162
+ reset_seq_offset, NULL);
163
+ break;
164
+ case COLO_EVENT_FAILOVER:
165
+ if (!g_hash_table_find(rs->connection_track_table,
166
+ offset_is_nonzero, NULL)) {
167
+ filter_rewriter_failover_mode(rs);
168
+ }
169
+ break;
170
+ default:
171
+ break;
172
+ }
173
+}
174
+
175
+
175
static void colo_rewriter_cleanup(NetFilterState *nf)
176
+#include "qemu/iova-tree.h"
176
{
177
+#include "exec/memory.h"
177
RewriterState *s = FILTER_COLO_REWRITER(nf);
178
+
178
@@ -XXX,XX +XXX,XX @@ static void filter_rewriter_init(Object *obj)
179
+typedef struct VhostIOVATree VhostIOVATree;
179
RewriterState *s = FILTER_COLO_REWRITER(obj);
180
+
180
181
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
181
s->vnet_hdr = false;
182
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
182
+ s->failover_mode = FAILOVER_MODE_OFF;
183
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
183
object_property_add_bool(obj, "vnet_hdr_support",
184
+
184
filter_rewriter_get_vnet_hdr,
185
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
185
filter_rewriter_set_vnet_hdr, NULL);
186
+ const DMAMap *map);
186
@@ -XXX,XX +XXX,XX @@ static void colo_rewriter_class_init(ObjectClass *oc, void *data)
187
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
187
nfc->setup = colo_rewriter_setup;
188
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
188
nfc->cleanup = colo_rewriter_cleanup;
189
+
189
nfc->receive_iov = colo_rewriter_receive_iov;
190
+#endif
190
+ nfc->handle_event = colo_rewriter_handle_event;
191
}
192
193
static const TypeInfo colo_rewriter_info = {
194
--
191
--
195
2.5.0
192
2.7.4
196
193
197
194
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
During the time of VM's running, PVM may dirty some pages, we will transfer
3
Use translations added in VhostIOVATree in SVQ.
4
PVM's dirty pages to SVM and store them into SVM's RAM cache at next checkpoint
4
5
time. So, the content of SVM's RAM cache will always be same with PVM's memory
5
Only introduce usage here, not allocation and deallocation. As with
6
after checkpoint.
6
previous patches, we use the dead code paths of shadow_vqs_enabled to
7
7
avoid commiting too many changes at once. These are impossible to take
8
Instead of flushing all content of PVM's RAM cache into SVM's MEMORY,
8
at the moment.
9
we do this in a more efficient way:
9
10
Only flush any page that dirtied by PVM since last checkpoint.
10
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
11
In this way, we can ensure SVM's memory same with PVM's.
11
Acked-by: Michael S. Tsirkin <mst@redhat.com>
12
13
Besides, we must ensure flush RAM cache before load device state.
14
15
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
16
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
17
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
18
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
Signed-off-by: Jason Wang <jasowang@redhat.com>
19
---
13
---
20
migration/ram.c | 37 +++++++++++++++++++++++++++++++++++++
14
hw/virtio/vhost-shadow-virtqueue.c | 86 +++++++++++++++++++++++---
21
migration/trace-events | 2 ++
15
hw/virtio/vhost-shadow-virtqueue.h | 6 +-
22
2 files changed, 39 insertions(+)
16
hw/virtio/vhost-vdpa.c | 122 +++++++++++++++++++++++++++++++------
23
17
include/hw/virtio/vhost-vdpa.h | 3 +
24
diff --git a/migration/ram.c b/migration/ram.c
18
4 files changed, 187 insertions(+), 30 deletions(-)
19
20
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
25
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
26
--- a/migration/ram.c
22
--- a/hw/virtio/vhost-shadow-virtqueue.c
27
+++ b/migration/ram.c
23
+++ b/hw/virtio/vhost-shadow-virtqueue.c
28
@@ -XXX,XX +XXX,XX @@ static bool postcopy_is_running(void)
24
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
29
return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
25
return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
30
}
26
}
31
27
32
+/*
28
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
33
+ * Flush content of RAM cache into SVM's memory.
29
+/**
34
+ * Only flush the pages that be dirtied by PVM or SVM or both.
30
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
31
+ *
32
+ * @svq: Shadow VirtQueue
33
+ * @vaddr: Translated IOVA addresses
34
+ * @iovec: Source qemu's VA addresses
35
+ * @num: Length of iovec and minimum length of vaddr
35
+ */
36
+ */
36
+static void colo_flush_ram_cache(void)
37
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
38
+ hwaddr *addrs, const struct iovec *iovec,
39
+ size_t num)
37
+{
40
+{
38
+ RAMBlock *block = NULL;
41
+ if (num == 0) {
39
+ void *dst_host;
42
+ return true;
40
+ void *src_host;
43
+ }
41
+ unsigned long offset = 0;
44
+
42
+
45
+ for (size_t i = 0; i < num; ++i) {
43
+ trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
46
+ DMAMap needle = {
44
+ rcu_read_lock();
47
+ .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
45
+ block = QLIST_FIRST_RCU(&ram_list.blocks);
48
+ .size = iovec[i].iov_len,
46
+
49
+ };
47
+ while (block) {
50
+ Int128 needle_last, map_last;
48
+ offset = migration_bitmap_find_dirty(ram_state, block, offset);
51
+ size_t off;
49
+
52
+
50
+ if (offset << TARGET_PAGE_BITS >= block->used_length) {
53
+ const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
51
+ offset = 0;
54
+ /*
52
+ block = QLIST_NEXT_RCU(block, next);
55
+ * Map cannot be NULL since iova map contains all guest space and
53
+ } else {
56
+ * qemu already has a physical address mapped
54
+ migration_bitmap_clear_dirty(ram_state, block, offset);
57
+ */
55
+ dst_host = block->host + (offset << TARGET_PAGE_BITS);
58
+ if (unlikely(!map)) {
56
+ src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
59
+ qemu_log_mask(LOG_GUEST_ERROR,
57
+ memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
60
+ "Invalid address 0x%"HWADDR_PRIx" given by guest",
61
+ needle.translated_addr);
62
+ return false;
58
+ }
63
+ }
59
+ }
64
+
60
+
65
+ off = needle.translated_addr - map->translated_addr;
61
+ rcu_read_unlock();
66
+ addrs[i] = map->iova + off;
62
+ trace_colo_flush_ram_cache_end();
67
+
68
+ needle_last = int128_add(int128_make64(needle.translated_addr),
69
+ int128_make64(iovec[i].iov_len));
70
+ map_last = int128_make64(map->translated_addr + map->size);
71
+ if (unlikely(int128_gt(needle_last, map_last))) {
72
+ qemu_log_mask(LOG_GUEST_ERROR,
73
+ "Guest buffer expands over iova range");
74
+ return false;
75
+ }
76
+ }
77
+
78
+ return true;
63
+}
79
+}
64
+
80
+
65
static int ram_load(QEMUFile *f, void *opaque, int version_id)
81
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
66
{
82
const struct iovec *iovec, size_t num,
67
int flags = 0, ret = 0, invalid_flags = 0;
83
bool more_descs, bool write)
68
@@ -XXX,XX +XXX,XX @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
84
{
69
ret |= wait_for_decompress_done();
85
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
70
rcu_read_unlock();
86
} else {
71
trace_ram_load_complete(ret, seq_iter);
87
descs[i].flags = flags;
72
+
88
}
73
+ if (!ret && migration_incoming_in_colo_state()) {
89
- descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
74
+ colo_flush_ram_cache();
90
+ descs[i].addr = cpu_to_le64(sg[n]);
75
+ }
91
descs[i].len = cpu_to_le32(iovec[n].iov_len);
76
return ret;
92
93
last = i;
94
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
95
{
96
unsigned avail_idx;
97
vring_avail_t *avail = svq->vring.avail;
98
+ bool ok;
99
+ g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
100
101
*head = svq->free_head;
102
103
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
104
return false;
105
}
106
107
- vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
108
- false);
109
- vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
110
+ ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
111
+ if (unlikely(!ok)) {
112
+ return false;
113
+ }
114
+ vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
115
+ elem->in_num > 0, false);
116
+
117
+
118
+ ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
119
+ if (unlikely(!ok)) {
120
+ return false;
121
+ }
122
+
123
+ vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
124
125
/*
126
* Put the entry in the available array (but don't update avail->idx until
127
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
128
void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
129
struct vhost_vring_addr *addr)
130
{
131
- addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
132
- addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
133
- addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
134
+ addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
135
+ addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
136
+ addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
77
}
137
}
78
138
79
diff --git a/migration/trace-events b/migration/trace-events
139
size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
140
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
141
* Creates vhost shadow virtqueue, and instructs the vhost device to use the
142
* shadow methods and file descriptors.
143
*
144
+ * @iova_tree: Tree to perform descriptors translations
145
+ *
146
* Returns the new virtqueue or NULL.
147
*
148
* In case of error, reason is reported through error_report.
149
*/
150
-VhostShadowVirtqueue *vhost_svq_new(void)
151
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
152
{
153
g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
154
int r;
155
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
156
157
event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
158
event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
159
+ svq->iova_tree = iova_tree;
160
return g_steal_pointer(&svq);
161
162
err_init_hdev_call:
163
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
80
index XXXXXXX..XXXXXXX 100644
164
index XXXXXXX..XXXXXXX 100644
81
--- a/migration/trace-events
165
--- a/hw/virtio/vhost-shadow-virtqueue.h
82
+++ b/migration/trace-events
166
+++ b/hw/virtio/vhost-shadow-virtqueue.h
83
@@ -XXX,XX +XXX,XX @@ ram_dirty_bitmap_sync_start(void) ""
167
@@ -XXX,XX +XXX,XX @@
84
ram_dirty_bitmap_sync_wait(void) ""
168
#include "qemu/event_notifier.h"
85
ram_dirty_bitmap_sync_complete(void) ""
169
#include "hw/virtio/virtio.h"
86
ram_state_resume_prepare(uint64_t v) "%" PRId64
170
#include "standard-headers/linux/vhost_types.h"
87
+colo_flush_ram_cache_begin(uint64_t dirty_pages) "dirty_pages %" PRIu64
171
+#include "hw/virtio/vhost-iova-tree.h"
88
+colo_flush_ram_cache_end(void) ""
172
89
173
/* Shadow virtqueue to relay notifications */
90
# migration/migration.c
174
typedef struct VhostShadowVirtqueue {
91
await_return_path_close_on_source_close(void) ""
175
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
176
/* Virtio device */
177
VirtIODevice *vdev;
178
179
+ /* IOVA mapping */
180
+ VhostIOVATree *iova_tree;
181
+
182
/* Map for use the guest's descriptors */
183
VirtQueueElement **ring_id_maps;
184
185
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
186
VirtQueue *vq);
187
void vhost_svq_stop(VhostShadowVirtqueue *svq);
188
189
-VhostShadowVirtqueue *vhost_svq_new(void);
190
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
191
192
void vhost_svq_free(gpointer vq);
193
G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
194
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
195
index XXXXXXX..XXXXXXX 100644
196
--- a/hw/virtio/vhost-vdpa.c
197
+++ b/hw/virtio/vhost-vdpa.c
198
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
199
vaddr, section->readonly);
200
201
llsize = int128_sub(llend, int128_make64(iova));
202
+ if (v->shadow_vqs_enabled) {
203
+ DMAMap mem_region = {
204
+ .translated_addr = (hwaddr)(uintptr_t)vaddr,
205
+ .size = int128_get64(llsize) - 1,
206
+ .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
207
+ };
208
+
209
+ int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
210
+ if (unlikely(r != IOVA_OK)) {
211
+ error_report("Can't allocate a mapping (%d)", r);
212
+ goto fail;
213
+ }
214
+
215
+ iova = mem_region.iova;
216
+ }
217
218
vhost_vdpa_iotlb_batch_begin_once(v);
219
ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
220
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
221
222
llsize = int128_sub(llend, int128_make64(iova));
223
224
+ if (v->shadow_vqs_enabled) {
225
+ const DMAMap *result;
226
+ const void *vaddr = memory_region_get_ram_ptr(section->mr) +
227
+ section->offset_within_region +
228
+ (iova - section->offset_within_address_space);
229
+ DMAMap mem_region = {
230
+ .translated_addr = (hwaddr)(uintptr_t)vaddr,
231
+ .size = int128_get64(llsize) - 1,
232
+ };
233
+
234
+ result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
235
+ iova = result->iova;
236
+ vhost_iova_tree_remove(v->iova_tree, &mem_region);
237
+ }
238
vhost_vdpa_iotlb_batch_begin_once(v);
239
ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
240
if (ret) {
241
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
242
243
shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
244
for (unsigned n = 0; n < hdev->nvqs; ++n) {
245
- g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
246
+ g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
247
248
if (unlikely(!svq)) {
249
error_setg(errp, "Cannot create svq %u", n);
250
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
251
/**
252
* Unmap a SVQ area in the device
253
*/
254
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
255
- hwaddr size)
256
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
257
+ const DMAMap *needle)
258
{
259
+ const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
260
+ hwaddr size;
261
int r;
262
263
- size = ROUND_UP(size, qemu_real_host_page_size);
264
- r = vhost_vdpa_dma_unmap(v, iova, size);
265
+ if (unlikely(!result)) {
266
+ error_report("Unable to find SVQ address to unmap");
267
+ return false;
268
+ }
269
+
270
+ size = ROUND_UP(result->size, qemu_real_host_page_size);
271
+ r = vhost_vdpa_dma_unmap(v, result->iova, size);
272
return r == 0;
273
}
274
275
static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
276
const VhostShadowVirtqueue *svq)
277
{
278
+ DMAMap needle = {};
279
struct vhost_vdpa *v = dev->opaque;
280
struct vhost_vring_addr svq_addr;
281
- size_t device_size = vhost_svq_device_area_size(svq);
282
- size_t driver_size = vhost_svq_driver_area_size(svq);
283
bool ok;
284
285
vhost_svq_get_vring_addr(svq, &svq_addr);
286
287
- ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
288
+ needle.translated_addr = svq_addr.desc_user_addr;
289
+ ok = vhost_vdpa_svq_unmap_ring(v, &needle);
290
if (unlikely(!ok)) {
291
return false;
292
}
293
294
- return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
295
+ needle.translated_addr = svq_addr.used_user_addr;
296
+ return vhost_vdpa_svq_unmap_ring(v, &needle);
297
+}
298
+
299
+/**
300
+ * Map the SVQ area in the device
301
+ *
302
+ * @v: Vhost-vdpa device
303
+ * @needle: The area to search iova
304
+ * @errorp: Error pointer
305
+ */
306
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
307
+ Error **errp)
308
+{
309
+ int r;
310
+
311
+ r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
312
+ if (unlikely(r != IOVA_OK)) {
313
+ error_setg(errp, "Cannot allocate iova (%d)", r);
314
+ return false;
315
+ }
316
+
317
+ r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
318
+ (void *)(uintptr_t)needle->translated_addr,
319
+ needle->perm == IOMMU_RO);
320
+ if (unlikely(r != 0)) {
321
+ error_setg_errno(errp, -r, "Cannot map region to device");
322
+ vhost_iova_tree_remove(v->iova_tree, needle);
323
+ }
324
+
325
+ return r == 0;
326
}
327
328
/**
329
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
330
struct vhost_vring_addr *addr,
331
Error **errp)
332
{
333
+ DMAMap device_region, driver_region;
334
+ struct vhost_vring_addr svq_addr;
335
struct vhost_vdpa *v = dev->opaque;
336
size_t device_size = vhost_svq_device_area_size(svq);
337
size_t driver_size = vhost_svq_driver_area_size(svq);
338
- int r;
339
+ size_t avail_offset;
340
+ bool ok;
341
342
ERRP_GUARD();
343
- vhost_svq_get_vring_addr(svq, addr);
344
+ vhost_svq_get_vring_addr(svq, &svq_addr);
345
346
- r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
347
- (void *)(uintptr_t)addr->desc_user_addr, true);
348
- if (unlikely(r != 0)) {
349
- error_setg_errno(errp, -r, "Cannot create vq driver region: ");
350
+ driver_region = (DMAMap) {
351
+ .translated_addr = svq_addr.desc_user_addr,
352
+ .size = driver_size - 1,
353
+ .perm = IOMMU_RO,
354
+ };
355
+ ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
356
+ if (unlikely(!ok)) {
357
+ error_prepend(errp, "Cannot create vq driver region: ");
358
return false;
359
}
360
+ addr->desc_user_addr = driver_region.iova;
361
+ avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
362
+ addr->avail_user_addr = driver_region.iova + avail_offset;
363
364
- r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
365
- (void *)(intptr_t)addr->used_user_addr, false);
366
- if (unlikely(r != 0)) {
367
- error_setg_errno(errp, -r, "Cannot create vq device region: ");
368
+ device_region = (DMAMap) {
369
+ .translated_addr = svq_addr.used_user_addr,
370
+ .size = device_size - 1,
371
+ .perm = IOMMU_RW,
372
+ };
373
+ ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
374
+ if (unlikely(!ok)) {
375
+ error_prepend(errp, "Cannot create vq device region: ");
376
+ vhost_vdpa_svq_unmap_ring(v, &driver_region);
377
}
378
+ addr->used_user_addr = device_region.iova;
379
380
- return r == 0;
381
+ return ok;
382
}
383
384
static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
385
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
386
index XXXXXXX..XXXXXXX 100644
387
--- a/include/hw/virtio/vhost-vdpa.h
388
+++ b/include/hw/virtio/vhost-vdpa.h
389
@@ -XXX,XX +XXX,XX @@
390
391
#include <gmodule.h>
392
393
+#include "hw/virtio/vhost-iova-tree.h"
394
#include "hw/virtio/virtio.h"
395
#include "standard-headers/linux/vhost_types.h"
396
397
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
398
MemoryListener listener;
399
struct vhost_vdpa_iova_range iova_range;
400
bool shadow_vqs_enabled;
401
+ /* IOVA mapping used by the Shadow Virtqueue */
402
+ VhostIOVATree *iova_tree;
403
GPtrArray *shadow_vqs;
404
struct vhost_dev *dev;
405
VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
92
--
406
--
93
2.5.0
407
2.7.4
94
408
95
409
diff view generated by jsdifflib
Deleted patch
1
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
2
1
3
If some errors happen during VM's COLO FT stage, it's important to
4
notify the users of this event. Together with 'x-colo-lost-heartbeat',
5
Users can intervene in COLO's failover work immediately.
6
If users don't want to get involved in COLO's failover verdict,
7
it is still necessary to notify users that we exited COLO mode.
8
9
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
10
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
11
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
12
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
13
Signed-off-by: Jason Wang <jasowang@redhat.com>
14
---
15
migration/colo.c | 31 +++++++++++++++++++++++++++++++
16
qapi/migration.json | 38 ++++++++++++++++++++++++++++++++++++++
17
2 files changed, 69 insertions(+)
18
19
diff --git a/migration/colo.c b/migration/colo.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/migration/colo.c
22
+++ b/migration/colo.c
23
@@ -XXX,XX +XXX,XX @@
24
#include "net/colo-compare.h"
25
#include "net/colo.h"
26
#include "block/block.h"
27
+#include "qapi/qapi-events-migration.h"
28
29
static bool vmstate_loading;
30
static Notifier packets_compare_notifier;
31
@@ -XXX,XX +XXX,XX @@ out:
32
qemu_fclose(fb);
33
}
34
35
+ /*
36
+ * There are only two reasons we can get here, some error happened
37
+ * or the user triggered failover.
38
+ */
39
+ switch (failover_get_state()) {
40
+ case FAILOVER_STATUS_NONE:
41
+ qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
42
+ COLO_EXIT_REASON_ERROR);
43
+ break;
44
+ case FAILOVER_STATUS_REQUIRE:
45
+ qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
46
+ COLO_EXIT_REASON_REQUEST);
47
+ break;
48
+ default:
49
+ abort();
50
+ }
51
+
52
/* Hope this not to be too long to wait here */
53
qemu_sem_wait(&s->colo_exit_sem);
54
qemu_sem_destroy(&s->colo_exit_sem);
55
@@ -XXX,XX +XXX,XX @@ out:
56
error_report_err(local_err);
57
}
58
59
+ switch (failover_get_state()) {
60
+ case FAILOVER_STATUS_NONE:
61
+ qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
62
+ COLO_EXIT_REASON_ERROR);
63
+ break;
64
+ case FAILOVER_STATUS_REQUIRE:
65
+ qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
66
+ COLO_EXIT_REASON_REQUEST);
67
+ break;
68
+ default:
69
+ abort();
70
+ }
71
+
72
if (fb) {
73
qemu_fclose(fb);
74
}
75
diff --git a/qapi/migration.json b/qapi/migration.json
76
index XXXXXXX..XXXXXXX 100644
77
--- a/qapi/migration.json
78
+++ b/qapi/migration.json
79
@@ -XXX,XX +XXX,XX @@
80
'data': [ 'none', 'require', 'active', 'completed', 'relaunch' ] }
81
82
##
83
+# @COLO_EXIT:
84
+#
85
+# Emitted when VM finishes COLO mode due to some errors happening or
86
+# at the request of users.
87
+#
88
+# @mode: report COLO mode when COLO exited.
89
+#
90
+# @reason: describes the reason for the COLO exit.
91
+#
92
+# Since: 3.1
93
+#
94
+# Example:
95
+#
96
+# <- { "timestamp": {"seconds": 2032141960, "microseconds": 417172},
97
+# "event": "COLO_EXIT", "data": {"mode": "primary", "reason": "request" } }
98
+#
99
+##
100
+{ 'event': 'COLO_EXIT',
101
+ 'data': {'mode': 'COLOMode', 'reason': 'COLOExitReason' } }
102
+
103
+##
104
+# @COLOExitReason:
105
+#
106
+# The reason for a COLO exit
107
+#
108
+# @none: no failover has ever happened. This can't occur in the
109
+# COLO_EXIT event, only in the result of query-colo-status.
110
+#
111
+# @request: COLO exit is due to an external request
112
+#
113
+# @error: COLO exit is due to an internal error
114
+#
115
+# Since: 3.1
116
+##
117
+{ 'enum': 'COLOExitReason',
118
+ 'data': [ 'none', 'request', 'error' ] }
119
+
120
+##
121
# @x-colo-lost-heartbeat:
122
#
123
# Tell qemu that heartbeat is lost, request it to do takeover procedures.
124
--
125
2.5.0
126
127
diff view generated by jsdifflib
Deleted patch
1
From: Zhang Chen <chen.zhang@intel.com>
2
1
3
Suggested by Markus Armbruster rename COLO unknown mode to none mode.
4
5
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
6
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
7
Reviewed-by: Eric Blake <eblake@redhat.com>
8
Reviewed-by: Markus Armbruster <armbru@redhat.com>
9
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
---
11
migration/colo-failover.c | 2 +-
12
migration/colo.c | 2 +-
13
qapi/migration.json | 10 +++++-----
14
3 files changed, 7 insertions(+), 7 deletions(-)
15
16
diff --git a/migration/colo-failover.c b/migration/colo-failover.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/migration/colo-failover.c
19
+++ b/migration/colo-failover.c
20
@@ -XXX,XX +XXX,XX @@ FailoverStatus failover_get_state(void)
21
22
void qmp_x_colo_lost_heartbeat(Error **errp)
23
{
24
- if (get_colo_mode() == COLO_MODE_UNKNOWN) {
25
+ if (get_colo_mode() == COLO_MODE_NONE) {
26
error_setg(errp, QERR_FEATURE_DISABLED, "colo");
27
return;
28
}
29
diff --git a/migration/colo.c b/migration/colo.c
30
index XXXXXXX..XXXXXXX 100644
31
--- a/migration/colo.c
32
+++ b/migration/colo.c
33
@@ -XXX,XX +XXX,XX @@ COLOMode get_colo_mode(void)
34
} else if (migration_incoming_in_colo_state()) {
35
return COLO_MODE_SECONDARY;
36
} else {
37
- return COLO_MODE_UNKNOWN;
38
+ return COLO_MODE_NONE;
39
}
40
}
41
42
diff --git a/qapi/migration.json b/qapi/migration.json
43
index XXXXXXX..XXXXXXX 100644
44
--- a/qapi/migration.json
45
+++ b/qapi/migration.json
46
@@ -XXX,XX +XXX,XX @@
47
##
48
# @COLOMode:
49
#
50
-# The colo mode
51
+# The COLO current mode.
52
#
53
-# @unknown: unknown mode
54
+# @none: COLO is disabled.
55
#
56
-# @primary: master side
57
+# @primary: COLO node in primary side.
58
#
59
-# @secondary: slave side
60
+# @secondary: COLO node in slave side.
61
#
62
# Since: 2.8
63
##
64
{ 'enum': 'COLOMode',
65
- 'data': [ 'unknown', 'primary', 'secondary'] }
66
+ 'data': [ 'none', 'primary', 'secondary'] }
67
68
##
69
# @FailoverStatus:
70
--
71
2.5.0
72
73
diff view generated by jsdifflib
1
There should not be a reason for passing a packet size greater than
1
From: Eugenio Pérez <eperezma@redhat.com>
2
INT_MAX. It's usually a hint of bug somewhere, so ignore packet size
3
greater than INT_MAX in qemu_deliver_packet_iov()
4
2
5
CC: qemu-stable@nongnu.org
3
This is needed to achieve migration, so the destination can restore its
6
Reported-by: Daniel Shapira <daniel@twistlock.com>
4
index.
7
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
5
6
Setting base as last used idx, so destination will see as available all
7
the entries that the device did not use, including the in-flight
8
processing ones.
9
10
This is ok for networking, but other kinds of devices might have
11
problems with these retransmissions.
12
13
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
14
Acked-by: Michael S. Tsirkin <mst@redhat.com>
8
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
9
---
16
---
10
net/net.c | 7 ++++++-
17
hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
11
1 file changed, 6 insertions(+), 1 deletion(-)
18
1 file changed, 17 insertions(+)
12
19
13
diff --git a/net/net.c b/net/net.c
20
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
14
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
15
--- a/net/net.c
22
--- a/hw/virtio/vhost-vdpa.c
16
+++ b/net/net.c
23
+++ b/hw/virtio/vhost-vdpa.c
17
@@ -XXX,XX +XXX,XX @@ ssize_t qemu_deliver_packet_iov(NetClientState *sender,
24
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
18
void *opaque)
25
static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
26
struct vhost_vring_state *ring)
19
{
27
{
20
NetClientState *nc = opaque;
28
+ struct vhost_vdpa *v = dev->opaque;
21
+ size_t size = iov_size(iov, iovcnt);
22
int ret;
29
int ret;
23
30
24
+ if (size > INT_MAX) {
31
+ if (v->shadow_vqs_enabled) {
25
+ return size;
32
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
33
+ ring->index);
34
+
35
+ /*
36
+ * Setting base as last used idx, so destination will see as available
37
+ * all the entries that the device did not use, including the in-flight
38
+ * processing ones.
39
+ *
40
+ * TODO: This is ok for networking, but other kinds of devices might
41
+ * have problems with these retransmissions.
42
+ */
43
+ ring->num = svq->last_used_idx;
44
+ return 0;
26
+ }
45
+ }
27
+
46
+
28
if (nc->link_down) {
47
ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
29
- return iov_size(iov, iovcnt);
48
trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
30
+ return size;
49
return ret;
31
}
32
33
if (nc->receive_disabled) {
34
--
50
--
35
2.5.0
51
2.7.4
36
52
37
53
diff view generated by jsdifflib
1
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
Notify all net filters about the checkpoint and failover event.
3
Setting the log address would make the device start reporting invalid
4
dirty memory because the SVQ vrings are located in qemu's memory.
4
5
5
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
6
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
6
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
7
Acked-by: Michael S. Tsirkin <mst@redhat.com>
7
Signed-off-by: Jason Wang <jasowang@redhat.com>
8
Signed-off-by: Jason Wang <jasowang@redhat.com>
8
---
9
---
9
migration/colo.c | 15 +++++++++++++++
10
hw/virtio/vhost-vdpa.c | 3 ++-
10
1 file changed, 15 insertions(+)
11
1 file changed, 2 insertions(+), 1 deletion(-)
11
12
12
diff --git a/migration/colo.c b/migration/colo.c
13
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
13
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
14
--- a/migration/colo.c
15
--- a/hw/virtio/vhost-vdpa.c
15
+++ b/migration/colo.c
16
+++ b/hw/virtio/vhost-vdpa.c
16
@@ -XXX,XX +XXX,XX @@
17
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
17
#include "qapi/qapi-events-migration.h"
18
static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
18
#include "qapi/qmp/qerror.h"
19
struct vhost_log *log)
19
#include "sysemu/cpus.h"
20
{
20
+#include "net/filter.h"
21
- if (vhost_vdpa_one_time_request(dev)) {
21
22
+ struct vhost_vdpa *v = dev->opaque;
22
static bool vmstate_loading;
23
+ if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
23
static Notifier packets_compare_notifier;
24
return 0;
24
@@ -XXX,XX +XXX,XX @@ static void secondary_vm_do_failover(void)
25
error_report_err(local_err);
26
}
25
}
27
26
28
+ /* Notify all filters of all NIC to do checkpoint */
29
+ colo_notify_filters_event(COLO_EVENT_FAILOVER, &local_err);
30
+ if (local_err) {
31
+ error_report_err(local_err);
32
+ }
33
+
34
if (!autostart) {
35
error_report("\"-S\" qemu option will be ignored in secondary side");
36
/* recover runstate to normal migration finish state */
37
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
38
goto out;
39
}
40
41
+ /* Notify all filters of all NIC to do checkpoint */
42
+ colo_notify_filters_event(COLO_EVENT_CHECKPOINT, &local_err);
43
+
44
+ if (local_err) {
45
+ qemu_mutex_unlock_iothread();
46
+ goto out;
47
+ }
48
+
49
vmstate_loading = false;
50
vm_start();
51
trace_colo_vm_state_change("stop", "run");
52
--
27
--
53
2.5.0
28
2.7.4
54
29
55
30
diff view generated by jsdifflib
1
From: Zhang Chen <zhangckid@gmail.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
There are several stages during loadvm/savevm process. In different stage,
3
SVQ is able to log the dirty bits by itself, so let's use it to not
4
migration incoming processes different types of sections.
4
block migration.
5
We want to control these stages more accuracy, it will benefit COLO
6
performance, we don't have to save type of QEMU_VM_SECTION_START
7
sections everytime while do checkpoint, besides, we want to separate
8
the process of saving/loading memory and devices state.
9
5
10
So we add three new helper functions: qemu_load_device_state() and
6
Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
11
qemu_savevm_live_state() to achieve different process during migration.
7
enabled. Even if the device supports it, the reports would be nonsense
8
because SVQ memory is in the qemu region.
12
9
13
Besides, we make qemu_loadvm_state_main() and qemu_save_device_state()
10
The log region is still allocated. Future changes might skip that, but
14
public, and simplify the codes of qemu_save_device_state() by calling the
11
this series is already long enough.
15
wrapper qemu_savevm_state_header().
16
12
17
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
13
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
18
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
14
Acked-by: Michael S. Tsirkin <mst@redhat.com>
19
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
20
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
21
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
22
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
23
---
16
---
24
migration/colo.c | 41 ++++++++++++++++++++++++++++++++---------
17
hw/virtio/vhost-vdpa.c | 39 +++++++++++++++++++++++++++++++++++----
25
migration/savevm.c | 36 +++++++++++++++++++++++++++++-------
18
include/hw/virtio/vhost-vdpa.h | 1 +
26
migration/savevm.h | 4 ++++
19
2 files changed, 36 insertions(+), 4 deletions(-)
27
3 files changed, 65 insertions(+), 16 deletions(-)
28
20
29
diff --git a/migration/colo.c b/migration/colo.c
21
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
30
index XXXXXXX..XXXXXXX 100644
22
index XXXXXXX..XXXXXXX 100644
31
--- a/migration/colo.c
23
--- a/hw/virtio/vhost-vdpa.c
32
+++ b/migration/colo.c
24
+++ b/hw/virtio/vhost-vdpa.c
33
@@ -XXX,XX +XXX,XX @@
25
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
34
#include "block/block.h"
26
return v->index != 0;
35
#include "qapi/qapi-events-migration.h"
27
}
36
#include "qapi/qmp/qerror.h"
28
37
+#include "sysemu/cpus.h"
29
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
38
30
+ uint64_t *features)
39
static bool vmstate_loading;
31
+{
40
static Notifier packets_compare_notifier;
32
+ int ret;
41
@@ -XXX,XX +XXX,XX @@ static int colo_do_checkpoint_transaction(MigrationState *s,
33
+
42
34
+ ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
43
/* Disable block migration */
35
+ trace_vhost_vdpa_get_features(dev, *features);
44
migrate_set_block_enabled(false, &local_err);
36
+ return ret;
45
- qemu_savevm_state_header(fb);
37
+}
46
- qemu_savevm_state_setup(fb);
38
+
47
qemu_mutex_lock_iothread();
39
static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
48
replication_do_checkpoint_all(&local_err);
40
Error **errp)
49
if (local_err) {
41
{
50
qemu_mutex_unlock_iothread();
42
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
51
goto out;
43
return 0;
52
}
44
}
53
- qemu_savevm_state_complete_precopy(fb, false, false);
45
54
- qemu_mutex_unlock_iothread();
46
- r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
55
-
47
+ r = vhost_vdpa_get_dev_features(hdev, &dev_features);
56
- qemu_fflush(fb);
48
if (r != 0) {
57
49
error_setg_errno(errp, -r, "Can't get vdpa device features");
58
colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
50
return r;
59
if (local_err) {
51
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
60
+ qemu_mutex_unlock_iothread();
52
static int vhost_vdpa_set_features(struct vhost_dev *dev,
61
+ goto out;
53
uint64_t features)
62
+ }
54
{
63
+ /* Note: device state is saved into buffer */
55
+ struct vhost_vdpa *v = dev->opaque;
64
+ ret = qemu_save_device_state(fb);
56
int ret;
65
+
57
66
+ qemu_mutex_unlock_iothread();
58
if (vhost_vdpa_one_time_request(dev)) {
67
+ if (ret < 0) {
59
return 0;
68
goto out;
69
}
60
}
70
/*
61
71
+ * Only save VM's live state, which not including device state.
62
+ if (v->shadow_vqs_enabled) {
72
+ * TODO: We may need a timeout mechanism to prevent COLO process
63
+ if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
73
+ * to be blocked here.
64
+ /*
74
+ */
65
+ * QEMU is just trying to enable or disable logging. SVQ handles
75
+ qemu_savevm_live_state(s->to_dst_file);
66
+ * this sepparately, so no need to forward this.
76
+
67
+ */
77
+ qemu_fflush(fb);
68
+ v->acked_features = features;
78
+
69
+ return 0;
79
+ /*
80
* We need the size of the VMstate data in Secondary side,
81
* With which we can decide how much data should be read.
82
*/
83
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
84
uint64_t total_size;
85
uint64_t value;
86
Error *local_err = NULL;
87
+ int ret;
88
89
rcu_register_thread();
90
qemu_sem_init(&mis->colo_incoming_sem, 0);
91
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
92
goto out;
93
}
94
95
+ qemu_mutex_lock_iothread();
96
+ cpu_synchronize_all_pre_loadvm();
97
+ ret = qemu_loadvm_state_main(mis->from_src_file, mis);
98
+ qemu_mutex_unlock_iothread();
99
+
100
+ if (ret < 0) {
101
+ error_report("Load VM's live state (ram) error");
102
+ goto out;
103
+ }
70
+ }
104
+
71
+
105
value = colo_receive_message_value(mis->from_src_file,
72
+ v->acked_features = features;
106
COLO_MESSAGE_VMSTATE_SIZE, &local_err);
73
+
107
if (local_err) {
74
+ /* We must not ack _F_LOG if SVQ is enabled */
108
@@ -XXX,XX +XXX,XX @@ void *colo_process_incoming_thread(void *opaque)
75
+ features &= ~BIT_ULL(VHOST_F_LOG_ALL);
109
}
76
+ }
110
77
+
111
qemu_mutex_lock_iothread();
78
trace_vhost_vdpa_set_features(dev, features);
112
- qemu_system_reset(SHUTDOWN_CAUSE_NONE);
79
ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
113
vmstate_loading = true;
80
if (ret) {
114
- if (qemu_loadvm_state(fb) < 0) {
81
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
115
- error_report("COLO: loadvm failed");
82
static int vhost_vdpa_get_features(struct vhost_dev *dev,
116
+ ret = qemu_load_device_state(fb);
83
uint64_t *features)
117
+ if (ret < 0) {
84
{
118
+ error_report("COLO: load device state failed");
85
- int ret;
119
qemu_mutex_unlock_iothread();
86
+ struct vhost_vdpa *v = dev->opaque;
120
goto out;
87
+ int ret = vhost_vdpa_get_dev_features(dev, features);
121
}
88
+
122
diff --git a/migration/savevm.c b/migration/savevm.c
89
+ if (ret == 0 && v->shadow_vqs_enabled) {
123
index XXXXXXX..XXXXXXX 100644
90
+ /* Add SVQ logging capabilities */
124
--- a/migration/savevm.c
91
+ *features |= BIT_ULL(VHOST_F_LOG_ALL);
125
+++ b/migration/savevm.c
92
+ }
126
@@ -XXX,XX +XXX,XX @@ done:
93
94
- ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
95
- trace_vhost_vdpa_get_features(dev, *features);
127
return ret;
96
return ret;
128
}
97
}
129
98
130
-static int qemu_save_device_state(QEMUFile *f)
99
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
131
+void qemu_savevm_live_state(QEMUFile *f)
132
{
133
- SaveStateEntry *se;
134
+ /* save QEMU_VM_SECTION_END section */
135
+ qemu_savevm_state_complete_precopy(f, true, false);
136
+ qemu_put_byte(f, QEMU_VM_EOF);
137
+}
138
139
- qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
140
- qemu_put_be32(f, QEMU_VM_FILE_VERSION);
141
+int qemu_save_device_state(QEMUFile *f)
142
+{
143
+ SaveStateEntry *se;
144
145
+ if (!migration_in_colo_state()) {
146
+ qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
147
+ qemu_put_be32(f, QEMU_VM_FILE_VERSION);
148
+ }
149
cpu_synchronize_all_states();
150
151
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
152
@@ -XXX,XX +XXX,XX @@ enum LoadVMExitCodes {
153
LOADVM_QUIT = 1,
154
};
155
156
-static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
157
-
158
/* ------ incoming postcopy messages ------ */
159
/* 'advise' arrives before any transfers just to tell us that a postcopy
160
* *might* happen - it might be skipped if precopy transferred everything
161
@@ -XXX,XX +XXX,XX @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
162
return true;
163
}
164
165
-static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
166
+int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
167
{
168
uint8_t section_type;
169
int ret = 0;
170
@@ -XXX,XX +XXX,XX @@ int qemu_loadvm_state(QEMUFile *f)
171
return ret;
172
}
173
174
+int qemu_load_device_state(QEMUFile *f)
175
+{
176
+ MigrationIncomingState *mis = migration_incoming_get_current();
177
+ int ret;
178
+
179
+ /* Load QEMU_VM_SECTION_FULL section */
180
+ ret = qemu_loadvm_state_main(f, mis);
181
+ if (ret < 0) {
182
+ error_report("Failed to load device state: %d", ret);
183
+ return ret;
184
+ }
185
+
186
+ cpu_synchronize_all_post_init();
187
+ return 0;
188
+}
189
+
190
int save_snapshot(const char *name, Error **errp)
191
{
192
BlockDriverState *bs, *bs1;
193
diff --git a/migration/savevm.h b/migration/savevm.h
194
index XXXXXXX..XXXXXXX 100644
100
index XXXXXXX..XXXXXXX 100644
195
--- a/migration/savevm.h
101
--- a/include/hw/virtio/vhost-vdpa.h
196
+++ b/migration/savevm.h
102
+++ b/include/hw/virtio/vhost-vdpa.h
197
@@ -XXX,XX +XXX,XX @@ void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
103
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
198
uint64_t *start_list,
104
bool iotlb_batch_begin_sent;
199
uint64_t *length_list);
105
MemoryListener listener;
200
void qemu_savevm_send_colo_enable(QEMUFile *f);
106
struct vhost_vdpa_iova_range iova_range;
201
+void qemu_savevm_live_state(QEMUFile *f);
107
+ uint64_t acked_features;
202
+int qemu_save_device_state(QEMUFile *f);
108
bool shadow_vqs_enabled;
203
109
/* IOVA mapping used by the Shadow Virtqueue */
204
int qemu_loadvm_state(QEMUFile *f);
110
VhostIOVATree *iova_tree;
205
void qemu_loadvm_state_cleanup(void);
206
+int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
207
+int qemu_load_device_state(QEMUFile *f);
208
209
#endif
210
--
111
--
211
2.5.0
112
2.7.4
212
113
213
114
diff view generated by jsdifflib
Deleted patch
1
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
2
1
3
Don't need to flush all VM's ram from cache, only
4
flush the dirty pages since last checkpoint
5
6
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
7
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
8
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
9
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
10
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
---
13
migration/ram.c | 9 +++++++++
14
1 file changed, 9 insertions(+)
15
16
diff --git a/migration/ram.c b/migration/ram.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/migration/ram.c
19
+++ b/migration/ram.c
20
@@ -XXX,XX +XXX,XX @@ int colo_init_ram_cache(void)
21
}
22
ram_state = g_new0(RAMState, 1);
23
ram_state->migration_dirty_pages = 0;
24
+ memory_global_dirty_log_start();
25
26
return 0;
27
28
@@ -XXX,XX +XXX,XX @@ void colo_release_ram_cache(void)
29
{
30
RAMBlock *block;
31
32
+ memory_global_dirty_log_stop();
33
RAMBLOCK_FOREACH_MIGRATABLE(block) {
34
g_free(block->bmap);
35
block->bmap = NULL;
36
@@ -XXX,XX +XXX,XX @@ static void colo_flush_ram_cache(void)
37
void *src_host;
38
unsigned long offset = 0;
39
40
+ memory_global_dirty_log_sync();
41
+ rcu_read_lock();
42
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
43
+ migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
44
+ }
45
+ rcu_read_unlock();
46
+
47
trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
48
rcu_read_lock();
49
block = QLIST_FIRST_RCU(&ram_list.blocks);
50
--
51
2.5.0
52
53
diff view generated by jsdifflib
Deleted patch
1
From: zhanghailiang <zhang.zhanghailiang@huawei.com>
2
1
3
COLO thread may sleep at qemu_sem_wait(&s->colo_checkpoint_sem),
4
while failover works begin, It's better to wakeup it to quick
5
the process.
6
7
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
8
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
9
Signed-off-by: Jason Wang <jasowang@redhat.com>
10
---
11
migration/colo.c | 8 ++++++++
12
1 file changed, 8 insertions(+)
13
14
diff --git a/migration/colo.c b/migration/colo.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/migration/colo.c
17
+++ b/migration/colo.c
18
@@ -XXX,XX +XXX,XX @@ static void primary_vm_do_failover(void)
19
20
migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
21
MIGRATION_STATUS_COMPLETED);
22
+ /*
23
+ * kick COLO thread which might wait at
24
+ * qemu_sem_wait(&s->colo_checkpoint_sem).
25
+ */
26
+ colo_checkpoint_notify(migrate_get_current());
27
28
/*
29
* Wake up COLO thread which may blocked in recv() or send(),
30
@@ -XXX,XX +XXX,XX @@ static void colo_process_checkpoint(MigrationState *s)
31
32
qemu_sem_wait(&s->colo_checkpoint_sem);
33
34
+ if (s->state != MIGRATION_STATUS_COLO) {
35
+ goto out;
36
+ }
37
ret = colo_do_checkpoint_transaction(s, bioc, fb);
38
if (ret < 0) {
39
goto out;
40
--
41
2.5.0
42
43
diff view generated by jsdifflib
Deleted patch
1
From: Zhang Chen <chen.zhang@intel.com>
2
1
3
This diagram make user better understand COLO.
4
Suggested by Markus Armbruster.
5
6
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
7
Signed-off-by: Zhang Chen <chen.zhang@intel.com>
8
Signed-off-by: Jason Wang <jasowang@redhat.com>
9
---
10
docs/COLO-FT.txt | 34 ++++++++++++++++++++++++++++++++++
11
1 file changed, 34 insertions(+)
12
13
diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt
14
index XXXXXXX..XXXXXXX 100644
15
--- a/docs/COLO-FT.txt
16
+++ b/docs/COLO-FT.txt
17
@@ -XXX,XX +XXX,XX @@ Note:
18
HeartBeat has not been implemented yet, so you need to trigger failover process
19
by using 'x-colo-lost-heartbeat' command.
20
21
+== COLO operation status ==
22
+
23
++-----------------+
24
+| |
25
+| Start COLO |
26
+| |
27
++--------+--------+
28
+ |
29
+ | Main qmp command:
30
+ | migrate-set-capabilities with x-colo
31
+ | migrate
32
+ |
33
+ v
34
++--------+--------+
35
+| |
36
+| COLO running |
37
+| |
38
++--------+--------+
39
+ |
40
+ | Main qmp command:
41
+ | x-colo-lost-heartbeat
42
+ | or
43
+ | some error happened
44
+ v
45
++--------+--------+
46
+| | send qmp event:
47
+| COLO failover | COLO_EXIT
48
+| |
49
++-----------------+
50
+
51
+COLO use the qmp command to switch and report operation status.
52
+The diagram just shows the main qmp command, you can get the detail
53
+in test procedure.
54
+
55
== Test procedure ==
56
1. Startup qemu
57
Primary:
58
--
59
2.5.0
60
61
diff view generated by jsdifflib
Deleted patch
1
From: liujunjie <liujunjie23@huawei.com>
2
1
3
Before, we did not clear callback like handle_output when delete
4
the virtqueue which may result be segmentfault.
5
The scene is as follows:
6
1. Start a vm with multiqueue vhost-net,
7
2. then we write VIRTIO_PCI_GUEST_FEATURES in PCI configuration to
8
triger multiqueue disable in this vm which will delete the virtqueue.
9
In this step, the tx_bh is deleted but the callback virtio_net_handle_tx_bh
10
still exist.
11
3. Finally, we write VIRTIO_PCI_QUEUE_NOTIFY in PCI configuration to
12
notify the deleted virtqueue. In this way, virtio_net_handle_tx_bh
13
will be called and qemu will be crashed.
14
15
Although the way described above is uncommon, we had better reinforce it.
16
17
CC: qemu-stable@nongnu.org
18
Signed-off-by: liujunjie <liujunjie23@huawei.com>
19
Signed-off-by: Jason Wang <jasowang@redhat.com>
20
---
21
hw/virtio/virtio.c | 2 ++
22
1 file changed, 2 insertions(+)
23
24
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
25
index XXXXXXX..XXXXXXX 100644
26
--- a/hw/virtio/virtio.c
27
+++ b/hw/virtio/virtio.c
28
@@ -XXX,XX +XXX,XX @@ void virtio_del_queue(VirtIODevice *vdev, int n)
29
30
vdev->vq[n].vring.num = 0;
31
vdev->vq[n].vring.num_default = 0;
32
+ vdev->vq[n].handle_output = NULL;
33
+ vdev->vq[n].handle_aio_output = NULL;
34
}
35
36
static void virtio_set_isr(VirtIODevice *vdev, int value)
37
--
38
2.5.0
39
40
diff view generated by jsdifflib
Deleted patch
1
In ne2000_receive(), we try to assign size_ to size which converts
2
from size_t to integer. This will cause troubles when size_ is greater
3
INT_MAX, this will lead a negative value in size and it can then pass
4
the check of size < MIN_BUF_SIZE which may lead out of bound access of
5
for both buf and buf1.
6
1
7
Fixing by converting the type of size to size_t.
8
9
CC: qemu-stable@nongnu.org
10
Reported-by: Daniel Shapira <daniel@twistlock.com>
11
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
12
Signed-off-by: Jason Wang <jasowang@redhat.com>
13
---
14
hw/net/ne2000.c | 4 ++--
15
1 file changed, 2 insertions(+), 2 deletions(-)
16
17
diff --git a/hw/net/ne2000.c b/hw/net/ne2000.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/hw/net/ne2000.c
20
+++ b/hw/net/ne2000.c
21
@@ -XXX,XX +XXX,XX @@ static int ne2000_buffer_full(NE2000State *s)
22
ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
23
{
24
NE2000State *s = qemu_get_nic_opaque(nc);
25
- int size = size_;
26
+ size_t size = size_;
27
uint8_t *p;
28
unsigned int total_len, next, avail, len, index, mcast_idx;
29
uint8_t buf1[60];
30
@@ -XXX,XX +XXX,XX @@ ssize_t ne2000_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
31
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
32
33
#if defined(DEBUG_NE2000)
34
- printf("NE2000: received len=%d\n", size);
35
+ printf("NE2000: received len=%zu\n", size);
36
#endif
37
38
if (s->cmd & E8390_STOP || ne2000_buffer_full(s))
39
--
40
2.5.0
41
42
diff view generated by jsdifflib
Deleted patch
1
In rtl8139_do_receive(), we try to assign size_ to size which converts
2
from size_t to integer. This will cause troubles when size_ is greater
3
INT_MAX, this will lead a negative value in size and it can then pass
4
the check of size < MIN_BUF_SIZE which may lead out of bound access of
5
for both buf and buf1.
6
1
7
Fixing by converting the type of size to size_t.
8
9
CC: qemu-stable@nongnu.org
10
Reported-by: Daniel Shapira <daniel@twistlock.com>
11
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
12
Signed-off-by: Jason Wang <jasowang@redhat.com>
13
---
14
hw/net/rtl8139.c | 8 ++++----
15
1 file changed, 4 insertions(+), 4 deletions(-)
16
17
diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/hw/net/rtl8139.c
20
+++ b/hw/net/rtl8139.c
21
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
22
RTL8139State *s = qemu_get_nic_opaque(nc);
23
PCIDevice *d = PCI_DEVICE(s);
24
/* size is the length of the buffer passed to the driver */
25
- int size = size_;
26
+ size_t size = size_;
27
const uint8_t *dot1q_buf = NULL;
28
29
uint32_t packet_header = 0;
30
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
31
static const uint8_t broadcast_macaddr[6] =
32
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
33
34
- DPRINTF(">>> received len=%d\n", size);
35
+ DPRINTF(">>> received len=%zu\n", size);
36
37
/* test if board clock is stopped */
38
if (!s->clock_enabled)
39
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
40
41
if (size+4 > rx_space)
42
{
43
- DPRINTF("C+ Rx mode : descriptor %d size %d received %d + 4\n",
44
+ DPRINTF("C+ Rx mode : descriptor %d size %d received %zu + 4\n",
45
descriptor, rx_space, size);
46
47
s->IntrStatus |= RxOverflow;
48
@@ -XXX,XX +XXX,XX @@ static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t
49
if (avail != 0 && RX_ALIGN(size + 8) >= avail)
50
{
51
DPRINTF("rx overflow: rx buffer length %d head 0x%04x "
52
- "read 0x%04x === available 0x%04x need 0x%04x\n",
53
+ "read 0x%04x === available 0x%04x need 0x%04zx\n",
54
s->RxBufferSize, s->RxBufAddr, s->RxBufPtr, avail, size + 8);
55
56
s->IntrStatus |= RxOverflow;
57
--
58
2.5.0
59
60
diff view generated by jsdifflib
Deleted patch
1
In pcnet_receive(), we try to assign size_ to size which converts from
2
size_t to integer. This will cause troubles when size_ is greater
3
INT_MAX, this will lead a negative value in size and it can then pass
4
the check of size < MIN_BUF_SIZE which may lead out of bound access
5
for both buf and buf1.
6
1
7
Fixing by converting the type of size to size_t.
8
9
CC: qemu-stable@nongnu.org
10
Reported-by: Daniel Shapira <daniel@twistlock.com>
11
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
12
Signed-off-by: Jason Wang <jasowang@redhat.com>
13
---
14
hw/net/pcnet.c | 4 ++--
15
1 file changed, 2 insertions(+), 2 deletions(-)
16
17
diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/hw/net/pcnet.c
20
+++ b/hw/net/pcnet.c
21
@@ -XXX,XX +XXX,XX @@ ssize_t pcnet_receive(NetClientState *nc, const uint8_t *buf, size_t size_)
22
uint8_t buf1[60];
23
int remaining;
24
int crc_err = 0;
25
- int size = size_;
26
+ size_t size = size_;
27
28
if (CSR_DRX(s) || CSR_STOP(s) || CSR_SPND(s) || !size ||
29
(CSR_LOOP(s) && !s->looptest)) {
30
return -1;
31
}
32
#ifdef PCNET_DEBUG
33
- printf("pcnet_receive size=%d\n", size);
34
+ printf("pcnet_receive size=%zu\n", size);
35
#endif
36
37
/* if too small buffer, then expand it */
38
--
39
2.5.0
40
41
diff view generated by jsdifflib