1
The following changes since commit e607bbee553cfe73072870cef458cfa4e78133e2:
1
The following changes since commit f45cc81911adc7726e8a2801986b6998b91b816e:
2
2
3
Merge remote-tracking branch 'remotes/edgar/tags/edgar/xilinx-next-2018-01-26.for-upstream' into staging (2018-01-26 14:24:25 +0000)
3
Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20220307' into staging (2022-03-08 09:06:57 +0000)
4
4
5
are available in the git repository at:
5
are available in the git repository at:
6
6
7
https://github.com/jasowang/qemu.git tags/net-pull-request
7
https://github.com/jasowang/qemu.git tags/net-pull-request
8
8
9
for you to fetch changes up to bf4835a4d5338bb7424827715df22570a8adc67c:
9
for you to fetch changes up to a10dd1e279fc56cebc7e738925e0db85d0cea089:
10
10
11
MAINTAINERS: update Dmitry Fleytman email (2018-01-29 16:05:38 +0800)
11
vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-08 21:18:41 +0800)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
14
15
----------------------------------------------------------------
15
----------------------------------------------------------------
16
Mao Zhongyi (2):
16
Eugenio Pérez (14):
17
colo: modified the payload compare function
17
vhost: Add VhostShadowVirtqueue
18
colo: compare the packet based on the tcp sequence number
18
vhost: Add Shadow VirtQueue kick forwarding capabilities
19
vhost: Add Shadow VirtQueue call forwarding capabilities
20
vhost: Add vhost_svq_valid_features to shadow vq
21
virtio: Add vhost_svq_get_vring_addr
22
vdpa: adapt vhost_ops callbacks to svq
23
vhost: Shadow virtqueue buffers forwarding
24
util: Add iova_tree_alloc_map
25
util: add iova_tree_find_iova
26
vhost: Add VhostIOVATree
27
vdpa: Add custom IOTLB translations to SVQ
28
vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
29
vdpa: Never set log_base addr if SVQ is enabled
30
vdpa: Expose VHOST_F_LOG_ALL on SVQ
19
31
20
Philippe Mathieu-Daudé (1):
32
Jason Wang (1):
21
MAINTAINERS: update Dmitry Fleytman email
33
virtio-net: fix map leaking on error during receive
22
34
23
Thomas Huth (3):
35
hw/net/virtio-net.c | 1 +
24
net: Allow hubports to connect to other netdevs
36
hw/virtio/meson.build | 2 +-
25
net: Allow netdevs to be used with 'hostfwd_add' and 'hostfwd_remove'
37
hw/virtio/vhost-iova-tree.c | 110 +++++++
26
qemu-doc: Get rid of "vlan=X" example in the documentation
38
hw/virtio/vhost-iova-tree.h | 27 ++
27
39
hw/virtio/vhost-shadow-virtqueue.c | 637 +++++++++++++++++++++++++++++++++++++
28
MAINTAINERS | 8 +-
40
hw/virtio/vhost-shadow-virtqueue.h | 87 +++++
29
hmp-commands.hx | 4 +-
41
hw/virtio/vhost-vdpa.c | 525 +++++++++++++++++++++++++++++-
30
net/colo-compare.c | 411 +++++++++++++++++++++++++++++++++--------------------
42
include/hw/virtio/vhost-vdpa.h | 8 +
31
net/colo.c | 9 ++
43
include/qemu/iova-tree.h | 38 ++-
32
net/colo.h | 15 ++
44
util/iova-tree.c | 169 ++++++++++
33
net/hub.c | 27 +++-
45
10 files changed, 1587 insertions(+), 17 deletions(-)
34
net/hub.h | 3 +-
46
create mode 100644 hw/virtio/vhost-iova-tree.c
35
net/net.c | 2 +-
47
create mode 100644 hw/virtio/vhost-iova-tree.h
36
net/slirp.c | 33 +++--
48
create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
37
net/trace-events | 2 +-
49
create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
38
qapi/net.json | 4 +-
39
qemu-options.hx | 12 +-
40
12 files changed, 347 insertions(+), 183 deletions(-)
41
50
42
51
diff view generated by jsdifflib
New patch
1
Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
2
tries to fix the use after free of the sg by caching the virtqueue
3
elements in an array and unmap them at once after receiving the
4
packets, But it forgot to unmap the cached elements on error which
5
will lead to leaking of mapping and other unexpected results.
1
6
7
Fixing this by detaching the cached elements on error. This addresses
8
CVE-2022-26353.
9
10
Reported-by: Victor Tom <vv474172261@gmail.com>
11
Cc: qemu-stable@nongnu.org
12
Fixes: CVE-2022-26353
13
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
14
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
16
---
17
hw/net/virtio-net.c | 1 +
18
1 file changed, 1 insertion(+)
19
20
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
21
index XXXXXXX..XXXXXXX 100644
22
--- a/hw/net/virtio-net.c
23
+++ b/hw/net/virtio-net.c
24
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
25
26
err:
27
for (j = 0; j < i; j++) {
28
+ virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
29
g_free(elems[j]);
30
}
31
32
--
33
2.7.4
diff view generated by jsdifflib
New patch
1
From: Eugenio Pérez <eperezma@redhat.com>
1
2
3
Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
4
notifications and buffers, allowing qemu to track them. While qemu is
5
forwarding the buffers and virtqueue changes, it is able to commit the
6
memory it's being dirtied, the same way regular qemu's VirtIO devices
7
do.
8
9
This commit only exposes basic SVQ allocation and free. Next patches of
10
the series add functionality like notifications and buffers forwarding.
11
12
Acked-by: Michael S. Tsirkin <mst@redhat.com>
13
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
14
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
---
16
hw/virtio/meson.build | 2 +-
17
hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
18
hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
19
3 files changed, 91 insertions(+), 1 deletion(-)
20
create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
21
create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
22
23
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
24
index XXXXXXX..XXXXXXX 100644
25
--- a/hw/virtio/meson.build
26
+++ b/hw/virtio/meson.build
27
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
28
29
virtio_ss = ss.source_set()
30
virtio_ss.add(files('virtio.c'))
31
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
32
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
33
virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
34
virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
35
virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
36
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
37
new file mode 100644
38
index XXXXXXX..XXXXXXX
39
--- /dev/null
40
+++ b/hw/virtio/vhost-shadow-virtqueue.c
41
@@ -XXX,XX +XXX,XX @@
42
+/*
43
+ * vhost shadow virtqueue
44
+ *
45
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
46
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
47
+ *
48
+ * SPDX-License-Identifier: GPL-2.0-or-later
49
+ */
50
+
51
+#include "qemu/osdep.h"
52
+#include "hw/virtio/vhost-shadow-virtqueue.h"
53
+
54
+#include "qemu/error-report.h"
55
+
56
+/**
57
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
58
+ * shadow methods and file descriptors.
59
+ *
60
+ * Returns the new virtqueue or NULL.
61
+ *
62
+ * In case of error, reason is reported through error_report.
63
+ */
64
+VhostShadowVirtqueue *vhost_svq_new(void)
65
+{
66
+ g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
67
+ int r;
68
+
69
+ r = event_notifier_init(&svq->hdev_kick, 0);
70
+ if (r != 0) {
71
+ error_report("Couldn't create kick event notifier: %s (%d)",
72
+ g_strerror(errno), errno);
73
+ goto err_init_hdev_kick;
74
+ }
75
+
76
+ r = event_notifier_init(&svq->hdev_call, 0);
77
+ if (r != 0) {
78
+ error_report("Couldn't create call event notifier: %s (%d)",
79
+ g_strerror(errno), errno);
80
+ goto err_init_hdev_call;
81
+ }
82
+
83
+ return g_steal_pointer(&svq);
84
+
85
+err_init_hdev_call:
86
+ event_notifier_cleanup(&svq->hdev_kick);
87
+
88
+err_init_hdev_kick:
89
+ return NULL;
90
+}
91
+
92
+/**
93
+ * Free the resources of the shadow virtqueue.
94
+ *
95
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
96
+ */
97
+void vhost_svq_free(gpointer pvq)
98
+{
99
+ VhostShadowVirtqueue *vq = pvq;
100
+ event_notifier_cleanup(&vq->hdev_kick);
101
+ event_notifier_cleanup(&vq->hdev_call);
102
+ g_free(vq);
103
+}
104
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
105
new file mode 100644
106
index XXXXXXX..XXXXXXX
107
--- /dev/null
108
+++ b/hw/virtio/vhost-shadow-virtqueue.h
109
@@ -XXX,XX +XXX,XX @@
110
+/*
111
+ * vhost shadow virtqueue
112
+ *
113
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
114
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
115
+ *
116
+ * SPDX-License-Identifier: GPL-2.0-or-later
117
+ */
118
+
119
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
120
+#define VHOST_SHADOW_VIRTQUEUE_H
121
+
122
+#include "qemu/event_notifier.h"
123
+
124
+/* Shadow virtqueue to relay notifications */
125
+typedef struct VhostShadowVirtqueue {
126
+ /* Shadow kick notifier, sent to vhost */
127
+ EventNotifier hdev_kick;
128
+ /* Shadow call notifier, sent to vhost */
129
+ EventNotifier hdev_call;
130
+} VhostShadowVirtqueue;
131
+
132
+VhostShadowVirtqueue *vhost_svq_new(void);
133
+
134
+void vhost_svq_free(gpointer vq);
135
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
136
+
137
+#endif
138
--
139
2.7.4
140
141
diff view generated by jsdifflib
New patch
1
1
From: Eugenio Pérez <eperezma@redhat.com>
2
3
At this mode no buffer forwarding will be performed in SVQ mode: Qemu
4
will just forward the guest's kicks to the device.
5
6
Host memory notifiers regions are left out for simplicity, and they will
7
not be addressed in this series.
8
9
Acked-by: Michael S. Tsirkin <mst@redhat.com>
10
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
---
13
hw/virtio/vhost-shadow-virtqueue.c | 56 ++++++++++++++
14
hw/virtio/vhost-shadow-virtqueue.h | 14 ++++
15
hw/virtio/vhost-vdpa.c | 145 ++++++++++++++++++++++++++++++++++++-
16
include/hw/virtio/vhost-vdpa.h | 4 +
17
4 files changed, 217 insertions(+), 2 deletions(-)
18
19
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/hw/virtio/vhost-shadow-virtqueue.c
22
+++ b/hw/virtio/vhost-shadow-virtqueue.c
23
@@ -XXX,XX +XXX,XX @@
24
#include "hw/virtio/vhost-shadow-virtqueue.h"
25
26
#include "qemu/error-report.h"
27
+#include "qemu/main-loop.h"
28
+#include "linux-headers/linux/vhost.h"
29
+
30
+/**
31
+ * Forward guest notifications.
32
+ *
33
+ * @n: guest kick event notifier, the one that guest set to notify svq.
34
+ */
35
+static void vhost_handle_guest_kick(EventNotifier *n)
36
+{
37
+ VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
38
+ svq_kick);
39
+ event_notifier_test_and_clear(n);
40
+ event_notifier_set(&svq->hdev_kick);
41
+}
42
+
43
+/**
44
+ * Set a new file descriptor for the guest to kick the SVQ and notify for avail
45
+ *
46
+ * @svq: The svq
47
+ * @svq_kick_fd: The svq kick fd
48
+ *
49
+ * Note that the SVQ will never close the old file descriptor.
50
+ */
51
+void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
52
+{
53
+ EventNotifier *svq_kick = &svq->svq_kick;
54
+ bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
55
+ bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
56
+
57
+ if (poll_stop) {
58
+ event_notifier_set_handler(svq_kick, NULL);
59
+ }
60
+
61
+ /*
62
+ * event_notifier_set_handler already checks for guest's notifications if
63
+ * they arrive at the new file descriptor in the switch, so there is no
64
+ * need to explicitly check for them.
65
+ */
66
+ if (poll_start) {
67
+ event_notifier_init_fd(svq_kick, svq_kick_fd);
68
+ event_notifier_set(svq_kick);
69
+ event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
70
+ }
71
+}
72
+
73
+/**
74
+ * Stop the shadow virtqueue operation.
75
+ * @svq: Shadow Virtqueue
76
+ */
77
+void vhost_svq_stop(VhostShadowVirtqueue *svq)
78
+{
79
+ event_notifier_set_handler(&svq->svq_kick, NULL);
80
+}
81
82
/**
83
* Creates vhost shadow virtqueue, and instructs the vhost device to use the
84
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
85
goto err_init_hdev_call;
86
}
87
88
+ event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
89
return g_steal_pointer(&svq);
90
91
err_init_hdev_call:
92
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
93
void vhost_svq_free(gpointer pvq)
94
{
95
VhostShadowVirtqueue *vq = pvq;
96
+ vhost_svq_stop(vq);
97
event_notifier_cleanup(&vq->hdev_kick);
98
event_notifier_cleanup(&vq->hdev_call);
99
g_free(vq);
100
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
101
index XXXXXXX..XXXXXXX 100644
102
--- a/hw/virtio/vhost-shadow-virtqueue.h
103
+++ b/hw/virtio/vhost-shadow-virtqueue.h
104
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
105
EventNotifier hdev_kick;
106
/* Shadow call notifier, sent to vhost */
107
EventNotifier hdev_call;
108
+
109
+ /*
110
+ * Borrowed virtqueue's guest to host notifier. To borrow it in this event
111
+ * notifier allows to recover the VhostShadowVirtqueue from the event loop
112
+ * easily. If we use the VirtQueue's one, we don't have an easy way to
113
+ * retrieve VhostShadowVirtqueue.
114
+ *
115
+ * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
116
+ */
117
+ EventNotifier svq_kick;
118
} VhostShadowVirtqueue;
119
120
+void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
121
+
122
+void vhost_svq_stop(VhostShadowVirtqueue *svq);
123
+
124
VhostShadowVirtqueue *vhost_svq_new(void);
125
126
void vhost_svq_free(gpointer vq);
127
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
128
index XXXXXXX..XXXXXXX 100644
129
--- a/hw/virtio/vhost-vdpa.c
130
+++ b/hw/virtio/vhost-vdpa.c
131
@@ -XXX,XX +XXX,XX @@
132
#include "hw/virtio/vhost.h"
133
#include "hw/virtio/vhost-backend.h"
134
#include "hw/virtio/virtio-net.h"
135
+#include "hw/virtio/vhost-shadow-virtqueue.h"
136
#include "hw/virtio/vhost-vdpa.h"
137
#include "exec/address-spaces.h"
138
#include "qemu/main-loop.h"
139
#include "cpu.h"
140
#include "trace.h"
141
#include "qemu-common.h"
142
+#include "qapi/error.h"
143
144
/*
145
* Return one past the end of the end of section. Be careful with uint64_t
146
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
147
return v->index != 0;
148
}
149
150
+static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
151
+ Error **errp)
152
+{
153
+ g_autoptr(GPtrArray) shadow_vqs = NULL;
154
+
155
+ if (!v->shadow_vqs_enabled) {
156
+ return 0;
157
+ }
158
+
159
+ shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
160
+ for (unsigned n = 0; n < hdev->nvqs; ++n) {
161
+ g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
162
+
163
+ if (unlikely(!svq)) {
164
+ error_setg(errp, "Cannot create svq %u", n);
165
+ return -1;
166
+ }
167
+ g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
168
+ }
169
+
170
+ v->shadow_vqs = g_steal_pointer(&shadow_vqs);
171
+ return 0;
172
+}
173
+
174
static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
175
{
176
struct vhost_vdpa *v;
177
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
178
dev->opaque = opaque ;
179
v->listener = vhost_vdpa_memory_listener;
180
v->msg_type = VHOST_IOTLB_MSG_V2;
181
+ ret = vhost_vdpa_init_svq(dev, v, errp);
182
+ if (ret) {
183
+ goto err;
184
+ }
185
186
vhost_vdpa_get_iova_range(v);
187
188
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
189
VIRTIO_CONFIG_S_DRIVER);
190
191
return 0;
192
+
193
+err:
194
+ ram_block_discard_disable(false);
195
+ return ret;
196
}
197
198
static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
199
@@ -XXX,XX +XXX,XX @@ err:
200
201
static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
202
{
203
+ struct vhost_vdpa *v = dev->opaque;
204
int i;
205
206
+ if (v->shadow_vqs_enabled) {
207
+ /* FIXME SVQ is not compatible with host notifiers mr */
208
+ return;
209
+ }
210
+
211
for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
212
if (vhost_vdpa_host_notifier_init(dev, i)) {
213
goto err;
214
@@ -XXX,XX +XXX,XX @@ err:
215
return;
216
}
217
218
+static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
219
+{
220
+ struct vhost_vdpa *v = dev->opaque;
221
+ size_t idx;
222
+
223
+ if (!v->shadow_vqs) {
224
+ return;
225
+ }
226
+
227
+ for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
228
+ vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
229
+ }
230
+ g_ptr_array_free(v->shadow_vqs, true);
231
+}
232
+
233
static int vhost_vdpa_cleanup(struct vhost_dev *dev)
234
{
235
struct vhost_vdpa *v;
236
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
237
trace_vhost_vdpa_cleanup(dev, v);
238
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
239
memory_listener_unregister(&v->listener);
240
+ vhost_vdpa_svq_cleanup(dev);
241
242
dev->opaque = NULL;
243
ram_block_discard_disable(false);
244
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
245
return ret;
246
}
247
248
+static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
249
+{
250
+ if (!v->shadow_vqs_enabled) {
251
+ return;
252
+ }
253
+
254
+ for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
255
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
256
+ vhost_svq_stop(svq);
257
+ }
258
+}
259
+
260
static int vhost_vdpa_reset_device(struct vhost_dev *dev)
261
{
262
+ struct vhost_vdpa *v = dev->opaque;
263
int ret;
264
uint8_t status = 0;
265
266
+ vhost_vdpa_reset_svq(v);
267
+
268
ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
269
trace_vhost_vdpa_reset_device(dev, status);
270
return ret;
271
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
272
return ret;
273
}
274
275
+static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
276
+ struct vhost_vring_file *file)
277
+{
278
+ trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
279
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
280
+}
281
+
282
+/**
283
+ * Set the shadow virtqueue descriptors to the device
284
+ *
285
+ * @dev: The vhost device model
286
+ * @svq: The shadow virtqueue
287
+ * @idx: The index of the virtqueue in the vhost device
288
+ * @errp: Error
289
+ */
290
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
291
+ VhostShadowVirtqueue *svq,
292
+ unsigned idx,
293
+ Error **errp)
294
+{
295
+ struct vhost_vring_file file = {
296
+ .index = dev->vq_index + idx,
297
+ };
298
+ const EventNotifier *event_notifier = &svq->hdev_kick;
299
+ int r;
300
+
301
+ file.fd = event_notifier_get_fd(event_notifier);
302
+ r = vhost_vdpa_set_vring_dev_kick(dev, &file);
303
+ if (unlikely(r != 0)) {
304
+ error_setg_errno(errp, -r, "Can't set device kick fd");
305
+ }
306
+
307
+ return r == 0;
308
+}
309
+
310
+static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
311
+{
312
+ struct vhost_vdpa *v = dev->opaque;
313
+ Error *err = NULL;
314
+ unsigned i;
315
+
316
+ if (!v->shadow_vqs) {
317
+ return true;
318
+ }
319
+
320
+ for (i = 0; i < v->shadow_vqs->len; ++i) {
321
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
322
+ bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
323
+ if (unlikely(!ok)) {
324
+ error_reportf_err(err, "Cannot setup SVQ %u: ", i);
325
+ return false;
326
+ }
327
+ }
328
+
329
+ return true;
330
+}
331
+
332
static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
333
{
334
struct vhost_vdpa *v = dev->opaque;
335
+ bool ok;
336
trace_vhost_vdpa_dev_start(dev, started);
337
338
if (started) {
339
vhost_vdpa_host_notifiers_init(dev);
340
+ ok = vhost_vdpa_svqs_start(dev);
341
+ if (unlikely(!ok)) {
342
+ return -1;
343
+ }
344
vhost_vdpa_set_vring_ready(dev);
345
} else {
346
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
347
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
348
static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
349
struct vhost_vring_file *file)
350
{
351
- trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
352
- return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
353
+ struct vhost_vdpa *v = dev->opaque;
354
+ int vdpa_idx = file->index - dev->vq_index;
355
+
356
+ if (v->shadow_vqs_enabled) {
357
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
358
+ vhost_svq_set_svq_kick_fd(svq, file->fd);
359
+ return 0;
360
+ } else {
361
+ return vhost_vdpa_set_vring_dev_kick(dev, file);
362
+ }
363
}
364
365
static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
366
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
367
index XXXXXXX..XXXXXXX 100644
368
--- a/include/hw/virtio/vhost-vdpa.h
369
+++ b/include/hw/virtio/vhost-vdpa.h
370
@@ -XXX,XX +XXX,XX @@
371
#ifndef HW_VIRTIO_VHOST_VDPA_H
372
#define HW_VIRTIO_VHOST_VDPA_H
373
374
+#include <gmodule.h>
375
+
376
#include "hw/virtio/virtio.h"
377
#include "standard-headers/linux/vhost_types.h"
378
379
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
380
bool iotlb_batch_begin_sent;
381
MemoryListener listener;
382
struct vhost_vdpa_iova_range iova_range;
383
+ bool shadow_vqs_enabled;
384
+ GPtrArray *shadow_vqs;
385
struct vhost_dev *dev;
386
VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
387
} VhostVDPA;
388
--
389
2.7.4
390
391
diff view generated by jsdifflib
New patch
1
From: Eugenio Pérez <eperezma@redhat.com>
1
2
3
This will make qemu aware of the device used buffers, allowing it to
4
write the guest memory with its contents if needed.
5
6
Acked-by: Michael S. Tsirkin <mst@redhat.com>
7
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
8
Signed-off-by: Jason Wang <jasowang@redhat.com>
9
---
10
hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
11
hw/virtio/vhost-shadow-virtqueue.h | 4 ++++
12
hw/virtio/vhost-vdpa.c | 31 +++++++++++++++++++++++++++++--
13
3 files changed, 71 insertions(+), 2 deletions(-)
14
15
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/hw/virtio/vhost-shadow-virtqueue.c
18
+++ b/hw/virtio/vhost-shadow-virtqueue.c
19
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
20
}
21
22
/**
23
+ * Forward vhost notifications
24
+ *
25
+ * @n: hdev call event notifier, the one that device set to notify svq.
26
+ */
27
+static void vhost_svq_handle_call(EventNotifier *n)
28
+{
29
+ VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
30
+ hdev_call);
31
+ event_notifier_test_and_clear(n);
32
+ event_notifier_set(&svq->svq_call);
33
+}
34
+
35
+/**
36
+ * Set the call notifier for the SVQ to call the guest
37
+ *
38
+ * @svq: Shadow virtqueue
39
+ * @call_fd: call notifier
40
+ *
41
+ * Called on BQL context.
42
+ */
43
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
44
+{
45
+ if (call_fd == VHOST_FILE_UNBIND) {
46
+ /*
47
+ * Fail event_notifier_set if called handling device call.
48
+ *
49
+ * SVQ still needs device notifications, since it needs to keep
50
+ * forwarding used buffers even with the unbind.
51
+ */
52
+ memset(&svq->svq_call, 0, sizeof(svq->svq_call));
53
+ } else {
54
+ event_notifier_init_fd(&svq->svq_call, call_fd);
55
+ }
56
+}
57
+
58
+/**
59
* Set a new file descriptor for the guest to kick the SVQ and notify for avail
60
*
61
* @svq: The svq
62
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
63
}
64
65
event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
66
+ event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
67
return g_steal_pointer(&svq);
68
69
err_init_hdev_call:
70
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
71
VhostShadowVirtqueue *vq = pvq;
72
vhost_svq_stop(vq);
73
event_notifier_cleanup(&vq->hdev_kick);
74
+ event_notifier_set_handler(&vq->hdev_call, NULL);
75
event_notifier_cleanup(&vq->hdev_call);
76
g_free(vq);
77
}
78
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
79
index XXXXXXX..XXXXXXX 100644
80
--- a/hw/virtio/vhost-shadow-virtqueue.h
81
+++ b/hw/virtio/vhost-shadow-virtqueue.h
82
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
83
* So shadow virtqueue must not clean it, or we would lose VirtQueue one.
84
*/
85
EventNotifier svq_kick;
86
+
87
+ /* Guest's call notifier, where the SVQ calls guest. */
88
+ EventNotifier svq_call;
89
} VhostShadowVirtqueue;
90
91
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
92
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
93
94
void vhost_svq_stop(VhostShadowVirtqueue *svq);
95
96
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
97
index XXXXXXX..XXXXXXX 100644
98
--- a/hw/virtio/vhost-vdpa.c
99
+++ b/hw/virtio/vhost-vdpa.c
100
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
101
return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
102
}
103
104
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
105
+ struct vhost_vring_file *file)
106
+{
107
+ trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
108
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
109
+}
110
+
111
/**
112
* Set the shadow virtqueue descriptors to the device
113
*
114
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
115
* @svq: The shadow virtqueue
116
* @idx: The index of the virtqueue in the vhost device
117
* @errp: Error
118
+ *
119
+ * Note that this function does not rewind kick file descriptor if cannot set
120
+ * call one.
121
*/
122
static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
123
VhostShadowVirtqueue *svq,
124
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
125
r = vhost_vdpa_set_vring_dev_kick(dev, &file);
126
if (unlikely(r != 0)) {
127
error_setg_errno(errp, -r, "Can't set device kick fd");
128
+ return false;
129
+ }
130
+
131
+ event_notifier = &svq->hdev_call;
132
+ file.fd = event_notifier_get_fd(event_notifier);
133
+ r = vhost_vdpa_set_vring_dev_call(dev, &file);
134
+ if (unlikely(r != 0)) {
135
+ error_setg_errno(errp, -r, "Can't set device call fd");
136
}
137
138
return r == 0;
139
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
140
static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
141
struct vhost_vring_file *file)
142
{
143
- trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
144
- return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
145
+ struct vhost_vdpa *v = dev->opaque;
146
+
147
+ if (v->shadow_vqs_enabled) {
148
+ int vdpa_idx = file->index - dev->vq_index;
149
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
150
+
151
+ vhost_svq_set_svq_call_fd(svq, file->fd);
152
+ return 0;
153
+ } else {
154
+ return vhost_vdpa_set_vring_dev_call(dev, file);
155
+ }
156
}
157
158
static int vhost_vdpa_get_features(struct vhost_dev *dev,
159
--
160
2.7.4
161
162
diff view generated by jsdifflib
New patch
1
From: Eugenio Pérez <eperezma@redhat.com>
1
2
3
This allows SVQ to negotiate features with the guest and the device. For
4
the device, SVQ is a driver. While this function bypasses all
5
non-transport features, it needs to disable the features that SVQ does
6
not support when forwarding buffers. This includes packed vq layout,
7
indirect descriptors or event idx.
8
9
Future changes can add support to offer more features to the guest,
10
since the use of VirtQueue gives this for free. This is left out at the
11
moment for simplicity.
12
13
Acked-by: Michael S. Tsirkin <mst@redhat.com>
14
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
16
---
17
hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
18
hw/virtio/vhost-shadow-virtqueue.h | 2 ++
19
hw/virtio/vhost-vdpa.c | 15 +++++++++++++
20
3 files changed, 61 insertions(+)
21
22
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
23
index XXXXXXX..XXXXXXX 100644
24
--- a/hw/virtio/vhost-shadow-virtqueue.c
25
+++ b/hw/virtio/vhost-shadow-virtqueue.c
26
@@ -XXX,XX +XXX,XX @@
27
#include "hw/virtio/vhost-shadow-virtqueue.h"
28
29
#include "qemu/error-report.h"
30
+#include "qapi/error.h"
31
#include "qemu/main-loop.h"
32
#include "linux-headers/linux/vhost.h"
33
34
/**
35
+ * Validate the transport device features that both guests can use with the SVQ
36
+ * and SVQs can use with the device.
37
+ *
38
+ * @dev_features: The features
39
+ * @errp: Error pointer
40
+ */
41
+bool vhost_svq_valid_features(uint64_t features, Error **errp)
42
+{
43
+ bool ok = true;
44
+ uint64_t svq_features = features;
45
+
46
+ for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
47
+ ++b) {
48
+ switch (b) {
49
+ case VIRTIO_F_ANY_LAYOUT:
50
+ continue;
51
+
52
+ case VIRTIO_F_ACCESS_PLATFORM:
53
+ /* SVQ trust in the host's IOMMU to translate addresses */
54
+ case VIRTIO_F_VERSION_1:
55
+ /* SVQ trust that the guest vring is little endian */
56
+ if (!(svq_features & BIT_ULL(b))) {
57
+ set_bit(b, &svq_features);
58
+ ok = false;
59
+ }
60
+ continue;
61
+
62
+ default:
63
+ if (svq_features & BIT_ULL(b)) {
64
+ clear_bit(b, &svq_features);
65
+ ok = false;
66
+ }
67
+ }
68
+ }
69
+
70
+ if (!ok) {
71
+ error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
72
+ ", ok: 0x%"PRIx64, features, svq_features);
73
+ }
74
+ return ok;
75
+}
76
+
77
+/**
78
* Forward guest notifications.
79
*
80
* @n: guest kick event notifier, the one that guest set to notify svq.
81
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
82
index XXXXXXX..XXXXXXX 100644
83
--- a/hw/virtio/vhost-shadow-virtqueue.h
84
+++ b/hw/virtio/vhost-shadow-virtqueue.h
85
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
86
EventNotifier svq_call;
87
} VhostShadowVirtqueue;
88
89
+bool vhost_svq_valid_features(uint64_t features, Error **errp);
90
+
91
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
92
void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
93
94
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
95
index XXXXXXX..XXXXXXX 100644
96
--- a/hw/virtio/vhost-vdpa.c
97
+++ b/hw/virtio/vhost-vdpa.c
98
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
99
Error **errp)
100
{
101
g_autoptr(GPtrArray) shadow_vqs = NULL;
102
+ uint64_t dev_features, svq_features;
103
+ int r;
104
+ bool ok;
105
106
if (!v->shadow_vqs_enabled) {
107
return 0;
108
}
109
110
+ r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
111
+ if (r != 0) {
112
+ error_setg_errno(errp, -r, "Can't get vdpa device features");
113
+ return r;
114
+ }
115
+
116
+ svq_features = dev_features;
117
+ ok = vhost_svq_valid_features(svq_features, errp);
118
+ if (unlikely(!ok)) {
119
+ return -1;
120
+ }
121
+
122
shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
123
for (unsigned n = 0; n < hdev->nvqs; ++n) {
124
g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
125
--
126
2.7.4
127
128
diff view generated by jsdifflib
New patch
1
From: Eugenio Pérez <eperezma@redhat.com>
1
2
3
It reports the shadow virtqueue address from qemu virtual address space.
4
5
Since this will be different from the guest's vaddr, but the device can
6
access it, SVQ takes special care about its alignment & lack of garbage
7
data. It assumes that IOMMU will work in host_page_size ranges for that.
8
9
Acked-by: Michael S. Tsirkin <mst@redhat.com>
10
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
---
13
hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
14
hw/virtio/vhost-shadow-virtqueue.h | 9 +++++++++
15
2 files changed, 38 insertions(+)
16
17
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/hw/virtio/vhost-shadow-virtqueue.c
20
+++ b/hw/virtio/vhost-shadow-virtqueue.c
21
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
22
}
23
24
/**
25
+ * Get the shadow vq vring address.
26
+ * @svq: Shadow virtqueue
27
+ * @addr: Destination to store address
28
+ */
29
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
30
+ struct vhost_vring_addr *addr)
31
+{
32
+ addr->desc_user_addr = (uint64_t)svq->vring.desc;
33
+ addr->avail_user_addr = (uint64_t)svq->vring.avail;
34
+ addr->used_user_addr = (uint64_t)svq->vring.used;
35
+}
36
+
37
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
38
+{
39
+ size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
40
+ size_t avail_size = offsetof(vring_avail_t, ring) +
41
+ sizeof(uint16_t) * svq->vring.num;
42
+
43
+ return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
44
+}
45
+
46
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
47
+{
48
+ size_t used_size = offsetof(vring_used_t, ring) +
49
+ sizeof(vring_used_elem_t) * svq->vring.num;
50
+ return ROUND_UP(used_size, qemu_real_host_page_size);
51
+}
52
+
53
+/**
54
* Set a new file descriptor for the guest to kick the SVQ and notify for avail
55
*
56
* @svq: The svq
57
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
58
index XXXXXXX..XXXXXXX 100644
59
--- a/hw/virtio/vhost-shadow-virtqueue.h
60
+++ b/hw/virtio/vhost-shadow-virtqueue.h
61
@@ -XXX,XX +XXX,XX @@
62
#define VHOST_SHADOW_VIRTQUEUE_H
63
64
#include "qemu/event_notifier.h"
65
+#include "hw/virtio/virtio.h"
66
+#include "standard-headers/linux/vhost_types.h"
67
68
/* Shadow virtqueue to relay notifications */
69
typedef struct VhostShadowVirtqueue {
70
+ /* Shadow vring */
71
+ struct vring vring;
72
+
73
/* Shadow kick notifier, sent to vhost */
74
EventNotifier hdev_kick;
75
/* Shadow call notifier, sent to vhost */
76
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
77
78
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
79
void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
80
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
81
+ struct vhost_vring_addr *addr);
82
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
83
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
84
85
void vhost_svq_stop(VhostShadowVirtqueue *svq);
86
87
--
88
2.7.4
89
90
diff view generated by jsdifflib
New patch
1
From: Eugenio Pérez <eperezma@redhat.com>
1
2
3
First half of the buffers forwarding part, preparing vhost-vdpa
4
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
5
this is effectively dead code at the moment, but it helps to reduce
6
patch size.
7
8
Acked-by: Michael S. Tsirkin <mst@redhat.com>
9
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
10
Signed-off-by: Jason Wang <jasowang@redhat.com>
11
---
12
hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
13
1 file changed, 41 insertions(+), 7 deletions(-)
14
15
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/hw/virtio/vhost-vdpa.c
18
+++ b/hw/virtio/vhost-vdpa.c
19
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
20
return ret;
21
}
22
23
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
24
+ struct vhost_vring_state *ring)
25
+{
26
+ trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
27
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
28
+}
29
+
30
static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
31
struct vhost_vring_file *file)
32
{
33
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
34
return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
35
}
36
37
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
38
+ struct vhost_vring_addr *addr)
39
+{
40
+ trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
41
+ addr->desc_user_addr, addr->used_user_addr,
42
+ addr->avail_user_addr,
43
+ addr->log_guest_addr);
44
+
45
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
46
+
47
+}
48
+
49
/**
50
* Set the shadow virtqueue descriptors to the device
51
*
52
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
53
static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
54
struct vhost_vring_addr *addr)
55
{
56
- trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
57
- addr->desc_user_addr, addr->used_user_addr,
58
- addr->avail_user_addr,
59
- addr->log_guest_addr);
60
- return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
61
+ struct vhost_vdpa *v = dev->opaque;
62
+
63
+ if (v->shadow_vqs_enabled) {
64
+ /*
65
+ * Device vring addr was set at device start. SVQ base is handled by
66
+ * VirtQueue code.
67
+ */
68
+ return 0;
69
+ }
70
+
71
+ return vhost_vdpa_set_vring_dev_addr(dev, addr);
72
}
73
74
static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
75
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
76
static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
77
struct vhost_vring_state *ring)
78
{
79
- trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
80
- return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
81
+ struct vhost_vdpa *v = dev->opaque;
82
+
83
+ if (v->shadow_vqs_enabled) {
84
+ /*
85
+ * Device vring base was set at device start. SVQ base is handled by
86
+ * VirtQueue code.
87
+ */
88
+ return 0;
89
+ }
90
+
91
+ return vhost_vdpa_set_dev_vring_base(dev, ring);
92
}
93
94
static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
95
--
96
2.7.4
97
98
diff view generated by jsdifflib
1
From: Thomas Huth <thuth@redhat.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
QEMU can emulate hubs to connect NICs and netdevs. This is currently
3
Initial version of shadow virtqueue that actually forward buffers. There
4
primarily used for the mis-named 'vlan' feature of the networking
4
is no iommu support at the moment, and that will be addressed in future
5
subsystem. Now the 'vlan' feature has been marked as deprecated, since
5
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
6
its name is rather confusing and the users often rather mis-configure
6
this means that SVQ is not usable at this point of the series on any
7
their network when trying to use it. But while the 'vlan' parameter
7
device.
8
should be removed at one point in time, the basic idea of emulating
8
9
a hub in QEMU is still good: It's useful for bundling up the output of
9
For simplicity it only supports modern devices, that expects vring
10
multiple NICs into one single l2tp netdev for example.
10
in little endian, with split ring and no event idx or indirect
11
11
descriptors. Support for them will not be added in this series.
12
Now to be able to use the hubport feature without 'vlan's, there is one
12
13
missing piece: The possibility to connect a hubport to a netdev, too.
13
It reuses the VirtQueue code for the device part. The driver part is
14
This patch adds this possibility by introducing a new "netdev=..."
14
based on Linux's virtio_ring driver, but with stripped functionality
15
parameter to the hubports.
15
and optimizations so it's easier to review.
16
16
17
To bundle up the output of multiple NICs into one socket netdev, you can
17
However, forwarding buffers have some particular pieces: One of the most
18
now run QEMU with these parameters for example:
18
unexpected ones is that a guest's buffer can expand through more than
19
19
one descriptor in SVQ. While this is handled gracefully by qemu's
20
qemu-system-ppc64 ... -netdev socket,id=s1,connect=:11122 \
20
emulated virtio devices, it may cause unexpected SVQ queue full. This
21
-netdev hubport,hubid=1,id=h1,netdev=s1 \
21
patch also solves it by checking for this condition at both guest's
22
-netdev hubport,hubid=1,id=h2 -device e1000,netdev=h2 \
22
kicks and device's calls. The code may be more elegant in the future if
23
-netdev hubport,hubid=1,id=h3 -device virtio-net-pci,netdev=h3
23
SVQ code runs in its own iocontext.
24
24
25
For using the socket netdev, you have got to start another QEMU as the
25
Acked-by: Michael S. Tsirkin <mst@redhat.com>
26
receiving side first, for example with network dumping enabled:
26
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
27
28
qemu-system-x86_64 -M isapc -netdev socket,id=s0,listen=:11122 \
29
-device ne2k_isa,netdev=s0 \
30
-object filter-dump,id=f1,netdev=s0,file=/tmp/dump.dat
31
32
After the ppc64 guest tried to boot from both NICs, you can see in the
33
dump file (using Wireshark, for example), that the output of both NICs
34
(the e1000 and the virtio-net-pci) has been successfully transfered
35
via the socket netdev in this case.
36
37
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
38
Signed-off-by: Thomas Huth <thuth@redhat.com>
39
Signed-off-by: Jason Wang <jasowang@redhat.com>
27
Signed-off-by: Jason Wang <jasowang@redhat.com>
40
---
28
---
41
net/hub.c | 27 +++++++++++++++++++++------
29
hw/virtio/vhost-shadow-virtqueue.c | 353 ++++++++++++++++++++++++++++++++++++-
42
net/hub.h | 3 ++-
30
hw/virtio/vhost-shadow-virtqueue.h | 26 +++
43
net/net.c | 2 +-
31
hw/virtio/vhost-vdpa.c | 159 ++++++++++++++++-
44
qapi/net.json | 4 +++-
32
3 files changed, 526 insertions(+), 12 deletions(-)
45
qemu-options.hx | 8 +++++---
33
46
5 files changed, 32 insertions(+), 12 deletions(-)
34
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
47
48
diff --git a/net/hub.c b/net/hub.c
49
index XXXXXXX..XXXXXXX 100644
35
index XXXXXXX..XXXXXXX 100644
50
--- a/net/hub.c
36
--- a/hw/virtio/vhost-shadow-virtqueue.c
51
+++ b/net/hub.c
37
+++ b/hw/virtio/vhost-shadow-virtqueue.c
52
@@ -XXX,XX +XXX,XX @@
38
@@ -XXX,XX +XXX,XX @@
39
#include "qemu/error-report.h"
40
#include "qapi/error.h"
41
#include "qemu/main-loop.h"
42
+#include "qemu/log.h"
43
#include "linux-headers/linux/vhost.h"
44
45
/**
46
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
47
}
48
49
/**
50
- * Forward guest notifications.
51
+ * Number of descriptors that the SVQ can make available from the guest.
52
+ *
53
+ * @svq: The svq
54
+ */
55
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
56
+{
57
+ return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
58
+}
59
+
60
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
61
+ const struct iovec *iovec,
62
+ size_t num, bool more_descs, bool write)
63
+{
64
+ uint16_t i = svq->free_head, last = svq->free_head;
65
+ unsigned n;
66
+ uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
67
+ vring_desc_t *descs = svq->vring.desc;
68
+
69
+ if (num == 0) {
70
+ return;
71
+ }
72
+
73
+ for (n = 0; n < num; n++) {
74
+ if (more_descs || (n + 1 < num)) {
75
+ descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
76
+ } else {
77
+ descs[i].flags = flags;
78
+ }
79
+ descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
80
+ descs[i].len = cpu_to_le32(iovec[n].iov_len);
81
+
82
+ last = i;
83
+ i = cpu_to_le16(descs[i].next);
84
+ }
85
+
86
+ svq->free_head = le16_to_cpu(descs[last].next);
87
+}
88
+
89
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
90
+ VirtQueueElement *elem,
91
+ unsigned *head)
92
+{
93
+ unsigned avail_idx;
94
+ vring_avail_t *avail = svq->vring.avail;
95
+
96
+ *head = svq->free_head;
97
+
98
+ /* We need some descriptors here */
99
+ if (unlikely(!elem->out_num && !elem->in_num)) {
100
+ qemu_log_mask(LOG_GUEST_ERROR,
101
+ "Guest provided element with no descriptors");
102
+ return false;
103
+ }
104
+
105
+ vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
106
+ elem->in_num > 0, false);
107
+ vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
108
+
109
+ /*
110
+ * Put the entry in the available array (but don't update avail->idx until
111
+ * they do sync).
112
+ */
113
+ avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
114
+ avail->ring[avail_idx] = cpu_to_le16(*head);
115
+ svq->shadow_avail_idx++;
116
+
117
+ /* Update the avail index after write the descriptor */
118
+ smp_wmb();
119
+ avail->idx = cpu_to_le16(svq->shadow_avail_idx);
120
+
121
+ return true;
122
+}
123
+
124
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
125
+{
126
+ unsigned qemu_head;
127
+ bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
128
+ if (unlikely(!ok)) {
129
+ return false;
130
+ }
131
+
132
+ svq->ring_id_maps[qemu_head] = elem;
133
+ return true;
134
+}
135
+
136
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
137
+{
138
+ /*
139
+ * We need to expose the available array entries before checking the used
140
+ * flags
141
+ */
142
+ smp_mb();
143
+ if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
144
+ return;
145
+ }
146
+
147
+ event_notifier_set(&svq->hdev_kick);
148
+}
149
+
150
+/**
151
+ * Forward available buffers.
152
+ *
153
+ * @svq: Shadow VirtQueue
154
+ *
155
+ * Note that this function does not guarantee that all guest's available
156
+ * buffers are available to the device in SVQ avail ring. The guest may have
157
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
158
+ * qemu vaddr.
159
+ *
160
+ * If that happens, guest's kick notifications will be disabled until the
161
+ * device uses some buffers.
162
+ */
163
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
164
+{
165
+ /* Clear event notifier */
166
+ event_notifier_test_and_clear(&svq->svq_kick);
167
+
168
+ /* Forward to the device as many available buffers as possible */
169
+ do {
170
+ virtio_queue_set_notification(svq->vq, false);
171
+
172
+ while (true) {
173
+ VirtQueueElement *elem;
174
+ bool ok;
175
+
176
+ if (svq->next_guest_avail_elem) {
177
+ elem = g_steal_pointer(&svq->next_guest_avail_elem);
178
+ } else {
179
+ elem = virtqueue_pop(svq->vq, sizeof(*elem));
180
+ }
181
+
182
+ if (!elem) {
183
+ break;
184
+ }
185
+
186
+ if (elem->out_num + elem->in_num >
187
+ vhost_svq_available_slots(svq)) {
188
+ /*
189
+ * This condition is possible since a contiguous buffer in GPA
190
+ * does not imply a contiguous buffer in qemu's VA
191
+ * scatter-gather segments. If that happens, the buffer exposed
192
+ * to the device needs to be a chain of descriptors at this
193
+ * moment.
194
+ *
195
+ * SVQ cannot hold more available buffers if we are here:
196
+ * queue the current guest descriptor and ignore further kicks
197
+ * until some elements are used.
198
+ */
199
+ svq->next_guest_avail_elem = elem;
200
+ return;
201
+ }
202
+
203
+ ok = vhost_svq_add(svq, elem);
204
+ if (unlikely(!ok)) {
205
+ /* VQ is broken, just return and ignore any other kicks */
206
+ return;
207
+ }
208
+ vhost_svq_kick(svq);
209
+ }
210
+
211
+ virtio_queue_set_notification(svq->vq, true);
212
+ } while (!virtio_queue_empty(svq->vq));
213
+}
214
+
215
+/**
216
+ * Handle guest's kick.
217
*
218
* @n: guest kick event notifier, the one that guest set to notify svq.
53
*/
219
*/
54
220
-static void vhost_handle_guest_kick(EventNotifier *n)
55
#include "qemu/osdep.h"
221
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
56
+#include "qapi/error.h"
57
#include "monitor/monitor.h"
58
#include "net/net.h"
59
#include "clients.h"
60
@@ -XXX,XX +XXX,XX @@ static NetClientInfo net_hub_port_info = {
61
.cleanup = net_hub_port_cleanup,
62
};
63
64
-static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
65
+static NetHubPort *net_hub_port_new(NetHub *hub, const char *name,
66
+ NetClientState *hubpeer)
67
{
222
{
68
NetClientState *nc;
223
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
69
NetHubPort *port;
224
svq_kick);
70
@@ -XXX,XX +XXX,XX @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
225
event_notifier_test_and_clear(n);
71
name = default_name;
226
- event_notifier_set(&svq->hdev_kick);
227
+ vhost_handle_guest_kick(svq);
228
+}
229
+
230
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
231
+{
232
+ if (svq->last_used_idx != svq->shadow_used_idx) {
233
+ return true;
234
+ }
235
+
236
+ svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
237
+
238
+ return svq->last_used_idx != svq->shadow_used_idx;
239
}
240
241
/**
242
- * Forward vhost notifications
243
+ * Enable vhost device calls after disable them.
244
+ *
245
+ * @svq: The svq
246
+ *
247
+ * It returns false if there are pending used buffers from the vhost device,
248
+ * avoiding the possible races between SVQ checking for more work and enabling
249
+ * callbacks. True if SVQ used vring has no more pending buffers.
250
+ */
251
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
252
+{
253
+ svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
254
+ /* Make sure the flag is written before the read of used_idx */
255
+ smp_mb();
256
+ return !vhost_svq_more_used(svq);
257
+}
258
+
259
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
260
+{
261
+ svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
262
+}
263
+
264
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
265
+ uint32_t *len)
266
+{
267
+ vring_desc_t *descs = svq->vring.desc;
268
+ const vring_used_t *used = svq->vring.used;
269
+ vring_used_elem_t used_elem;
270
+ uint16_t last_used;
271
+
272
+ if (!vhost_svq_more_used(svq)) {
273
+ return NULL;
274
+ }
275
+
276
+ /* Only get used array entries after they have been exposed by dev */
277
+ smp_rmb();
278
+ last_used = svq->last_used_idx & (svq->vring.num - 1);
279
+ used_elem.id = le32_to_cpu(used->ring[last_used].id);
280
+ used_elem.len = le32_to_cpu(used->ring[last_used].len);
281
+
282
+ svq->last_used_idx++;
283
+ if (unlikely(used_elem.id >= svq->vring.num)) {
284
+ qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
285
+ svq->vdev->name, used_elem.id);
286
+ return NULL;
287
+ }
288
+
289
+ if (unlikely(!svq->ring_id_maps[used_elem.id])) {
290
+ qemu_log_mask(LOG_GUEST_ERROR,
291
+ "Device %s says index %u is used, but it was not available",
292
+ svq->vdev->name, used_elem.id);
293
+ return NULL;
294
+ }
295
+
296
+ descs[used_elem.id].next = svq->free_head;
297
+ svq->free_head = used_elem.id;
298
+
299
+ *len = used_elem.len;
300
+ return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
301
+}
302
+
303
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
304
+ bool check_for_avail_queue)
305
+{
306
+ VirtQueue *vq = svq->vq;
307
+
308
+ /* Forward as many used buffers as possible. */
309
+ do {
310
+ unsigned i = 0;
311
+
312
+ vhost_svq_disable_notification(svq);
313
+ while (true) {
314
+ uint32_t len;
315
+ g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
316
+ if (!elem) {
317
+ break;
318
+ }
319
+
320
+ if (unlikely(i >= svq->vring.num)) {
321
+ qemu_log_mask(LOG_GUEST_ERROR,
322
+ "More than %u used buffers obtained in a %u size SVQ",
323
+ i, svq->vring.num);
324
+ virtqueue_fill(vq, elem, len, i);
325
+ virtqueue_flush(vq, i);
326
+ return;
327
+ }
328
+ virtqueue_fill(vq, elem, len, i++);
329
+ }
330
+
331
+ virtqueue_flush(vq, i);
332
+ event_notifier_set(&svq->svq_call);
333
+
334
+ if (check_for_avail_queue && svq->next_guest_avail_elem) {
335
+ /*
336
+ * Avail ring was full when vhost_svq_flush was called, so it's a
337
+ * good moment to make more descriptors available if possible.
338
+ */
339
+ vhost_handle_guest_kick(svq);
340
+ }
341
+ } while (!vhost_svq_enable_notification(svq));
342
+}
343
+
344
+/**
345
+ * Forward used buffers.
346
*
347
* @n: hdev call event notifier, the one that device set to notify svq.
348
+ *
349
+ * Note that we are not making any buffers available in the loop, there is no
350
+ * way that it runs more than virtqueue size times.
351
*/
352
static void vhost_svq_handle_call(EventNotifier *n)
353
{
354
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
355
hdev_call);
356
event_notifier_test_and_clear(n);
357
- event_notifier_set(&svq->svq_call);
358
+ vhost_svq_flush(svq, true);
359
}
360
361
/**
362
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
363
if (poll_start) {
364
event_notifier_init_fd(svq_kick, svq_kick_fd);
365
event_notifier_set(svq_kick);
366
- event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
367
+ event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
368
+ }
369
+}
370
+
371
+/**
372
+ * Start the shadow virtqueue operation.
373
+ *
374
+ * @svq: Shadow Virtqueue
375
+ * @vdev: VirtIO device
376
+ * @vq: Virtqueue to shadow
377
+ */
378
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
379
+ VirtQueue *vq)
380
+{
381
+ size_t desc_size, driver_size, device_size;
382
+
383
+ svq->next_guest_avail_elem = NULL;
384
+ svq->shadow_avail_idx = 0;
385
+ svq->shadow_used_idx = 0;
386
+ svq->last_used_idx = 0;
387
+ svq->vdev = vdev;
388
+ svq->vq = vq;
389
+
390
+ svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
391
+ driver_size = vhost_svq_driver_area_size(svq);
392
+ device_size = vhost_svq_device_area_size(svq);
393
+ svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
394
+ desc_size = sizeof(vring_desc_t) * svq->vring.num;
395
+ svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
396
+ memset(svq->vring.desc, 0, driver_size);
397
+ svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
398
+ memset(svq->vring.used, 0, device_size);
399
+ svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
400
+ for (unsigned i = 0; i < svq->vring.num - 1; i++) {
401
+ svq->vring.desc[i].next = cpu_to_le16(i + 1);
72
}
402
}
73
403
}
74
- nc = qemu_new_net_client(&net_hub_port_info, NULL, "hub", name);
404
75
+ nc = qemu_new_net_client(&net_hub_port_info, hubpeer, "hub", name);
405
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
76
port = DO_UPCAST(NetHubPort, nc, nc);
406
void vhost_svq_stop(VhostShadowVirtqueue *svq)
77
port->id = id;
407
{
78
port->hub = hub;
408
event_notifier_set_handler(&svq->svq_kick, NULL);
79
@@ -XXX,XX +XXX,XX @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name)
409
+ g_autofree VirtQueueElement *next_avail_elem = NULL;
410
+
411
+ if (!svq->vq) {
412
+ return;
413
+ }
414
+
415
+ /* Send all pending used descriptors to guest */
416
+ vhost_svq_flush(svq, false);
417
+
418
+ for (unsigned i = 0; i < svq->vring.num; ++i) {
419
+ g_autofree VirtQueueElement *elem = NULL;
420
+ elem = g_steal_pointer(&svq->ring_id_maps[i]);
421
+ if (elem) {
422
+ virtqueue_detach_element(svq->vq, elem, 0);
423
+ }
424
+ }
425
+
426
+ next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
427
+ if (next_avail_elem) {
428
+ virtqueue_detach_element(svq->vq, next_avail_elem, 0);
429
+ }
430
+ svq->vq = NULL;
431
+ g_free(svq->ring_id_maps);
432
+ qemu_vfree(svq->vring.desc);
433
+ qemu_vfree(svq->vring.used);
434
}
80
435
81
/**
436
/**
82
* Create a port on a given hub
437
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
83
+ * @hub_id: Number of the hub
438
index XXXXXXX..XXXXXXX 100644
84
* @name: Net client name or NULL for default name.
439
--- a/hw/virtio/vhost-shadow-virtqueue.h
85
+ * @hubpeer: Peer to use (if "netdev=id" has been specified)
440
+++ b/hw/virtio/vhost-shadow-virtqueue.h
86
*
441
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
87
* If there is no existing hub with the given id then a new hub is created.
442
443
/* Guest's call notifier, where the SVQ calls guest. */
444
EventNotifier svq_call;
445
+
446
+ /* Virtio queue shadowing */
447
+ VirtQueue *vq;
448
+
449
+ /* Virtio device */
450
+ VirtIODevice *vdev;
451
+
452
+ /* Map for use the guest's descriptors */
453
+ VirtQueueElement **ring_id_maps;
454
+
455
+ /* Next VirtQueue element that guest made available */
456
+ VirtQueueElement *next_guest_avail_elem;
457
+
458
+ /* Next head to expose to the device */
459
+ uint16_t shadow_avail_idx;
460
+
461
+ /* Next free descriptor */
462
+ uint16_t free_head;
463
+
464
+ /* Last seen used idx */
465
+ uint16_t shadow_used_idx;
466
+
467
+ /* Next head to consume from the device */
468
+ uint16_t last_used_idx;
469
} VhostShadowVirtqueue;
470
471
bool vhost_svq_valid_features(uint64_t features, Error **errp);
472
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
473
size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
474
size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
475
476
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
477
+ VirtQueue *vq);
478
void vhost_svq_stop(VhostShadowVirtqueue *svq);
479
480
VhostShadowVirtqueue *vhost_svq_new(void);
481
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
482
index XXXXXXX..XXXXXXX 100644
483
--- a/hw/virtio/vhost-vdpa.c
484
+++ b/hw/virtio/vhost-vdpa.c
485
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
486
* Note that this function does not rewind kick file descriptor if cannot set
487
* call one.
88
*/
488
*/
89
-NetClientState *net_hub_add_port(int hub_id, const char *name)
489
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
90
+NetClientState *net_hub_add_port(int hub_id, const char *name,
490
- VhostShadowVirtqueue *svq,
91
+ NetClientState *hubpeer)
491
- unsigned idx,
492
- Error **errp)
493
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
494
+ VhostShadowVirtqueue *svq,
495
+ unsigned idx,
496
+ Error **errp)
92
{
497
{
93
NetHub *hub;
498
struct vhost_vring_file file = {
94
NetHubPort *port;
499
.index = dev->vq_index + idx,
95
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_add_port(int hub_id, const char *name)
500
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
96
hub = net_hub_new(hub_id);
501
r = vhost_vdpa_set_vring_dev_kick(dev, &file);
502
if (unlikely(r != 0)) {
503
error_setg_errno(errp, -r, "Can't set device kick fd");
504
- return false;
505
+ return r;
97
}
506
}
98
507
99
- port = net_hub_port_new(hub, name);
508
event_notifier = &svq->hdev_call;
100
+ port = net_hub_port_new(hub, name, hubpeer);
509
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
101
return &port->nc;
510
error_setg_errno(errp, -r, "Can't set device call fd");
511
}
512
513
+ return r;
514
+}
515
+
516
+/**
517
+ * Unmap a SVQ area in the device
518
+ */
519
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
520
+ hwaddr size)
521
+{
522
+ int r;
523
+
524
+ size = ROUND_UP(size, qemu_real_host_page_size);
525
+ r = vhost_vdpa_dma_unmap(v, iova, size);
526
+ return r == 0;
527
+}
528
+
529
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
530
+ const VhostShadowVirtqueue *svq)
531
+{
532
+ struct vhost_vdpa *v = dev->opaque;
533
+ struct vhost_vring_addr svq_addr;
534
+ size_t device_size = vhost_svq_device_area_size(svq);
535
+ size_t driver_size = vhost_svq_driver_area_size(svq);
536
+ bool ok;
537
+
538
+ vhost_svq_get_vring_addr(svq, &svq_addr);
539
+
540
+ ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
541
+ if (unlikely(!ok)) {
542
+ return false;
543
+ }
544
+
545
+ return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
546
+}
547
+
548
+/**
549
+ * Map the shadow virtqueue rings in the device
550
+ *
551
+ * @dev: The vhost device
552
+ * @svq: The shadow virtqueue
553
+ * @addr: Assigned IOVA addresses
554
+ * @errp: Error pointer
555
+ */
556
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
557
+ const VhostShadowVirtqueue *svq,
558
+ struct vhost_vring_addr *addr,
559
+ Error **errp)
560
+{
561
+ struct vhost_vdpa *v = dev->opaque;
562
+ size_t device_size = vhost_svq_device_area_size(svq);
563
+ size_t driver_size = vhost_svq_driver_area_size(svq);
564
+ int r;
565
+
566
+ ERRP_GUARD();
567
+ vhost_svq_get_vring_addr(svq, addr);
568
+
569
+ r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
570
+ (void *)addr->desc_user_addr, true);
571
+ if (unlikely(r != 0)) {
572
+ error_setg_errno(errp, -r, "Cannot create vq driver region: ");
573
+ return false;
574
+ }
575
+
576
+ r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
577
+ (void *)addr->used_user_addr, false);
578
+ if (unlikely(r != 0)) {
579
+ error_setg_errno(errp, -r, "Cannot create vq device region: ");
580
+ }
581
+
582
+ return r == 0;
583
+}
584
+
585
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
586
+ VhostShadowVirtqueue *svq,
587
+ unsigned idx,
588
+ Error **errp)
589
+{
590
+ uint16_t vq_index = dev->vq_index + idx;
591
+ struct vhost_vring_state s = {
592
+ .index = vq_index,
593
+ };
594
+ int r;
595
+
596
+ r = vhost_vdpa_set_dev_vring_base(dev, &s);
597
+ if (unlikely(r)) {
598
+ error_setg_errno(errp, -r, "Cannot set vring base");
599
+ return false;
600
+ }
601
+
602
+ r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
603
return r == 0;
102
}
604
}
103
605
104
@@ -XXX,XX +XXX,XX @@ NetClientState *net_hub_port_find(int hub_id)
606
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
607
}
608
609
for (i = 0; i < v->shadow_vqs->len; ++i) {
610
+ VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
611
VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
612
+ struct vhost_vring_addr addr = {
613
+ .index = i,
614
+ };
615
+ int r;
616
bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
617
if (unlikely(!ok)) {
618
- error_reportf_err(err, "Cannot setup SVQ %u: ", i);
619
+ goto err;
620
+ }
621
+
622
+ vhost_svq_start(svq, dev->vdev, vq);
623
+ ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
624
+ if (unlikely(!ok)) {
625
+ goto err_map;
626
+ }
627
+
628
+ /* Override vring GPA set by vhost subsystem */
629
+ r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
630
+ if (unlikely(r != 0)) {
631
+ error_setg_errno(&err, -r, "Cannot set device address");
632
+ goto err_set_addr;
633
+ }
634
+ }
635
+
636
+ return true;
637
+
638
+err_set_addr:
639
+ vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
640
+
641
+err_map:
642
+ vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
643
+
644
+err:
645
+ error_reportf_err(err, "Cannot setup SVQ %u: ", i);
646
+ for (unsigned j = 0; j < i; ++j) {
647
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
648
+ vhost_vdpa_svq_unmap_rings(dev, svq);
649
+ vhost_svq_stop(svq);
650
+ }
651
+
652
+ return false;
653
+}
654
+
655
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
656
+{
657
+ struct vhost_vdpa *v = dev->opaque;
658
+
659
+ if (!v->shadow_vqs) {
660
+ return true;
661
+ }
662
+
663
+ for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
664
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
665
+ i);
666
+ bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
667
+ if (unlikely(!ok)) {
668
return false;
105
}
669
}
106
}
670
}
107
671
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
108
- nc = net_hub_add_port(hub_id, NULL);
672
}
109
+ nc = net_hub_add_port(hub_id, NULL, NULL);
673
vhost_vdpa_set_vring_ready(dev);
110
return nc;
674
} else {
111
}
675
+ ok = vhost_vdpa_svqs_stop(dev);
112
676
+ if (unlikely(!ok)) {
113
@@ -XXX,XX +XXX,XX @@ int net_init_hubport(const Netdev *netdev, const char *name,
114
NetClientState *peer, Error **errp)
115
{
116
const NetdevHubPortOptions *hubport;
117
+ NetClientState *hubpeer = NULL;
118
119
assert(netdev->type == NET_CLIENT_DRIVER_HUBPORT);
120
assert(!peer);
121
hubport = &netdev->u.hubport;
122
123
- net_hub_add_port(hubport->hubid, name);
124
+ if (hubport->has_netdev) {
125
+ hubpeer = qemu_find_netdev(hubport->netdev);
126
+ if (!hubpeer) {
127
+ error_setg(errp, "netdev '%s' not found", hubport->netdev);
128
+ return -1;
677
+ return -1;
129
+ }
678
+ }
130
+ }
679
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
131
+
680
}
132
+ net_hub_add_port(hubport->hubid, name, hubpeer);
133
+
134
return 0;
135
}
136
137
diff --git a/net/hub.h b/net/hub.h
138
index XXXXXXX..XXXXXXX 100644
139
--- a/net/hub.h
140
+++ b/net/hub.h
141
@@ -XXX,XX +XXX,XX @@
142
143
#include "qemu-common.h"
144
145
-NetClientState *net_hub_add_port(int hub_id, const char *name);
146
+NetClientState *net_hub_add_port(int hub_id, const char *name,
147
+ NetClientState *hubpeer);
148
NetClientState *net_hub_find_client_by_name(int hub_id, const char *name);
149
void net_hub_info(Monitor *mon);
150
void net_hub_check_clients(void);
151
diff --git a/net/net.c b/net/net.c
152
index XXXXXXX..XXXXXXX 100644
153
--- a/net/net.c
154
+++ b/net/net.c
155
@@ -XXX,XX +XXX,XX @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
156
/* Do not add to a vlan if it's a nic with a netdev= parameter. */
157
if (netdev->type != NET_CLIENT_DRIVER_NIC ||
158
!opts->u.nic.has_netdev) {
159
- peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL);
160
+ peer = net_hub_add_port(net->has_vlan ? net->vlan : 0, NULL, NULL);
161
}
162
163
if (net->has_vlan && !vlan_warned) {
164
diff --git a/qapi/net.json b/qapi/net.json
165
index XXXXXXX..XXXXXXX 100644
166
--- a/qapi/net.json
167
+++ b/qapi/net.json
168
@@ -XXX,XX +XXX,XX @@
169
# Connect two or more net clients through a software hub.
170
#
171
# @hubid: hub identifier number
172
+# @netdev: used to connect hub to a netdev instead of a device (since 2.12)
173
#
174
# Since: 1.2
175
##
176
{ 'struct': 'NetdevHubPortOptions',
177
'data': {
178
- 'hubid': 'int32' } }
179
+ 'hubid': 'int32',
180
+ '*netdev': 'str' } }
181
182
##
183
# @NetdevNetmapOptions:
184
diff --git a/qemu-options.hx b/qemu-options.hx
185
index XXXXXXX..XXXXXXX 100644
186
--- a/qemu-options.hx
187
+++ b/qemu-options.hx
188
@@ -XXX,XX +XXX,XX @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
189
#endif
190
"-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
191
" configure a vhost-user network, backed by a chardev 'dev'\n"
192
- "-netdev hubport,id=str,hubid=n\n"
193
+ "-netdev hubport,id=str,hubid=n[,netdev=nd]\n"
194
" configure a hub port on QEMU VLAN 'n'\n", QEMU_ARCH_ALL)
195
DEF("net", HAS_ARG, QEMU_OPTION_net,
196
"-net nic[,vlan=n][,netdev=nd][,macaddr=mac][,model=type][,name=str][,addr=str][,vectors=v]\n"
197
@@ -XXX,XX +XXX,XX @@ vde_switch -F -sock /tmp/myswitch
198
qemu-system-i386 linux.img -net nic -net vde,sock=/tmp/myswitch
199
@end example
200
201
-@item -netdev hubport,id=@var{id},hubid=@var{hubid}
202
+@item -netdev hubport,id=@var{id},hubid=@var{hubid}[,netdev=@var{nd}]
203
204
Create a hub port on QEMU "vlan" @var{hubid}.
205
206
The hubport netdev lets you connect a NIC to a QEMU "vlan" instead of a single
207
netdev. @code{-net} and @code{-device} with parameter @option{vlan} create the
208
-required hub automatically.
209
+required hub automatically. Alternatively, you can also connect the hubport
210
+to another netdev with ID @var{nd} by using the @option{netdev=@var{nd}}
211
+option.
212
213
@item -netdev vhost-user,chardev=@var{id}[,vhostforce=on|off][,queues=n]
214
681
215
--
682
--
216
2.7.4
683
2.7.4
217
684
218
685
diff view generated by jsdifflib
New patch
1
1
From: Eugenio Pérez <eperezma@redhat.com>
2
3
This iova tree function allows it to look for a hole in allocated
4
regions and return a totally new translation for a given translated
5
address.
6
7
It's usage is mainly to allow devices to access qemu address space,
8
remapping guest's one into a new iova space where qemu can add chunks of
9
addresses.
10
11
Acked-by: Michael S. Tsirkin <mst@redhat.com>
12
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
13
Reviewed-by: Peter Xu <peterx@redhat.com>
14
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
---
16
include/qemu/iova-tree.h | 18 +++++++
17
util/iova-tree.c | 135 +++++++++++++++++++++++++++++++++++++++++++++++
18
2 files changed, 153 insertions(+)
19
20
diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
21
index XXXXXXX..XXXXXXX 100644
22
--- a/include/qemu/iova-tree.h
23
+++ b/include/qemu/iova-tree.h
24
@@ -XXX,XX +XXX,XX @@
25
#define IOVA_OK (0)
26
#define IOVA_ERR_INVALID (-1) /* Invalid parameters */
27
#define IOVA_ERR_OVERLAP (-2) /* IOVA range overlapped */
28
+#define IOVA_ERR_NOMEM (-3) /* Cannot allocate */
29
30
typedef struct IOVATree IOVATree;
31
typedef struct DMAMap {
32
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
33
void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
34
35
/**
36
+ * iova_tree_alloc_map:
37
+ *
38
+ * @tree: the iova tree to allocate from
39
+ * @map: the new map (as translated addr & size) to allocate in the iova region
40
+ * @iova_begin: the minimum address of the allocation
41
+ * @iova_end: the maximum addressable direction of the allocation
42
+ *
43
+ * Allocates a new region of a given size, between iova_min and iova_max.
44
+ *
45
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
46
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
47
+ * in map->iova.
48
+ */
49
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
50
+ hwaddr iova_end);
51
+
52
+/**
53
* iova_tree_destroy:
54
*
55
* @tree: the iova tree to destroy
56
diff --git a/util/iova-tree.c b/util/iova-tree.c
57
index XXXXXXX..XXXXXXX 100644
58
--- a/util/iova-tree.c
59
+++ b/util/iova-tree.c
60
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
61
GTree *tree;
62
};
63
64
+/* Args to pass to iova_tree_alloc foreach function. */
65
+struct IOVATreeAllocArgs {
66
+ /* Size of the desired allocation */
67
+ size_t new_size;
68
+
69
+ /* The minimum address allowed in the allocation */
70
+ hwaddr iova_begin;
71
+
72
+ /* Map at the left of the hole, can be NULL if "this" is first one */
73
+ const DMAMap *prev;
74
+
75
+ /* Map at the right of the hole, can be NULL if "prev" is the last one */
76
+ const DMAMap *this;
77
+
78
+ /* If found, we fill in the IOVA here */
79
+ hwaddr iova_result;
80
+
81
+ /* Whether have we found a valid IOVA */
82
+ bool iova_found;
83
+};
84
+
85
+/**
86
+ * Iterate args to the next hole
87
+ *
88
+ * @args: The alloc arguments
89
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
90
+ */
91
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
92
+ const DMAMap *next) {
93
+ args->prev = args->this;
94
+ args->this = next;
95
+}
96
+
97
static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
98
{
99
const DMAMap *m1 = a, *m2 = b;
100
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
101
return IOVA_OK;
102
}
103
104
+/**
105
+ * Try to find an unallocated IOVA range between prev and this elements.
106
+ *
107
+ * @args: Arguments to allocation
108
+ *
109
+ * Cases:
110
+ *
111
+ * (1) !prev, !this: No entries allocated, always succeed
112
+ *
113
+ * (2) !prev, this: We're iterating at the 1st element.
114
+ *
115
+ * (3) prev, !this: We're iterating at the last element.
116
+ *
117
+ * (4) prev, this: this is the most common case, we'll try to find a hole
118
+ * between "prev" and "this" mapping.
119
+ *
120
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
121
+ * searches linearly so it's easy to discard the result if it's not the case.
122
+ */
123
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
124
+{
125
+ const DMAMap *prev = args->prev, *this = args->this;
126
+ uint64_t hole_start, hole_last;
127
+
128
+ if (this && this->iova + this->size < args->iova_begin) {
129
+ return;
130
+ }
131
+
132
+ hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
133
+ hole_last = this ? this->iova : HWADDR_MAX;
134
+
135
+ if (hole_last - hole_start > args->new_size) {
136
+ args->iova_result = hole_start;
137
+ args->iova_found = true;
138
+ }
139
+}
140
+
141
+/**
142
+ * Foreach dma node in the tree, compare if there is a hole with its previous
143
+ * node (or minimum iova address allowed) and the node.
144
+ *
145
+ * @key: Node iterating
146
+ * @value: Node iterating
147
+ * @pargs: Struct to communicate with the outside world
148
+ *
149
+ * Return: false to keep iterating, true if needs break.
150
+ */
151
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
152
+ gpointer pargs)
153
+{
154
+ struct IOVATreeAllocArgs *args = pargs;
155
+ DMAMap *node = value;
156
+
157
+ assert(key == value);
158
+
159
+ iova_tree_alloc_args_iterate(args, node);
160
+ iova_tree_alloc_map_in_hole(args);
161
+ return args->iova_found;
162
+}
163
+
164
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
165
+ hwaddr iova_last)
166
+{
167
+ struct IOVATreeAllocArgs args = {
168
+ .new_size = map->size,
169
+ .iova_begin = iova_begin,
170
+ };
171
+
172
+ if (unlikely(iova_last < iova_begin)) {
173
+ return IOVA_ERR_INVALID;
174
+ }
175
+
176
+ /*
177
+ * Find a valid hole for the mapping
178
+ *
179
+ * Assuming low iova_begin, so no need to do a binary search to
180
+ * locate the first node.
181
+ *
182
+ * TODO: Replace all this with g_tree_node_first/next/last when available
183
+ * (from glib since 2.68). To do it with g_tree_foreach complicates the
184
+ * code a lot.
185
+ *
186
+ */
187
+ g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
188
+ if (!args.iova_found) {
189
+ /*
190
+ * Either tree is empty or the last hole is still not checked.
191
+ * g_tree_foreach does not compare (last, iova_last] range, so we check
192
+ * it here.
193
+ */
194
+ iova_tree_alloc_args_iterate(&args, NULL);
195
+ iova_tree_alloc_map_in_hole(&args);
196
+ }
197
+
198
+ if (!args.iova_found || args.iova_result + map->size > iova_last) {
199
+ return IOVA_ERR_NOMEM;
200
+ }
201
+
202
+ map->iova = args.iova_result;
203
+ return iova_tree_insert(tree, map);
204
+}
205
+
206
void iova_tree_destroy(IOVATree *tree)
207
{
208
g_tree_destroy(tree->tree);
209
--
210
2.7.4
211
212
diff view generated by jsdifflib
New patch
1
From: Eugenio Pérez <eperezma@redhat.com>
1
2
3
This function does the reverse operation of iova_tree_find: To look for
4
a mapping that match a translated address so we can do the reverse.
5
6
This have linear complexity instead of logarithmic, but it supports
7
overlapping HVA. Future developments could reduce it.
8
9
Acked-by: Michael S. Tsirkin <mst@redhat.com>
10
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
11
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
---
13
include/qemu/iova-tree.h | 20 +++++++++++++++++++-
14
util/iova-tree.c | 34 ++++++++++++++++++++++++++++++++++
15
2 files changed, 53 insertions(+), 1 deletion(-)
16
17
diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
18
index XXXXXXX..XXXXXXX 100644
19
--- a/include/qemu/iova-tree.h
20
+++ b/include/qemu/iova-tree.h
21
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
22
* @tree: the iova tree to search from
23
* @map: the mapping to search
24
*
25
- * Search for a mapping in the iova tree that overlaps with the
26
+ * Search for a mapping in the iova tree that iova overlaps with the
27
* mapping range specified. Only the first found mapping will be
28
* returned.
29
*
30
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
31
const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
32
33
/**
34
+ * iova_tree_find_iova:
35
+ *
36
+ * @tree: the iova tree to search from
37
+ * @map: the mapping to search
38
+ *
39
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
40
+ * mapping range specified. Only the first found mapping will be
41
+ * returned.
42
+ *
43
+ * Return: DMAMap pointer if found, or NULL if not found. Note that
44
+ * the returned DMAMap pointer is maintained internally. User should
45
+ * only read the content but never modify or free the content. Also,
46
+ * user is responsible to make sure the pointer is valid (say, no
47
+ * concurrent deletion in progress).
48
+ */
49
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
50
+
51
+/**
52
* iova_tree_find_address:
53
*
54
* @tree: the iova tree to search from
55
diff --git a/util/iova-tree.c b/util/iova-tree.c
56
index XXXXXXX..XXXXXXX 100644
57
--- a/util/iova-tree.c
58
+++ b/util/iova-tree.c
59
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
60
bool iova_found;
61
};
62
63
+typedef struct IOVATreeFindIOVAArgs {
64
+ const DMAMap *needle;
65
+ const DMAMap *result;
66
+} IOVATreeFindIOVAArgs;
67
+
68
/**
69
* Iterate args to the next hole
70
*
71
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
72
return g_tree_lookup(tree->tree, map);
73
}
74
75
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
76
+ gpointer data)
77
+{
78
+ const DMAMap *map = key;
79
+ IOVATreeFindIOVAArgs *args = data;
80
+ const DMAMap *needle;
81
+
82
+ g_assert(key == value);
83
+
84
+ needle = args->needle;
85
+ if (map->translated_addr + map->size < needle->translated_addr ||
86
+ needle->translated_addr + needle->size < map->translated_addr) {
87
+ return false;
88
+ }
89
+
90
+ args->result = map;
91
+ return true;
92
+}
93
+
94
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
95
+{
96
+ IOVATreeFindIOVAArgs args = {
97
+ .needle = map,
98
+ };
99
+
100
+ g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
101
+ return args.result;
102
+}
103
+
104
const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
105
{
106
const DMAMap map = { .iova = iova, .size = 0 };
107
--
108
2.7.4
109
110
diff view generated by jsdifflib
1
From: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
Packet size some time different or when network is busy.
3
This tree is able to look for a translated address from an IOVA address.
4
Based on same payload size, but TCP protocol can not
5
guarantee send the same one packet in the same way,
6
4
7
like that:
5
At first glance it is similar to util/iova-tree. However, SVQ working on
8
We send this payload:
6
devices with limited IOVA space need more capabilities, like allocating
9
------------------------------
7
IOVA chunks or performing reverse translations (qemu addresses to iova).
10
| header |1|2|3|4|5|6|7|8|9|0|
11
------------------------------
12
8
13
primary:
9
The allocation capability, as "assign a free IOVA address to this chunk
14
ppkt1:
10
of memory in qemu's address space" allows shadow virtqueue to create a
15
----------------
11
new address space that is not restricted by guest's addressable one, so
16
| header |1|2|3|
12
we can allocate shadow vqs vrings outside of it.
17
----------------
18
ppkt2:
19
------------------------
20
| header |4|5|6|7|8|9|0|
21
------------------------
22
13
23
secondary:
14
It duplicates the tree so it can search efficiently in both directions,
24
spkt1:
15
and it will signal overlap if iova or the translated address is present
25
------------------------------
16
in any tree.
26
| header |1|2|3|4|5|6|7|8|9|0|
27
------------------------------
28
17
29
In the original method, ppkt1 and ppkt2 are different in size and
18
Acked-by: Michael S. Tsirkin <mst@redhat.com>
30
spkt1, so they can't compare and trigger the checkpoint.
19
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
31
32
I have tested FTP get 200M and 1G file many times, I found that
33
the performance was less than 1% of the native.
34
35
Now I reconstructed the comparison of TCP packets based on the
36
TCP sequence number. first of all, ppkt1 and spkt1 have the same
37
starting sequence number, so they can compare, even though their
38
length is different. And then ppkt1 with a smaller payload length
39
is used as the comparison length, if the payload is same, send
40
out the ppkt1 and record the offset(the length of ppkt1 payload)
41
in spkt1. The next comparison, ppkt2 and spkt1 can be compared
42
from the recorded position of spkt1.
43
44
like that:
45
----------------
46
| header |1|2|3| ppkt1
47
---------|-----|
48
| |
49
---------v-----v--------------
50
| header |1|2|3|4|5|6|7|8|9|0| spkt1
51
---------------|\------------|
52
| \offset |
53
---------v-------------v
54
| header |4|5|6|7|8|9|0| ppkt2
55
------------------------
56
57
In this way, the performance can reach native 20% in my multiple
58
tests.
59
60
Cc: Zhang Chen <zhangckid@gmail.com>
61
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
62
Cc: Jason Wang <jasowang@redhat.com>
63
64
Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
65
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
66
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
67
Reviewed-by: Zhang Chen <zhangckid@gmail.com>
68
Tested-by: Zhang Chen <zhangckid@gmail.com>
69
Signed-off-by: Jason Wang <jasowang@redhat.com>
20
Signed-off-by: Jason Wang <jasowang@redhat.com>
70
---
21
---
71
net/colo-compare.c | 343 +++++++++++++++++++++++++++++++++++------------------
22
hw/virtio/meson.build | 2 +-
72
net/colo.c | 9 ++
23
hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
73
net/colo.h | 15 +++
24
hw/virtio/vhost-iova-tree.h | 27 +++++++++++
74
net/trace-events | 2 +-
25
3 files changed, 138 insertions(+), 1 deletion(-)
75
4 files changed, 250 insertions(+), 119 deletions(-)
26
create mode 100644 hw/virtio/vhost-iova-tree.c
27
create mode 100644 hw/virtio/vhost-iova-tree.h
76
28
77
diff --git a/net/colo-compare.c b/net/colo-compare.c
29
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
78
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
79
--- a/net/colo-compare.c
31
--- a/hw/virtio/meson.build
80
+++ b/net/colo-compare.c
32
+++ b/hw/virtio/meson.build
33
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
34
35
virtio_ss = ss.source_set()
36
virtio_ss.add(files('virtio.c'))
37
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
38
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
39
virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
40
virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
41
virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
42
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
43
new file mode 100644
44
index XXXXXXX..XXXXXXX
45
--- /dev/null
46
+++ b/hw/virtio/vhost-iova-tree.c
81
@@ -XXX,XX +XXX,XX @@
47
@@ -XXX,XX +XXX,XX @@
82
#define COMPARE_READ_LEN_MAX NET_BUFSIZE
48
+/*
83
#define MAX_QUEUE_SIZE 1024
49
+ * vhost software live migration iova tree
84
50
+ *
85
+#define COLO_COMPARE_FREE_PRIMARY 0x01
51
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
86
+#define COLO_COMPARE_FREE_SECONDARY 0x02
52
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
53
+ *
54
+ * SPDX-License-Identifier: GPL-2.0-or-later
55
+ */
87
+
56
+
88
/* TODO: Should be configurable */
57
+#include "qemu/osdep.h"
89
#define REGULAR_PACKET_CHECK_MS 3000
58
+#include "qemu/iova-tree.h"
90
59
+#include "vhost-iova-tree.h"
91
@@ -XXX,XX +XXX,XX @@ static gint seq_sorter(Packet *a, Packet *b, gpointer data)
60
+
92
return ntohl(atcp->th_seq) - ntohl(btcp->th_seq);
61
+#define iova_min_addr qemu_real_host_page_size
93
}
62
+
94
63
+/**
95
+static void fill_pkt_tcp_info(void *data, uint32_t *max_ack)
64
+ * VhostIOVATree, able to:
65
+ * - Translate iova address
66
+ * - Reverse translate iova address (from translated to iova)
67
+ * - Allocate IOVA regions for translated range (linear operation)
68
+ */
69
+struct VhostIOVATree {
70
+ /* First addressable iova address in the device */
71
+ uint64_t iova_first;
72
+
73
+ /* Last addressable iova address in the device */
74
+ uint64_t iova_last;
75
+
76
+ /* IOVA address to qemu memory maps. */
77
+ IOVATree *iova_taddr_map;
78
+};
79
+
80
+/**
81
+ * Create a new IOVA tree
82
+ *
83
+ * Returns the new IOVA tree
84
+ */
85
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
96
+{
86
+{
97
+ Packet *pkt = data;
87
+ VhostIOVATree *tree = g_new(VhostIOVATree, 1);
98
+ struct tcphdr *tcphd;
99
+
88
+
100
+ tcphd = (struct tcphdr *)pkt->transport_header;
89
+ /* Some devices do not like 0 addresses */
90
+ tree->iova_first = MAX(iova_first, iova_min_addr);
91
+ tree->iova_last = iova_last;
101
+
92
+
102
+ pkt->tcp_seq = ntohl(tcphd->th_seq);
93
+ tree->iova_taddr_map = iova_tree_new();
103
+ pkt->tcp_ack = ntohl(tcphd->th_ack);
94
+ return tree;
104
+ *max_ack = *max_ack > pkt->tcp_ack ? *max_ack : pkt->tcp_ack;
105
+ pkt->header_size = pkt->transport_header - (uint8_t *)pkt->data
106
+ + (tcphd->th_off << 2) - pkt->vnet_hdr_len;
107
+ pkt->payload_size = pkt->size - pkt->header_size;
108
+ pkt->seq_end = pkt->tcp_seq + pkt->payload_size;
109
+ pkt->flags = tcphd->th_flags;
110
+}
95
+}
111
+
96
+
112
/*
97
+/**
113
* Return 1 on success, if return 0 means the
98
+ * Delete an iova tree
114
* packet will be dropped
99
+ */
115
*/
100
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
116
-static int colo_insert_packet(GQueue *queue, Packet *pkt)
117
+static int colo_insert_packet(GQueue *queue, Packet *pkt, uint32_t *max_ack)
118
{
119
if (g_queue_get_length(queue) <= MAX_QUEUE_SIZE) {
120
if (pkt->ip->ip_p == IPPROTO_TCP) {
121
+ fill_pkt_tcp_info(pkt, max_ack);
122
g_queue_insert_sorted(queue,
123
pkt,
124
(GCompareDataFunc)seq_sorter,
125
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
126
}
127
128
if (mode == PRIMARY_IN) {
129
- if (!colo_insert_packet(&conn->primary_list, pkt)) {
130
+ if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) {
131
error_report("colo compare primary queue size too big,"
132
"drop packet");
133
}
134
} else {
135
- if (!colo_insert_packet(&conn->secondary_list, pkt)) {
136
+ if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) {
137
error_report("colo compare secondary queue size too big,"
138
"drop packet");
139
}
140
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
141
return 0;
142
}
143
144
+static inline bool after(uint32_t seq1, uint32_t seq2)
145
+{
101
+{
146
+ return (int32_t)(seq1 - seq2) > 0;
102
+ iova_tree_destroy(iova_tree->iova_taddr_map);
103
+ g_free(iova_tree);
147
+}
104
+}
148
+
105
+
149
+static void colo_release_primary_pkt(CompareState *s, Packet *pkt)
106
+/**
107
+ * Find the IOVA address stored from a memory address
108
+ *
109
+ * @tree: The iova tree
110
+ * @map: The map with the memory address
111
+ *
112
+ * Return the stored mapping, or NULL if not found.
113
+ */
114
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
115
+ const DMAMap *map)
150
+{
116
+{
151
+ int ret;
117
+ return iova_tree_find_iova(tree->iova_taddr_map, map);
152
+ ret = compare_chr_send(s,
153
+ pkt->data,
154
+ pkt->size,
155
+ pkt->vnet_hdr_len);
156
+ if (ret < 0) {
157
+ error_report("colo send primary packet failed");
158
+ }
159
+ trace_colo_compare_main("packet same and release packet");
160
+ packet_destroy(pkt, NULL);
161
+}
118
+}
162
+
119
+
163
/*
120
+/**
164
* The IP packets sent by primary and secondary
121
+ * Allocate a new mapping
165
* will be compared in here
122
+ *
166
@@ -XXX,XX +XXX,XX @@ static int colo_compare_packet_payload(Packet *ppkt,
123
+ * @tree: The iova tree
167
}
124
+ * @map: The iova map
168
125
+ *
169
/*
126
+ * Returns:
170
- * Called from the compare thread on the primary
127
+ * - IOVA_OK if the map fits in the container
171
- * for compare tcp packet
128
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
172
- * compare_tcp copied from Dr. David Alan Gilbert's branch
129
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
173
- */
130
+ *
174
-static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
131
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
175
+ * return true means that the payload is consist and
132
+ */
176
+ * need to make the next comparison, false means do
133
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
177
+ * the checkpoint
134
+{
178
+*/
135
+ /* Some vhost devices do not like addr 0. Skip first page */
179
+static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
136
+ hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
180
+ int8_t *mark, uint32_t max_ack)
181
{
182
- struct tcphdr *ptcp, *stcp;
183
- int res;
184
+ *mark = 0;
185
+
137
+
186
+ if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
138
+ if (map->translated_addr + map->size < map->translated_addr ||
187
+ if (colo_compare_packet_payload(ppkt, spkt,
139
+ map->perm == IOMMU_NONE) {
188
+ ppkt->header_size, spkt->header_size,
140
+ return IOVA_ERR_INVALID;
189
+ ppkt->payload_size)) {
190
+ *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
191
+ return true;
192
+ }
193
+ }
194
+ if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
195
+ if (colo_compare_packet_payload(ppkt, spkt,
196
+ ppkt->header_size, spkt->header_size,
197
+ ppkt->payload_size)) {
198
+ *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
199
+ return true;
200
+ }
201
+ }
141
+ }
202
+
142
+
203
+ /* one part of secondary packet payload still need to be compared */
143
+ /* Allocate a node in IOVA address */
204
+ if (!after(ppkt->seq_end, spkt->seq_end)) {
144
+ return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
205
+ if (colo_compare_packet_payload(ppkt, spkt,
145
+ tree->iova_last);
206
+ ppkt->header_size + ppkt->offset,
207
+ spkt->header_size + spkt->offset,
208
+ ppkt->payload_size - ppkt->offset)) {
209
+ if (!after(ppkt->tcp_ack, max_ack)) {
210
+ *mark = COLO_COMPARE_FREE_PRIMARY;
211
+ spkt->offset += ppkt->payload_size - ppkt->offset;
212
+ return true;
213
+ } else {
214
+ /* secondary guest hasn't ack the data, don't send
215
+ * out this packet
216
+ */
217
+ return false;
218
+ }
219
+ }
220
+ } else {
221
+ /* primary packet is longer than secondary packet, compare
222
+ * the same part and mark the primary packet offset
223
+ */
224
+ if (colo_compare_packet_payload(ppkt, spkt,
225
+ ppkt->header_size + ppkt->offset,
226
+ spkt->header_size + spkt->offset,
227
+ spkt->payload_size - spkt->offset)) {
228
+ *mark = COLO_COMPARE_FREE_SECONDARY;
229
+ ppkt->offset += spkt->payload_size - spkt->offset;
230
+ return true;
231
+ }
232
+ }
233
234
- trace_colo_compare_main("compare tcp");
235
+ return false;
236
+}
237
238
- ptcp = (struct tcphdr *)ppkt->transport_header;
239
- stcp = (struct tcphdr *)spkt->transport_header;
240
+static void colo_compare_tcp(CompareState *s, Connection *conn)
241
+{
242
+ Packet *ppkt = NULL, *spkt = NULL;
243
+ int8_t mark;
244
245
/*
246
- * The 'identification' field in the IP header is *very* random
247
- * it almost never matches. Fudge this by ignoring differences in
248
- * unfragmented packets; they'll normally sort themselves out if different
249
- * anyway, and it should recover at the TCP level.
250
- * An alternative would be to get both the primary and secondary to rewrite
251
- * somehow; but that would need some sync traffic to sync the state
252
- */
253
- if (ntohs(ppkt->ip->ip_off) & IP_DF) {
254
- spkt->ip->ip_id = ppkt->ip->ip_id;
255
- /* and the sum will be different if the IDs were different */
256
- spkt->ip->ip_sum = ppkt->ip->ip_sum;
257
+ * If ppkt and spkt have the same payload, but ppkt's ACK
258
+ * is greater than spkt's ACK, in this case we can not
259
+ * send the ppkt because it will cause the secondary guest
260
+ * to miss sending some data in the next. Therefore, we
261
+ * record the maximum ACK in the current queue at both
262
+ * primary side and secondary side. Only when the ack is
263
+ * less than the smaller of the two maximum ack, then we
264
+ * can ensure that the packet's payload is acknowledged by
265
+ * primary and secondary.
266
+ */
267
+ uint32_t min_ack = conn->pack > conn->sack ? conn->sack : conn->pack;
268
+
269
+pri:
270
+ if (g_queue_is_empty(&conn->primary_list)) {
271
+ return;
272
}
273
+ ppkt = g_queue_pop_head(&conn->primary_list);
274
+sec:
275
+ if (g_queue_is_empty(&conn->secondary_list)) {
276
+ g_queue_push_head(&conn->primary_list, ppkt);
277
+ return;
278
+ }
279
+ spkt = g_queue_pop_head(&conn->secondary_list);
280
281
- /*
282
- * Check tcp header length for tcp option field.
283
- * th_off > 5 means this tcp packet have options field.
284
- * The tcp options maybe always different.
285
- * for example:
286
- * From RFC 7323.
287
- * TCP Timestamps option (TSopt):
288
- * Kind: 8
289
- *
290
- * Length: 10 bytes
291
- *
292
- * +-------+-------+---------------------+---------------------+
293
- * |Kind=8 | 10 | TS Value (TSval) |TS Echo Reply (TSecr)|
294
- * +-------+-------+---------------------+---------------------+
295
- * 1 1 4 4
296
- *
297
- * In this case the primary guest's timestamp always different with
298
- * the secondary guest's timestamp. COLO just focus on payload,
299
- * so we just need skip this field.
300
- */
301
+ if (ppkt->tcp_seq == ppkt->seq_end) {
302
+ colo_release_primary_pkt(s, ppkt);
303
+ ppkt = NULL;
304
+ }
305
306
- ptrdiff_t ptcp_offset, stcp_offset;
307
+ if (ppkt && conn->compare_seq && !after(ppkt->seq_end, conn->compare_seq)) {
308
+ trace_colo_compare_main("pri: this packet has compared");
309
+ colo_release_primary_pkt(s, ppkt);
310
+ ppkt = NULL;
311
+ }
312
313
- ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
314
- + (ptcp->th_off << 2) - ppkt->vnet_hdr_len;
315
- stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
316
- + (stcp->th_off << 2) - spkt->vnet_hdr_len;
317
- if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) {
318
- res = colo_compare_packet_payload(ppkt, spkt,
319
- ptcp_offset, stcp_offset,
320
- ppkt->size - ptcp_offset);
321
+ if (spkt->tcp_seq == spkt->seq_end) {
322
+ packet_destroy(spkt, NULL);
323
+ if (!ppkt) {
324
+ goto pri;
325
+ } else {
326
+ goto sec;
327
+ }
328
} else {
329
- trace_colo_compare_main("TCP: payload size of packets are different");
330
- res = -1;
331
+ if (conn->compare_seq && !after(spkt->seq_end, conn->compare_seq)) {
332
+ trace_colo_compare_main("sec: this packet has compared");
333
+ packet_destroy(spkt, NULL);
334
+ if (!ppkt) {
335
+ goto pri;
336
+ } else {
337
+ goto sec;
338
+ }
339
+ }
340
+ if (!ppkt) {
341
+ g_queue_push_head(&conn->secondary_list, spkt);
342
+ goto pri;
343
+ }
344
}
345
346
- if (res != 0 &&
347
- trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
348
- char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
349
-
350
- strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src));
351
- strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst));
352
- strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src));
353
- strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst));
354
-
355
- trace_colo_compare_ip_info(ppkt->size, pri_ip_src,
356
- pri_ip_dst, spkt->size,
357
- sec_ip_src, sec_ip_dst);
358
-
359
- trace_colo_compare_tcp_info("pri tcp packet",
360
- ntohl(ptcp->th_seq),
361
- ntohl(ptcp->th_ack),
362
- res, ptcp->th_flags,
363
- ppkt->size);
364
-
365
- trace_colo_compare_tcp_info("sec tcp packet",
366
- ntohl(stcp->th_seq),
367
- ntohl(stcp->th_ack),
368
- res, stcp->th_flags,
369
- spkt->size);
370
+ if (colo_mark_tcp_pkt(ppkt, spkt, &mark, min_ack)) {
371
+ trace_colo_compare_tcp_info("pri",
372
+ ppkt->tcp_seq, ppkt->tcp_ack,
373
+ ppkt->header_size, ppkt->payload_size,
374
+ ppkt->offset, ppkt->flags);
375
+
376
+ trace_colo_compare_tcp_info("sec",
377
+ spkt->tcp_seq, spkt->tcp_ack,
378
+ spkt->header_size, spkt->payload_size,
379
+ spkt->offset, spkt->flags);
380
+
381
+ if (mark == COLO_COMPARE_FREE_PRIMARY) {
382
+ conn->compare_seq = ppkt->seq_end;
383
+ colo_release_primary_pkt(s, ppkt);
384
+ g_queue_push_head(&conn->secondary_list, spkt);
385
+ goto pri;
386
+ }
387
+ if (mark == COLO_COMPARE_FREE_SECONDARY) {
388
+ conn->compare_seq = spkt->seq_end;
389
+ packet_destroy(spkt, NULL);
390
+ goto sec;
391
+ }
392
+ if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) {
393
+ conn->compare_seq = ppkt->seq_end;
394
+ colo_release_primary_pkt(s, ppkt);
395
+ packet_destroy(spkt, NULL);
396
+ goto pri;
397
+ }
398
+ } else {
399
+ g_queue_push_head(&conn->primary_list, ppkt);
400
+ g_queue_push_head(&conn->secondary_list, spkt);
401
402
qemu_hexdump((char *)ppkt->data, stderr,
403
"colo-compare ppkt", ppkt->size);
404
qemu_hexdump((char *)spkt->data, stderr,
405
"colo-compare spkt", spkt->size);
406
- }
407
408
- return res;
409
+ /*
410
+ * colo_compare_inconsistent_notify();
411
+ * TODO: notice to checkpoint();
412
+ */
413
+ }
414
}
415
416
+
417
/*
418
* Called from the compare thread on the primary
419
* for compare udp packet
420
@@ -XXX,XX +XXX,XX @@ static void colo_old_packet_check(void *opaque)
421
(GCompareFunc)colo_old_packet_check_one_conn);
422
}
423
424
-/*
425
- * Called from the compare thread on the primary
426
- * for compare packet with secondary list of the
427
- * specified connection when a new packet was
428
- * queued to it.
429
- */
430
-static void colo_compare_connection(void *opaque, void *user_data)
431
+static void colo_compare_packet(CompareState *s, Connection *conn,
432
+ int (*HandlePacket)(Packet *spkt,
433
+ Packet *ppkt))
434
{
435
- CompareState *s = user_data;
436
- Connection *conn = opaque;
437
Packet *pkt = NULL;
438
GList *result = NULL;
439
- int ret;
440
441
while (!g_queue_is_empty(&conn->primary_list) &&
442
!g_queue_is_empty(&conn->secondary_list)) {
443
pkt = g_queue_pop_head(&conn->primary_list);
444
- switch (conn->ip_proto) {
445
- case IPPROTO_TCP:
446
- result = g_queue_find_custom(&conn->secondary_list,
447
- pkt, (GCompareFunc)colo_packet_compare_tcp);
448
- break;
449
- case IPPROTO_UDP:
450
- result = g_queue_find_custom(&conn->secondary_list,
451
- pkt, (GCompareFunc)colo_packet_compare_udp);
452
- break;
453
- case IPPROTO_ICMP:
454
- result = g_queue_find_custom(&conn->secondary_list,
455
- pkt, (GCompareFunc)colo_packet_compare_icmp);
456
- break;
457
- default:
458
- result = g_queue_find_custom(&conn->secondary_list,
459
- pkt, (GCompareFunc)colo_packet_compare_other);
460
- break;
461
- }
462
+ result = g_queue_find_custom(&conn->secondary_list,
463
+ pkt, (GCompareFunc)HandlePacket);
464
465
if (result) {
466
- ret = compare_chr_send(s,
467
- pkt->data,
468
- pkt->size,
469
- pkt->vnet_hdr_len);
470
- if (ret < 0) {
471
- error_report("colo_send_primary_packet failed");
472
- }
473
- trace_colo_compare_main("packet same and release packet");
474
+ colo_release_primary_pkt(s, pkt);
475
g_queue_remove(&conn->secondary_list, result->data);
476
- packet_destroy(pkt, NULL);
477
} else {
478
/*
479
* If one packet arrive late, the secondary_list or
480
@@ -XXX,XX +XXX,XX @@ static void colo_compare_connection(void *opaque, void *user_data)
481
}
482
}
483
484
+/*
485
+ * Called from the compare thread on the primary
486
+ * for compare packet with secondary list of the
487
+ * specified connection when a new packet was
488
+ * queued to it.
489
+ */
490
+static void colo_compare_connection(void *opaque, void *user_data)
491
+{
492
+ CompareState *s = user_data;
493
+ Connection *conn = opaque;
494
+
495
+ switch (conn->ip_proto) {
496
+ case IPPROTO_TCP:
497
+ colo_compare_tcp(s, conn);
498
+ break;
499
+ case IPPROTO_UDP:
500
+ colo_compare_packet(s, conn, colo_packet_compare_udp);
501
+ break;
502
+ case IPPROTO_ICMP:
503
+ colo_compare_packet(s, conn, colo_packet_compare_icmp);
504
+ break;
505
+ default:
506
+ colo_compare_packet(s, conn, colo_packet_compare_other);
507
+ break;
508
+ }
509
+}
146
+}
510
+
147
+
511
static int compare_chr_send(CompareState *s,
148
+/**
512
const uint8_t *buf,
149
+ * Remove existing mappings from iova tree
513
uint32_t size,
150
+ *
514
diff --git a/net/colo.c b/net/colo.c
151
+ * @iova_tree: The vhost iova tree
515
index XXXXXXX..XXXXXXX 100644
152
+ * @map: The map to remove
516
--- a/net/colo.c
153
+ */
517
+++ b/net/colo.c
154
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
518
@@ -XXX,XX +XXX,XX @@ Connection *connection_new(ConnectionKey *key)
155
+{
519
conn->processing = false;
156
+ iova_tree_remove(iova_tree->iova_taddr_map, map);
520
conn->offset = 0;
157
+}
521
conn->syn_flag = 0;
158
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
522
+ conn->pack = 0;
159
new file mode 100644
523
+ conn->sack = 0;
160
index XXXXXXX..XXXXXXX
524
g_queue_init(&conn->primary_list);
161
--- /dev/null
525
g_queue_init(&conn->secondary_list);
162
+++ b/hw/virtio/vhost-iova-tree.h
526
163
@@ -XXX,XX +XXX,XX @@
527
@@ -XXX,XX +XXX,XX @@ Packet *packet_new(const void *data, int size, int vnet_hdr_len)
164
+/*
528
pkt->size = size;
165
+ * vhost software live migration iova tree
529
pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
166
+ *
530
pkt->vnet_hdr_len = vnet_hdr_len;
167
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
531
+ pkt->tcp_seq = 0;
168
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
532
+ pkt->tcp_ack = 0;
169
+ *
533
+ pkt->seq_end = 0;
170
+ * SPDX-License-Identifier: GPL-2.0-or-later
534
+ pkt->header_size = 0;
171
+ */
535
+ pkt->payload_size = 0;
172
+
536
+ pkt->offset = 0;
173
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
537
+ pkt->flags = 0;
174
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
538
175
+
539
return pkt;
176
+#include "qemu/iova-tree.h"
540
}
177
+#include "exec/memory.h"
541
diff --git a/net/colo.h b/net/colo.h
178
+
542
index XXXXXXX..XXXXXXX 100644
179
+typedef struct VhostIOVATree VhostIOVATree;
543
--- a/net/colo.h
180
+
544
+++ b/net/colo.h
181
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
545
@@ -XXX,XX +XXX,XX @@ typedef struct Packet {
182
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
546
int64_t creation_ms;
183
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
547
/* Get vnet_hdr_len from filter */
184
+
548
uint32_t vnet_hdr_len;
185
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
549
+ uint32_t tcp_seq; /* sequence number */
186
+ const DMAMap *map);
550
+ uint32_t tcp_ack; /* acknowledgement number */
187
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
551
+ /* the sequence number of the last byte of the packet */
188
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
552
+ uint32_t seq_end;
189
+
553
+ uint8_t header_size; /* the header length */
190
+#endif
554
+ uint16_t payload_size; /* the payload length */
555
+ /* record the payload offset(the length that has been compared) */
556
+ uint16_t offset;
557
+ uint8_t flags; /* Flags(aka Control bits) */
558
} Packet;
559
560
typedef struct ConnectionKey {
561
@@ -XXX,XX +XXX,XX @@ typedef struct Connection {
562
/* flag to enqueue unprocessed_connections */
563
bool processing;
564
uint8_t ip_proto;
565
+ /* record the sequence number that has been compared */
566
+ uint32_t compare_seq;
567
+ /* the maximum of acknowledgement number in primary_list queue */
568
+ uint32_t pack;
569
+ /* the maximum of acknowledgement number in secondary_list queue */
570
+ uint32_t sack;
571
/* offset = secondary_seq - primary_seq */
572
tcp_seq offset;
573
/*
574
diff --git a/net/trace-events b/net/trace-events
575
index XXXXXXX..XXXXXXX 100644
576
--- a/net/trace-events
577
+++ b/net/trace-events
578
@@ -XXX,XX +XXX,XX @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
579
colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
580
colo_old_packet_check_found(int64_t old_time) "%" PRId64
581
colo_compare_miscompare(void) ""
582
-colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int res, uint32_t flag, int size) "side: %s seq/ack= %u/%u res= %d flags= 0x%x pkt_size: %d\n"
583
+colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, int pdlen, int offset, int flags) "%s: seq/ack= %u/%u hdlen= %d pdlen= %d offset= %d flags=%d\n"
584
585
# net/filter-rewriter.c
586
colo_filter_rewriter_debug(void) ""
587
--
191
--
588
2.7.4
192
2.7.4
589
193
590
194
diff view generated by jsdifflib
1
From: Thomas Huth <thuth@redhat.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
It does not make much sense to limit these commands to the legacy 'vlan'
3
Use translations added in VhostIOVATree in SVQ.
4
concept only, they should work with the modern netdevs, too. So now
4
5
it is possible to use this command with one, two or three parameters.
5
Only introduce usage here, not allocation and deallocation. As with
6
6
previous patches, we use the dead code paths of shadow_vqs_enabled to
7
With one parameter, the command installs a hostfwd rule on the default
7
avoid commiting too many changes at once. These are impossible to take
8
"user" network:
8
at the moment.
9
hostfwd_add tcp:...
9
10
10
Acked-by: Michael S. Tsirkin <mst@redhat.com>
11
With two parameters, the command installs a hostfwd rule on a netdev
11
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
12
(that's the new way of using this command):
13
hostfwd_add netdev_id tcp:...
14
15
With three parameters, the command installs a rule on a 'vlan' (aka hub):
16
hostfwd_add hub_id name tcp:...
17
18
Same applies to the hostfwd_remove command now.
19
20
Signed-off-by: Thomas Huth <thuth@redhat.com>
21
Signed-off-by: Jason Wang <jasowang@redhat.com>
12
Signed-off-by: Jason Wang <jasowang@redhat.com>
22
---
13
---
23
hmp-commands.hx | 4 ++--
14
hw/virtio/vhost-shadow-virtqueue.c | 75 +++++++++++++++++++++--
24
net/slirp.c | 33 +++++++++++++++++++++++----------
15
hw/virtio/vhost-shadow-virtqueue.h | 6 +-
25
2 files changed, 25 insertions(+), 12 deletions(-)
16
hw/virtio/vhost-vdpa.c | 122 +++++++++++++++++++++++++++++++------
26
17
include/hw/virtio/vhost-vdpa.h | 3 +
27
diff --git a/hmp-commands.hx b/hmp-commands.hx
18
4 files changed, 181 insertions(+), 25 deletions(-)
19
20
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
28
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
29
--- a/hmp-commands.hx
22
--- a/hw/virtio/vhost-shadow-virtqueue.c
30
+++ b/hmp-commands.hx
23
+++ b/hw/virtio/vhost-shadow-virtqueue.c
31
@@ -XXX,XX +XXX,XX @@ ETEXI
24
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
32
{
25
return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
33
.name = "hostfwd_add",
26
}
34
.args_type = "arg1:s,arg2:s?,arg3:s?",
27
35
- .params = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport",
28
+/**
36
+ .params = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport-[guestaddr]:guestport",
29
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
37
.help = "redirect TCP or UDP connections from host to guest (requires -net user)",
30
+ *
38
.cmd = hmp_hostfwd_add,
31
+ * @svq: Shadow VirtQueue
39
},
32
+ * @vaddr: Translated IOVA addresses
40
@@ -XXX,XX +XXX,XX @@ ETEXI
33
+ * @iovec: Source qemu's VA addresses
41
{
34
+ * @num: Length of iovec and minimum length of vaddr
42
.name = "hostfwd_remove",
35
+ */
43
.args_type = "arg1:s,arg2:s?,arg3:s?",
36
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
44
- .params = "[vlan_id name] [tcp|udp]:[hostaddr]:hostport",
37
+ void **addrs, const struct iovec *iovec,
45
+ .params = "[hub_id name]|[netdev_id] [tcp|udp]:[hostaddr]:hostport",
38
+ size_t num)
46
.help = "remove host-to-guest TCP or UDP redirection",
39
+{
47
.cmd = hmp_hostfwd_remove,
40
+ if (num == 0) {
48
},
41
+ return true;
49
diff --git a/net/slirp.c b/net/slirp.c
42
+ }
43
+
44
+ for (size_t i = 0; i < num; ++i) {
45
+ DMAMap needle = {
46
+ .translated_addr = (hwaddr)iovec[i].iov_base,
47
+ .size = iovec[i].iov_len,
48
+ };
49
+ size_t off;
50
+
51
+ const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
52
+ /*
53
+ * Map cannot be NULL since iova map contains all guest space and
54
+ * qemu already has a physical address mapped
55
+ */
56
+ if (unlikely(!map)) {
57
+ qemu_log_mask(LOG_GUEST_ERROR,
58
+ "Invalid address 0x%"HWADDR_PRIx" given by guest",
59
+ needle.translated_addr);
60
+ return false;
61
+ }
62
+
63
+ off = needle.translated_addr - map->translated_addr;
64
+ addrs[i] = (void *)(map->iova + off);
65
+
66
+ if (unlikely(int128_gt(int128_add(needle.translated_addr,
67
+ iovec[i].iov_len),
68
+ map->translated_addr + map->size))) {
69
+ qemu_log_mask(LOG_GUEST_ERROR,
70
+ "Guest buffer expands over iova range");
71
+ return false;
72
+ }
73
+ }
74
+
75
+ return true;
76
+}
77
+
78
static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
79
+ void * const *sg,
80
const struct iovec *iovec,
81
size_t num, bool more_descs, bool write)
82
{
83
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
84
} else {
85
descs[i].flags = flags;
86
}
87
- descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
88
+ descs[i].addr = cpu_to_le64((hwaddr)sg[n]);
89
descs[i].len = cpu_to_le32(iovec[n].iov_len);
90
91
last = i;
92
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
93
{
94
unsigned avail_idx;
95
vring_avail_t *avail = svq->vring.avail;
96
+ bool ok;
97
+ g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));
98
99
*head = svq->free_head;
100
101
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
102
return false;
103
}
104
105
- vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
106
+ ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
107
+ if (unlikely(!ok)) {
108
+ return false;
109
+ }
110
+ vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
111
elem->in_num > 0, false);
112
- vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
113
+
114
+
115
+ ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
116
+ if (unlikely(!ok)) {
117
+ return false;
118
+ }
119
+
120
+ vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
121
122
/*
123
* Put the entry in the available array (but don't update avail->idx until
124
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
125
* Creates vhost shadow virtqueue, and instructs the vhost device to use the
126
* shadow methods and file descriptors.
127
*
128
+ * @iova_tree: Tree to perform descriptors translations
129
+ *
130
* Returns the new virtqueue or NULL.
131
*
132
* In case of error, reason is reported through error_report.
133
*/
134
-VhostShadowVirtqueue *vhost_svq_new(void)
135
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
136
{
137
g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
138
int r;
139
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
140
141
event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
142
event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
143
+ svq->iova_tree = iova_tree;
144
return g_steal_pointer(&svq);
145
146
err_init_hdev_call:
147
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
50
index XXXXXXX..XXXXXXX 100644
148
index XXXXXXX..XXXXXXX 100644
51
--- a/net/slirp.c
149
--- a/hw/virtio/vhost-shadow-virtqueue.h
52
+++ b/net/slirp.c
150
+++ b/hw/virtio/vhost-shadow-virtqueue.h
53
@@ -XXX,XX +XXX,XX @@ error:
151
@@ -XXX,XX +XXX,XX @@
54
return -1;
152
#include "qemu/event_notifier.h"
153
#include "hw/virtio/virtio.h"
154
#include "standard-headers/linux/vhost_types.h"
155
+#include "hw/virtio/vhost-iova-tree.h"
156
157
/* Shadow virtqueue to relay notifications */
158
typedef struct VhostShadowVirtqueue {
159
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
160
/* Virtio device */
161
VirtIODevice *vdev;
162
163
+ /* IOVA mapping */
164
+ VhostIOVATree *iova_tree;
165
+
166
/* Map for use the guest's descriptors */
167
VirtQueueElement **ring_id_maps;
168
169
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
170
VirtQueue *vq);
171
void vhost_svq_stop(VhostShadowVirtqueue *svq);
172
173
-VhostShadowVirtqueue *vhost_svq_new(void);
174
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
175
176
void vhost_svq_free(gpointer vq);
177
G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
178
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
179
index XXXXXXX..XXXXXXX 100644
180
--- a/hw/virtio/vhost-vdpa.c
181
+++ b/hw/virtio/vhost-vdpa.c
182
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
183
vaddr, section->readonly);
184
185
llsize = int128_sub(llend, int128_make64(iova));
186
+ if (v->shadow_vqs_enabled) {
187
+ DMAMap mem_region = {
188
+ .translated_addr = (hwaddr)vaddr,
189
+ .size = int128_get64(llsize) - 1,
190
+ .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
191
+ };
192
+
193
+ int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
194
+ if (unlikely(r != IOVA_OK)) {
195
+ error_report("Can't allocate a mapping (%d)", r);
196
+ goto fail;
197
+ }
198
+
199
+ iova = mem_region.iova;
200
+ }
201
202
vhost_vdpa_iotlb_batch_begin_once(v);
203
ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
204
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
205
206
llsize = int128_sub(llend, int128_make64(iova));
207
208
+ if (v->shadow_vqs_enabled) {
209
+ const DMAMap *result;
210
+ const void *vaddr = memory_region_get_ram_ptr(section->mr) +
211
+ section->offset_within_region +
212
+ (iova - section->offset_within_address_space);
213
+ DMAMap mem_region = {
214
+ .translated_addr = (hwaddr)vaddr,
215
+ .size = int128_get64(llsize) - 1,
216
+ };
217
+
218
+ result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
219
+ iova = result->iova;
220
+ vhost_iova_tree_remove(v->iova_tree, &mem_region);
221
+ }
222
vhost_vdpa_iotlb_batch_begin_once(v);
223
ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
224
if (ret) {
225
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
226
227
shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
228
for (unsigned n = 0; n < hdev->nvqs; ++n) {
229
- g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
230
+ g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
231
232
if (unlikely(!svq)) {
233
error_setg(errp, "Cannot create svq %u", n);
234
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
235
/**
236
* Unmap a SVQ area in the device
237
*/
238
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
239
- hwaddr size)
240
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
241
+ const DMAMap *needle)
242
{
243
+ const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
244
+ hwaddr size;
245
int r;
246
247
- size = ROUND_UP(size, qemu_real_host_page_size);
248
- r = vhost_vdpa_dma_unmap(v, iova, size);
249
+ if (unlikely(!result)) {
250
+ error_report("Unable to find SVQ address to unmap");
251
+ return false;
252
+ }
253
+
254
+ size = ROUND_UP(result->size, qemu_real_host_page_size);
255
+ r = vhost_vdpa_dma_unmap(v, result->iova, size);
256
return r == 0;
55
}
257
}
56
258
57
-static SlirpState *slirp_lookup(Monitor *mon, const char *vlan,
259
static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
58
- const char *stack)
260
const VhostShadowVirtqueue *svq)
59
+static SlirpState *slirp_lookup(Monitor *mon, const char *hub_id,
261
{
60
+ const char *name)
262
+ DMAMap needle = {};
61
{
263
struct vhost_vdpa *v = dev->opaque;
62
-
264
struct vhost_vring_addr svq_addr;
63
- if (vlan) {
265
- size_t device_size = vhost_svq_device_area_size(svq);
64
+ if (name) {
266
- size_t driver_size = vhost_svq_driver_area_size(svq);
65
NetClientState *nc;
267
bool ok;
66
- nc = net_hub_find_client_by_name(strtol(vlan, NULL, 0), stack);
268
67
- if (!nc) {
269
vhost_svq_get_vring_addr(svq, &svq_addr);
68
- monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n");
270
69
- return NULL;
271
- ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
70
+ if (hub_id) {
272
+ needle.translated_addr = svq_addr.desc_user_addr;
71
+ nc = net_hub_find_client_by_name(strtol(hub_id, NULL, 0), name);
273
+ ok = vhost_vdpa_svq_unmap_ring(v, &needle);
72
+ if (!nc) {
274
if (unlikely(!ok)) {
73
+ monitor_printf(mon, "unrecognized (vlan-id, stackname) pair\n");
275
return false;
74
+ return NULL;
276
}
75
+ }
277
76
+ } else {
278
- return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
77
+ nc = qemu_find_netdev(name);
279
+ needle.translated_addr = svq_addr.used_user_addr;
78
+ if (!nc) {
280
+ return vhost_vdpa_svq_unmap_ring(v, &needle);
79
+ monitor_printf(mon, "unrecognized netdev id '%s'\n", name);
281
+}
80
+ return NULL;
282
+
81
+ }
283
+/**
82
}
284
+ * Map the SVQ area in the device
83
if (strcmp(nc->model, "user")) {
285
+ *
84
monitor_printf(mon, "invalid device specified\n");
286
+ * @v: Vhost-vdpa device
85
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict)
287
+ * @needle: The area to search iova
86
const char *arg2 = qdict_get_try_str(qdict, "arg2");
288
+ * @errorp: Error pointer
87
const char *arg3 = qdict_get_try_str(qdict, "arg3");
289
+ */
88
290
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
89
- if (arg2) {
291
+ Error **errp)
90
+ if (arg3) {
292
+{
91
s = slirp_lookup(mon, arg1, arg2);
293
+ int r;
92
src_str = arg3;
294
+
93
+ } else if (arg2) {
295
+ r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
94
+ s = slirp_lookup(mon, NULL, arg1);
296
+ if (unlikely(r != IOVA_OK)) {
95
+ src_str = arg2;
297
+ error_setg(errp, "Cannot allocate iova (%d)", r);
96
} else {
298
+ return false;
97
s = slirp_lookup(mon, NULL, NULL);
299
+ }
98
src_str = arg1;
300
+
99
@@ -XXX,XX +XXX,XX @@ void hmp_hostfwd_add(Monitor *mon, const QDict *qdict)
301
+ r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
100
const char *arg2 = qdict_get_try_str(qdict, "arg2");
302
+ (void *)needle->translated_addr,
101
const char *arg3 = qdict_get_try_str(qdict, "arg3");
303
+ needle->perm == IOMMU_RO);
102
304
+ if (unlikely(r != 0)) {
103
- if (arg2) {
305
+ error_setg_errno(errp, -r, "Cannot map region to device");
104
+ if (arg3) {
306
+ vhost_iova_tree_remove(v->iova_tree, needle);
105
s = slirp_lookup(mon, arg1, arg2);
307
+ }
106
redir_str = arg3;
308
+
107
+ } else if (arg2) {
309
+ return r == 0;
108
+ s = slirp_lookup(mon, NULL, arg1);
310
}
109
+ redir_str = arg2;
311
110
} else {
312
/**
111
s = slirp_lookup(mon, NULL, NULL);
313
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
112
redir_str = arg1;
314
struct vhost_vring_addr *addr,
315
Error **errp)
316
{
317
+ DMAMap device_region, driver_region;
318
+ struct vhost_vring_addr svq_addr;
319
struct vhost_vdpa *v = dev->opaque;
320
size_t device_size = vhost_svq_device_area_size(svq);
321
size_t driver_size = vhost_svq_driver_area_size(svq);
322
- int r;
323
+ size_t avail_offset;
324
+ bool ok;
325
326
ERRP_GUARD();
327
- vhost_svq_get_vring_addr(svq, addr);
328
+ vhost_svq_get_vring_addr(svq, &svq_addr);
329
330
- r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
331
- (void *)addr->desc_user_addr, true);
332
- if (unlikely(r != 0)) {
333
- error_setg_errno(errp, -r, "Cannot create vq driver region: ");
334
+ driver_region = (DMAMap) {
335
+ .translated_addr = svq_addr.desc_user_addr,
336
+ .size = driver_size - 1,
337
+ .perm = IOMMU_RO,
338
+ };
339
+ ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
340
+ if (unlikely(!ok)) {
341
+ error_prepend(errp, "Cannot create vq driver region: ");
342
return false;
343
}
344
+ addr->desc_user_addr = driver_region.iova;
345
+ avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
346
+ addr->avail_user_addr = driver_region.iova + avail_offset;
347
348
- r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
349
- (void *)addr->used_user_addr, false);
350
- if (unlikely(r != 0)) {
351
- error_setg_errno(errp, -r, "Cannot create vq device region: ");
352
+ device_region = (DMAMap) {
353
+ .translated_addr = svq_addr.used_user_addr,
354
+ .size = device_size - 1,
355
+ .perm = IOMMU_RW,
356
+ };
357
+ ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
358
+ if (unlikely(!ok)) {
359
+ error_prepend(errp, "Cannot create vq device region: ");
360
+ vhost_vdpa_svq_unmap_ring(v, &driver_region);
361
}
362
+ addr->used_user_addr = device_region.iova;
363
364
- return r == 0;
365
+ return ok;
366
}
367
368
static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
369
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
370
index XXXXXXX..XXXXXXX 100644
371
--- a/include/hw/virtio/vhost-vdpa.h
372
+++ b/include/hw/virtio/vhost-vdpa.h
373
@@ -XXX,XX +XXX,XX @@
374
375
#include <gmodule.h>
376
377
+#include "hw/virtio/vhost-iova-tree.h"
378
#include "hw/virtio/virtio.h"
379
#include "standard-headers/linux/vhost_types.h"
380
381
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
382
MemoryListener listener;
383
struct vhost_vdpa_iova_range iova_range;
384
bool shadow_vqs_enabled;
385
+ /* IOVA mapping used by the Shadow Virtqueue */
386
+ VhostIOVATree *iova_tree;
387
GPtrArray *shadow_vqs;
388
struct vhost_dev *dev;
389
VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
113
--
390
--
114
2.7.4
391
2.7.4
115
392
116
393
diff view generated by jsdifflib
1
From: Thomas Huth <thuth@redhat.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
The vlan concept is marked as deprecated, so we should not use
3
This is needed to achieve migration, so the destination can restore its
4
this for examples in the documentation anymore.
4
index.
5
5
6
Signed-off-by: Thomas Huth <thuth@redhat.com>
6
Setting base as last used idx, so destination will see as available all
7
the entries that the device did not use, including the in-flight
8
processing ones.
9
10
This is ok for networking, but other kinds of devices might have
11
problems with these retransmissions.
12
13
Acked-by: Michael S. Tsirkin <mst@redhat.com>
14
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
7
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
8
---
16
---
9
qemu-options.hx | 4 ++--
17
hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
10
1 file changed, 2 insertions(+), 2 deletions(-)
18
1 file changed, 17 insertions(+)
11
19
12
diff --git a/qemu-options.hx b/qemu-options.hx
20
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
13
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
14
--- a/qemu-options.hx
22
--- a/hw/virtio/vhost-vdpa.c
15
+++ b/qemu-options.hx
23
+++ b/hw/virtio/vhost-vdpa.c
16
@@ -XXX,XX +XXX,XX @@ qemu-system-i386 linux.img -net nic -net tap
24
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
17
#launch a QEMU instance with two NICs, each one connected
25
static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
18
#to a TAP device
26
struct vhost_vring_state *ring)
19
qemu-system-i386 linux.img \
27
{
20
- -net nic,vlan=0 -net tap,vlan=0,ifname=tap0 \
28
+ struct vhost_vdpa *v = dev->opaque;
21
- -net nic,vlan=1 -net tap,vlan=1,ifname=tap1
29
int ret;
22
+ -netdev tap,id=nd0,ifname=tap0 -device e1000,netdev=nd0 \
30
23
+ -netdev tap,id=nd1,ifname=tap1 -device rtl8139,netdev=nd1
31
+ if (v->shadow_vqs_enabled) {
24
@end example
32
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
25
33
+ ring->index);
26
@example
34
+
35
+ /*
36
+ * Setting base as last used idx, so destination will see as available
37
+ * all the entries that the device did not use, including the in-flight
38
+ * processing ones.
39
+ *
40
+ * TODO: This is ok for networking, but other kinds of devices might
41
+ * have problems with these retransmissions.
42
+ */
43
+ ring->num = svq->last_used_idx;
44
+ return 0;
45
+ }
46
+
47
ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
48
trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
49
return ret;
27
--
50
--
28
2.7.4
51
2.7.4
29
52
30
53
diff view generated by jsdifflib
1
From: Philippe Mathieu-Daudé <f4bug@amsat.org>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
gently asked by his automatic reply :)
3
Setting the log address would make the device start reporting invalid
4
dirty memory because the SVQ vrings are located in qemu's memory.
4
5
5
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
6
Acked-by: Michael S. Tsirkin <mst@redhat.com>
7
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
6
Signed-off-by: Jason Wang <jasowang@redhat.com>
8
Signed-off-by: Jason Wang <jasowang@redhat.com>
7
---
9
---
8
MAINTAINERS | 8 ++++----
10
hw/virtio/vhost-vdpa.c | 3 ++-
9
1 file changed, 4 insertions(+), 4 deletions(-)
11
1 file changed, 2 insertions(+), 1 deletion(-)
10
12
11
diff --git a/MAINTAINERS b/MAINTAINERS
13
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
12
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
13
--- a/MAINTAINERS
15
--- a/hw/virtio/vhost-vdpa.c
14
+++ b/MAINTAINERS
16
+++ b/hw/virtio/vhost-vdpa.c
15
@@ -XXX,XX +XXX,XX @@ F: hw/scsi/mfi.h
17
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
16
F: tests/megasas-test.c
18
static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
17
19
struct vhost_log *log)
18
Network packet abstractions
20
{
19
-M: Dmitry Fleytman <dmitry@daynix.com>
21
- if (vhost_vdpa_one_time_request(dev)) {
20
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
22
+ struct vhost_vdpa *v = dev->opaque;
21
S: Maintained
23
+ if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
22
F: include/net/eth.h
24
return 0;
23
F: net/eth.c
25
}
24
@@ -XXX,XX +XXX,XX @@ F: hw/net/net_rx_pkt*
25
F: hw/net/net_tx_pkt*
26
27
Vmware
28
-M: Dmitry Fleytman <dmitry@daynix.com>
29
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
30
S: Maintained
31
F: hw/net/vmxnet*
32
F: hw/scsi/vmw_pvscsi*
33
@@ -XXX,XX +XXX,XX @@ F: hw/mem/nvdimm.c
34
F: include/hw/mem/nvdimm.h
35
36
e1000x
37
-M: Dmitry Fleytman <dmitry@daynix.com>
38
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
39
S: Maintained
40
F: hw/net/e1000x*
41
42
e1000e
43
-M: Dmitry Fleytman <dmitry@daynix.com>
44
+M: Dmitry Fleytman <dmitry.fleytman@gmail.com>
45
S: Maintained
46
F: hw/net/e1000e*
47
26
48
--
27
--
49
2.7.4
28
2.7.4
50
29
51
30
diff view generated by jsdifflib
1
From: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
1
From: Eugenio Pérez <eperezma@redhat.com>
2
2
3
Modified the function colo_packet_compare_common to prepare for the
3
SVQ is able to log the dirty bits by itself, so let's use it to not
4
tcp packet comparison in the next patch.
4
block migration.
5
5
6
Cc: Zhang Chen <zhangckid@gmail.com>
6
Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
7
Cc: Li Zhijian <lizhijian@cn.fujitsu.com>
7
enabled. Even if the device supports it, the reports would be nonsense
8
Cc: Jason Wang <jasowang@redhat.com>
8
because SVQ memory is in the qemu region.
9
9
10
Signed-off-by: Mao Zhongyi <maozy.fnst@cn.fujitsu.com>
10
The log region is still allocated. Future changes might skip that, but
11
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
11
this series is already long enough.
12
Signed-off-by: Zhang Chen <zhangckid@gmail.com>
12
13
Reviewed-by: Zhang Chen <zhangckid@gmail.com>
13
Acked-by: Michael S. Tsirkin <mst@redhat.com>
14
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
14
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
Signed-off-by: Jason Wang <jasowang@redhat.com>
15
---
16
---
16
net/colo-compare.c | 88 +++++++++++++++++++++++++++---------------------------
17
hw/virtio/vhost-vdpa.c | 39 +++++++++++++++++++++++++++++++++++----
17
1 file changed, 44 insertions(+), 44 deletions(-)
18
include/hw/virtio/vhost-vdpa.h | 1 +
19
2 files changed, 36 insertions(+), 4 deletions(-)
18
20
19
diff --git a/net/colo-compare.c b/net/colo-compare.c
21
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
20
index XXXXXXX..XXXXXXX 100644
22
index XXXXXXX..XXXXXXX 100644
21
--- a/net/colo-compare.c
23
--- a/hw/virtio/vhost-vdpa.c
22
+++ b/net/colo-compare.c
24
+++ b/hw/virtio/vhost-vdpa.c
23
@@ -XXX,XX +XXX,XX @@ static int packet_enqueue(CompareState *s, int mode, Connection **con)
25
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
24
* return: 0 means packet same
26
return v->index != 0;
25
* > 0 || < 0 means packet different
27
}
26
*/
28
27
-static int colo_packet_compare_common(Packet *ppkt,
29
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
28
- Packet *spkt,
30
+ uint64_t *features)
29
- int poffset,
31
+{
30
- int soffset)
32
+ int ret;
31
+static int colo_compare_packet_payload(Packet *ppkt,
32
+ Packet *spkt,
33
+ uint16_t poffset,
34
+ uint16_t soffset,
35
+ uint16_t len)
36
+
33
+
34
+ ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
35
+ trace_vhost_vdpa_get_features(dev, *features);
36
+ return ret;
37
+}
38
+
39
static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
40
Error **errp)
37
{
41
{
38
if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
42
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
39
char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
43
return 0;
40
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_common(Packet *ppkt,
41
sec_ip_src, sec_ip_dst);
42
}
44
}
43
45
44
- poffset = ppkt->vnet_hdr_len + poffset;
46
- r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
45
- soffset = ppkt->vnet_hdr_len + soffset;
47
+ r = vhost_vdpa_get_dev_features(hdev, &dev_features);
46
-
48
if (r != 0) {
47
- if (ppkt->size - poffset == spkt->size - soffset) {
49
error_setg_errno(errp, -r, "Can't get vdpa device features");
48
- return memcmp(ppkt->data + poffset,
50
return r;
49
- spkt->data + soffset,
51
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
50
- spkt->size - soffset);
52
static int vhost_vdpa_set_features(struct vhost_dev *dev,
51
- } else {
53
uint64_t features)
52
- trace_colo_compare_main("Net packet size are not the same");
54
{
53
- return -1;
55
+ struct vhost_vdpa *v = dev->opaque;
54
- }
56
int ret;
55
+ return memcmp(ppkt->data + poffset, spkt->data + soffset, len);
57
56
}
58
if (vhost_vdpa_one_time_request(dev)) {
57
59
return 0;
58
/*
59
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
60
* the secondary guest's timestamp. COLO just focus on payload,
61
* so we just need skip this field.
62
*/
63
- if (ptcp->th_off > 5) {
64
- ptrdiff_t ptcp_offset, stcp_offset;
65
66
- ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
67
- + (ptcp->th_off * 4) - ppkt->vnet_hdr_len;
68
- stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
69
- + (stcp->th_off * 4) - spkt->vnet_hdr_len;
70
+ ptrdiff_t ptcp_offset, stcp_offset;
71
72
- /*
73
- * When network is busy, some tcp options(like sack) will unpredictable
74
- * occur in primary side or secondary side. it will make packet size
75
- * not same, but the two packet's payload is identical. colo just
76
- * care about packet payload, so we skip the option field.
77
- */
78
- res = colo_packet_compare_common(ppkt, spkt, ptcp_offset, stcp_offset);
79
- } else if (ptcp->th_sum == stcp->th_sum) {
80
- res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN, ETH_HLEN);
81
+ ptcp_offset = ppkt->transport_header - (uint8_t *)ppkt->data
82
+ + (ptcp->th_off << 2) - ppkt->vnet_hdr_len;
83
+ stcp_offset = spkt->transport_header - (uint8_t *)spkt->data
84
+ + (stcp->th_off << 2) - spkt->vnet_hdr_len;
85
+ if (ppkt->size - ptcp_offset == spkt->size - stcp_offset) {
86
+ res = colo_compare_packet_payload(ppkt, spkt,
87
+ ptcp_offset, stcp_offset,
88
+ ppkt->size - ptcp_offset);
89
} else {
90
+ trace_colo_compare_main("TCP: payload size of packets are different");
91
res = -1;
92
}
60
}
93
61
94
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
62
+ if (v->shadow_vqs_enabled) {
95
*/
63
+ if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
96
static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
64
+ /*
65
+ * QEMU is just trying to enable or disable logging. SVQ handles
66
+ * this sepparately, so no need to forward this.
67
+ */
68
+ v->acked_features = features;
69
+ return 0;
70
+ }
71
+
72
+ v->acked_features = features;
73
+
74
+ /* We must not ack _F_LOG if SVQ is enabled */
75
+ features &= ~BIT_ULL(VHOST_F_LOG_ALL);
76
+ }
77
+
78
trace_vhost_vdpa_set_features(dev, features);
79
ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
80
if (ret) {
81
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
82
static int vhost_vdpa_get_features(struct vhost_dev *dev,
83
uint64_t *features)
97
{
84
{
98
- int ret;
85
- int ret;
99
- int network_header_length = ppkt->ip->ip_hl * 4;
86
+ struct vhost_vdpa *v = dev->opaque;
100
+ uint16_t network_header_length = ppkt->ip->ip_hl << 2;
87
+ int ret = vhost_vdpa_get_dev_features(dev, features);
101
+ uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
88
+
102
89
+ if (ret == 0 && v->shadow_vqs_enabled) {
103
trace_colo_compare_main("compare udp");
90
+ /* Add SVQ logging capabilities */
104
91
+ *features |= BIT_ULL(VHOST_F_LOG_ALL);
105
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
106
* other field like TOS,TTL,IP Checksum. we only need to compare
107
* the ip payload here.
108
*/
109
- ret = colo_packet_compare_common(ppkt, spkt,
110
- network_header_length + ETH_HLEN,
111
- network_header_length + ETH_HLEN);
112
-
113
- if (ret) {
114
+ if (ppkt->size != spkt->size) {
115
+ trace_colo_compare_main("UDP: payload size of packets are different");
116
+ return -1;
117
+ }
92
+ }
118
+ if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
93
119
+ ppkt->size - offset)) {
94
- ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
120
trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
95
- trace_vhost_vdpa_get_features(dev, *features);
121
trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
96
return ret;
122
if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
123
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
124
qemu_hexdump((char *)spkt->data, stderr, "colo-compare sec pkt",
125
spkt->size);
126
}
127
+ return -1;
128
+ } else {
129
+ return 0;
130
}
131
-
132
- return ret;
133
}
97
}
134
98
135
/*
99
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
136
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
100
index XXXXXXX..XXXXXXX 100644
137
*/
101
--- a/include/hw/virtio/vhost-vdpa.h
138
static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
102
+++ b/include/hw/virtio/vhost-vdpa.h
139
{
103
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
140
- int network_header_length = ppkt->ip->ip_hl * 4;
104
bool iotlb_batch_begin_sent;
141
+ uint16_t network_header_length = ppkt->ip->ip_hl << 2;
105
MemoryListener listener;
142
+ uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
106
struct vhost_vdpa_iova_range iova_range;
143
107
+ uint64_t acked_features;
144
trace_colo_compare_main("compare icmp");
108
bool shadow_vqs_enabled;
145
109
/* IOVA mapping used by the Shadow Virtqueue */
146
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
110
VhostIOVATree *iova_tree;
147
* other field like TOS,TTL,IP Checksum. we only need to compare
148
* the ip payload here.
149
*/
150
- if (colo_packet_compare_common(ppkt, spkt,
151
- network_header_length + ETH_HLEN,
152
- network_header_length + ETH_HLEN)) {
153
+ if (ppkt->size != spkt->size) {
154
+ trace_colo_compare_main("ICMP: payload size of packets are different");
155
+ return -1;
156
+ }
157
+ if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
158
+ ppkt->size - offset)) {
159
trace_colo_compare_icmp_miscompare("primary pkt size",
160
ppkt->size);
161
trace_colo_compare_icmp_miscompare("Secondary pkt size",
162
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
163
*/
164
static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
165
{
166
+ uint16_t offset = ppkt->vnet_hdr_len;
167
+
168
trace_colo_compare_main("compare other");
169
if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) {
170
char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
171
@@ -XXX,XX +XXX,XX @@ static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
172
sec_ip_src, sec_ip_dst);
173
}
174
175
- return colo_packet_compare_common(ppkt, spkt, 0, 0);
176
+ if (ppkt->size != spkt->size) {
177
+ trace_colo_compare_main("Other: payload size of packets are different");
178
+ return -1;
179
+ }
180
+ return colo_compare_packet_payload(ppkt, spkt, offset, offset,
181
+ ppkt->size - offset);
182
}
183
184
static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
185
--
111
--
186
2.7.4
112
2.7.4
187
113
188
114
diff view generated by jsdifflib