1
The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:
1
The following changes since commit c6a5fc2ac76c5ab709896ee1b0edd33685a67ed1:
2
2
3
Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)
3
decodetree: Add --output-null for meson testing (2023-05-31 19:56:42 -0700)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://github.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:
9
for you to fetch changes up to 98b126f5e3228a346c774e569e26689943b401dd:
10
10
11
block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)
11
qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa (2023-06-01 11:08:21 -0400)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
- Stefano Garzarella's blkio block driver 'fd' parameter
17
- My thread-local blk_io_plug() series
18
16
----------------------------------------------------------------
19
----------------------------------------------------------------
17
20
18
Daniele Buono (4):
21
Stefan Hajnoczi (6):
19
coroutine: support SafeStack in ucontext backend
22
block: add blk_io_plug_call() API
20
coroutine: add check for SafeStack in sigaltstack
23
block/nvme: convert to blk_io_plug_call() API
21
configure: add flags to support SafeStack
24
block/blkio: convert to blk_io_plug_call() API
22
check-block: enable iotests with SafeStack
25
block/io_uring: convert to blk_io_plug_call() API
26
block/linux-aio: convert to blk_io_plug_call() API
27
block: remove bdrv_co_io_plug() API
23
28
24
Stefan Hajnoczi (8):
29
Stefano Garzarella (2):
25
minikconf: explicitly set encoding to UTF-8
30
block/blkio: use qemu_open() to support fd passing for virtio-blk
26
block/nvme: poll queues without q->lock
31
qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa
27
block/nvme: drop tautologous assertion
28
block/nvme: don't access CQE after moving cq.head
29
block/nvme: switch to a NVMeRequest freelist
30
block/nvme: clarify that free_req_queue is protected by q->lock
31
block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
32
block/nvme: support nested aio_poll()
33
32
34
configure | 73 ++++++++++++
33
MAINTAINERS | 1 +
35
include/qemu/coroutine_int.h | 5 +
34
qapi/block-core.json | 6 ++
36
block/nvme.c | 220 +++++++++++++++++++++++++----------
35
meson.build | 4 +
37
util/coroutine-sigaltstack.c | 4 +
36
include/block/block-io.h | 3 -
38
util/coroutine-ucontext.c | 28 +++++
37
include/block/block_int-common.h | 11 ---
39
block/trace-events | 2 +-
38
include/block/raw-aio.h | 14 ---
40
scripts/minikconf.py | 6 +-
39
include/sysemu/block-backend-io.h | 13 +--
41
tests/check-block.sh | 12 +-
40
block/blkio.c | 96 ++++++++++++------
42
8 files changed, 284 insertions(+), 66 deletions(-)
41
block/block-backend.c | 22 -----
42
block/file-posix.c | 38 -------
43
block/io.c | 37 -------
44
block/io_uring.c | 44 ++++-----
45
block/linux-aio.c | 41 +++-----
46
block/nvme.c | 44 +++------
47
block/plug.c | 159 ++++++++++++++++++++++++++++++
48
hw/block/dataplane/xen-block.c | 8 +-
49
hw/block/virtio-blk.c | 4 +-
50
hw/scsi/virtio-scsi.c | 6 +-
51
block/meson.build | 1 +
52
block/trace-events | 6 +-
53
20 files changed, 293 insertions(+), 265 deletions(-)
54
create mode 100644 block/plug.c
43
55
44
--
56
--
45
2.26.2
57
2.40.1
46
diff view generated by jsdifflib
Deleted patch
1
QEMU currently only has ASCII Kconfig files but Linux actually uses
2
UTF-8. Explicitly specify the encoding and that we're doing text file
3
I/O.
4
1
5
It's unclear whether or not QEMU will ever need Unicode in its Kconfig
6
files. If we start using the help text then it will become an issue
7
sooner or later. Make this change now for consistency with Linux
8
Kconfig.
9
10
Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
13
Message-id: 20200521153616.307100-1-stefanha@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
scripts/minikconf.py | 6 +++---
17
1 file changed, 3 insertions(+), 3 deletions(-)
18
19
diff --git a/scripts/minikconf.py b/scripts/minikconf.py
20
index XXXXXXX..XXXXXXX 100755
21
--- a/scripts/minikconf.py
22
+++ b/scripts/minikconf.py
23
@@ -XXX,XX +XXX,XX @@ class KconfigParser:
24
if incl_abs_fname in self.data.previously_included:
25
return
26
try:
27
- fp = open(incl_abs_fname, 'r')
28
+ fp = open(incl_abs_fname, 'rt', encoding='utf-8')
29
except IOError as e:
30
raise KconfigParserError(self,
31
'%s: %s' % (e.strerror, include))
32
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
33
parser.do_assignment(name, value == 'y')
34
external_vars.add(name[7:])
35
else:
36
- fp = open(arg, 'r')
37
+ fp = open(arg, 'rt', encoding='utf-8')
38
parser.parse_file(fp)
39
fp.close()
40
41
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
42
if key not in external_vars and config[key]:
43
print ('CONFIG_%s=y' % key)
44
45
- deps = open(argv[2], 'w')
46
+ deps = open(argv[2], 'wt', encoding='utf-8')
47
for fname in data.previously_included:
48
print ('%s: %s' % (argv[1], fname), file=deps)
49
deps.close()
50
--
51
2.26.2
52
diff view generated by jsdifflib
1
Do not access a CQE after incrementing q->cq.head and releasing q->lock.
1
Introduce a new API for thread-local blk_io_plug() that does not
2
It is unlikely that this causes problems in practice but it's a latent
2
traverse the block graph. The goal is to make blk_io_plug() multi-queue
3
bug.
3
friendly.
4
4
5
The reason why it should be safe at the moment is that completion
5
Instead of having block drivers track whether or not we're in a plugged
6
processing is not re-entrant and the CQ doorbell isn't written until the
6
section, provide an API that allows them to defer a function call until
7
end of nvme_process_completion().
7
we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
8
8
called multiple times with the same fn/opaque pair, then fn() is only
9
Make this change now because QEMU expects completion processing to be
9
called once at the end of the function - resulting in batching.
10
re-entrant and later patches will do that.
10
11
This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
12
blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
13
because the plug state is now thread-local.
14
15
Later patches convert block drivers to blk_io_plug_call() and then we
16
can finally remove .bdrv_co_io_plug() once all block drivers have been
17
converted.
11
18
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Sergio Lopez <slp@redhat.com>
20
Reviewed-by: Eric Blake <eblake@redhat.com>
14
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
21
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
15
Message-id: 20200617132201.1832152-4-stefanha@redhat.com
22
Acked-by: Kevin Wolf <kwolf@redhat.com>
23
Message-id: 20230530180959.1108766-2-stefanha@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
24
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
25
---
18
block/nvme.c | 5 ++++-
26
MAINTAINERS | 1 +
19
1 file changed, 4 insertions(+), 1 deletion(-)
27
include/sysemu/block-backend-io.h | 13 +--
20
28
block/block-backend.c | 22 -----
21
diff --git a/block/nvme.c b/block/nvme.c
29
block/plug.c | 159 ++++++++++++++++++++++++++++++
22
index XXXXXXX..XXXXXXX 100644
30
hw/block/dataplane/xen-block.c | 8 +-
23
--- a/block/nvme.c
31
hw/block/virtio-blk.c | 4 +-
24
+++ b/block/nvme.c
32
hw/scsi/virtio-scsi.c | 6 +-
25
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
33
block/meson.build | 1 +
26
q->busy = true;
34
8 files changed, 173 insertions(+), 41 deletions(-)
27
assert(q->inflight >= 0);
35
create mode 100644 block/plug.c
28
while (q->inflight) {
36
29
+ int ret;
37
diff --git a/MAINTAINERS b/MAINTAINERS
30
int16_t cid;
38
index XXXXXXX..XXXXXXX 100644
31
+
39
--- a/MAINTAINERS
32
c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
40
+++ b/MAINTAINERS
33
if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
41
@@ -XXX,XX +XXX,XX @@ F: util/aio-*.c
34
break;
42
F: util/aio-*.h
43
F: util/fdmon-*.c
44
F: block/io.c
45
+F: block/plug.c
46
F: migration/block*
47
F: include/block/aio.h
48
F: include/block/aio-wait.h
49
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
50
index XXXXXXX..XXXXXXX 100644
51
--- a/include/sysemu/block-backend-io.h
52
+++ b/include/sysemu/block-backend-io.h
53
@@ -XXX,XX +XXX,XX @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
54
int blk_get_max_iov(BlockBackend *blk);
55
int blk_get_max_hw_iov(BlockBackend *blk);
56
57
-/*
58
- * blk_io_plug/unplug are thread-local operations. This means that multiple
59
- * IOThreads can simultaneously call plug/unplug, but the caller must ensure
60
- * that each unplug() is called in the same IOThread of the matching plug().
61
- */
62
-void coroutine_fn blk_co_io_plug(BlockBackend *blk);
63
-void co_wrapper blk_io_plug(BlockBackend *blk);
64
-
65
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
66
-void co_wrapper blk_io_unplug(BlockBackend *blk);
67
+void blk_io_plug(void);
68
+void blk_io_unplug(void);
69
+void blk_io_plug_call(void (*fn)(void *), void *opaque);
70
71
AioContext *blk_get_aio_context(BlockBackend *blk);
72
BlockAcctStats *blk_get_stats(BlockBackend *blk);
73
diff --git a/block/block-backend.c b/block/block-backend.c
74
index XXXXXXX..XXXXXXX 100644
75
--- a/block/block-backend.c
76
+++ b/block/block-backend.c
77
@@ -XXX,XX +XXX,XX @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
78
notifier_list_add(&blk->insert_bs_notifiers, notify);
79
}
80
81
-void coroutine_fn blk_co_io_plug(BlockBackend *blk)
82
-{
83
- BlockDriverState *bs = blk_bs(blk);
84
- IO_CODE();
85
- GRAPH_RDLOCK_GUARD();
86
-
87
- if (bs) {
88
- bdrv_co_io_plug(bs);
89
- }
90
-}
91
-
92
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
93
-{
94
- BlockDriverState *bs = blk_bs(blk);
95
- IO_CODE();
96
- GRAPH_RDLOCK_GUARD();
97
-
98
- if (bs) {
99
- bdrv_co_io_unplug(bs);
100
- }
101
-}
102
-
103
BlockAcctStats *blk_get_stats(BlockBackend *blk)
104
{
105
IO_CODE();
106
diff --git a/block/plug.c b/block/plug.c
107
new file mode 100644
108
index XXXXXXX..XXXXXXX
109
--- /dev/null
110
+++ b/block/plug.c
111
@@ -XXX,XX +XXX,XX @@
112
+/* SPDX-License-Identifier: GPL-2.0-or-later */
113
+/*
114
+ * Block I/O plugging
115
+ *
116
+ * Copyright Red Hat.
117
+ *
118
+ * This API defers a function call within a blk_io_plug()/blk_io_unplug()
119
+ * section, allowing multiple calls to batch up. This is a performance
120
+ * optimization that is used in the block layer to submit several I/O requests
121
+ * at once instead of individually:
122
+ *
123
+ * blk_io_plug(); <-- start of plugged region
124
+ * ...
125
+ * blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
126
+ * blk_io_plug_call(my_func, my_obj); <-- another
127
+ * blk_io_plug_call(my_func, my_obj); <-- another
128
+ * ...
129
+ * blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
130
+ *
131
+ * This code is actually generic and not tied to the block layer. If another
132
+ * subsystem needs this functionality, it could be renamed.
133
+ */
134
+
135
+#include "qemu/osdep.h"
136
+#include "qemu/coroutine-tls.h"
137
+#include "qemu/notify.h"
138
+#include "qemu/thread.h"
139
+#include "sysemu/block-backend.h"
140
+
141
+/* A function call that has been deferred until unplug() */
142
+typedef struct {
143
+ void (*fn)(void *);
144
+ void *opaque;
145
+} UnplugFn;
146
+
147
+/* Per-thread state */
148
+typedef struct {
149
+ unsigned count; /* how many times has plug() been called? */
150
+ GArray *unplug_fns; /* functions to call at unplug time */
151
+} Plug;
152
+
153
+/* Use get_ptr_plug() to fetch this thread-local value */
154
+QEMU_DEFINE_STATIC_CO_TLS(Plug, plug);
155
+
156
+/* Called at thread cleanup time */
157
+static void blk_io_plug_atexit(Notifier *n, void *value)
158
+{
159
+ Plug *plug = get_ptr_plug();
160
+ g_array_free(plug->unplug_fns, TRUE);
161
+}
162
+
163
+/* This won't involve coroutines, so use __thread */
164
+static __thread Notifier blk_io_plug_atexit_notifier;
165
+
166
+/**
167
+ * blk_io_plug_call:
168
+ * @fn: a function pointer to be invoked
169
+ * @opaque: a user-defined argument to @fn()
170
+ *
171
+ * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug()
172
+ * section.
173
+ *
174
+ * Otherwise defer the call until the end of the outermost
175
+ * blk_io_plug()/blk_io_unplug() section in this thread. If the same
176
+ * @fn/@opaque pair has already been deferred, it will only be called once upon
177
+ * blk_io_unplug() so that accumulated calls are batched into a single call.
178
+ *
179
+ * The caller must ensure that @opaque is not freed before @fn() is invoked.
180
+ */
181
+void blk_io_plug_call(void (*fn)(void *), void *opaque)
182
+{
183
+ Plug *plug = get_ptr_plug();
184
+
185
+ /* Call immediately if we're not plugged */
186
+ if (plug->count == 0) {
187
+ fn(opaque);
188
+ return;
189
+ }
190
+
191
+ GArray *array = plug->unplug_fns;
192
+ if (!array) {
193
+ array = g_array_new(FALSE, FALSE, sizeof(UnplugFn));
194
+ plug->unplug_fns = array;
195
+ blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit;
196
+ qemu_thread_atexit_add(&blk_io_plug_atexit_notifier);
197
+ }
198
+
199
+ UnplugFn *fns = (UnplugFn *)array->data;
200
+ UnplugFn new_fn = {
201
+ .fn = fn,
202
+ .opaque = opaque,
203
+ };
204
+
205
+ /*
206
+ * There won't be many, so do a linear search. If this becomes a bottleneck
207
+ * then a binary search (glib 2.62+) or different data structure could be
208
+ * used.
209
+ */
210
+ for (guint i = 0; i < array->len; i++) {
211
+ if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) {
212
+ return; /* already exists */
213
+ }
214
+ }
215
+
216
+ g_array_append_val(array, new_fn);
217
+}
218
+
219
+/**
220
+ * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug()
221
+ *
222
+ * blk_io_plug/unplug are thread-local operations. This means that multiple
223
+ * threads can simultaneously call plug/unplug, but the caller must ensure that
224
+ * each unplug() is called in the same thread of the matching plug().
225
+ *
226
+ * Nesting is supported. blk_io_plug_call() functions are only called at the
227
+ * outermost blk_io_unplug().
228
+ */
229
+void blk_io_plug(void)
230
+{
231
+ Plug *plug = get_ptr_plug();
232
+
233
+ assert(plug->count < UINT32_MAX);
234
+
235
+ plug->count++;
236
+}
237
+
238
+/**
239
+ * blk_io_unplug: Run any pending blk_io_plug_call() functions
240
+ *
241
+ * There must have been a matching blk_io_plug() call in the same thread prior
242
+ * to this blk_io_unplug() call.
243
+ */
244
+void blk_io_unplug(void)
245
+{
246
+ Plug *plug = get_ptr_plug();
247
+
248
+ assert(plug->count > 0);
249
+
250
+ if (--plug->count > 0) {
251
+ return;
252
+ }
253
+
254
+ GArray *array = plug->unplug_fns;
255
+ if (!array) {
256
+ return;
257
+ }
258
+
259
+ UnplugFn *fns = (UnplugFn *)array->data;
260
+
261
+ for (guint i = 0; i < array->len; i++) {
262
+ fns[i].fn(fns[i].opaque);
263
+ }
264
+
265
+ /*
266
+ * This resets the array without freeing memory so that appending is cheap
267
+ * in the future.
268
+ */
269
+ g_array_set_size(array, 0);
270
+}
271
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
272
index XXXXXXX..XXXXXXX 100644
273
--- a/hw/block/dataplane/xen-block.c
274
+++ b/hw/block/dataplane/xen-block.c
275
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
276
* is below us.
277
*/
278
if (inflight_atstart > IO_PLUG_THRESHOLD) {
279
- blk_io_plug(dataplane->blk);
280
+ blk_io_plug();
281
}
282
while (rc != rp) {
283
/* pull request from ring */
284
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
285
286
if (inflight_atstart > IO_PLUG_THRESHOLD &&
287
batched >= inflight_atstart) {
288
- blk_io_unplug(dataplane->blk);
289
+ blk_io_unplug();
35
}
290
}
36
+ ret = nvme_translate_error(c);
291
xen_block_do_aio(request);
37
q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
292
if (inflight_atstart > IO_PLUG_THRESHOLD) {
38
if (!q->cq.head) {
293
if (batched >= inflight_atstart) {
39
q->cq_phase = !q->cq_phase;
294
- blk_io_plug(dataplane->blk);
40
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
295
+ blk_io_plug();
41
preq->busy = false;
296
batched = 0;
42
preq->cb = preq->opaque = NULL;
297
} else {
43
qemu_mutex_unlock(&q->lock);
298
batched++;
44
- req.cb(req.opaque, nvme_translate_error(c));
299
@@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
45
+ req.cb(req.opaque, ret);
300
}
46
qemu_mutex_lock(&q->lock);
301
}
47
q->inflight--;
302
if (inflight_atstart > IO_PLUG_THRESHOLD) {
48
progress = true;
303
- blk_io_unplug(dataplane->blk);
304
+ blk_io_unplug();
305
}
306
307
return done_something;
308
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
309
index XXXXXXX..XXXXXXX 100644
310
--- a/hw/block/virtio-blk.c
311
+++ b/hw/block/virtio-blk.c
312
@@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
313
bool suppress_notifications = virtio_queue_get_notification(vq);
314
315
aio_context_acquire(blk_get_aio_context(s->blk));
316
- blk_io_plug(s->blk);
317
+ blk_io_plug();
318
319
do {
320
if (suppress_notifications) {
321
@@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
322
virtio_blk_submit_multireq(s, &mrb);
323
}
324
325
- blk_io_unplug(s->blk);
326
+ blk_io_unplug();
327
aio_context_release(blk_get_aio_context(s->blk));
328
}
329
330
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
331
index XXXXXXX..XXXXXXX 100644
332
--- a/hw/scsi/virtio-scsi.c
333
+++ b/hw/scsi/virtio-scsi.c
334
@@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
335
return -ENOBUFS;
336
}
337
scsi_req_ref(req->sreq);
338
- blk_io_plug(d->conf.blk);
339
+ blk_io_plug();
340
object_unref(OBJECT(d));
341
return 0;
342
}
343
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req)
344
if (scsi_req_enqueue(sreq)) {
345
scsi_req_continue(sreq);
346
}
347
- blk_io_unplug(sreq->dev->conf.blk);
348
+ blk_io_unplug();
349
scsi_req_unref(sreq);
350
}
351
352
@@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
353
while (!QTAILQ_EMPTY(&reqs)) {
354
req = QTAILQ_FIRST(&reqs);
355
QTAILQ_REMOVE(&reqs, req, next);
356
- blk_io_unplug(req->sreq->dev->conf.blk);
357
+ blk_io_unplug();
358
scsi_req_unref(req->sreq);
359
virtqueue_detach_element(req->vq, &req->elem, 0);
360
virtio_scsi_free_req(req);
361
diff --git a/block/meson.build b/block/meson.build
362
index XXXXXXX..XXXXXXX 100644
363
--- a/block/meson.build
364
+++ b/block/meson.build
365
@@ -XXX,XX +XXX,XX @@ block_ss.add(files(
366
'mirror.c',
367
'nbd.c',
368
'null.c',
369
+ 'plug.c',
370
'qapi.c',
371
'qcow2-bitmap.c',
372
'qcow2-cache.c',
49
--
373
--
50
2.26.2
374
2.40.1
51
diff view generated by jsdifflib
1
QEMU block drivers are supposed to support aio_poll() from I/O
1
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
2
completion callback functions. This means completion processing must be
2
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
3
re-entrant.
3
submission instead.
4
5
The standard approach is to schedule a BH during completion processing
6
and cancel it at the end of processing. If aio_poll() is invoked by a
7
callback function then the BH will run. The BH continues the suspended
8
completion processing.
9
10
All of this means that request A's cb() can synchronously wait for
11
request B to complete. Previously the nvme block driver would hang
12
because it didn't process completions from nested aio_poll().
13
4
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Reviewed-by: Sergio Lopez <slp@redhat.com>
6
Reviewed-by: Eric Blake <eblake@redhat.com>
16
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
7
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
8
Acked-by: Kevin Wolf <kwolf@redhat.com>
9
Message-id: 20230530180959.1108766-3-stefanha@redhat.com
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
11
---
19
block/nvme.c | 67 ++++++++++++++++++++++++++++++++++++++++------
12
block/nvme.c | 44 ++++++++++++--------------------------------
20
block/trace-events | 2 +-
13
block/trace-events | 1 -
21
2 files changed, 60 insertions(+), 9 deletions(-)
14
2 files changed, 12 insertions(+), 33 deletions(-)
22
15
23
diff --git a/block/nvme.c b/block/nvme.c
16
diff --git a/block/nvme.c b/block/nvme.c
24
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
25
--- a/block/nvme.c
18
--- a/block/nvme.c
26
+++ b/block/nvme.c
19
+++ b/block/nvme.c
27
@@ -XXX,XX +XXX,XX @@ typedef struct {
20
@@ -XXX,XX +XXX,XX @@
28
int cq_phase;
21
#include "qemu/vfio-helpers.h"
29
int free_req_head;
22
#include "block/block-io.h"
30
NVMeRequest reqs[NVME_NUM_REQS];
23
#include "block/block_int.h"
31
- bool busy;
24
+#include "sysemu/block-backend.h"
32
int need_kick;
25
#include "sysemu/replay.h"
33
int inflight;
26
#include "trace.h"
34
+
27
35
+ /* Thread-safe, no lock necessary */
36
+ QEMUBH *completion_bh;
37
} NVMeQueuePair;
38
39
/* Memory mapped registers */
40
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
28
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
41
#define NVME_BLOCK_OPT_DEVICE "device"
29
int blkshift;
42
#define NVME_BLOCK_OPT_NAMESPACE "namespace"
30
43
31
uint64_t max_transfer;
44
+static void nvme_process_completion_bh(void *opaque);
32
- bool plugged;
45
+
33
46
static QemuOptsList runtime_opts = {
34
bool supports_write_zeroes;
47
.name = "nvme",
35
bool supports_discard;
48
.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
36
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(NVMeQueuePair *q)
49
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
50
51
static void nvme_free_queue_pair(NVMeQueuePair *q)
52
{
37
{
53
+ if (q->completion_bh) {
38
BDRVNVMeState *s = q->s;
54
+ qemu_bh_delete(q->completion_bh);
39
55
+ }
40
- if (s->plugged || !q->need_kick) {
56
qemu_vfree(q->prp_list_pages);
41
+ if (!q->need_kick) {
57
qemu_vfree(q->sq.queue);
42
return;
58
qemu_vfree(q->cq.queue);
43
}
59
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
44
trace_nvme_kick(s, q->index);
60
q->index = idx;
61
qemu_co_queue_init(&q->free_req_queue);
62
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
63
+ q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
64
+ nvme_process_completion_bh, q);
65
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
66
s->page_size * NVME_NUM_REQS,
67
false, &prp_list_iova);
68
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
45
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
69
NvmeCqe *c;
46
NvmeCqe *c;
70
47
71
trace_nvme_process_completion(s, q->index, q->inflight);
48
trace_nvme_process_completion(s, q->index, q->inflight);
72
- if (q->busy || s->plugged) {
49
- if (s->plugged) {
73
- trace_nvme_process_completion_queue_busy(s, q->index);
50
- trace_nvme_process_completion_queue_plugged(s, q->index);
74
+ if (s->plugged) {
51
- return false;
75
+ trace_nvme_process_completion_queue_plugged(s, q->index);
52
- }
76
return false;
53
54
/*
55
* Support re-entrancy when a request cb() function invokes aio_poll().
56
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
77
}
57
}
78
- q->busy = true;
79
+
80
+ /*
81
+ * Support re-entrancy when a request cb() function invokes aio_poll().
82
+ * Pending completions must be visible to aio_poll() so that a cb()
83
+ * function can wait for the completion of another request.
84
+ *
85
+ * The aio_poll() loop will execute our BH and we'll resume completion
86
+ * processing there.
87
+ */
88
+ qemu_bh_schedule(q->completion_bh);
89
+
90
assert(q->inflight >= 0);
91
while (q->inflight) {
92
int ret;
93
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
94
assert(req.cb);
95
nvme_put_free_req_locked(q, preq);
96
preq->cb = preq->opaque = NULL;
97
- qemu_mutex_unlock(&q->lock);
98
- req.cb(req.opaque, ret);
99
- qemu_mutex_lock(&q->lock);
100
q->inflight--;
101
+ qemu_mutex_unlock(&q->lock);
102
+ req.cb(req.opaque, ret);
103
+ qemu_mutex_lock(&q->lock);
104
progress = true;
105
}
106
if (progress) {
107
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
108
*q->cq.doorbell = cpu_to_le32(q->cq.head);
109
nvme_wake_free_req_locked(q);
110
}
111
- q->busy = false;
112
+
113
+ qemu_bh_cancel(q->completion_bh);
114
+
115
return progress;
116
}
58
}
117
59
118
+static void nvme_process_completion_bh(void *opaque)
60
+static void nvme_unplug_fn(void *opaque)
119
+{
61
+{
120
+ NVMeQueuePair *q = opaque;
62
+ NVMeQueuePair *q = opaque;
121
+
63
+
122
+ /*
64
+ QEMU_LOCK_GUARD(&q->lock);
123
+ * We're being invoked because a nvme_process_completion() cb() function
65
+ nvme_kick(q);
124
+ * called aio_poll(). The callback may be waiting for further completions
125
+ * so notify the device that it has space to fill in more completions now.
126
+ */
127
+ smp_mb_release();
128
+ *q->cq.doorbell = cpu_to_le32(q->cq.head);
129
+ nvme_wake_free_req_locked(q);
130
+
131
+ nvme_process_completion(q);
66
+ nvme_process_completion(q);
132
+}
67
+}
133
+
68
+
134
static void nvme_trace_command(const NvmeCmd *cmd)
69
static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
70
NvmeCmd *cmd, BlockCompletionFunc cb,
71
void *opaque)
72
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
73
q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
74
q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
75
q->need_kick++;
76
- nvme_kick(q);
77
- nvme_process_completion(q);
78
+ blk_io_plug_call(nvme_unplug_fn, q);
79
qemu_mutex_unlock(&q->lock);
80
}
81
82
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
83
}
84
}
85
86
-static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs)
87
-{
88
- BDRVNVMeState *s = bs->opaque;
89
- assert(!s->plugged);
90
- s->plugged = true;
91
-}
92
-
93
-static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs)
94
-{
95
- BDRVNVMeState *s = bs->opaque;
96
- assert(s->plugged);
97
- s->plugged = false;
98
- for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
99
- NVMeQueuePair *q = s->queues[i];
100
- qemu_mutex_lock(&q->lock);
101
- nvme_kick(q);
102
- nvme_process_completion(q);
103
- qemu_mutex_unlock(&q->lock);
104
- }
105
-}
106
-
107
static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
108
Error **errp)
135
{
109
{
136
int i;
110
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_nvme = {
137
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
111
.bdrv_detach_aio_context = nvme_detach_aio_context,
138
{
112
.bdrv_attach_aio_context = nvme_attach_aio_context,
139
BDRVNVMeState *s = bs->opaque;
113
140
114
- .bdrv_co_io_plug = nvme_co_io_plug,
141
+ for (int i = 0; i < s->nr_queues; i++) {
115
- .bdrv_co_io_unplug = nvme_co_io_unplug,
142
+ NVMeQueuePair *q = s->queues[i];
116
-
143
+
117
.bdrv_register_buf = nvme_register_buf,
144
+ qemu_bh_delete(q->completion_bh);
118
.bdrv_unregister_buf = nvme_unregister_buf,
145
+ q->completion_bh = NULL;
119
};
146
+ }
147
+
148
aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
149
false, NULL, NULL);
150
}
151
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
152
s->aio_context = new_context;
153
aio_set_event_notifier(new_context, &s->irq_notifier,
154
false, nvme_handle_event, nvme_poll_cb);
155
+
156
+ for (int i = 0; i < s->nr_queues; i++) {
157
+ NVMeQueuePair *q = s->queues[i];
158
+
159
+ q->completion_bh =
160
+ aio_bh_new(new_context, nvme_process_completion_bh, q);
161
+ }
162
}
163
164
static void nvme_aio_plug(BlockDriverState *bs)
165
diff --git a/block/trace-events b/block/trace-events
120
diff --git a/block/trace-events b/block/trace-events
166
index XXXXXXX..XXXXXXX 100644
121
index XXXXXXX..XXXXXXX 100644
167
--- a/block/trace-events
122
--- a/block/trace-events
168
+++ b/block/trace-events
123
+++ b/block/trace-events
169
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
124
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u"
170
nvme_dma_flush_queue_wait(void *s) "s %p"
125
nvme_dma_flush_queue_wait(void *s) "s %p"
171
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
126
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
172
nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
127
nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
173
-nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
128
-nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
174
+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
129
nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
175
nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
130
nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
176
nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
177
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
131
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
178
--
132
--
179
2.26.2
133
2.40.1
180
diff view generated by jsdifflib
1
There are three issues with the current NVMeRequest->busy field:
1
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
2
1. The busy field is accidentally accessed outside q->lock when request
2
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
3
submission fails.
3
submission instead.
4
2. Waiters on free_req_queue are not woken when a request is returned
5
early due to submission failure.
6
2. Finding a free request involves scanning all requests. This makes
7
request submission O(n^2).
8
9
Switch to an O(1) freelist that is always accessed under the lock.
10
11
Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
12
NVME_NUM_REQS, the number of usable requests. This makes the code
13
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
14
that one slot is reserved.
15
4
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Reviewed-by: Sergio Lopez <slp@redhat.com>
6
Reviewed-by: Eric Blake <eblake@redhat.com>
18
Message-id: 20200617132201.1832152-5-stefanha@redhat.com
7
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
8
Acked-by: Kevin Wolf <kwolf@redhat.com>
9
Message-id: 20230530180959.1108766-4-stefanha@redhat.com
19
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
---
11
---
21
block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
12
block/blkio.c | 43 ++++++++++++++++++++++++-------------------
22
1 file changed, 54 insertions(+), 27 deletions(-)
13
1 file changed, 24 insertions(+), 19 deletions(-)
23
14
24
diff --git a/block/nvme.c b/block/nvme.c
15
diff --git a/block/blkio.c b/block/blkio.c
25
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
26
--- a/block/nvme.c
17
--- a/block/blkio.c
27
+++ b/block/nvme.c
18
+++ b/block/blkio.c
28
@@ -XXX,XX +XXX,XX @@
19
@@ -XXX,XX +XXX,XX @@
29
#define NVME_QUEUE_SIZE 128
20
#include "qemu/error-report.h"
30
#define NVME_BAR_SIZE 8192
21
#include "qapi/qmp/qdict.h"
31
22
#include "qemu/module.h"
23
+#include "sysemu/block-backend.h"
24
#include "exec/memory.h" /* for ram_block_discard_disable() */
25
26
#include "block/block-io.h"
27
@@ -XXX,XX +XXX,XX @@ static void blkio_detach_aio_context(BlockDriverState *bs)
28
NULL, NULL, NULL);
29
}
30
31
-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
32
-static void blkio_submit_io(BlockDriverState *bs)
32
+/*
33
+/*
33
+ * We have to leave one slot empty as that is the full queue case where
34
+ * Called by blk_io_unplug() or immediately if not plugged. Called without
34
+ * head == tail + 1.
35
+ * blkio_lock.
35
+ */
36
+ */
36
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
37
+static void blkio_unplug_fn(void *opaque)
38
{
39
- if (qatomic_read(&bs->io_plugged) == 0) {
40
- BDRVBlkioState *s = bs->opaque;
41
+ BDRVBlkioState *s = opaque;
42
43
+ WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
44
blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
45
}
46
}
47
48
+/*
49
+ * Schedule I/O submission after enqueuing a new request. Called without
50
+ * blkio_lock.
51
+ */
52
+static void blkio_submit_io(BlockDriverState *bs)
53
+{
54
+ BDRVBlkioState *s = bs->opaque;
37
+
55
+
38
typedef struct {
56
+ blk_io_plug_call(blkio_unplug_fn, s);
39
int32_t head, tail;
40
uint8_t *queue;
41
@@ -XXX,XX +XXX,XX @@ typedef struct {
42
int cid;
43
void *prp_list_page;
44
uint64_t prp_list_iova;
45
- bool busy;
46
+ int free_req_next; /* q->reqs[] index of next free req */
47
} NVMeRequest;
48
49
typedef struct {
50
@@ -XXX,XX +XXX,XX @@ typedef struct {
51
/* Fields protected by @lock */
52
NVMeQueue sq, cq;
53
int cq_phase;
54
- NVMeRequest reqs[NVME_QUEUE_SIZE];
55
+ int free_req_head;
56
+ NVMeRequest reqs[NVME_NUM_REQS];
57
bool busy;
58
int need_kick;
59
int inflight;
60
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
61
qemu_mutex_init(&q->lock);
62
q->index = idx;
63
qemu_co_queue_init(&q->free_req_queue);
64
- q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
65
+ q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
66
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
67
- s->page_size * NVME_QUEUE_SIZE,
68
+ s->page_size * NVME_NUM_REQS,
69
false, &prp_list_iova);
70
if (r) {
71
goto fail;
72
}
73
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
74
+ q->free_req_head = -1;
75
+ for (i = 0; i < NVME_NUM_REQS; i++) {
76
NVMeRequest *req = &q->reqs[i];
77
req->cid = i + 1;
78
+ req->free_req_next = q->free_req_head;
79
+ q->free_req_head = i;
80
req->prp_list_page = q->prp_list_pages + i * s->page_size;
81
req->prp_list_iova = prp_list_iova + i * s->page_size;
82
}
83
+
84
nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
85
if (local_err) {
86
error_propagate(errp, local_err);
87
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
88
*/
89
static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
90
{
91
- int i;
92
- NVMeRequest *req = NULL;
93
+ NVMeRequest *req;
94
95
qemu_mutex_lock(&q->lock);
96
- while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
97
- /* We have to leave one slot empty as that is the full queue case (head
98
- * == tail + 1). */
99
+
100
+ while (q->free_req_head == -1) {
101
if (qemu_in_coroutine()) {
102
trace_nvme_free_req_queue_wait(q);
103
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
104
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
105
return NULL;
106
}
107
}
108
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
109
- if (!q->reqs[i].busy) {
110
- q->reqs[i].busy = true;
111
- req = &q->reqs[i];
112
- break;
113
- }
114
- }
115
- /* We have checked inflight and need_kick while holding q->lock, so one
116
- * free req must be available. */
117
- assert(req);
118
+
119
+ req = &q->reqs[q->free_req_head];
120
+ q->free_req_head = req->free_req_next;
121
+ req->free_req_next = -1;
122
+
123
qemu_mutex_unlock(&q->lock);
124
return req;
125
}
126
127
+/* With q->lock */
128
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
129
+{
130
+ req->free_req_next = q->free_req_head;
131
+ q->free_req_head = req - q->reqs;
132
+}
57
+}
133
+
58
+
134
+/* With q->lock */
59
static int coroutine_fn
135
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
60
blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
136
+{
137
+ if (!qemu_co_queue_empty(&q->free_req_queue)) {
138
+ replay_bh_schedule_oneshot_event(s->aio_context,
139
+ nvme_free_req_queue_cb, q);
140
+ }
141
+}
142
+
143
+/* Insert a request in the freelist and wake waiters */
144
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
145
+ NVMeRequest *req)
146
+{
147
+ qemu_mutex_lock(&q->lock);
148
+ nvme_put_free_req_locked(q, req);
149
+ nvme_wake_free_req_locked(s, q);
150
+ qemu_mutex_unlock(&q->lock);
151
+}
152
+
153
static inline int nvme_translate_error(const NvmeCqe *c)
154
{
61
{
155
uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
62
@@ -XXX,XX +XXX,XX @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
156
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
63
157
req = *preq;
64
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
158
assert(req.cid == cid);
65
blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
159
assert(req.cb);
66
- blkio_submit_io(bs);
160
- preq->busy = false;
161
+ nvme_put_free_req_locked(q, preq);
162
preq->cb = preq->opaque = NULL;
163
qemu_mutex_unlock(&q->lock);
164
req.cb(req.opaque, ret);
165
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
166
/* Notify the device so it can post more completions. */
167
smp_mb_release();
168
*q->cq.doorbell = cpu_to_le32(q->cq.head);
169
- if (!qemu_co_queue_empty(&q->free_req_queue)) {
170
- replay_bh_schedule_oneshot_event(s->aio_context,
171
- nvme_free_req_queue_cb, q);
172
- }
173
+ nvme_wake_free_req_locked(s, q);
174
}
67
}
175
q->busy = false;
68
176
return progress;
69
+ blkio_submit_io(bs);
177
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
70
qemu_coroutine_yield();
178
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
71
return cod.ret;
179
qemu_co_mutex_unlock(&s->dma_map_lock);
72
}
180
if (r) {
73
@@ -XXX,XX +XXX,XX @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
181
- req->busy = false;
74
182
+ nvme_put_free_req_and_wake(s, ioq, req);
75
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
183
return r;
76
blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
77
- blkio_submit_io(bs);
184
}
78
}
185
nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
79
186
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
80
+ blkio_submit_io(bs);
187
qemu_co_mutex_unlock(&s->dma_map_lock);
81
qemu_coroutine_yield();
188
82
189
if (ret) {
83
if (use_bounce_buffer) {
190
- req->busy = false;
84
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
191
+ nvme_put_free_req_and_wake(s, ioq, req);
85
192
goto out;
86
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
87
blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
88
- blkio_submit_io(bs);
193
}
89
}
194
90
91
+ blkio_submit_io(bs);
92
qemu_coroutine_yield();
93
94
if (use_bounce_buffer) {
95
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
96
97
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
98
blkioq_flush(s->blkioq, &cod, 0);
99
- blkio_submit_io(bs);
100
}
101
102
+ blkio_submit_io(bs);
103
qemu_coroutine_yield();
104
return cod.ret;
105
}
106
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
107
108
WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
109
blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
110
- blkio_submit_io(bs);
111
}
112
113
+ blkio_submit_io(bs);
114
qemu_coroutine_yield();
115
return cod.ret;
116
}
117
118
-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
119
-{
120
- BDRVBlkioState *s = bs->opaque;
121
-
122
- WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
123
- blkio_submit_io(bs);
124
- }
125
-}
126
-
127
typedef enum {
128
BMRR_OK,
129
BMRR_SKIP,
130
@@ -XXX,XX +XXX,XX @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
131
.bdrv_co_pwritev = blkio_co_pwritev, \
132
.bdrv_co_flush_to_disk = blkio_co_flush, \
133
.bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \
134
- .bdrv_co_io_unplug = blkio_co_io_unplug, \
135
.bdrv_refresh_limits = blkio_refresh_limits, \
136
.bdrv_register_buf = blkio_register_buf, \
137
.bdrv_unregister_buf = blkio_unregister_buf, \
195
--
138
--
196
2.26.2
139
2.40.1
197
diff view generated by jsdifflib
1
Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
1
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
2
the number of function arguments by keeping the BDRVNVMeState pointer in
2
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
3
NVMeQueuePair. This will come in handly when a BH is introduced in a
3
submission instead.
4
later patch and only one argument can be passed to it.
5
4
6
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Reviewed-by: Sergio Lopez <slp@redhat.com>
6
Reviewed-by: Eric Blake <eblake@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
7
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
9
Message-id: 20200617132201.1832152-7-stefanha@redhat.com
8
Acked-by: Kevin Wolf <kwolf@redhat.com>
9
Message-id: 20230530180959.1108766-5-stefanha@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
11
---
12
block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
12
include/block/raw-aio.h | 7 -------
13
1 file changed, 38 insertions(+), 32 deletions(-)
13
block/file-posix.c | 10 ----------
14
block/io_uring.c | 44 ++++++++++++++++-------------------------
15
block/trace-events | 5 ++---
16
4 files changed, 19 insertions(+), 47 deletions(-)
14
17
15
diff --git a/block/nvme.c b/block/nvme.c
18
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
16
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
17
--- a/block/nvme.c
20
--- a/include/block/raw-aio.h
18
+++ b/block/nvme.c
21
+++ b/include/block/raw-aio.h
22
@@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
23
QEMUIOVector *qiov, int type);
24
void luring_detach_aio_context(LuringState *s, AioContext *old_context);
25
void luring_attach_aio_context(LuringState *s, AioContext *new_context);
26
-
27
-/*
28
- * luring_io_plug/unplug work in the thread's current AioContext, therefore the
29
- * caller must ensure that they are paired in the same IOThread.
30
- */
31
-void luring_io_plug(void);
32
-void luring_io_unplug(void);
33
#endif
34
35
#ifdef _WIN32
36
diff --git a/block/file-posix.c b/block/file-posix.c
37
index XXXXXXX..XXXXXXX 100644
38
--- a/block/file-posix.c
39
+++ b/block/file-posix.c
40
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
41
laio_io_plug();
42
}
43
#endif
44
-#ifdef CONFIG_LINUX_IO_URING
45
- if (s->use_linux_io_uring) {
46
- luring_io_plug();
47
- }
48
-#endif
49
}
50
51
static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
52
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
53
laio_io_unplug(s->aio_max_batch);
54
}
55
#endif
56
-#ifdef CONFIG_LINUX_IO_URING
57
- if (s->use_linux_io_uring) {
58
- luring_io_unplug();
59
- }
60
-#endif
61
}
62
63
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
64
diff --git a/block/io_uring.c b/block/io_uring.c
65
index XXXXXXX..XXXXXXX 100644
66
--- a/block/io_uring.c
67
+++ b/block/io_uring.c
19
@@ -XXX,XX +XXX,XX @@
68
@@ -XXX,XX +XXX,XX @@
20
*/
69
#include "block/raw-aio.h"
21
#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
70
#include "qemu/coroutine.h"
22
71
#include "qapi/error.h"
23
+typedef struct BDRVNVMeState BDRVNVMeState;
72
+#include "sysemu/block-backend.h"
24
+
73
#include "trace.h"
25
typedef struct {
74
26
int32_t head, tail;
75
/* Only used for assertions. */
27
uint8_t *queue;
76
@@ -XXX,XX +XXX,XX @@ typedef struct LuringAIOCB {
28
@@ -XXX,XX +XXX,XX @@ typedef struct {
77
} LuringAIOCB;
29
typedef struct {
78
30
QemuMutex lock;
79
typedef struct LuringQueue {
31
80
- int plugged;
32
+ /* Read from I/O code path, initialized under BQL */
81
unsigned int in_queue;
33
+ BDRVNVMeState *s;
82
unsigned int in_flight;
34
+ int index;
83
bool blocked;
35
+
84
@@ -XXX,XX +XXX,XX @@ static void luring_process_completions_and_submit(LuringState *s)
36
/* Fields protected by BQL */
85
{
37
- int index;
86
luring_process_completions(s);
38
uint8_t *prp_list_pages;
87
39
88
- if (!s->io_q.plugged && s->io_q.in_queue > 0) {
40
/* Fields protected by @lock */
89
+ if (s->io_q.in_queue > 0) {
41
@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
90
ioq_submit(s);
42
43
QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
44
45
-typedef struct {
46
+struct BDRVNVMeState {
47
AioContext *aio_context;
48
QEMUVFIOState *vfio;
49
NVMeRegs *regs;
50
@@ -XXX,XX +XXX,XX @@ typedef struct {
51
52
/* PCI address (required for nvme_refresh_filename()) */
53
char *device;
54
-} BDRVNVMeState;
55
+};
56
57
#define NVME_BLOCK_OPT_DEVICE "device"
58
#define NVME_BLOCK_OPT_NAMESPACE "namespace"
59
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
60
}
91
}
61
}
92
}
62
93
@@ -XXX,XX +XXX,XX @@ static void qemu_luring_poll_ready(void *opaque)
63
-static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
94
static void ioq_init(LuringQueue *io_q)
64
+static void nvme_free_queue_pair(NVMeQueuePair *q)
65
{
95
{
66
qemu_vfree(q->prp_list_pages);
96
QSIMPLEQ_INIT(&io_q->submit_queue);
67
qemu_vfree(q->sq.queue);
97
- io_q->plugged = 0;
68
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
98
io_q->in_queue = 0;
69
uint64_t prp_list_iova;
99
io_q->in_flight = 0;
70
100
io_q->blocked = false;
71
qemu_mutex_init(&q->lock);
72
+ q->s = s;
73
q->index = idx;
74
qemu_co_queue_init(&q->free_req_queue);
75
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
76
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
77
78
return q;
79
fail:
80
- nvme_free_queue_pair(bs, q);
81
+ nvme_free_queue_pair(q);
82
return NULL;
83
}
101
}
84
102
85
/* With q->lock */
103
-void luring_io_plug(void)
86
-static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
104
+static void luring_unplug_fn(void *opaque)
87
+static void nvme_kick(NVMeQueuePair *q)
88
{
105
{
89
+ BDRVNVMeState *s = q->s;
106
- AioContext *ctx = qemu_get_current_aio_context();
90
+
107
- LuringState *s = aio_get_linux_io_uring(ctx);
91
if (s->plugged || !q->need_kick) {
108
- trace_luring_io_plug(s);
92
return;
109
- s->io_q.plugged++;
93
}
110
-}
94
@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
111
-
95
}
112
-void luring_io_unplug(void)
96
113
-{
97
/* With q->lock */
114
- AioContext *ctx = qemu_get_current_aio_context();
98
-static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
115
- LuringState *s = aio_get_linux_io_uring(ctx);
99
+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
116
- assert(s->io_q.plugged);
100
{
117
- trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged,
101
if (!qemu_co_queue_empty(&q->free_req_queue)) {
118
- s->io_q.in_queue, s->io_q.in_flight);
102
- replay_bh_schedule_oneshot_event(s->aio_context,
119
- if (--s->io_q.plugged == 0 &&
103
+ replay_bh_schedule_oneshot_event(q->s->aio_context,
120
- !s->io_q.blocked && s->io_q.in_queue > 0) {
104
nvme_free_req_queue_cb, q);
121
+ LuringState *s = opaque;
122
+ trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
123
+ s->io_q.in_flight);
124
+ if (!s->io_q.blocked && s->io_q.in_queue > 0) {
125
ioq_submit(s);
105
}
126
}
106
}
127
}
107
128
@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
108
/* Insert a request in the freelist and wake waiters */
129
109
-static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
130
QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
110
- NVMeRequest *req)
131
s->io_q.in_queue++;
111
+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
132
- trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged,
112
{
133
- s->io_q.in_queue, s->io_q.in_flight);
113
qemu_mutex_lock(&q->lock);
134
- if (!s->io_q.blocked &&
114
nvme_put_free_req_locked(q, req);
135
- (!s->io_q.plugged ||
115
- nvme_wake_free_req_locked(s, q);
136
- s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) {
116
+ nvme_wake_free_req_locked(q);
137
- ret = ioq_submit(s);
117
qemu_mutex_unlock(&q->lock);
138
- trace_luring_do_submit_done(s, ret);
139
- return ret;
140
+ trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
141
+ s->io_q.in_flight);
142
+ if (!s->io_q.blocked) {
143
+ if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
144
+ ret = ioq_submit(s);
145
+ trace_luring_do_submit_done(s, ret);
146
+ return ret;
147
+ }
148
+
149
+ blk_io_plug_call(luring_unplug_fn, s);
150
}
151
return 0;
118
}
152
}
119
153
diff --git a/block/trace-events b/block/trace-events
120
@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
154
index XXXXXXX..XXXXXXX 100644
121
}
155
--- a/block/trace-events
122
156
+++ b/block/trace-events
123
/* With q->lock */
157
@@ -XXX,XX +XXX,XX @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "
124
-static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
158
# io_uring.c
125
+static bool nvme_process_completion(NVMeQueuePair *q)
159
luring_init_state(void *s, size_t size) "s %p size %zu"
126
{
160
luring_cleanup_state(void *s) "%p freed"
127
+ BDRVNVMeState *s = q->s;
161
-luring_io_plug(void *s) "LuringState %p plug"
128
bool progress = false;
162
-luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
129
NVMeRequest *preq;
163
-luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d"
130
NVMeRequest req;
164
+luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
131
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
165
+luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
132
/* Notify the device so it can post more completions. */
166
luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
133
smp_mb_release();
167
luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
134
*q->cq.doorbell = cpu_to_le32(q->cq.head);
168
luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
135
- nvme_wake_free_req_locked(s, q);
136
+ nvme_wake_free_req_locked(q);
137
}
138
q->busy = false;
139
return progress;
140
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
141
}
142
}
143
144
-static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
145
- NVMeRequest *req,
146
+static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
147
NvmeCmd *cmd, BlockCompletionFunc cb,
148
void *opaque)
149
{
150
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
151
req->opaque = opaque;
152
cmd->cid = cpu_to_le32(req->cid);
153
154
- trace_nvme_submit_command(s, q->index, req->cid);
155
+ trace_nvme_submit_command(q->s, q->index, req->cid);
156
nvme_trace_command(cmd);
157
qemu_mutex_lock(&q->lock);
158
memcpy((uint8_t *)q->sq.queue +
159
q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
160
q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
161
q->need_kick++;
162
- nvme_kick(s, q);
163
- nvme_process_completion(s, q);
164
+ nvme_kick(q);
165
+ nvme_process_completion(q);
166
qemu_mutex_unlock(&q->lock);
167
}
168
169
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
170
NvmeCmd *cmd)
171
{
172
NVMeRequest *req;
173
- BDRVNVMeState *s = bs->opaque;
174
int ret = -EINPROGRESS;
175
req = nvme_get_free_req(q);
176
if (!req) {
177
return -EBUSY;
178
}
179
- nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
180
+ nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
181
182
BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
183
return ret;
184
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
185
}
186
187
qemu_mutex_lock(&q->lock);
188
- while (nvme_process_completion(s, q)) {
189
+ while (nvme_process_completion(q)) {
190
/* Keep polling */
191
progress = true;
192
}
193
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
194
};
195
if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
196
error_setg(errp, "Failed to create io queue [%d]", n);
197
- nvme_free_queue_pair(bs, q);
198
+ nvme_free_queue_pair(q);
199
return false;
200
}
201
cmd = (NvmeCmd) {
202
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
203
};
204
if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
205
error_setg(errp, "Failed to create io queue [%d]", n);
206
- nvme_free_queue_pair(bs, q);
207
+ nvme_free_queue_pair(q);
208
return false;
209
}
210
s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
211
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
212
BDRVNVMeState *s = bs->opaque;
213
214
for (i = 0; i < s->nr_queues; ++i) {
215
- nvme_free_queue_pair(bs, s->queues[i]);
216
+ nvme_free_queue_pair(s->queues[i]);
217
}
218
g_free(s->queues);
219
aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
220
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
221
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
222
qemu_co_mutex_unlock(&s->dma_map_lock);
223
if (r) {
224
- nvme_put_free_req_and_wake(s, ioq, req);
225
+ nvme_put_free_req_and_wake(ioq, req);
226
return r;
227
}
228
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
229
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
230
231
data.co = qemu_coroutine_self();
232
while (data.ret == -EINPROGRESS) {
233
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
234
assert(s->nr_queues > 1);
235
req = nvme_get_free_req(ioq);
236
assert(req);
237
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
238
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
239
240
data.co = qemu_coroutine_self();
241
if (data.ret == -EINPROGRESS) {
242
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
243
req = nvme_get_free_req(ioq);
244
assert(req);
245
246
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
247
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
248
249
data.co = qemu_coroutine_self();
250
while (data.ret == -EINPROGRESS) {
251
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
252
qemu_co_mutex_unlock(&s->dma_map_lock);
253
254
if (ret) {
255
- nvme_put_free_req_and_wake(s, ioq, req);
256
+ nvme_put_free_req_and_wake(ioq, req);
257
goto out;
258
}
259
260
trace_nvme_dsm(s, offset, bytes);
261
262
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
263
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
264
265
data.co = qemu_coroutine_self();
266
while (data.ret == -EINPROGRESS) {
267
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
268
for (i = 1; i < s->nr_queues; i++) {
269
NVMeQueuePair *q = s->queues[i];
270
qemu_mutex_lock(&q->lock);
271
- nvme_kick(s, q);
272
- nvme_process_completion(s, q);
273
+ nvme_kick(q);
274
+ nvme_process_completion(q);
275
qemu_mutex_unlock(&q->lock);
276
}
277
}
278
--
169
--
279
2.26.2
170
2.40.1
280
diff view generated by jsdifflib
1
Existing users access free_req_queue under q->lock. Document this.
1
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
2
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
3
submission instead.
4
5
Note that a dev_max_batch check is dropped in laio_io_unplug() because
6
the semantics of unplug_fn() are different from .bdrv_co_unplug():
7
1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
8
not every time blk_io_unplug() is called.
9
2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
10
way to get per-BlockDriverState fields like dev_max_batch.
11
12
Therefore this condition cannot be moved to laio_unplug_fn(). It is not
13
obvious that this condition affects performance in practice, so I am
14
removing it instead of trying to come up with a more complex mechanism
15
to preserve the condition.
2
16
3
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Reviewed-by: Sergio Lopez <slp@redhat.com>
18
Reviewed-by: Eric Blake <eblake@redhat.com>
5
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
19
Acked-by: Kevin Wolf <kwolf@redhat.com>
6
Message-id: 20200617132201.1832152-6-stefanha@redhat.com
20
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
21
Message-id: 20230530180959.1108766-6-stefanha@redhat.com
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
---
23
---
9
block/nvme.c | 2 +-
24
include/block/raw-aio.h | 7 -------
10
1 file changed, 1 insertion(+), 1 deletion(-)
25
block/file-posix.c | 28 ----------------------------
11
26
block/linux-aio.c | 41 +++++++++++------------------------------
12
diff --git a/block/nvme.c b/block/nvme.c
27
3 files changed, 11 insertions(+), 65 deletions(-)
28
29
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
13
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
14
--- a/block/nvme.c
31
--- a/include/block/raw-aio.h
15
+++ b/block/nvme.c
32
+++ b/include/block/raw-aio.h
16
@@ -XXX,XX +XXX,XX @@ typedef struct {
33
@@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
17
} NVMeRequest;
34
35
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
36
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
37
-
38
-/*
39
- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
40
- * caller must ensure that they are paired in the same IOThread.
41
- */
42
-void laio_io_plug(void);
43
-void laio_io_unplug(uint64_t dev_max_batch);
44
#endif
45
/* io_uring.c - Linux io_uring implementation */
46
#ifdef CONFIG_LINUX_IO_URING
47
diff --git a/block/file-posix.c b/block/file-posix.c
48
index XXXXXXX..XXXXXXX 100644
49
--- a/block/file-posix.c
50
+++ b/block/file-posix.c
51
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
52
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
53
}
54
55
-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
56
-{
57
- BDRVRawState __attribute__((unused)) *s = bs->opaque;
58
-#ifdef CONFIG_LINUX_AIO
59
- if (s->use_linux_aio) {
60
- laio_io_plug();
61
- }
62
-#endif
63
-}
64
-
65
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
66
-{
67
- BDRVRawState __attribute__((unused)) *s = bs->opaque;
68
-#ifdef CONFIG_LINUX_AIO
69
- if (s->use_linux_aio) {
70
- laio_io_unplug(s->aio_max_batch);
71
- }
72
-#endif
73
-}
74
-
75
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
76
{
77
BDRVRawState *s = bs->opaque;
78
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
79
.bdrv_co_copy_range_from = raw_co_copy_range_from,
80
.bdrv_co_copy_range_to = raw_co_copy_range_to,
81
.bdrv_refresh_limits = raw_refresh_limits,
82
- .bdrv_co_io_plug = raw_co_io_plug,
83
- .bdrv_co_io_unplug = raw_co_io_unplug,
84
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
85
86
.bdrv_co_truncate = raw_co_truncate,
87
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
88
.bdrv_co_copy_range_from = raw_co_copy_range_from,
89
.bdrv_co_copy_range_to = raw_co_copy_range_to,
90
.bdrv_refresh_limits = raw_refresh_limits,
91
- .bdrv_co_io_plug = raw_co_io_plug,
92
- .bdrv_co_io_unplug = raw_co_io_unplug,
93
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
94
95
.bdrv_co_truncate = raw_co_truncate,
96
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = {
97
.bdrv_co_pwritev = raw_co_pwritev,
98
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
99
.bdrv_refresh_limits = cdrom_refresh_limits,
100
- .bdrv_co_io_plug = raw_co_io_plug,
101
- .bdrv_co_io_unplug = raw_co_io_unplug,
102
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
103
104
.bdrv_co_truncate = raw_co_truncate,
105
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = {
106
.bdrv_co_pwritev = raw_co_pwritev,
107
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
108
.bdrv_refresh_limits = cdrom_refresh_limits,
109
- .bdrv_co_io_plug = raw_co_io_plug,
110
- .bdrv_co_io_unplug = raw_co_io_unplug,
111
.bdrv_attach_aio_context = raw_aio_attach_aio_context,
112
113
.bdrv_co_truncate = raw_co_truncate,
114
diff --git a/block/linux-aio.c b/block/linux-aio.c
115
index XXXXXXX..XXXXXXX 100644
116
--- a/block/linux-aio.c
117
+++ b/block/linux-aio.c
118
@@ -XXX,XX +XXX,XX @@
119
#include "qemu/event_notifier.h"
120
#include "qemu/coroutine.h"
121
#include "qapi/error.h"
122
+#include "sysemu/block-backend.h"
123
124
/* Only used for assertions. */
125
#include "qemu/coroutine_int.h"
126
@@ -XXX,XX +XXX,XX @@ struct qemu_laiocb {
127
};
18
128
19
typedef struct {
129
typedef struct {
20
- CoQueue free_req_queue;
130
- int plugged;
21
QemuMutex lock;
131
unsigned int in_queue;
22
132
unsigned int in_flight;
23
/* Fields protected by BQL */
133
bool blocked;
24
@@ -XXX,XX +XXX,XX @@ typedef struct {
134
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
25
uint8_t *prp_list_pages;
135
{
26
136
qemu_laio_process_completions(s);
27
/* Fields protected by @lock */
137
28
+ CoQueue free_req_queue;
138
- if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
29
NVMeQueue sq, cq;
139
+ if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
30
int cq_phase;
140
ioq_submit(s);
31
int free_req_head;
141
}
142
}
143
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
144
static void ioq_init(LaioQueue *io_q)
145
{
146
QSIMPLEQ_INIT(&io_q->pending);
147
- io_q->plugged = 0;
148
io_q->in_queue = 0;
149
io_q->in_flight = 0;
150
io_q->blocked = false;
151
@@ -XXX,XX +XXX,XX @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
152
return max_batch;
153
}
154
155
-void laio_io_plug(void)
156
+static void laio_unplug_fn(void *opaque)
157
{
158
- AioContext *ctx = qemu_get_current_aio_context();
159
- LinuxAioState *s = aio_get_linux_aio(ctx);
160
+ LinuxAioState *s = opaque;
161
162
- s->io_q.plugged++;
163
-}
164
-
165
-void laio_io_unplug(uint64_t dev_max_batch)
166
-{
167
- AioContext *ctx = qemu_get_current_aio_context();
168
- LinuxAioState *s = aio_get_linux_aio(ctx);
169
-
170
- assert(s->io_q.plugged);
171
- s->io_q.plugged--;
172
-
173
- /*
174
- * Why max batch checking is performed here:
175
- * Another BDS may have queued requests with a higher dev_max_batch and
176
- * therefore in_queue could now exceed our dev_max_batch. Re-check the max
177
- * batch so we can honor our device's dev_max_batch.
178
- */
179
- if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
180
- (!s->io_q.plugged &&
181
- !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
182
+ if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
183
ioq_submit(s);
184
}
185
}
186
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
187
188
QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
189
s->io_q.in_queue++;
190
- if (!s->io_q.blocked &&
191
- (!s->io_q.plugged ||
192
- s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
193
- ioq_submit(s);
194
+ if (!s->io_q.blocked) {
195
+ if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
196
+ ioq_submit(s);
197
+ } else {
198
+ blk_io_plug_call(laio_unplug_fn, s);
199
+ }
200
}
201
202
return 0;
32
--
203
--
33
2.26.2
204
2.40.1
34
diff view generated by jsdifflib
1
nvme_process_completion() explicitly checks cid so the assertion that
1
No block driver implements .bdrv_co_io_plug() anymore. Get rid of the
2
follows is always true:
2
function pointers.
3
4
if (cid == 0 || cid > NVME_QUEUE_SIZE) {
5
...
6
continue;
7
}
8
assert(cid <= NVME_QUEUE_SIZE);
9
3
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Reviewed-by: Sergio Lopez <slp@redhat.com>
5
Reviewed-by: Eric Blake <eblake@redhat.com>
12
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
6
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
13
Message-id: 20200617132201.1832152-3-stefanha@redhat.com
7
Acked-by: Kevin Wolf <kwolf@redhat.com>
8
Message-id: 20230530180959.1108766-7-stefanha@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
10
---
16
block/nvme.c | 1 -
11
include/block/block-io.h | 3 ---
17
1 file changed, 1 deletion(-)
12
include/block/block_int-common.h | 11 ----------
13
block/io.c | 37 --------------------------------
14
3 files changed, 51 deletions(-)
18
15
19
diff --git a/block/nvme.c b/block/nvme.c
16
diff --git a/include/block/block-io.h b/include/block/block-io.h
20
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
21
--- a/block/nvme.c
18
--- a/include/block/block-io.h
22
+++ b/block/nvme.c
19
+++ b/include/block/block-io.h
23
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
20
@@ -XXX,XX +XXX,XX @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
24
cid);
21
25
continue;
22
AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
26
}
23
27
- assert(cid <= NVME_QUEUE_SIZE);
24
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs);
28
trace_nvme_complete_command(s, q->index, cid);
25
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs);
29
preq = &q->reqs[cid - 1];
26
-
30
req = *preq;
27
bool coroutine_fn GRAPH_RDLOCK
28
bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
29
uint32_t granularity, Error **errp);
30
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
31
index XXXXXXX..XXXXXXX 100644
32
--- a/include/block/block_int-common.h
33
+++ b/include/block/block_int-common.h
34
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
35
void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)(
36
BlockDriverState *bs, BlkdebugEvent event);
37
38
- /* io queue for linux-aio */
39
- void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState *bs);
40
- void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)(
41
- BlockDriverState *bs);
42
-
43
bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
44
45
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)(
46
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
47
unsigned int in_flight;
48
unsigned int serialising_in_flight;
49
50
- /*
51
- * counter for nested bdrv_io_plug.
52
- * Accessed with atomic ops.
53
- */
54
- unsigned io_plugged;
55
-
56
/* do we need to tell the quest if we have a volatile write cache? */
57
int enable_write_cache;
58
59
diff --git a/block/io.c b/block/io.c
60
index XXXXXXX..XXXXXXX 100644
61
--- a/block/io.c
62
+++ b/block/io.c
63
@@ -XXX,XX +XXX,XX @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
64
return mem;
65
}
66
67
-void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs)
68
-{
69
- BdrvChild *child;
70
- IO_CODE();
71
- assert_bdrv_graph_readable();
72
-
73
- QLIST_FOREACH(child, &bs->children, next) {
74
- bdrv_co_io_plug(child->bs);
75
- }
76
-
77
- if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
78
- BlockDriver *drv = bs->drv;
79
- if (drv && drv->bdrv_co_io_plug) {
80
- drv->bdrv_co_io_plug(bs);
81
- }
82
- }
83
-}
84
-
85
-void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs)
86
-{
87
- BdrvChild *child;
88
- IO_CODE();
89
- assert_bdrv_graph_readable();
90
-
91
- assert(bs->io_plugged);
92
- if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
93
- BlockDriver *drv = bs->drv;
94
- if (drv && drv->bdrv_co_io_unplug) {
95
- drv->bdrv_co_io_unplug(bs);
96
- }
97
- }
98
-
99
- QLIST_FOREACH(child, &bs->children, next) {
100
- bdrv_co_io_unplug(child->bs);
101
- }
102
-}
103
-
104
/* Helper that undoes bdrv_register_buf() when it fails partway through */
105
static void GRAPH_RDLOCK
106
bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
31
--
107
--
32
2.26.2
108
2.40.1
33
diff view generated by jsdifflib
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
1
From: Stefano Garzarella <sgarzare@redhat.com>
2
2
3
LLVM's SafeStack instrumentation does not yet support programs that make
3
Some virtio-blk drivers (e.g. virtio-blk-vhost-vdpa) supports the fd
4
use of the APIs in ucontext.h
4
passing. Let's expose this to the user, so the management layer
5
With the current implementation of coroutine-ucontext, the resulting
5
can pass the file descriptor of an already opened path.
6
binary is incorrect, with different coroutines sharing the same unsafe
7
stack and producing undefined behavior at runtime.
8
This fix allocates an additional unsafe stack area for each coroutine,
9
and sets the new unsafe stack pointer before calling swapcontext() in
10
qemu_coroutine_new.
11
This is the only place where the pointer needs to be manually updated,
12
since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
13
support SafeStack.
14
The additional stack is then freed in qemu_coroutine_delete.
15
6
16
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
7
If the libblkio virtio-blk driver supports fd passing, let's always
17
Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
8
use qemu_open() to open the `path`, so we can handle fd passing
9
from the management layer through the "/dev/fdset/N" special path.
10
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
13
Message-id: 20230530071941.8954-2-sgarzare@redhat.com
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
---
15
---
20
include/qemu/coroutine_int.h | 5 +++++
16
block/blkio.c | 53 ++++++++++++++++++++++++++++++++++++++++++---------
21
util/coroutine-ucontext.c | 28 ++++++++++++++++++++++++++++
17
1 file changed, 44 insertions(+), 9 deletions(-)
22
2 files changed, 33 insertions(+)
23
18
24
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
19
diff --git a/block/blkio.c b/block/blkio.c
25
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
26
--- a/include/qemu/coroutine_int.h
21
--- a/block/blkio.c
27
+++ b/include/qemu/coroutine_int.h
22
+++ b/block/blkio.c
28
@@ -XXX,XX +XXX,XX @@
23
@@ -XXX,XX +XXX,XX @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs,
29
#include "qemu/queue.h"
24
{
30
#include "qemu/coroutine.h"
25
const char *path = qdict_get_try_str(options, "path");
31
26
BDRVBlkioState *s = bs->opaque;
32
+#ifdef CONFIG_SAFESTACK
27
- int ret;
33
+/* Pointer to the unsafe stack, defined by the compiler */
28
+ bool fd_supported = false;
34
+extern __thread void *__safestack_unsafe_stack_ptr;
29
+ int fd, ret;
35
+#endif
30
31
if (!path) {
32
error_setg(errp, "missing 'path' option");
33
return -EINVAL;
34
}
35
36
- ret = blkio_set_str(s->blkio, "path", path);
37
- qdict_del(options, "path");
38
- if (ret < 0) {
39
- error_setg_errno(errp, -ret, "failed to set path: %s",
40
- blkio_get_error_msg());
41
- return ret;
42
- }
43
-
44
if (!(flags & BDRV_O_NOCACHE)) {
45
error_setg(errp, "cache.direct=off is not supported");
46
return -EINVAL;
47
}
36
+
48
+
37
#define COROUTINE_STACK_SIZE (1 << 20)
49
+ if (blkio_get_int(s->blkio, "fd", &fd) == 0) {
38
50
+ fd_supported = true;
39
typedef enum {
51
+ }
40
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
41
index XXXXXXX..XXXXXXX 100644
42
--- a/util/coroutine-ucontext.c
43
+++ b/util/coroutine-ucontext.c
44
@@ -XXX,XX +XXX,XX @@ typedef struct {
45
Coroutine base;
46
void *stack;
47
size_t stack_size;
48
+#ifdef CONFIG_SAFESTACK
49
+ /* Need an unsafe stack for each coroutine */
50
+ void *unsafe_stack;
51
+ size_t unsafe_stack_size;
52
+#endif
53
sigjmp_buf env;
54
55
void *tsan_co_fiber;
56
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
57
co = g_malloc0(sizeof(*co));
58
co->stack_size = COROUTINE_STACK_SIZE;
59
co->stack = qemu_alloc_stack(&co->stack_size);
60
+#ifdef CONFIG_SAFESTACK
61
+ co->unsafe_stack_size = COROUTINE_STACK_SIZE;
62
+ co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
63
+#endif
64
co->base.entry_arg = &old_env; /* stash away our jmp_buf */
65
66
uc.uc_link = &old_uc;
67
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
68
COROUTINE_YIELD,
69
&fake_stack_save,
70
co->stack, co->stack_size, co->tsan_co_fiber);
71
+
52
+
72
+#ifdef CONFIG_SAFESTACK
53
+ /*
73
+ /*
54
+ * If the libblkio driver supports fd passing, let's always use qemu_open()
74
+ * Before we swap the context, set the new unsafe stack
55
+ * to open the `path`, so we can handle fd passing from the management
75
+ * The unsafe stack grows just like the normal stack, so start from
56
+ * layer through the "/dev/fdset/N" special path.
76
+ * the last usable location of the memory area.
57
+ */
77
+ * NOTE: we don't have to re-set the usp afterwards because we are
58
+ if (fd_supported) {
78
+ * coming back to this context through a siglongjmp.
59
+ int open_flags;
79
+ * The compiler already wrapped the corresponding sigsetjmp call with
80
+ * code that saves the usp on the (safe) stack before the call, and
81
+ * restores it right after (which is where we return with siglongjmp).
82
+ */
83
+ void *usp = co->unsafe_stack + co->unsafe_stack_size;
84
+ __safestack_unsafe_stack_ptr = usp;
85
+#endif
86
+
60
+
87
swapcontext(&old_uc, &uc);
61
+ if (flags & BDRV_O_RDWR) {
88
}
62
+ open_flags = O_RDWR;
89
63
+ } else {
90
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
64
+ open_flags = O_RDONLY;
91
#endif
65
+ }
92
66
+
93
qemu_free_stack(co->stack, co->stack_size);
67
+ fd = qemu_open(path, open_flags, errp);
94
+#ifdef CONFIG_SAFESTACK
68
+ if (fd < 0) {
95
+ qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
69
+ return -EINVAL;
96
+#endif
70
+ }
97
g_free(co);
71
+
72
+ ret = blkio_set_int(s->blkio, "fd", fd);
73
+ if (ret < 0) {
74
+ error_setg_errno(errp, -ret, "failed to set fd: %s",
75
+ blkio_get_error_msg());
76
+ qemu_close(fd);
77
+ return ret;
78
+ }
79
+ } else {
80
+ ret = blkio_set_str(s->blkio, "path", path);
81
+ if (ret < 0) {
82
+ error_setg_errno(errp, -ret, "failed to set path: %s",
83
+ blkio_get_error_msg());
84
+ return ret;
85
+ }
86
+ }
87
+
88
+ qdict_del(options, "path");
89
+
90
return 0;
98
}
91
}
99
92
100
--
93
--
101
2.26.2
94
2.40.1
102
diff view generated by jsdifflib
Deleted patch
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
2
1
3
Current implementation of LLVM's SafeStack is not compatible with
4
code that uses an alternate stack created with sigaltstack().
5
Since coroutine-sigaltstack relies on sigaltstack(), it is not
6
compatible with SafeStack. The resulting binary is incorrect, with
7
different coroutines sharing the same unsafe stack and producing
8
undefined behavior at runtime.
9
10
In the future LLVM may provide a SafeStack implementation compatible with
11
sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
12
implementation from coroutine-ucontext should be used.
13
As a safety check, add a control in coroutine-sigaltstack to throw a
14
preprocessor #error if SafeStack is enabled and we are trying to
15
use coroutine-sigaltstack to implement coroutines.
16
17
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
18
Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
19
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
---
21
util/coroutine-sigaltstack.c | 4 ++++
22
1 file changed, 4 insertions(+)
23
24
diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
25
index XXXXXXX..XXXXXXX 100644
26
--- a/util/coroutine-sigaltstack.c
27
+++ b/util/coroutine-sigaltstack.c
28
@@ -XXX,XX +XXX,XX @@
29
#include "qemu-common.h"
30
#include "qemu/coroutine_int.h"
31
32
+#ifdef CONFIG_SAFESTACK
33
+#error "SafeStack is not compatible with code run in alternate signal stacks"
34
+#endif
35
+
36
typedef struct {
37
Coroutine base;
38
void *stack;
39
--
40
2.26.2
41
diff view generated by jsdifflib
Deleted patch
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
2
1
3
This patch adds a flag to enable/disable the SafeStack instrumentation
4
provided by LLVM.
5
6
On enable, make sure that the compiler supports the flags, and that we
7
are using the proper coroutine implementation (coroutine-ucontext).
8
On disable, explicitly disable the option if it was enabled by default.
9
10
While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
11
we are not checking for the O.S. since this is already done by LLVM.
12
13
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
14
Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
18
1 file changed, 73 insertions(+)
19
20
diff --git a/configure b/configure
21
index XXXXXXX..XXXXXXX 100755
22
--- a/configure
23
+++ b/configure
24
@@ -XXX,XX +XXX,XX @@ audio_win_int=""
25
libs_qga=""
26
debug_info="yes"
27
stack_protector=""
28
+safe_stack=""
29
use_containers="yes"
30
gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
31
32
@@ -XXX,XX +XXX,XX @@ for opt do
33
;;
34
--disable-stack-protector) stack_protector="no"
35
;;
36
+ --enable-safe-stack) safe_stack="yes"
37
+ ;;
38
+ --disable-safe-stack) safe_stack="no"
39
+ ;;
40
--disable-curses) curses="no"
41
;;
42
--enable-curses) curses="yes"
43
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
44
debug-tcg TCG debugging (default is disabled)
45
debug-info debugging information
46
sparse sparse checker
47
+ safe-stack SafeStack Stack Smash Protection. Depends on
48
+ clang/llvm >= 3.7 and requires coroutine backend ucontext.
49
50
gnutls GNUTLS cryptography support
51
nettle nettle cryptography support
52
@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
53
fi
54
fi
55
56
+##################################################
57
+# SafeStack
58
+
59
+
60
+if test "$safe_stack" = "yes"; then
61
+cat > $TMPC << EOF
62
+int main(int argc, char *argv[])
63
+{
64
+#if ! __has_feature(safe_stack)
65
+#error SafeStack Disabled
66
+#endif
67
+ return 0;
68
+}
69
+EOF
70
+ flag="-fsanitize=safe-stack"
71
+ # Check that safe-stack is supported and enabled.
72
+ if compile_prog "-Werror $flag" "$flag"; then
73
+ # Flag needed both at compilation and at linking
74
+ QEMU_CFLAGS="$QEMU_CFLAGS $flag"
75
+ QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
76
+ else
77
+ error_exit "SafeStack not supported by your compiler"
78
+ fi
79
+ if test "$coroutine" != "ucontext"; then
80
+ error_exit "SafeStack is only supported by the coroutine backend ucontext"
81
+ fi
82
+else
83
+cat > $TMPC << EOF
84
+int main(int argc, char *argv[])
85
+{
86
+#if defined(__has_feature)
87
+#if __has_feature(safe_stack)
88
+#error SafeStack Enabled
89
+#endif
90
+#endif
91
+ return 0;
92
+}
93
+EOF
94
+if test "$safe_stack" = "no"; then
95
+ # Make sure that safe-stack is disabled
96
+ if ! compile_prog "-Werror" ""; then
97
+ # SafeStack was already enabled, try to explicitly remove the feature
98
+ flag="-fno-sanitize=safe-stack"
99
+ if ! compile_prog "-Werror $flag" "$flag"; then
100
+ error_exit "Configure cannot disable SafeStack"
101
+ fi
102
+ QEMU_CFLAGS="$QEMU_CFLAGS $flag"
103
+ QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
104
+ fi
105
+else # "$safe_stack" = ""
106
+ # Set safe_stack to yes or no based on pre-existing flags
107
+ if compile_prog "-Werror" ""; then
108
+ safe_stack="no"
109
+ else
110
+ safe_stack="yes"
111
+ if test "$coroutine" != "ucontext"; then
112
+ error_exit "SafeStack is only supported by the coroutine backend ucontext"
113
+ fi
114
+ fi
115
+fi
116
+fi
117
118
##########################################
119
# check if we have open_by_handle_at
120
@@ -XXX,XX +XXX,XX @@ echo "sparse enabled $sparse"
121
echo "strip binaries $strip_opt"
122
echo "profiler $profiler"
123
echo "static build $static"
124
+echo "safe stack $safe_stack"
125
if test "$darwin" = "yes" ; then
126
echo "Cocoa support $cocoa"
127
fi
128
@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
129
echo "export CCACHE_CPP2=y" >> $config_host_mak
130
fi
131
132
+if test "$safe_stack" = "yes"; then
133
+ echo "CONFIG_SAFESTACK=y" >> $config_host_mak
134
+fi
135
+
136
# If we're using a separate build tree, set it up now.
137
# DIRS are directories which we simply mkdir in the build tree;
138
# LINKS are things to symlink back into the source tree
139
--
140
2.26.2
141
diff view generated by jsdifflib
Deleted patch
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
2
1
3
SafeStack is a stack protection technique implemented in llvm. It is
4
enabled with a -fsanitize flag.
5
iotests are currently disabled when any -fsanitize option is used,
6
because such options tend to produce additional warnings and false
7
positives.
8
9
While common -fsanitize options are used to verify the code and not
10
added in production, SafeStack's main use is in production environments
11
to protect against stack smashing.
12
13
Since SafeStack does not print any warning or false positive, enable
14
iotests when SafeStack is the only -fsanitize option used.
15
This is likely going to be a production binary and we want to make sure
16
it works correctly.
17
18
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
19
Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
---
22
tests/check-block.sh | 12 +++++++++++-
23
1 file changed, 11 insertions(+), 1 deletion(-)
24
25
diff --git a/tests/check-block.sh b/tests/check-block.sh
26
index XXXXXXX..XXXXXXX 100755
27
--- a/tests/check-block.sh
28
+++ b/tests/check-block.sh
29
@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
30
exit 0
31
fi
32
33
-if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
34
+# Disable tests with any sanitizer except for SafeStack
35
+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
36
+SANITIZE_FLAGS=""
37
+#Remove all occurrencies of -fsanitize=safe-stack
38
+for i in ${CFLAGS}; do
39
+ if [ "${i}" != "-fsanitize=safe-stack" ]; then
40
+ SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
41
+ fi
42
+done
43
+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
44
+ # Have a sanitize flag that is not allowed, stop
45
echo "Sanitizers are enabled ==> Not running the qemu-iotests."
46
exit 0
47
fi
48
--
49
2.26.2
50
diff view generated by jsdifflib
1
A lot of CPU time is spent simply locking/unlocking q->lock during
1
From: Stefano Garzarella <sgarzare@redhat.com>
2
polling. Check for completion outside the lock to make q->lock disappear
3
from the profile.
4
2
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
3
The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the fd
6
Reviewed-by: Sergio Lopez <slp@redhat.com>
4
passing through the new 'fd' property.
7
Message-id: 20200617132201.1832152-2-stefanha@redhat.com
5
6
Since now we are using qemu_open() on '@path' if the virtio-blk driver
7
supports the fd passing, let's announce it.
8
In this way, the management layer can pass the file descriptor of an
9
already opened vhost-vdpa character device. This is useful especially
10
when the device can only be accessed with certain privileges.
11
12
Add the '@fdset' feature only when the virtio-blk-vhost-vdpa driver
13
in libblkio supports it.
14
15
Suggested-by: Markus Armbruster <armbru@redhat.com>
16
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
18
Message-id: 20230530071941.8954-3-sgarzare@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
20
---
10
block/nvme.c | 12 ++++++++++++
21
qapi/block-core.json | 6 ++++++
11
1 file changed, 12 insertions(+)
22
meson.build | 4 ++++
23
2 files changed, 10 insertions(+)
12
24
13
diff --git a/block/nvme.c b/block/nvme.c
25
diff --git a/qapi/block-core.json b/qapi/block-core.json
14
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
15
--- a/block/nvme.c
27
--- a/qapi/block-core.json
16
+++ b/block/nvme.c
28
+++ b/qapi/block-core.json
17
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
29
@@ -XXX,XX +XXX,XX @@
18
30
#
19
for (i = 0; i < s->nr_queues; i++) {
31
# @path: path to the vhost-vdpa character device.
20
NVMeQueuePair *q = s->queues[i];
32
#
21
+ const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
33
+# Features:
22
+ NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
34
+# @fdset: Member @path supports the special "/dev/fdset/N" path
23
+
35
+# (since 8.1)
24
+ /*
36
+#
25
+ * Do an early check for completions. q->lock isn't needed because
37
# Since: 7.2
26
+ * nvme_process_completion() only runs in the event loop thread and
38
##
27
+ * cannot race with itself.
39
{ 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa',
28
+ */
40
'data': { 'path': 'str' },
29
+ if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
41
+ 'features': [ { 'name' :'fdset',
30
+ continue;
42
+ 'if': 'CONFIG_BLKIO_VHOST_VDPA_FD' } ],
31
+ }
43
'if': 'CONFIG_BLKIO' }
32
+
44
33
qemu_mutex_lock(&q->lock);
45
##
34
while (nvme_process_completion(s, q)) {
46
diff --git a/meson.build b/meson.build
35
/* Keep polling */
47
index XXXXXXX..XXXXXXX 100644
48
--- a/meson.build
49
+++ b/meson.build
50
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_LZO', lzo.found())
51
config_host_data.set('CONFIG_MPATH', mpathpersist.found())
52
config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api)
53
config_host_data.set('CONFIG_BLKIO', blkio.found())
54
+if blkio.found()
55
+ config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD',
56
+ blkio.version().version_compare('>=1.3.0'))
57
+endif
58
config_host_data.set('CONFIG_CURL', curl.found())
59
config_host_data.set('CONFIG_CURSES', curses.found())
60
config_host_data.set('CONFIG_GBM', gbm.found())
36
--
61
--
37
2.26.2
62
2.40.1
38
diff view generated by jsdifflib