1 | The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae: | 1 | The following changes since commit c6a5fc2ac76c5ab709896ee1b0edd33685a67ed1: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100) | 3 | decodetree: Add --output-null for meson testing (2023-05-31 19:56:42 -0700) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a: | 9 | for you to fetch changes up to 98b126f5e3228a346c774e569e26689943b401dd: |
10 | 10 | ||
11 | block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100) | 11 | qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa (2023-06-01 11:08:21 -0400) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | - Stefano Garzarella's blkio block driver 'fd' parameter | ||
17 | - My thread-local blk_io_plug() series | ||
18 | |||
16 | ---------------------------------------------------------------- | 19 | ---------------------------------------------------------------- |
17 | 20 | ||
18 | Daniele Buono (4): | 21 | Stefan Hajnoczi (6): |
19 | coroutine: support SafeStack in ucontext backend | 22 | block: add blk_io_plug_call() API |
20 | coroutine: add check for SafeStack in sigaltstack | 23 | block/nvme: convert to blk_io_plug_call() API |
21 | configure: add flags to support SafeStack | 24 | block/blkio: convert to blk_io_plug_call() API |
22 | check-block: enable iotests with SafeStack | 25 | block/io_uring: convert to blk_io_plug_call() API |
26 | block/linux-aio: convert to blk_io_plug_call() API | ||
27 | block: remove bdrv_co_io_plug() API | ||
23 | 28 | ||
24 | Stefan Hajnoczi (8): | 29 | Stefano Garzarella (2): |
25 | minikconf: explicitly set encoding to UTF-8 | 30 | block/blkio: use qemu_open() to support fd passing for virtio-blk |
26 | block/nvme: poll queues without q->lock | 31 | qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa |
27 | block/nvme: drop tautologous assertion | ||
28 | block/nvme: don't access CQE after moving cq.head | ||
29 | block/nvme: switch to a NVMeRequest freelist | ||
30 | block/nvme: clarify that free_req_queue is protected by q->lock | ||
31 | block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair | ||
32 | block/nvme: support nested aio_poll() | ||
33 | 32 | ||
34 | configure | 73 ++++++++++++ | 33 | MAINTAINERS | 1 + |
35 | include/qemu/coroutine_int.h | 5 + | 34 | qapi/block-core.json | 6 ++ |
36 | block/nvme.c | 220 +++++++++++++++++++++++++---------- | 35 | meson.build | 4 + |
37 | util/coroutine-sigaltstack.c | 4 + | 36 | include/block/block-io.h | 3 - |
38 | util/coroutine-ucontext.c | 28 +++++ | 37 | include/block/block_int-common.h | 11 --- |
39 | block/trace-events | 2 +- | 38 | include/block/raw-aio.h | 14 --- |
40 | scripts/minikconf.py | 6 +- | 39 | include/sysemu/block-backend-io.h | 13 +-- |
41 | tests/check-block.sh | 12 +- | 40 | block/blkio.c | 96 ++++++++++++------ |
42 | 8 files changed, 284 insertions(+), 66 deletions(-) | 41 | block/block-backend.c | 22 ----- |
42 | block/file-posix.c | 38 ------- | ||
43 | block/io.c | 37 ------- | ||
44 | block/io_uring.c | 44 ++++----- | ||
45 | block/linux-aio.c | 41 +++----- | ||
46 | block/nvme.c | 44 +++------ | ||
47 | block/plug.c | 159 ++++++++++++++++++++++++++++++ | ||
48 | hw/block/dataplane/xen-block.c | 8 +- | ||
49 | hw/block/virtio-blk.c | 4 +- | ||
50 | hw/scsi/virtio-scsi.c | 6 +- | ||
51 | block/meson.build | 1 + | ||
52 | block/trace-events | 6 +- | ||
53 | 20 files changed, 293 insertions(+), 265 deletions(-) | ||
54 | create mode 100644 block/plug.c | ||
43 | 55 | ||
44 | -- | 56 | -- |
45 | 2.26.2 | 57 | 2.40.1 |
46 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | QEMU currently only has ASCII Kconfig files but Linux actually uses | ||
2 | UTF-8. Explicitly specify the encoding and that we're doing text file | ||
3 | I/O. | ||
4 | 1 | ||
5 | It's unclear whether or not QEMU will ever need Unicode in its Kconfig | ||
6 | files. If we start using the help text then it will become an issue | ||
7 | sooner or later. Make this change now for consistency with Linux | ||
8 | Kconfig. | ||
9 | |||
10 | Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
13 | Message-id: 20200521153616.307100-1-stefanha@redhat.com | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | --- | ||
16 | scripts/minikconf.py | 6 +++--- | ||
17 | 1 file changed, 3 insertions(+), 3 deletions(-) | ||
18 | |||
19 | diff --git a/scripts/minikconf.py b/scripts/minikconf.py | ||
20 | index XXXXXXX..XXXXXXX 100755 | ||
21 | --- a/scripts/minikconf.py | ||
22 | +++ b/scripts/minikconf.py | ||
23 | @@ -XXX,XX +XXX,XX @@ class KconfigParser: | ||
24 | if incl_abs_fname in self.data.previously_included: | ||
25 | return | ||
26 | try: | ||
27 | - fp = open(incl_abs_fname, 'r') | ||
28 | + fp = open(incl_abs_fname, 'rt', encoding='utf-8') | ||
29 | except IOError as e: | ||
30 | raise KconfigParserError(self, | ||
31 | '%s: %s' % (e.strerror, include)) | ||
32 | @@ -XXX,XX +XXX,XX @@ if __name__ == '__main__': | ||
33 | parser.do_assignment(name, value == 'y') | ||
34 | external_vars.add(name[7:]) | ||
35 | else: | ||
36 | - fp = open(arg, 'r') | ||
37 | + fp = open(arg, 'rt', encoding='utf-8') | ||
38 | parser.parse_file(fp) | ||
39 | fp.close() | ||
40 | |||
41 | @@ -XXX,XX +XXX,XX @@ if __name__ == '__main__': | ||
42 | if key not in external_vars and config[key]: | ||
43 | print ('CONFIG_%s=y' % key) | ||
44 | |||
45 | - deps = open(argv[2], 'w') | ||
46 | + deps = open(argv[2], 'wt', encoding='utf-8') | ||
47 | for fname in data.previously_included: | ||
48 | print ('%s: %s' % (argv[1], fname), file=deps) | ||
49 | deps.close() | ||
50 | -- | ||
51 | 2.26.2 | ||
52 | diff view generated by jsdifflib |
1 | Do not access a CQE after incrementing q->cq.head and releasing q->lock. | 1 | Introduce a new API for thread-local blk_io_plug() that does not |
---|---|---|---|
2 | It is unlikely that this causes problems in practice but it's a latent | 2 | traverse the block graph. The goal is to make blk_io_plug() multi-queue |
3 | bug. | 3 | friendly. |
4 | 4 | ||
5 | The reason why it should be safe at the moment is that completion | 5 | Instead of having block drivers track whether or not we're in a plugged |
6 | processing is not re-entrant and the CQ doorbell isn't written until the | 6 | section, provide an API that allows them to defer a function call until |
7 | end of nvme_process_completion(). | 7 | we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is |
8 | 8 | called multiple times with the same fn/opaque pair, then fn() is only | |
9 | Make this change now because QEMU expects completion processing to be | 9 | called once at the end of the function - resulting in batching. |
10 | re-entrant and later patches will do that. | 10 | |
11 | This patch introduces the API and changes blk_io_plug()/blk_io_unplug(). | ||
12 | blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument | ||
13 | because the plug state is now thread-local. | ||
14 | |||
15 | Later patches convert block drivers to blk_io_plug_call() and then we | ||
16 | can finally remove .bdrv_co_io_plug() once all block drivers have been | ||
17 | converted. | ||
11 | 18 | ||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
13 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 20 | Reviewed-by: Eric Blake <eblake@redhat.com> |
14 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 21 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
15 | Message-id: 20200617132201.1832152-4-stefanha@redhat.com | 22 | Acked-by: Kevin Wolf <kwolf@redhat.com> |
23 | Message-id: 20230530180959.1108766-2-stefanha@redhat.com | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 24 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
17 | --- | 25 | --- |
18 | block/nvme.c | 5 ++++- | 26 | MAINTAINERS | 1 + |
19 | 1 file changed, 4 insertions(+), 1 deletion(-) | 27 | include/sysemu/block-backend-io.h | 13 +-- |
20 | 28 | block/block-backend.c | 22 ----- | |
21 | diff --git a/block/nvme.c b/block/nvme.c | 29 | block/plug.c | 159 ++++++++++++++++++++++++++++++ |
22 | index XXXXXXX..XXXXXXX 100644 | 30 | hw/block/dataplane/xen-block.c | 8 +- |
23 | --- a/block/nvme.c | 31 | hw/block/virtio-blk.c | 4 +- |
24 | +++ b/block/nvme.c | 32 | hw/scsi/virtio-scsi.c | 6 +- |
25 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 33 | block/meson.build | 1 + |
26 | q->busy = true; | 34 | 8 files changed, 173 insertions(+), 41 deletions(-) |
27 | assert(q->inflight >= 0); | 35 | create mode 100644 block/plug.c |
28 | while (q->inflight) { | 36 | |
29 | + int ret; | 37 | diff --git a/MAINTAINERS b/MAINTAINERS |
30 | int16_t cid; | 38 | index XXXXXXX..XXXXXXX 100644 |
31 | + | 39 | --- a/MAINTAINERS |
32 | c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; | 40 | +++ b/MAINTAINERS |
33 | if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { | 41 | @@ -XXX,XX +XXX,XX @@ F: util/aio-*.c |
34 | break; | 42 | F: util/aio-*.h |
43 | F: util/fdmon-*.c | ||
44 | F: block/io.c | ||
45 | +F: block/plug.c | ||
46 | F: migration/block* | ||
47 | F: include/block/aio.h | ||
48 | F: include/block/aio-wait.h | ||
49 | diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h | ||
50 | index XXXXXXX..XXXXXXX 100644 | ||
51 | --- a/include/sysemu/block-backend-io.h | ||
52 | +++ b/include/sysemu/block-backend-io.h | ||
53 | @@ -XXX,XX +XXX,XX @@ void blk_iostatus_set_err(BlockBackend *blk, int error); | ||
54 | int blk_get_max_iov(BlockBackend *blk); | ||
55 | int blk_get_max_hw_iov(BlockBackend *blk); | ||
56 | |||
57 | -/* | ||
58 | - * blk_io_plug/unplug are thread-local operations. This means that multiple | ||
59 | - * IOThreads can simultaneously call plug/unplug, but the caller must ensure | ||
60 | - * that each unplug() is called in the same IOThread of the matching plug(). | ||
61 | - */ | ||
62 | -void coroutine_fn blk_co_io_plug(BlockBackend *blk); | ||
63 | -void co_wrapper blk_io_plug(BlockBackend *blk); | ||
64 | - | ||
65 | -void coroutine_fn blk_co_io_unplug(BlockBackend *blk); | ||
66 | -void co_wrapper blk_io_unplug(BlockBackend *blk); | ||
67 | +void blk_io_plug(void); | ||
68 | +void blk_io_unplug(void); | ||
69 | +void blk_io_plug_call(void (*fn)(void *), void *opaque); | ||
70 | |||
71 | AioContext *blk_get_aio_context(BlockBackend *blk); | ||
72 | BlockAcctStats *blk_get_stats(BlockBackend *blk); | ||
73 | diff --git a/block/block-backend.c b/block/block-backend.c | ||
74 | index XXXXXXX..XXXXXXX 100644 | ||
75 | --- a/block/block-backend.c | ||
76 | +++ b/block/block-backend.c | ||
77 | @@ -XXX,XX +XXX,XX @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify) | ||
78 | notifier_list_add(&blk->insert_bs_notifiers, notify); | ||
79 | } | ||
80 | |||
81 | -void coroutine_fn blk_co_io_plug(BlockBackend *blk) | ||
82 | -{ | ||
83 | - BlockDriverState *bs = blk_bs(blk); | ||
84 | - IO_CODE(); | ||
85 | - GRAPH_RDLOCK_GUARD(); | ||
86 | - | ||
87 | - if (bs) { | ||
88 | - bdrv_co_io_plug(bs); | ||
89 | - } | ||
90 | -} | ||
91 | - | ||
92 | -void coroutine_fn blk_co_io_unplug(BlockBackend *blk) | ||
93 | -{ | ||
94 | - BlockDriverState *bs = blk_bs(blk); | ||
95 | - IO_CODE(); | ||
96 | - GRAPH_RDLOCK_GUARD(); | ||
97 | - | ||
98 | - if (bs) { | ||
99 | - bdrv_co_io_unplug(bs); | ||
100 | - } | ||
101 | -} | ||
102 | - | ||
103 | BlockAcctStats *blk_get_stats(BlockBackend *blk) | ||
104 | { | ||
105 | IO_CODE(); | ||
106 | diff --git a/block/plug.c b/block/plug.c | ||
107 | new file mode 100644 | ||
108 | index XXXXXXX..XXXXXXX | ||
109 | --- /dev/null | ||
110 | +++ b/block/plug.c | ||
111 | @@ -XXX,XX +XXX,XX @@ | ||
112 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
113 | +/* | ||
114 | + * Block I/O plugging | ||
115 | + * | ||
116 | + * Copyright Red Hat. | ||
117 | + * | ||
118 | + * This API defers a function call within a blk_io_plug()/blk_io_unplug() | ||
119 | + * section, allowing multiple calls to batch up. This is a performance | ||
120 | + * optimization that is used in the block layer to submit several I/O requests | ||
121 | + * at once instead of individually: | ||
122 | + * | ||
123 | + * blk_io_plug(); <-- start of plugged region | ||
124 | + * ... | ||
125 | + * blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call | ||
126 | + * blk_io_plug_call(my_func, my_obj); <-- another | ||
127 | + * blk_io_plug_call(my_func, my_obj); <-- another | ||
128 | + * ... | ||
129 | + * blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once | ||
130 | + * | ||
131 | + * This code is actually generic and not tied to the block layer. If another | ||
132 | + * subsystem needs this functionality, it could be renamed. | ||
133 | + */ | ||
134 | + | ||
135 | +#include "qemu/osdep.h" | ||
136 | +#include "qemu/coroutine-tls.h" | ||
137 | +#include "qemu/notify.h" | ||
138 | +#include "qemu/thread.h" | ||
139 | +#include "sysemu/block-backend.h" | ||
140 | + | ||
141 | +/* A function call that has been deferred until unplug() */ | ||
142 | +typedef struct { | ||
143 | + void (*fn)(void *); | ||
144 | + void *opaque; | ||
145 | +} UnplugFn; | ||
146 | + | ||
147 | +/* Per-thread state */ | ||
148 | +typedef struct { | ||
149 | + unsigned count; /* how many times has plug() been called? */ | ||
150 | + GArray *unplug_fns; /* functions to call at unplug time */ | ||
151 | +} Plug; | ||
152 | + | ||
153 | +/* Use get_ptr_plug() to fetch this thread-local value */ | ||
154 | +QEMU_DEFINE_STATIC_CO_TLS(Plug, plug); | ||
155 | + | ||
156 | +/* Called at thread cleanup time */ | ||
157 | +static void blk_io_plug_atexit(Notifier *n, void *value) | ||
158 | +{ | ||
159 | + Plug *plug = get_ptr_plug(); | ||
160 | + g_array_free(plug->unplug_fns, TRUE); | ||
161 | +} | ||
162 | + | ||
163 | +/* This won't involve coroutines, so use __thread */ | ||
164 | +static __thread Notifier blk_io_plug_atexit_notifier; | ||
165 | + | ||
166 | +/** | ||
167 | + * blk_io_plug_call: | ||
168 | + * @fn: a function pointer to be invoked | ||
169 | + * @opaque: a user-defined argument to @fn() | ||
170 | + * | ||
171 | + * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug() | ||
172 | + * section. | ||
173 | + * | ||
174 | + * Otherwise defer the call until the end of the outermost | ||
175 | + * blk_io_plug()/blk_io_unplug() section in this thread. If the same | ||
176 | + * @fn/@opaque pair has already been deferred, it will only be called once upon | ||
177 | + * blk_io_unplug() so that accumulated calls are batched into a single call. | ||
178 | + * | ||
179 | + * The caller must ensure that @opaque is not freed before @fn() is invoked. | ||
180 | + */ | ||
181 | +void blk_io_plug_call(void (*fn)(void *), void *opaque) | ||
182 | +{ | ||
183 | + Plug *plug = get_ptr_plug(); | ||
184 | + | ||
185 | + /* Call immediately if we're not plugged */ | ||
186 | + if (plug->count == 0) { | ||
187 | + fn(opaque); | ||
188 | + return; | ||
189 | + } | ||
190 | + | ||
191 | + GArray *array = plug->unplug_fns; | ||
192 | + if (!array) { | ||
193 | + array = g_array_new(FALSE, FALSE, sizeof(UnplugFn)); | ||
194 | + plug->unplug_fns = array; | ||
195 | + blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit; | ||
196 | + qemu_thread_atexit_add(&blk_io_plug_atexit_notifier); | ||
197 | + } | ||
198 | + | ||
199 | + UnplugFn *fns = (UnplugFn *)array->data; | ||
200 | + UnplugFn new_fn = { | ||
201 | + .fn = fn, | ||
202 | + .opaque = opaque, | ||
203 | + }; | ||
204 | + | ||
205 | + /* | ||
206 | + * There won't be many, so do a linear search. If this becomes a bottleneck | ||
207 | + * then a binary search (glib 2.62+) or different data structure could be | ||
208 | + * used. | ||
209 | + */ | ||
210 | + for (guint i = 0; i < array->len; i++) { | ||
211 | + if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) { | ||
212 | + return; /* already exists */ | ||
213 | + } | ||
214 | + } | ||
215 | + | ||
216 | + g_array_append_val(array, new_fn); | ||
217 | +} | ||
218 | + | ||
219 | +/** | ||
220 | + * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug() | ||
221 | + * | ||
222 | + * blk_io_plug/unplug are thread-local operations. This means that multiple | ||
223 | + * threads can simultaneously call plug/unplug, but the caller must ensure that | ||
224 | + * each unplug() is called in the same thread of the matching plug(). | ||
225 | + * | ||
226 | + * Nesting is supported. blk_io_plug_call() functions are only called at the | ||
227 | + * outermost blk_io_unplug(). | ||
228 | + */ | ||
229 | +void blk_io_plug(void) | ||
230 | +{ | ||
231 | + Plug *plug = get_ptr_plug(); | ||
232 | + | ||
233 | + assert(plug->count < UINT32_MAX); | ||
234 | + | ||
235 | + plug->count++; | ||
236 | +} | ||
237 | + | ||
238 | +/** | ||
239 | + * blk_io_unplug: Run any pending blk_io_plug_call() functions | ||
240 | + * | ||
241 | + * There must have been a matching blk_io_plug() call in the same thread prior | ||
242 | + * to this blk_io_unplug() call. | ||
243 | + */ | ||
244 | +void blk_io_unplug(void) | ||
245 | +{ | ||
246 | + Plug *plug = get_ptr_plug(); | ||
247 | + | ||
248 | + assert(plug->count > 0); | ||
249 | + | ||
250 | + if (--plug->count > 0) { | ||
251 | + return; | ||
252 | + } | ||
253 | + | ||
254 | + GArray *array = plug->unplug_fns; | ||
255 | + if (!array) { | ||
256 | + return; | ||
257 | + } | ||
258 | + | ||
259 | + UnplugFn *fns = (UnplugFn *)array->data; | ||
260 | + | ||
261 | + for (guint i = 0; i < array->len; i++) { | ||
262 | + fns[i].fn(fns[i].opaque); | ||
263 | + } | ||
264 | + | ||
265 | + /* | ||
266 | + * This resets the array without freeing memory so that appending is cheap | ||
267 | + * in the future. | ||
268 | + */ | ||
269 | + g_array_set_size(array, 0); | ||
270 | +} | ||
271 | diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c | ||
272 | index XXXXXXX..XXXXXXX 100644 | ||
273 | --- a/hw/block/dataplane/xen-block.c | ||
274 | +++ b/hw/block/dataplane/xen-block.c | ||
275 | @@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) | ||
276 | * is below us. | ||
277 | */ | ||
278 | if (inflight_atstart > IO_PLUG_THRESHOLD) { | ||
279 | - blk_io_plug(dataplane->blk); | ||
280 | + blk_io_plug(); | ||
281 | } | ||
282 | while (rc != rp) { | ||
283 | /* pull request from ring */ | ||
284 | @@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) | ||
285 | |||
286 | if (inflight_atstart > IO_PLUG_THRESHOLD && | ||
287 | batched >= inflight_atstart) { | ||
288 | - blk_io_unplug(dataplane->blk); | ||
289 | + blk_io_unplug(); | ||
35 | } | 290 | } |
36 | + ret = nvme_translate_error(c); | 291 | xen_block_do_aio(request); |
37 | q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; | 292 | if (inflight_atstart > IO_PLUG_THRESHOLD) { |
38 | if (!q->cq.head) { | 293 | if (batched >= inflight_atstart) { |
39 | q->cq_phase = !q->cq_phase; | 294 | - blk_io_plug(dataplane->blk); |
40 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 295 | + blk_io_plug(); |
41 | preq->busy = false; | 296 | batched = 0; |
42 | preq->cb = preq->opaque = NULL; | 297 | } else { |
43 | qemu_mutex_unlock(&q->lock); | 298 | batched++; |
44 | - req.cb(req.opaque, nvme_translate_error(c)); | 299 | @@ -XXX,XX +XXX,XX @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) |
45 | + req.cb(req.opaque, ret); | 300 | } |
46 | qemu_mutex_lock(&q->lock); | 301 | } |
47 | q->inflight--; | 302 | if (inflight_atstart > IO_PLUG_THRESHOLD) { |
48 | progress = true; | 303 | - blk_io_unplug(dataplane->blk); |
304 | + blk_io_unplug(); | ||
305 | } | ||
306 | |||
307 | return done_something; | ||
308 | diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c | ||
309 | index XXXXXXX..XXXXXXX 100644 | ||
310 | --- a/hw/block/virtio-blk.c | ||
311 | +++ b/hw/block/virtio-blk.c | ||
312 | @@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) | ||
313 | bool suppress_notifications = virtio_queue_get_notification(vq); | ||
314 | |||
315 | aio_context_acquire(blk_get_aio_context(s->blk)); | ||
316 | - blk_io_plug(s->blk); | ||
317 | + blk_io_plug(); | ||
318 | |||
319 | do { | ||
320 | if (suppress_notifications) { | ||
321 | @@ -XXX,XX +XXX,XX @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) | ||
322 | virtio_blk_submit_multireq(s, &mrb); | ||
323 | } | ||
324 | |||
325 | - blk_io_unplug(s->blk); | ||
326 | + blk_io_unplug(); | ||
327 | aio_context_release(blk_get_aio_context(s->blk)); | ||
328 | } | ||
329 | |||
330 | diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c | ||
331 | index XXXXXXX..XXXXXXX 100644 | ||
332 | --- a/hw/scsi/virtio-scsi.c | ||
333 | +++ b/hw/scsi/virtio-scsi.c | ||
334 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
335 | return -ENOBUFS; | ||
336 | } | ||
337 | scsi_req_ref(req->sreq); | ||
338 | - blk_io_plug(d->conf.blk); | ||
339 | + blk_io_plug(); | ||
340 | object_unref(OBJECT(d)); | ||
341 | return 0; | ||
342 | } | ||
343 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
344 | if (scsi_req_enqueue(sreq)) { | ||
345 | scsi_req_continue(sreq); | ||
346 | } | ||
347 | - blk_io_unplug(sreq->dev->conf.blk); | ||
348 | + blk_io_unplug(); | ||
349 | scsi_req_unref(sreq); | ||
350 | } | ||
351 | |||
352 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) | ||
353 | while (!QTAILQ_EMPTY(&reqs)) { | ||
354 | req = QTAILQ_FIRST(&reqs); | ||
355 | QTAILQ_REMOVE(&reqs, req, next); | ||
356 | - blk_io_unplug(req->sreq->dev->conf.blk); | ||
357 | + blk_io_unplug(); | ||
358 | scsi_req_unref(req->sreq); | ||
359 | virtqueue_detach_element(req->vq, &req->elem, 0); | ||
360 | virtio_scsi_free_req(req); | ||
361 | diff --git a/block/meson.build b/block/meson.build | ||
362 | index XXXXXXX..XXXXXXX 100644 | ||
363 | --- a/block/meson.build | ||
364 | +++ b/block/meson.build | ||
365 | @@ -XXX,XX +XXX,XX @@ block_ss.add(files( | ||
366 | 'mirror.c', | ||
367 | 'nbd.c', | ||
368 | 'null.c', | ||
369 | + 'plug.c', | ||
370 | 'qapi.c', | ||
371 | 'qcow2-bitmap.c', | ||
372 | 'qcow2-cache.c', | ||
49 | -- | 373 | -- |
50 | 2.26.2 | 374 | 2.40.1 |
51 | diff view generated by jsdifflib |
1 | QEMU block drivers are supposed to support aio_poll() from I/O | 1 | Stop using the .bdrv_co_io_plug() API because it is not multi-queue |
---|---|---|---|
2 | completion callback functions. This means completion processing must be | 2 | block layer friendly. Use the new blk_io_plug_call() API to batch I/O |
3 | re-entrant. | 3 | submission instead. |
4 | |||
5 | The standard approach is to schedule a BH during completion processing | ||
6 | and cancel it at the end of processing. If aio_poll() is invoked by a | ||
7 | callback function then the BH will run. The BH continues the suspended | ||
8 | completion processing. | ||
9 | |||
10 | All of this means that request A's cb() can synchronously wait for | ||
11 | request B to complete. Previously the nvme block driver would hang | ||
12 | because it didn't process completions from nested aio_poll(). | ||
13 | 4 | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
15 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 6 | Reviewed-by: Eric Blake <eblake@redhat.com> |
16 | Message-id: 20200617132201.1832152-8-stefanha@redhat.com | 7 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
8 | Acked-by: Kevin Wolf <kwolf@redhat.com> | ||
9 | Message-id: 20230530180959.1108766-3-stefanha@redhat.com | ||
17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
18 | --- | 11 | --- |
19 | block/nvme.c | 67 ++++++++++++++++++++++++++++++++++++++++------ | 12 | block/nvme.c | 44 ++++++++++++-------------------------------- |
20 | block/trace-events | 2 +- | 13 | block/trace-events | 1 - |
21 | 2 files changed, 60 insertions(+), 9 deletions(-) | 14 | 2 files changed, 12 insertions(+), 33 deletions(-) |
22 | 15 | ||
23 | diff --git a/block/nvme.c b/block/nvme.c | 16 | diff --git a/block/nvme.c b/block/nvme.c |
24 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100644 |
25 | --- a/block/nvme.c | 18 | --- a/block/nvme.c |
26 | +++ b/block/nvme.c | 19 | +++ b/block/nvme.c |
27 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 20 | @@ -XXX,XX +XXX,XX @@ |
28 | int cq_phase; | 21 | #include "qemu/vfio-helpers.h" |
29 | int free_req_head; | 22 | #include "block/block-io.h" |
30 | NVMeRequest reqs[NVME_NUM_REQS]; | 23 | #include "block/block_int.h" |
31 | - bool busy; | 24 | +#include "sysemu/block-backend.h" |
32 | int need_kick; | 25 | #include "sysemu/replay.h" |
33 | int inflight; | 26 | #include "trace.h" |
34 | + | 27 | |
35 | + /* Thread-safe, no lock necessary */ | ||
36 | + QEMUBH *completion_bh; | ||
37 | } NVMeQueuePair; | ||
38 | |||
39 | /* Memory mapped registers */ | ||
40 | @@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState { | 28 | @@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState { |
41 | #define NVME_BLOCK_OPT_DEVICE "device" | 29 | int blkshift; |
42 | #define NVME_BLOCK_OPT_NAMESPACE "namespace" | 30 | |
43 | 31 | uint64_t max_transfer; | |
44 | +static void nvme_process_completion_bh(void *opaque); | 32 | - bool plugged; |
45 | + | 33 | |
46 | static QemuOptsList runtime_opts = { | 34 | bool supports_write_zeroes; |
47 | .name = "nvme", | 35 | bool supports_discard; |
48 | .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), | 36 | @@ -XXX,XX +XXX,XX @@ static void nvme_kick(NVMeQueuePair *q) |
49 | @@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, | ||
50 | |||
51 | static void nvme_free_queue_pair(NVMeQueuePair *q) | ||
52 | { | 37 | { |
53 | + if (q->completion_bh) { | 38 | BDRVNVMeState *s = q->s; |
54 | + qemu_bh_delete(q->completion_bh); | 39 | |
55 | + } | 40 | - if (s->plugged || !q->need_kick) { |
56 | qemu_vfree(q->prp_list_pages); | 41 | + if (!q->need_kick) { |
57 | qemu_vfree(q->sq.queue); | 42 | return; |
58 | qemu_vfree(q->cq.queue); | 43 | } |
59 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | 44 | trace_nvme_kick(s, q->index); |
60 | q->index = idx; | ||
61 | qemu_co_queue_init(&q->free_req_queue); | ||
62 | q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
63 | + q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs), | ||
64 | + nvme_process_completion_bh, q); | ||
65 | r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, | ||
66 | s->page_size * NVME_NUM_REQS, | ||
67 | false, &prp_list_iova); | ||
68 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | 45 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) |
69 | NvmeCqe *c; | 46 | NvmeCqe *c; |
70 | 47 | ||
71 | trace_nvme_process_completion(s, q->index, q->inflight); | 48 | trace_nvme_process_completion(s, q->index, q->inflight); |
72 | - if (q->busy || s->plugged) { | 49 | - if (s->plugged) { |
73 | - trace_nvme_process_completion_queue_busy(s, q->index); | 50 | - trace_nvme_process_completion_queue_plugged(s, q->index); |
74 | + if (s->plugged) { | 51 | - return false; |
75 | + trace_nvme_process_completion_queue_plugged(s, q->index); | 52 | - } |
76 | return false; | 53 | |
54 | /* | ||
55 | * Support re-entrancy when a request cb() function invokes aio_poll(). | ||
56 | @@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd) | ||
77 | } | 57 | } |
78 | - q->busy = true; | ||
79 | + | ||
80 | + /* | ||
81 | + * Support re-entrancy when a request cb() function invokes aio_poll(). | ||
82 | + * Pending completions must be visible to aio_poll() so that a cb() | ||
83 | + * function can wait for the completion of another request. | ||
84 | + * | ||
85 | + * The aio_poll() loop will execute our BH and we'll resume completion | ||
86 | + * processing there. | ||
87 | + */ | ||
88 | + qemu_bh_schedule(q->completion_bh); | ||
89 | + | ||
90 | assert(q->inflight >= 0); | ||
91 | while (q->inflight) { | ||
92 | int ret; | ||
93 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
94 | assert(req.cb); | ||
95 | nvme_put_free_req_locked(q, preq); | ||
96 | preq->cb = preq->opaque = NULL; | ||
97 | - qemu_mutex_unlock(&q->lock); | ||
98 | - req.cb(req.opaque, ret); | ||
99 | - qemu_mutex_lock(&q->lock); | ||
100 | q->inflight--; | ||
101 | + qemu_mutex_unlock(&q->lock); | ||
102 | + req.cb(req.opaque, ret); | ||
103 | + qemu_mutex_lock(&q->lock); | ||
104 | progress = true; | ||
105 | } | ||
106 | if (progress) { | ||
107 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
108 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
109 | nvme_wake_free_req_locked(q); | ||
110 | } | ||
111 | - q->busy = false; | ||
112 | + | ||
113 | + qemu_bh_cancel(q->completion_bh); | ||
114 | + | ||
115 | return progress; | ||
116 | } | 58 | } |
117 | 59 | ||
118 | +static void nvme_process_completion_bh(void *opaque) | 60 | +static void nvme_unplug_fn(void *opaque) |
119 | +{ | 61 | +{ |
120 | + NVMeQueuePair *q = opaque; | 62 | + NVMeQueuePair *q = opaque; |
121 | + | 63 | + |
122 | + /* | 64 | + QEMU_LOCK_GUARD(&q->lock); |
123 | + * We're being invoked because a nvme_process_completion() cb() function | 65 | + nvme_kick(q); |
124 | + * called aio_poll(). The callback may be waiting for further completions | ||
125 | + * so notify the device that it has space to fill in more completions now. | ||
126 | + */ | ||
127 | + smp_mb_release(); | ||
128 | + *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
129 | + nvme_wake_free_req_locked(q); | ||
130 | + | ||
131 | + nvme_process_completion(q); | 66 | + nvme_process_completion(q); |
132 | +} | 67 | +} |
133 | + | 68 | + |
134 | static void nvme_trace_command(const NvmeCmd *cmd) | 69 | static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, |
70 | NvmeCmd *cmd, BlockCompletionFunc cb, | ||
71 | void *opaque) | ||
72 | @@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, | ||
73 | q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); | ||
74 | q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; | ||
75 | q->need_kick++; | ||
76 | - nvme_kick(q); | ||
77 | - nvme_process_completion(q); | ||
78 | + blk_io_plug_call(nvme_unplug_fn, q); | ||
79 | qemu_mutex_unlock(&q->lock); | ||
80 | } | ||
81 | |||
82 | @@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs, | ||
83 | } | ||
84 | } | ||
85 | |||
86 | -static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs) | ||
87 | -{ | ||
88 | - BDRVNVMeState *s = bs->opaque; | ||
89 | - assert(!s->plugged); | ||
90 | - s->plugged = true; | ||
91 | -} | ||
92 | - | ||
93 | -static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs) | ||
94 | -{ | ||
95 | - BDRVNVMeState *s = bs->opaque; | ||
96 | - assert(s->plugged); | ||
97 | - s->plugged = false; | ||
98 | - for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) { | ||
99 | - NVMeQueuePair *q = s->queues[i]; | ||
100 | - qemu_mutex_lock(&q->lock); | ||
101 | - nvme_kick(q); | ||
102 | - nvme_process_completion(q); | ||
103 | - qemu_mutex_unlock(&q->lock); | ||
104 | - } | ||
105 | -} | ||
106 | - | ||
107 | static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, | ||
108 | Error **errp) | ||
135 | { | 109 | { |
136 | int i; | 110 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_nvme = { |
137 | @@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs) | 111 | .bdrv_detach_aio_context = nvme_detach_aio_context, |
138 | { | 112 | .bdrv_attach_aio_context = nvme_attach_aio_context, |
139 | BDRVNVMeState *s = bs->opaque; | 113 | |
140 | 114 | - .bdrv_co_io_plug = nvme_co_io_plug, | |
141 | + for (int i = 0; i < s->nr_queues; i++) { | 115 | - .bdrv_co_io_unplug = nvme_co_io_unplug, |
142 | + NVMeQueuePair *q = s->queues[i]; | 116 | - |
143 | + | 117 | .bdrv_register_buf = nvme_register_buf, |
144 | + qemu_bh_delete(q->completion_bh); | 118 | .bdrv_unregister_buf = nvme_unregister_buf, |
145 | + q->completion_bh = NULL; | 119 | }; |
146 | + } | ||
147 | + | ||
148 | aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, | ||
149 | false, NULL, NULL); | ||
150 | } | ||
151 | @@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs, | ||
152 | s->aio_context = new_context; | ||
153 | aio_set_event_notifier(new_context, &s->irq_notifier, | ||
154 | false, nvme_handle_event, nvme_poll_cb); | ||
155 | + | ||
156 | + for (int i = 0; i < s->nr_queues; i++) { | ||
157 | + NVMeQueuePair *q = s->queues[i]; | ||
158 | + | ||
159 | + q->completion_bh = | ||
160 | + aio_bh_new(new_context, nvme_process_completion_bh, q); | ||
161 | + } | ||
162 | } | ||
163 | |||
164 | static void nvme_aio_plug(BlockDriverState *bs) | ||
165 | diff --git a/block/trace-events b/block/trace-events | 120 | diff --git a/block/trace-events b/block/trace-events |
166 | index XXXXXXX..XXXXXXX 100644 | 121 | index XXXXXXX..XXXXXXX 100644 |
167 | --- a/block/trace-events | 122 | --- a/block/trace-events |
168 | +++ b/block/trace-events | 123 | +++ b/block/trace-events |
169 | @@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d" | 124 | @@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u" |
170 | nvme_dma_flush_queue_wait(void *s) "s %p" | 125 | nvme_dma_flush_queue_wait(void *s) "s %p" |
171 | nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" | 126 | nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" |
172 | nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" | 127 | nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d" |
173 | -nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" | 128 | -nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u" |
174 | +nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d" | 129 | nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" |
175 | nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" | 130 | nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" |
176 | nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" | ||
177 | nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" | 131 | nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" |
178 | -- | 132 | -- |
179 | 2.26.2 | 133 | 2.40.1 |
180 | diff view generated by jsdifflib |
1 | There are three issues with the current NVMeRequest->busy field: | 1 | Stop using the .bdrv_co_io_plug() API because it is not multi-queue |
---|---|---|---|
2 | 1. The busy field is accidentally accessed outside q->lock when request | 2 | block layer friendly. Use the new blk_io_plug_call() API to batch I/O |
3 | submission fails. | 3 | submission instead. |
4 | 2. Waiters on free_req_queue are not woken when a request is returned | ||
5 | early due to submission failure. | ||
6 | 2. Finding a free request involves scanning all requests. This makes | ||
7 | request submission O(n^2). | ||
8 | |||
9 | Switch to an O(1) freelist that is always accessed under the lock. | ||
10 | |||
11 | Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and | ||
12 | NVME_NUM_REQS, the number of usable requests. This makes the code | ||
13 | simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind | ||
14 | that one slot is reserved. | ||
15 | 4 | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
17 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 6 | Reviewed-by: Eric Blake <eblake@redhat.com> |
18 | Message-id: 20200617132201.1832152-5-stefanha@redhat.com | 7 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
8 | Acked-by: Kevin Wolf <kwolf@redhat.com> | ||
9 | Message-id: 20230530180959.1108766-4-stefanha@redhat.com | ||
19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
20 | --- | 11 | --- |
21 | block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------ | 12 | block/blkio.c | 43 ++++++++++++++++++++++++------------------- |
22 | 1 file changed, 54 insertions(+), 27 deletions(-) | 13 | 1 file changed, 24 insertions(+), 19 deletions(-) |
23 | 14 | ||
24 | diff --git a/block/nvme.c b/block/nvme.c | 15 | diff --git a/block/blkio.c b/block/blkio.c |
25 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/block/nvme.c | 17 | --- a/block/blkio.c |
27 | +++ b/block/nvme.c | 18 | +++ b/block/blkio.c |
28 | @@ -XXX,XX +XXX,XX @@ | 19 | @@ -XXX,XX +XXX,XX @@ |
29 | #define NVME_QUEUE_SIZE 128 | 20 | #include "qemu/error-report.h" |
30 | #define NVME_BAR_SIZE 8192 | 21 | #include "qapi/qmp/qdict.h" |
31 | 22 | #include "qemu/module.h" | |
23 | +#include "sysemu/block-backend.h" | ||
24 | #include "exec/memory.h" /* for ram_block_discard_disable() */ | ||
25 | |||
26 | #include "block/block-io.h" | ||
27 | @@ -XXX,XX +XXX,XX @@ static void blkio_detach_aio_context(BlockDriverState *bs) | ||
28 | NULL, NULL, NULL); | ||
29 | } | ||
30 | |||
31 | -/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */ | ||
32 | -static void blkio_submit_io(BlockDriverState *bs) | ||
32 | +/* | 33 | +/* |
33 | + * We have to leave one slot empty as that is the full queue case where | 34 | + * Called by blk_io_unplug() or immediately if not plugged. Called without |
34 | + * head == tail + 1. | 35 | + * blkio_lock. |
35 | + */ | 36 | + */ |
36 | +#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) | 37 | +static void blkio_unplug_fn(void *opaque) |
38 | { | ||
39 | - if (qatomic_read(&bs->io_plugged) == 0) { | ||
40 | - BDRVBlkioState *s = bs->opaque; | ||
41 | + BDRVBlkioState *s = opaque; | ||
42 | |||
43 | + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { | ||
44 | blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); | ||
45 | } | ||
46 | } | ||
47 | |||
48 | +/* | ||
49 | + * Schedule I/O submission after enqueuing a new request. Called without | ||
50 | + * blkio_lock. | ||
51 | + */ | ||
52 | +static void blkio_submit_io(BlockDriverState *bs) | ||
53 | +{ | ||
54 | + BDRVBlkioState *s = bs->opaque; | ||
37 | + | 55 | + |
38 | typedef struct { | 56 | + blk_io_plug_call(blkio_unplug_fn, s); |
39 | int32_t head, tail; | ||
40 | uint8_t *queue; | ||
41 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
42 | int cid; | ||
43 | void *prp_list_page; | ||
44 | uint64_t prp_list_iova; | ||
45 | - bool busy; | ||
46 | + int free_req_next; /* q->reqs[] index of next free req */ | ||
47 | } NVMeRequest; | ||
48 | |||
49 | typedef struct { | ||
50 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
51 | /* Fields protected by @lock */ | ||
52 | NVMeQueue sq, cq; | ||
53 | int cq_phase; | ||
54 | - NVMeRequest reqs[NVME_QUEUE_SIZE]; | ||
55 | + int free_req_head; | ||
56 | + NVMeRequest reqs[NVME_NUM_REQS]; | ||
57 | bool busy; | ||
58 | int need_kick; | ||
59 | int inflight; | ||
60 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | ||
61 | qemu_mutex_init(&q->lock); | ||
62 | q->index = idx; | ||
63 | qemu_co_queue_init(&q->free_req_queue); | ||
64 | - q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE); | ||
65 | + q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
66 | r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, | ||
67 | - s->page_size * NVME_QUEUE_SIZE, | ||
68 | + s->page_size * NVME_NUM_REQS, | ||
69 | false, &prp_list_iova); | ||
70 | if (r) { | ||
71 | goto fail; | ||
72 | } | ||
73 | - for (i = 0; i < NVME_QUEUE_SIZE; i++) { | ||
74 | + q->free_req_head = -1; | ||
75 | + for (i = 0; i < NVME_NUM_REQS; i++) { | ||
76 | NVMeRequest *req = &q->reqs[i]; | ||
77 | req->cid = i + 1; | ||
78 | + req->free_req_next = q->free_req_head; | ||
79 | + q->free_req_head = i; | ||
80 | req->prp_list_page = q->prp_list_pages + i * s->page_size; | ||
81 | req->prp_list_iova = prp_list_iova + i * s->page_size; | ||
82 | } | ||
83 | + | ||
84 | nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); | ||
85 | if (local_err) { | ||
86 | error_propagate(errp, local_err); | ||
87 | @@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) | ||
88 | */ | ||
89 | static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) | ||
90 | { | ||
91 | - int i; | ||
92 | - NVMeRequest *req = NULL; | ||
93 | + NVMeRequest *req; | ||
94 | |||
95 | qemu_mutex_lock(&q->lock); | ||
96 | - while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) { | ||
97 | - /* We have to leave one slot empty as that is the full queue case (head | ||
98 | - * == tail + 1). */ | ||
99 | + | ||
100 | + while (q->free_req_head == -1) { | ||
101 | if (qemu_in_coroutine()) { | ||
102 | trace_nvme_free_req_queue_wait(q); | ||
103 | qemu_co_queue_wait(&q->free_req_queue, &q->lock); | ||
104 | @@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) | ||
105 | return NULL; | ||
106 | } | ||
107 | } | ||
108 | - for (i = 0; i < NVME_QUEUE_SIZE; i++) { | ||
109 | - if (!q->reqs[i].busy) { | ||
110 | - q->reqs[i].busy = true; | ||
111 | - req = &q->reqs[i]; | ||
112 | - break; | ||
113 | - } | ||
114 | - } | ||
115 | - /* We have checked inflight and need_kick while holding q->lock, so one | ||
116 | - * free req must be available. */ | ||
117 | - assert(req); | ||
118 | + | ||
119 | + req = &q->reqs[q->free_req_head]; | ||
120 | + q->free_req_head = req->free_req_next; | ||
121 | + req->free_req_next = -1; | ||
122 | + | ||
123 | qemu_mutex_unlock(&q->lock); | ||
124 | return req; | ||
125 | } | ||
126 | |||
127 | +/* With q->lock */ | ||
128 | +static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) | ||
129 | +{ | ||
130 | + req->free_req_next = q->free_req_head; | ||
131 | + q->free_req_head = req - q->reqs; | ||
132 | +} | 57 | +} |
133 | + | 58 | + |
134 | +/* With q->lock */ | 59 | static int coroutine_fn |
135 | +static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) | 60 | blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) |
136 | +{ | ||
137 | + if (!qemu_co_queue_empty(&q->free_req_queue)) { | ||
138 | + replay_bh_schedule_oneshot_event(s->aio_context, | ||
139 | + nvme_free_req_queue_cb, q); | ||
140 | + } | ||
141 | +} | ||
142 | + | ||
143 | +/* Insert a request in the freelist and wake waiters */ | ||
144 | +static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, | ||
145 | + NVMeRequest *req) | ||
146 | +{ | ||
147 | + qemu_mutex_lock(&q->lock); | ||
148 | + nvme_put_free_req_locked(q, req); | ||
149 | + nvme_wake_free_req_locked(s, q); | ||
150 | + qemu_mutex_unlock(&q->lock); | ||
151 | +} | ||
152 | + | ||
153 | static inline int nvme_translate_error(const NvmeCqe *c) | ||
154 | { | 61 | { |
155 | uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; | 62 | @@ -XXX,XX +XXX,XX @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) |
156 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 63 | |
157 | req = *preq; | 64 | WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { |
158 | assert(req.cid == cid); | 65 | blkioq_discard(s->blkioq, offset, bytes, &cod, 0); |
159 | assert(req.cb); | 66 | - blkio_submit_io(bs); |
160 | - preq->busy = false; | ||
161 | + nvme_put_free_req_locked(q, preq); | ||
162 | preq->cb = preq->opaque = NULL; | ||
163 | qemu_mutex_unlock(&q->lock); | ||
164 | req.cb(req.opaque, ret); | ||
165 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
166 | /* Notify the device so it can post more completions. */ | ||
167 | smp_mb_release(); | ||
168 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
169 | - if (!qemu_co_queue_empty(&q->free_req_queue)) { | ||
170 | - replay_bh_schedule_oneshot_event(s->aio_context, | ||
171 | - nvme_free_req_queue_cb, q); | ||
172 | - } | ||
173 | + nvme_wake_free_req_locked(s, q); | ||
174 | } | 67 | } |
175 | q->busy = false; | 68 | |
176 | return progress; | 69 | + blkio_submit_io(bs); |
177 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, | 70 | qemu_coroutine_yield(); |
178 | r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); | 71 | return cod.ret; |
179 | qemu_co_mutex_unlock(&s->dma_map_lock); | 72 | } |
180 | if (r) { | 73 | @@ -XXX,XX +XXX,XX @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, |
181 | - req->busy = false; | 74 | |
182 | + nvme_put_free_req_and_wake(s, ioq, req); | 75 | WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { |
183 | return r; | 76 | blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); |
77 | - blkio_submit_io(bs); | ||
184 | } | 78 | } |
185 | nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | 79 | |
186 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, | 80 | + blkio_submit_io(bs); |
187 | qemu_co_mutex_unlock(&s->dma_map_lock); | 81 | qemu_coroutine_yield(); |
188 | 82 | ||
189 | if (ret) { | 83 | if (use_bounce_buffer) { |
190 | - req->busy = false; | 84 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, |
191 | + nvme_put_free_req_and_wake(s, ioq, req); | 85 | |
192 | goto out; | 86 | WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { |
87 | blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); | ||
88 | - blkio_submit_io(bs); | ||
193 | } | 89 | } |
194 | 90 | ||
91 | + blkio_submit_io(bs); | ||
92 | qemu_coroutine_yield(); | ||
93 | |||
94 | if (use_bounce_buffer) { | ||
95 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs) | ||
96 | |||
97 | WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { | ||
98 | blkioq_flush(s->blkioq, &cod, 0); | ||
99 | - blkio_submit_io(bs); | ||
100 | } | ||
101 | |||
102 | + blkio_submit_io(bs); | ||
103 | qemu_coroutine_yield(); | ||
104 | return cod.ret; | ||
105 | } | ||
106 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, | ||
107 | |||
108 | WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { | ||
109 | blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); | ||
110 | - blkio_submit_io(bs); | ||
111 | } | ||
112 | |||
113 | + blkio_submit_io(bs); | ||
114 | qemu_coroutine_yield(); | ||
115 | return cod.ret; | ||
116 | } | ||
117 | |||
118 | -static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs) | ||
119 | -{ | ||
120 | - BDRVBlkioState *s = bs->opaque; | ||
121 | - | ||
122 | - WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { | ||
123 | - blkio_submit_io(bs); | ||
124 | - } | ||
125 | -} | ||
126 | - | ||
127 | typedef enum { | ||
128 | BMRR_OK, | ||
129 | BMRR_SKIP, | ||
130 | @@ -XXX,XX +XXX,XX @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) | ||
131 | .bdrv_co_pwritev = blkio_co_pwritev, \ | ||
132 | .bdrv_co_flush_to_disk = blkio_co_flush, \ | ||
133 | .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ | ||
134 | - .bdrv_co_io_unplug = blkio_co_io_unplug, \ | ||
135 | .bdrv_refresh_limits = blkio_refresh_limits, \ | ||
136 | .bdrv_register_buf = blkio_register_buf, \ | ||
137 | .bdrv_unregister_buf = blkio_unregister_buf, \ | ||
195 | -- | 138 | -- |
196 | 2.26.2 | 139 | 2.40.1 |
197 | diff view generated by jsdifflib |
1 | Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce | 1 | Stop using the .bdrv_co_io_plug() API because it is not multi-queue |
---|---|---|---|
2 | the number of function arguments by keeping the BDRVNVMeState pointer in | 2 | block layer friendly. Use the new blk_io_plug_call() API to batch I/O |
3 | NVMeQueuePair. This will come in handly when a BH is introduced in a | 3 | submission instead. |
4 | later patch and only one argument can be passed to it. | ||
5 | 4 | ||
6 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
7 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 6 | Reviewed-by: Eric Blake <eblake@redhat.com> |
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 7 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
9 | Message-id: 20200617132201.1832152-7-stefanha@redhat.com | 8 | Acked-by: Kevin Wolf <kwolf@redhat.com> |
9 | Message-id: 20230530180959.1108766-5-stefanha@redhat.com | ||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
11 | --- | 11 | --- |
12 | block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------ | 12 | include/block/raw-aio.h | 7 ------- |
13 | 1 file changed, 38 insertions(+), 32 deletions(-) | 13 | block/file-posix.c | 10 ---------- |
14 | block/io_uring.c | 44 ++++++++++++++++------------------------- | ||
15 | block/trace-events | 5 ++--- | ||
16 | 4 files changed, 19 insertions(+), 47 deletions(-) | ||
14 | 17 | ||
15 | diff --git a/block/nvme.c b/block/nvme.c | 18 | diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h |
16 | index XXXXXXX..XXXXXXX 100644 | 19 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/nvme.c | 20 | --- a/include/block/raw-aio.h |
18 | +++ b/block/nvme.c | 21 | +++ b/include/block/raw-aio.h |
22 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, | ||
23 | QEMUIOVector *qiov, int type); | ||
24 | void luring_detach_aio_context(LuringState *s, AioContext *old_context); | ||
25 | void luring_attach_aio_context(LuringState *s, AioContext *new_context); | ||
26 | - | ||
27 | -/* | ||
28 | - * luring_io_plug/unplug work in the thread's current AioContext, therefore the | ||
29 | - * caller must ensure that they are paired in the same IOThread. | ||
30 | - */ | ||
31 | -void luring_io_plug(void); | ||
32 | -void luring_io_unplug(void); | ||
33 | #endif | ||
34 | |||
35 | #ifdef _WIN32 | ||
36 | diff --git a/block/file-posix.c b/block/file-posix.c | ||
37 | index XXXXXXX..XXXXXXX 100644 | ||
38 | --- a/block/file-posix.c | ||
39 | +++ b/block/file-posix.c | ||
40 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_plug(BlockDriverState *bs) | ||
41 | laio_io_plug(); | ||
42 | } | ||
43 | #endif | ||
44 | -#ifdef CONFIG_LINUX_IO_URING | ||
45 | - if (s->use_linux_io_uring) { | ||
46 | - luring_io_plug(); | ||
47 | - } | ||
48 | -#endif | ||
49 | } | ||
50 | |||
51 | static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) | ||
52 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) | ||
53 | laio_io_unplug(s->aio_max_batch); | ||
54 | } | ||
55 | #endif | ||
56 | -#ifdef CONFIG_LINUX_IO_URING | ||
57 | - if (s->use_linux_io_uring) { | ||
58 | - luring_io_unplug(); | ||
59 | - } | ||
60 | -#endif | ||
61 | } | ||
62 | |||
63 | static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) | ||
64 | diff --git a/block/io_uring.c b/block/io_uring.c | ||
65 | index XXXXXXX..XXXXXXX 100644 | ||
66 | --- a/block/io_uring.c | ||
67 | +++ b/block/io_uring.c | ||
19 | @@ -XXX,XX +XXX,XX @@ | 68 | @@ -XXX,XX +XXX,XX @@ |
20 | */ | 69 | #include "block/raw-aio.h" |
21 | #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) | 70 | #include "qemu/coroutine.h" |
22 | 71 | #include "qapi/error.h" | |
23 | +typedef struct BDRVNVMeState BDRVNVMeState; | 72 | +#include "sysemu/block-backend.h" |
24 | + | 73 | #include "trace.h" |
25 | typedef struct { | 74 | |
26 | int32_t head, tail; | 75 | /* Only used for assertions. */ |
27 | uint8_t *queue; | 76 | @@ -XXX,XX +XXX,XX @@ typedef struct LuringAIOCB { |
28 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 77 | } LuringAIOCB; |
29 | typedef struct { | 78 | |
30 | QemuMutex lock; | 79 | typedef struct LuringQueue { |
31 | 80 | - int plugged; | |
32 | + /* Read from I/O code path, initialized under BQL */ | 81 | unsigned int in_queue; |
33 | + BDRVNVMeState *s; | 82 | unsigned int in_flight; |
34 | + int index; | 83 | bool blocked; |
35 | + | 84 | @@ -XXX,XX +XXX,XX @@ static void luring_process_completions_and_submit(LuringState *s) |
36 | /* Fields protected by BQL */ | 85 | { |
37 | - int index; | 86 | luring_process_completions(s); |
38 | uint8_t *prp_list_pages; | 87 | |
39 | 88 | - if (!s->io_q.plugged && s->io_q.in_queue > 0) { | |
40 | /* Fields protected by @lock */ | 89 | + if (s->io_q.in_queue > 0) { |
41 | @@ -XXX,XX +XXX,XX @@ typedef volatile struct { | 90 | ioq_submit(s); |
42 | |||
43 | QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); | ||
44 | |||
45 | -typedef struct { | ||
46 | +struct BDRVNVMeState { | ||
47 | AioContext *aio_context; | ||
48 | QEMUVFIOState *vfio; | ||
49 | NVMeRegs *regs; | ||
50 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
51 | |||
52 | /* PCI address (required for nvme_refresh_filename()) */ | ||
53 | char *device; | ||
54 | -} BDRVNVMeState; | ||
55 | +}; | ||
56 | |||
57 | #define NVME_BLOCK_OPT_DEVICE "device" | ||
58 | #define NVME_BLOCK_OPT_NAMESPACE "namespace" | ||
59 | @@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, | ||
60 | } | 91 | } |
61 | } | 92 | } |
62 | 93 | @@ -XXX,XX +XXX,XX @@ static void qemu_luring_poll_ready(void *opaque) | |
63 | -static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q) | 94 | static void ioq_init(LuringQueue *io_q) |
64 | +static void nvme_free_queue_pair(NVMeQueuePair *q) | ||
65 | { | 95 | { |
66 | qemu_vfree(q->prp_list_pages); | 96 | QSIMPLEQ_INIT(&io_q->submit_queue); |
67 | qemu_vfree(q->sq.queue); | 97 | - io_q->plugged = 0; |
68 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | 98 | io_q->in_queue = 0; |
69 | uint64_t prp_list_iova; | 99 | io_q->in_flight = 0; |
70 | 100 | io_q->blocked = false; | |
71 | qemu_mutex_init(&q->lock); | ||
72 | + q->s = s; | ||
73 | q->index = idx; | ||
74 | qemu_co_queue_init(&q->free_req_queue); | ||
75 | q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
76 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | ||
77 | |||
78 | return q; | ||
79 | fail: | ||
80 | - nvme_free_queue_pair(bs, q); | ||
81 | + nvme_free_queue_pair(q); | ||
82 | return NULL; | ||
83 | } | 101 | } |
84 | 102 | ||
85 | /* With q->lock */ | 103 | -void luring_io_plug(void) |
86 | -static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) | 104 | +static void luring_unplug_fn(void *opaque) |
87 | +static void nvme_kick(NVMeQueuePair *q) | ||
88 | { | 105 | { |
89 | + BDRVNVMeState *s = q->s; | 106 | - AioContext *ctx = qemu_get_current_aio_context(); |
90 | + | 107 | - LuringState *s = aio_get_linux_io_uring(ctx); |
91 | if (s->plugged || !q->need_kick) { | 108 | - trace_luring_io_plug(s); |
92 | return; | 109 | - s->io_q.plugged++; |
93 | } | 110 | -} |
94 | @@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) | 111 | - |
95 | } | 112 | -void luring_io_unplug(void) |
96 | 113 | -{ | |
97 | /* With q->lock */ | 114 | - AioContext *ctx = qemu_get_current_aio_context(); |
98 | -static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) | 115 | - LuringState *s = aio_get_linux_io_uring(ctx); |
99 | +static void nvme_wake_free_req_locked(NVMeQueuePair *q) | 116 | - assert(s->io_q.plugged); |
100 | { | 117 | - trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, |
101 | if (!qemu_co_queue_empty(&q->free_req_queue)) { | 118 | - s->io_q.in_queue, s->io_q.in_flight); |
102 | - replay_bh_schedule_oneshot_event(s->aio_context, | 119 | - if (--s->io_q.plugged == 0 && |
103 | + replay_bh_schedule_oneshot_event(q->s->aio_context, | 120 | - !s->io_q.blocked && s->io_q.in_queue > 0) { |
104 | nvme_free_req_queue_cb, q); | 121 | + LuringState *s = opaque; |
122 | + trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue, | ||
123 | + s->io_q.in_flight); | ||
124 | + if (!s->io_q.blocked && s->io_q.in_queue > 0) { | ||
125 | ioq_submit(s); | ||
105 | } | 126 | } |
106 | } | 127 | } |
107 | 128 | @@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, | |
108 | /* Insert a request in the freelist and wake waiters */ | 129 | |
109 | -static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, | 130 | QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); |
110 | - NVMeRequest *req) | 131 | s->io_q.in_queue++; |
111 | +static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) | 132 | - trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged, |
112 | { | 133 | - s->io_q.in_queue, s->io_q.in_flight); |
113 | qemu_mutex_lock(&q->lock); | 134 | - if (!s->io_q.blocked && |
114 | nvme_put_free_req_locked(q, req); | 135 | - (!s->io_q.plugged || |
115 | - nvme_wake_free_req_locked(s, q); | 136 | - s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) { |
116 | + nvme_wake_free_req_locked(q); | 137 | - ret = ioq_submit(s); |
117 | qemu_mutex_unlock(&q->lock); | 138 | - trace_luring_do_submit_done(s, ret); |
139 | - return ret; | ||
140 | + trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue, | ||
141 | + s->io_q.in_flight); | ||
142 | + if (!s->io_q.blocked) { | ||
143 | + if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) { | ||
144 | + ret = ioq_submit(s); | ||
145 | + trace_luring_do_submit_done(s, ret); | ||
146 | + return ret; | ||
147 | + } | ||
148 | + | ||
149 | + blk_io_plug_call(luring_unplug_fn, s); | ||
150 | } | ||
151 | return 0; | ||
118 | } | 152 | } |
119 | 153 | diff --git a/block/trace-events b/block/trace-events | |
120 | @@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c) | 154 | index XXXXXXX..XXXXXXX 100644 |
121 | } | 155 | --- a/block/trace-events |
122 | 156 | +++ b/block/trace-events | |
123 | /* With q->lock */ | 157 | @@ -XXX,XX +XXX,XX @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) " |
124 | -static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 158 | # io_uring.c |
125 | +static bool nvme_process_completion(NVMeQueuePair *q) | 159 | luring_init_state(void *s, size_t size) "s %p size %zu" |
126 | { | 160 | luring_cleanup_state(void *s) "%p freed" |
127 | + BDRVNVMeState *s = q->s; | 161 | -luring_io_plug(void *s) "LuringState %p plug" |
128 | bool progress = false; | 162 | -luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" |
129 | NVMeRequest *preq; | 163 | -luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" |
130 | NVMeRequest req; | 164 | +luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" |
131 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 165 | +luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" |
132 | /* Notify the device so it can post more completions. */ | 166 | luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d" |
133 | smp_mb_release(); | 167 | luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d" |
134 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | 168 | luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d" |
135 | - nvme_wake_free_req_locked(s, q); | ||
136 | + nvme_wake_free_req_locked(q); | ||
137 | } | ||
138 | q->busy = false; | ||
139 | return progress; | ||
140 | @@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd) | ||
141 | } | ||
142 | } | ||
143 | |||
144 | -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, | ||
145 | - NVMeRequest *req, | ||
146 | +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, | ||
147 | NvmeCmd *cmd, BlockCompletionFunc cb, | ||
148 | void *opaque) | ||
149 | { | ||
150 | @@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, | ||
151 | req->opaque = opaque; | ||
152 | cmd->cid = cpu_to_le32(req->cid); | ||
153 | |||
154 | - trace_nvme_submit_command(s, q->index, req->cid); | ||
155 | + trace_nvme_submit_command(q->s, q->index, req->cid); | ||
156 | nvme_trace_command(cmd); | ||
157 | qemu_mutex_lock(&q->lock); | ||
158 | memcpy((uint8_t *)q->sq.queue + | ||
159 | q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); | ||
160 | q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; | ||
161 | q->need_kick++; | ||
162 | - nvme_kick(s, q); | ||
163 | - nvme_process_completion(s, q); | ||
164 | + nvme_kick(q); | ||
165 | + nvme_process_completion(q); | ||
166 | qemu_mutex_unlock(&q->lock); | ||
167 | } | ||
168 | |||
169 | @@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, | ||
170 | NvmeCmd *cmd) | ||
171 | { | ||
172 | NVMeRequest *req; | ||
173 | - BDRVNVMeState *s = bs->opaque; | ||
174 | int ret = -EINPROGRESS; | ||
175 | req = nvme_get_free_req(q); | ||
176 | if (!req) { | ||
177 | return -EBUSY; | ||
178 | } | ||
179 | - nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret); | ||
180 | + nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); | ||
181 | |||
182 | BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); | ||
183 | return ret; | ||
184 | @@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s) | ||
185 | } | ||
186 | |||
187 | qemu_mutex_lock(&q->lock); | ||
188 | - while (nvme_process_completion(s, q)) { | ||
189 | + while (nvme_process_completion(q)) { | ||
190 | /* Keep polling */ | ||
191 | progress = true; | ||
192 | } | ||
193 | @@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) | ||
194 | }; | ||
195 | if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { | ||
196 | error_setg(errp, "Failed to create io queue [%d]", n); | ||
197 | - nvme_free_queue_pair(bs, q); | ||
198 | + nvme_free_queue_pair(q); | ||
199 | return false; | ||
200 | } | ||
201 | cmd = (NvmeCmd) { | ||
202 | @@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) | ||
203 | }; | ||
204 | if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { | ||
205 | error_setg(errp, "Failed to create io queue [%d]", n); | ||
206 | - nvme_free_queue_pair(bs, q); | ||
207 | + nvme_free_queue_pair(q); | ||
208 | return false; | ||
209 | } | ||
210 | s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); | ||
211 | @@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs) | ||
212 | BDRVNVMeState *s = bs->opaque; | ||
213 | |||
214 | for (i = 0; i < s->nr_queues; ++i) { | ||
215 | - nvme_free_queue_pair(bs, s->queues[i]); | ||
216 | + nvme_free_queue_pair(s->queues[i]); | ||
217 | } | ||
218 | g_free(s->queues); | ||
219 | aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, | ||
220 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, | ||
221 | r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); | ||
222 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
223 | if (r) { | ||
224 | - nvme_put_free_req_and_wake(s, ioq, req); | ||
225 | + nvme_put_free_req_and_wake(ioq, req); | ||
226 | return r; | ||
227 | } | ||
228 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
229 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
230 | |||
231 | data.co = qemu_coroutine_self(); | ||
232 | while (data.ret == -EINPROGRESS) { | ||
233 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) | ||
234 | assert(s->nr_queues > 1); | ||
235 | req = nvme_get_free_req(ioq); | ||
236 | assert(req); | ||
237 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
238 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
239 | |||
240 | data.co = qemu_coroutine_self(); | ||
241 | if (data.ret == -EINPROGRESS) { | ||
242 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, | ||
243 | req = nvme_get_free_req(ioq); | ||
244 | assert(req); | ||
245 | |||
246 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
247 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
248 | |||
249 | data.co = qemu_coroutine_self(); | ||
250 | while (data.ret == -EINPROGRESS) { | ||
251 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, | ||
252 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
253 | |||
254 | if (ret) { | ||
255 | - nvme_put_free_req_and_wake(s, ioq, req); | ||
256 | + nvme_put_free_req_and_wake(ioq, req); | ||
257 | goto out; | ||
258 | } | ||
259 | |||
260 | trace_nvme_dsm(s, offset, bytes); | ||
261 | |||
262 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
263 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
264 | |||
265 | data.co = qemu_coroutine_self(); | ||
266 | while (data.ret == -EINPROGRESS) { | ||
267 | @@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs) | ||
268 | for (i = 1; i < s->nr_queues; i++) { | ||
269 | NVMeQueuePair *q = s->queues[i]; | ||
270 | qemu_mutex_lock(&q->lock); | ||
271 | - nvme_kick(s, q); | ||
272 | - nvme_process_completion(s, q); | ||
273 | + nvme_kick(q); | ||
274 | + nvme_process_completion(q); | ||
275 | qemu_mutex_unlock(&q->lock); | ||
276 | } | ||
277 | } | ||
278 | -- | 169 | -- |
279 | 2.26.2 | 170 | 2.40.1 |
280 | diff view generated by jsdifflib |
1 | Existing users access free_req_queue under q->lock. Document this. | 1 | Stop using the .bdrv_co_io_plug() API because it is not multi-queue |
---|---|---|---|
2 | block layer friendly. Use the new blk_io_plug_call() API to batch I/O | ||
3 | submission instead. | ||
4 | |||
5 | Note that a dev_max_batch check is dropped in laio_io_unplug() because | ||
6 | the semantics of unplug_fn() are different from .bdrv_co_unplug(): | ||
7 | 1. unplug_fn() is only called when the last blk_io_unplug() call occurs, | ||
8 | not every time blk_io_unplug() is called. | ||
9 | 2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no | ||
10 | way to get per-BlockDriverState fields like dev_max_batch. | ||
11 | |||
12 | Therefore this condition cannot be moved to laio_unplug_fn(). It is not | ||
13 | obvious that this condition affects performance in practice, so I am | ||
14 | removing it instead of trying to come up with a more complex mechanism | ||
15 | to preserve the condition. | ||
2 | 16 | ||
3 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
4 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 18 | Reviewed-by: Eric Blake <eblake@redhat.com> |
5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 19 | Acked-by: Kevin Wolf <kwolf@redhat.com> |
6 | Message-id: 20200617132201.1832152-6-stefanha@redhat.com | 20 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
21 | Message-id: 20230530180959.1108766-6-stefanha@redhat.com | ||
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | --- | 23 | --- |
9 | block/nvme.c | 2 +- | 24 | include/block/raw-aio.h | 7 ------- |
10 | 1 file changed, 1 insertion(+), 1 deletion(-) | 25 | block/file-posix.c | 28 ---------------------------- |
11 | 26 | block/linux-aio.c | 41 +++++++++++------------------------------ | |
12 | diff --git a/block/nvme.c b/block/nvme.c | 27 | 3 files changed, 11 insertions(+), 65 deletions(-) |
28 | |||
29 | diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h | ||
13 | index XXXXXXX..XXXXXXX 100644 | 30 | index XXXXXXX..XXXXXXX 100644 |
14 | --- a/block/nvme.c | 31 | --- a/include/block/raw-aio.h |
15 | +++ b/block/nvme.c | 32 | +++ b/include/block/raw-aio.h |
16 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 33 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, |
17 | } NVMeRequest; | 34 | |
35 | void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); | ||
36 | void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); | ||
37 | - | ||
38 | -/* | ||
39 | - * laio_io_plug/unplug work in the thread's current AioContext, therefore the | ||
40 | - * caller must ensure that they are paired in the same IOThread. | ||
41 | - */ | ||
42 | -void laio_io_plug(void); | ||
43 | -void laio_io_unplug(uint64_t dev_max_batch); | ||
44 | #endif | ||
45 | /* io_uring.c - Linux io_uring implementation */ | ||
46 | #ifdef CONFIG_LINUX_IO_URING | ||
47 | diff --git a/block/file-posix.c b/block/file-posix.c | ||
48 | index XXXXXXX..XXXXXXX 100644 | ||
49 | --- a/block/file-posix.c | ||
50 | +++ b/block/file-posix.c | ||
51 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, | ||
52 | return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); | ||
53 | } | ||
54 | |||
55 | -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs) | ||
56 | -{ | ||
57 | - BDRVRawState __attribute__((unused)) *s = bs->opaque; | ||
58 | -#ifdef CONFIG_LINUX_AIO | ||
59 | - if (s->use_linux_aio) { | ||
60 | - laio_io_plug(); | ||
61 | - } | ||
62 | -#endif | ||
63 | -} | ||
64 | - | ||
65 | -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) | ||
66 | -{ | ||
67 | - BDRVRawState __attribute__((unused)) *s = bs->opaque; | ||
68 | -#ifdef CONFIG_LINUX_AIO | ||
69 | - if (s->use_linux_aio) { | ||
70 | - laio_io_unplug(s->aio_max_batch); | ||
71 | - } | ||
72 | -#endif | ||
73 | -} | ||
74 | - | ||
75 | static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) | ||
76 | { | ||
77 | BDRVRawState *s = bs->opaque; | ||
78 | @@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = { | ||
79 | .bdrv_co_copy_range_from = raw_co_copy_range_from, | ||
80 | .bdrv_co_copy_range_to = raw_co_copy_range_to, | ||
81 | .bdrv_refresh_limits = raw_refresh_limits, | ||
82 | - .bdrv_co_io_plug = raw_co_io_plug, | ||
83 | - .bdrv_co_io_unplug = raw_co_io_unplug, | ||
84 | .bdrv_attach_aio_context = raw_aio_attach_aio_context, | ||
85 | |||
86 | .bdrv_co_truncate = raw_co_truncate, | ||
87 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = { | ||
88 | .bdrv_co_copy_range_from = raw_co_copy_range_from, | ||
89 | .bdrv_co_copy_range_to = raw_co_copy_range_to, | ||
90 | .bdrv_refresh_limits = raw_refresh_limits, | ||
91 | - .bdrv_co_io_plug = raw_co_io_plug, | ||
92 | - .bdrv_co_io_unplug = raw_co_io_unplug, | ||
93 | .bdrv_attach_aio_context = raw_aio_attach_aio_context, | ||
94 | |||
95 | .bdrv_co_truncate = raw_co_truncate, | ||
96 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = { | ||
97 | .bdrv_co_pwritev = raw_co_pwritev, | ||
98 | .bdrv_co_flush_to_disk = raw_co_flush_to_disk, | ||
99 | .bdrv_refresh_limits = cdrom_refresh_limits, | ||
100 | - .bdrv_co_io_plug = raw_co_io_plug, | ||
101 | - .bdrv_co_io_unplug = raw_co_io_unplug, | ||
102 | .bdrv_attach_aio_context = raw_aio_attach_aio_context, | ||
103 | |||
104 | .bdrv_co_truncate = raw_co_truncate, | ||
105 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_cdrom = { | ||
106 | .bdrv_co_pwritev = raw_co_pwritev, | ||
107 | .bdrv_co_flush_to_disk = raw_co_flush_to_disk, | ||
108 | .bdrv_refresh_limits = cdrom_refresh_limits, | ||
109 | - .bdrv_co_io_plug = raw_co_io_plug, | ||
110 | - .bdrv_co_io_unplug = raw_co_io_unplug, | ||
111 | .bdrv_attach_aio_context = raw_aio_attach_aio_context, | ||
112 | |||
113 | .bdrv_co_truncate = raw_co_truncate, | ||
114 | diff --git a/block/linux-aio.c b/block/linux-aio.c | ||
115 | index XXXXXXX..XXXXXXX 100644 | ||
116 | --- a/block/linux-aio.c | ||
117 | +++ b/block/linux-aio.c | ||
118 | @@ -XXX,XX +XXX,XX @@ | ||
119 | #include "qemu/event_notifier.h" | ||
120 | #include "qemu/coroutine.h" | ||
121 | #include "qapi/error.h" | ||
122 | +#include "sysemu/block-backend.h" | ||
123 | |||
124 | /* Only used for assertions. */ | ||
125 | #include "qemu/coroutine_int.h" | ||
126 | @@ -XXX,XX +XXX,XX @@ struct qemu_laiocb { | ||
127 | }; | ||
18 | 128 | ||
19 | typedef struct { | 129 | typedef struct { |
20 | - CoQueue free_req_queue; | 130 | - int plugged; |
21 | QemuMutex lock; | 131 | unsigned int in_queue; |
22 | 132 | unsigned int in_flight; | |
23 | /* Fields protected by BQL */ | 133 | bool blocked; |
24 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 134 | @@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s) |
25 | uint8_t *prp_list_pages; | 135 | { |
26 | 136 | qemu_laio_process_completions(s); | |
27 | /* Fields protected by @lock */ | 137 | |
28 | + CoQueue free_req_queue; | 138 | - if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
29 | NVMeQueue sq, cq; | 139 | + if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
30 | int cq_phase; | 140 | ioq_submit(s); |
31 | int free_req_head; | 141 | } |
142 | } | ||
143 | @@ -XXX,XX +XXX,XX @@ static void qemu_laio_poll_ready(EventNotifier *opaque) | ||
144 | static void ioq_init(LaioQueue *io_q) | ||
145 | { | ||
146 | QSIMPLEQ_INIT(&io_q->pending); | ||
147 | - io_q->plugged = 0; | ||
148 | io_q->in_queue = 0; | ||
149 | io_q->in_flight = 0; | ||
150 | io_q->blocked = false; | ||
151 | @@ -XXX,XX +XXX,XX @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) | ||
152 | return max_batch; | ||
153 | } | ||
154 | |||
155 | -void laio_io_plug(void) | ||
156 | +static void laio_unplug_fn(void *opaque) | ||
157 | { | ||
158 | - AioContext *ctx = qemu_get_current_aio_context(); | ||
159 | - LinuxAioState *s = aio_get_linux_aio(ctx); | ||
160 | + LinuxAioState *s = opaque; | ||
161 | |||
162 | - s->io_q.plugged++; | ||
163 | -} | ||
164 | - | ||
165 | -void laio_io_unplug(uint64_t dev_max_batch) | ||
166 | -{ | ||
167 | - AioContext *ctx = qemu_get_current_aio_context(); | ||
168 | - LinuxAioState *s = aio_get_linux_aio(ctx); | ||
169 | - | ||
170 | - assert(s->io_q.plugged); | ||
171 | - s->io_q.plugged--; | ||
172 | - | ||
173 | - /* | ||
174 | - * Why max batch checking is performed here: | ||
175 | - * Another BDS may have queued requests with a higher dev_max_batch and | ||
176 | - * therefore in_queue could now exceed our dev_max_batch. Re-check the max | ||
177 | - * batch so we can honor our device's dev_max_batch. | ||
178 | - */ | ||
179 | - if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || | ||
180 | - (!s->io_q.plugged && | ||
181 | - !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { | ||
182 | + if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { | ||
183 | ioq_submit(s); | ||
184 | } | ||
185 | } | ||
186 | @@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, | ||
187 | |||
188 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); | ||
189 | s->io_q.in_queue++; | ||
190 | - if (!s->io_q.blocked && | ||
191 | - (!s->io_q.plugged || | ||
192 | - s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) { | ||
193 | - ioq_submit(s); | ||
194 | + if (!s->io_q.blocked) { | ||
195 | + if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) { | ||
196 | + ioq_submit(s); | ||
197 | + } else { | ||
198 | + blk_io_plug_call(laio_unplug_fn, s); | ||
199 | + } | ||
200 | } | ||
201 | |||
202 | return 0; | ||
32 | -- | 203 | -- |
33 | 2.26.2 | 204 | 2.40.1 |
34 | diff view generated by jsdifflib |
1 | nvme_process_completion() explicitly checks cid so the assertion that | 1 | No block driver implements .bdrv_co_io_plug() anymore. Get rid of the |
---|---|---|---|
2 | follows is always true: | 2 | function pointers. |
3 | |||
4 | if (cid == 0 || cid > NVME_QUEUE_SIZE) { | ||
5 | ... | ||
6 | continue; | ||
7 | } | ||
8 | assert(cid <= NVME_QUEUE_SIZE); | ||
9 | 3 | ||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 4 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
11 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 5 | Reviewed-by: Eric Blake <eblake@redhat.com> |
12 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 6 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
13 | Message-id: 20200617132201.1832152-3-stefanha@redhat.com | 7 | Acked-by: Kevin Wolf <kwolf@redhat.com> |
8 | Message-id: 20230530180959.1108766-7-stefanha@redhat.com | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
15 | --- | 10 | --- |
16 | block/nvme.c | 1 - | 11 | include/block/block-io.h | 3 --- |
17 | 1 file changed, 1 deletion(-) | 12 | include/block/block_int-common.h | 11 ---------- |
13 | block/io.c | 37 -------------------------------- | ||
14 | 3 files changed, 51 deletions(-) | ||
18 | 15 | ||
19 | diff --git a/block/nvme.c b/block/nvme.c | 16 | diff --git a/include/block/block-io.h b/include/block/block-io.h |
20 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/block/nvme.c | 18 | --- a/include/block/block-io.h |
22 | +++ b/block/nvme.c | 19 | +++ b/include/block/block-io.h |
23 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 20 | @@ -XXX,XX +XXX,XX @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx); |
24 | cid); | 21 | |
25 | continue; | 22 | AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c); |
26 | } | 23 | |
27 | - assert(cid <= NVME_QUEUE_SIZE); | 24 | -void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs); |
28 | trace_nvme_complete_command(s, q->index, cid); | 25 | -void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs); |
29 | preq = &q->reqs[cid - 1]; | 26 | - |
30 | req = *preq; | 27 | bool coroutine_fn GRAPH_RDLOCK |
28 | bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, | ||
29 | uint32_t granularity, Error **errp); | ||
30 | diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h | ||
31 | index XXXXXXX..XXXXXXX 100644 | ||
32 | --- a/include/block/block_int-common.h | ||
33 | +++ b/include/block/block_int-common.h | ||
34 | @@ -XXX,XX +XXX,XX @@ struct BlockDriver { | ||
35 | void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)( | ||
36 | BlockDriverState *bs, BlkdebugEvent event); | ||
37 | |||
38 | - /* io queue for linux-aio */ | ||
39 | - void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState *bs); | ||
40 | - void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)( | ||
41 | - BlockDriverState *bs); | ||
42 | - | ||
43 | bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); | ||
44 | |||
45 | bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)( | ||
46 | @@ -XXX,XX +XXX,XX @@ struct BlockDriverState { | ||
47 | unsigned int in_flight; | ||
48 | unsigned int serialising_in_flight; | ||
49 | |||
50 | - /* | ||
51 | - * counter for nested bdrv_io_plug. | ||
52 | - * Accessed with atomic ops. | ||
53 | - */ | ||
54 | - unsigned io_plugged; | ||
55 | - | ||
56 | /* do we need to tell the quest if we have a volatile write cache? */ | ||
57 | int enable_write_cache; | ||
58 | |||
59 | diff --git a/block/io.c b/block/io.c | ||
60 | index XXXXXXX..XXXXXXX 100644 | ||
61 | --- a/block/io.c | ||
62 | +++ b/block/io.c | ||
63 | @@ -XXX,XX +XXX,XX @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) | ||
64 | return mem; | ||
65 | } | ||
66 | |||
67 | -void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs) | ||
68 | -{ | ||
69 | - BdrvChild *child; | ||
70 | - IO_CODE(); | ||
71 | - assert_bdrv_graph_readable(); | ||
72 | - | ||
73 | - QLIST_FOREACH(child, &bs->children, next) { | ||
74 | - bdrv_co_io_plug(child->bs); | ||
75 | - } | ||
76 | - | ||
77 | - if (qatomic_fetch_inc(&bs->io_plugged) == 0) { | ||
78 | - BlockDriver *drv = bs->drv; | ||
79 | - if (drv && drv->bdrv_co_io_plug) { | ||
80 | - drv->bdrv_co_io_plug(bs); | ||
81 | - } | ||
82 | - } | ||
83 | -} | ||
84 | - | ||
85 | -void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs) | ||
86 | -{ | ||
87 | - BdrvChild *child; | ||
88 | - IO_CODE(); | ||
89 | - assert_bdrv_graph_readable(); | ||
90 | - | ||
91 | - assert(bs->io_plugged); | ||
92 | - if (qatomic_fetch_dec(&bs->io_plugged) == 1) { | ||
93 | - BlockDriver *drv = bs->drv; | ||
94 | - if (drv && drv->bdrv_co_io_unplug) { | ||
95 | - drv->bdrv_co_io_unplug(bs); | ||
96 | - } | ||
97 | - } | ||
98 | - | ||
99 | - QLIST_FOREACH(child, &bs->children, next) { | ||
100 | - bdrv_co_io_unplug(child->bs); | ||
101 | - } | ||
102 | -} | ||
103 | - | ||
104 | /* Helper that undoes bdrv_register_buf() when it fails partway through */ | ||
105 | static void GRAPH_RDLOCK | ||
106 | bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size, | ||
31 | -- | 107 | -- |
32 | 2.26.2 | 108 | 2.40.1 |
33 | diff view generated by jsdifflib |
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | 1 | From: Stefano Garzarella <sgarzare@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | LLVM's SafeStack instrumentation does not yet support programs that make | 3 | Some virtio-blk drivers (e.g. virtio-blk-vhost-vdpa) supports the fd |
4 | use of the APIs in ucontext.h | 4 | passing. Let's expose this to the user, so the management layer |
5 | With the current implementation of coroutine-ucontext, the resulting | 5 | can pass the file descriptor of an already opened path. |
6 | binary is incorrect, with different coroutines sharing the same unsafe | ||
7 | stack and producing undefined behavior at runtime. | ||
8 | This fix allocates an additional unsafe stack area for each coroutine, | ||
9 | and sets the new unsafe stack pointer before calling swapcontext() in | ||
10 | qemu_coroutine_new. | ||
11 | This is the only place where the pointer needs to be manually updated, | ||
12 | since sigsetjmp/siglongjmp are already instrumented by LLVM to properly | ||
13 | support SafeStack. | ||
14 | The additional stack is then freed in qemu_coroutine_delete. | ||
15 | 6 | ||
16 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | 7 | If the libblkio virtio-blk driver supports fd passing, let's always |
17 | Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com | 8 | use qemu_open() to open the `path`, so we can handle fd passing |
9 | from the management layer through the "/dev/fdset/N" special path. | ||
10 | |||
11 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> | ||
13 | Message-id: 20230530071941.8954-2-sgarzare@redhat.com | ||
18 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
19 | --- | 15 | --- |
20 | include/qemu/coroutine_int.h | 5 +++++ | 16 | block/blkio.c | 53 ++++++++++++++++++++++++++++++++++++++++++--------- |
21 | util/coroutine-ucontext.c | 28 ++++++++++++++++++++++++++++ | 17 | 1 file changed, 44 insertions(+), 9 deletions(-) |
22 | 2 files changed, 33 insertions(+) | ||
23 | 18 | ||
24 | diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h | 19 | diff --git a/block/blkio.c b/block/blkio.c |
25 | index XXXXXXX..XXXXXXX 100644 | 20 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/include/qemu/coroutine_int.h | 21 | --- a/block/blkio.c |
27 | +++ b/include/qemu/coroutine_int.h | 22 | +++ b/block/blkio.c |
28 | @@ -XXX,XX +XXX,XX @@ | 23 | @@ -XXX,XX +XXX,XX @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs, |
29 | #include "qemu/queue.h" | 24 | { |
30 | #include "qemu/coroutine.h" | 25 | const char *path = qdict_get_try_str(options, "path"); |
31 | 26 | BDRVBlkioState *s = bs->opaque; | |
32 | +#ifdef CONFIG_SAFESTACK | 27 | - int ret; |
33 | +/* Pointer to the unsafe stack, defined by the compiler */ | 28 | + bool fd_supported = false; |
34 | +extern __thread void *__safestack_unsafe_stack_ptr; | 29 | + int fd, ret; |
35 | +#endif | 30 | |
31 | if (!path) { | ||
32 | error_setg(errp, "missing 'path' option"); | ||
33 | return -EINVAL; | ||
34 | } | ||
35 | |||
36 | - ret = blkio_set_str(s->blkio, "path", path); | ||
37 | - qdict_del(options, "path"); | ||
38 | - if (ret < 0) { | ||
39 | - error_setg_errno(errp, -ret, "failed to set path: %s", | ||
40 | - blkio_get_error_msg()); | ||
41 | - return ret; | ||
42 | - } | ||
43 | - | ||
44 | if (!(flags & BDRV_O_NOCACHE)) { | ||
45 | error_setg(errp, "cache.direct=off is not supported"); | ||
46 | return -EINVAL; | ||
47 | } | ||
36 | + | 48 | + |
37 | #define COROUTINE_STACK_SIZE (1 << 20) | 49 | + if (blkio_get_int(s->blkio, "fd", &fd) == 0) { |
38 | 50 | + fd_supported = true; | |
39 | typedef enum { | 51 | + } |
40 | diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c | ||
41 | index XXXXXXX..XXXXXXX 100644 | ||
42 | --- a/util/coroutine-ucontext.c | ||
43 | +++ b/util/coroutine-ucontext.c | ||
44 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
45 | Coroutine base; | ||
46 | void *stack; | ||
47 | size_t stack_size; | ||
48 | +#ifdef CONFIG_SAFESTACK | ||
49 | + /* Need an unsafe stack for each coroutine */ | ||
50 | + void *unsafe_stack; | ||
51 | + size_t unsafe_stack_size; | ||
52 | +#endif | ||
53 | sigjmp_buf env; | ||
54 | |||
55 | void *tsan_co_fiber; | ||
56 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void) | ||
57 | co = g_malloc0(sizeof(*co)); | ||
58 | co->stack_size = COROUTINE_STACK_SIZE; | ||
59 | co->stack = qemu_alloc_stack(&co->stack_size); | ||
60 | +#ifdef CONFIG_SAFESTACK | ||
61 | + co->unsafe_stack_size = COROUTINE_STACK_SIZE; | ||
62 | + co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size); | ||
63 | +#endif | ||
64 | co->base.entry_arg = &old_env; /* stash away our jmp_buf */ | ||
65 | |||
66 | uc.uc_link = &old_uc; | ||
67 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void) | ||
68 | COROUTINE_YIELD, | ||
69 | &fake_stack_save, | ||
70 | co->stack, co->stack_size, co->tsan_co_fiber); | ||
71 | + | 52 | + |
72 | +#ifdef CONFIG_SAFESTACK | 53 | + /* |
73 | + /* | 54 | + * If the libblkio driver supports fd passing, let's always use qemu_open() |
74 | + * Before we swap the context, set the new unsafe stack | 55 | + * to open the `path`, so we can handle fd passing from the management |
75 | + * The unsafe stack grows just like the normal stack, so start from | 56 | + * layer through the "/dev/fdset/N" special path. |
76 | + * the last usable location of the memory area. | 57 | + */ |
77 | + * NOTE: we don't have to re-set the usp afterwards because we are | 58 | + if (fd_supported) { |
78 | + * coming back to this context through a siglongjmp. | 59 | + int open_flags; |
79 | + * The compiler already wrapped the corresponding sigsetjmp call with | ||
80 | + * code that saves the usp on the (safe) stack before the call, and | ||
81 | + * restores it right after (which is where we return with siglongjmp). | ||
82 | + */ | ||
83 | + void *usp = co->unsafe_stack + co->unsafe_stack_size; | ||
84 | + __safestack_unsafe_stack_ptr = usp; | ||
85 | +#endif | ||
86 | + | 60 | + |
87 | swapcontext(&old_uc, &uc); | 61 | + if (flags & BDRV_O_RDWR) { |
88 | } | 62 | + open_flags = O_RDWR; |
89 | 63 | + } else { | |
90 | @@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_) | 64 | + open_flags = O_RDONLY; |
91 | #endif | 65 | + } |
92 | 66 | + | |
93 | qemu_free_stack(co->stack, co->stack_size); | 67 | + fd = qemu_open(path, open_flags, errp); |
94 | +#ifdef CONFIG_SAFESTACK | 68 | + if (fd < 0) { |
95 | + qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size); | 69 | + return -EINVAL; |
96 | +#endif | 70 | + } |
97 | g_free(co); | 71 | + |
72 | + ret = blkio_set_int(s->blkio, "fd", fd); | ||
73 | + if (ret < 0) { | ||
74 | + error_setg_errno(errp, -ret, "failed to set fd: %s", | ||
75 | + blkio_get_error_msg()); | ||
76 | + qemu_close(fd); | ||
77 | + return ret; | ||
78 | + } | ||
79 | + } else { | ||
80 | + ret = blkio_set_str(s->blkio, "path", path); | ||
81 | + if (ret < 0) { | ||
82 | + error_setg_errno(errp, -ret, "failed to set path: %s", | ||
83 | + blkio_get_error_msg()); | ||
84 | + return ret; | ||
85 | + } | ||
86 | + } | ||
87 | + | ||
88 | + qdict_del(options, "path"); | ||
89 | + | ||
90 | return 0; | ||
98 | } | 91 | } |
99 | 92 | ||
100 | -- | 93 | -- |
101 | 2.26.2 | 94 | 2.40.1 |
102 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | Current implementation of LLVM's SafeStack is not compatible with | ||
4 | code that uses an alternate stack created with sigaltstack(). | ||
5 | Since coroutine-sigaltstack relies on sigaltstack(), it is not | ||
6 | compatible with SafeStack. The resulting binary is incorrect, with | ||
7 | different coroutines sharing the same unsafe stack and producing | ||
8 | undefined behavior at runtime. | ||
9 | |||
10 | In the future LLVM may provide a SafeStack implementation compatible with | ||
11 | sigaltstack(). In the meantime, if SafeStack is desired, the coroutine | ||
12 | implementation from coroutine-ucontext should be used. | ||
13 | As a safety check, add a control in coroutine-sigaltstack to throw a | ||
14 | preprocessor #error if SafeStack is enabled and we are trying to | ||
15 | use coroutine-sigaltstack to implement coroutines. | ||
16 | |||
17 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
18 | Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com | ||
19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
20 | --- | ||
21 | util/coroutine-sigaltstack.c | 4 ++++ | ||
22 | 1 file changed, 4 insertions(+) | ||
23 | |||
24 | diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c | ||
25 | index XXXXXXX..XXXXXXX 100644 | ||
26 | --- a/util/coroutine-sigaltstack.c | ||
27 | +++ b/util/coroutine-sigaltstack.c | ||
28 | @@ -XXX,XX +XXX,XX @@ | ||
29 | #include "qemu-common.h" | ||
30 | #include "qemu/coroutine_int.h" | ||
31 | |||
32 | +#ifdef CONFIG_SAFESTACK | ||
33 | +#error "SafeStack is not compatible with code run in alternate signal stacks" | ||
34 | +#endif | ||
35 | + | ||
36 | typedef struct { | ||
37 | Coroutine base; | ||
38 | void *stack; | ||
39 | -- | ||
40 | 2.26.2 | ||
41 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | This patch adds a flag to enable/disable the SafeStack instrumentation | ||
4 | provided by LLVM. | ||
5 | |||
6 | On enable, make sure that the compiler supports the flags, and that we | ||
7 | are using the proper coroutine implementation (coroutine-ucontext). | ||
8 | On disable, explicitly disable the option if it was enabled by default. | ||
9 | |||
10 | While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS, | ||
11 | we are not checking for the O.S. since this is already done by LLVM. | ||
12 | |||
13 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
14 | Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com | ||
15 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
16 | --- | ||
17 | configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ | ||
18 | 1 file changed, 73 insertions(+) | ||
19 | |||
20 | diff --git a/configure b/configure | ||
21 | index XXXXXXX..XXXXXXX 100755 | ||
22 | --- a/configure | ||
23 | +++ b/configure | ||
24 | @@ -XXX,XX +XXX,XX @@ audio_win_int="" | ||
25 | libs_qga="" | ||
26 | debug_info="yes" | ||
27 | stack_protector="" | ||
28 | +safe_stack="" | ||
29 | use_containers="yes" | ||
30 | gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb") | ||
31 | |||
32 | @@ -XXX,XX +XXX,XX @@ for opt do | ||
33 | ;; | ||
34 | --disable-stack-protector) stack_protector="no" | ||
35 | ;; | ||
36 | + --enable-safe-stack) safe_stack="yes" | ||
37 | + ;; | ||
38 | + --disable-safe-stack) safe_stack="no" | ||
39 | + ;; | ||
40 | --disable-curses) curses="no" | ||
41 | ;; | ||
42 | --enable-curses) curses="yes" | ||
43 | @@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available: | ||
44 | debug-tcg TCG debugging (default is disabled) | ||
45 | debug-info debugging information | ||
46 | sparse sparse checker | ||
47 | + safe-stack SafeStack Stack Smash Protection. Depends on | ||
48 | + clang/llvm >= 3.7 and requires coroutine backend ucontext. | ||
49 | |||
50 | gnutls GNUTLS cryptography support | ||
51 | nettle nettle cryptography support | ||
52 | @@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then | ||
53 | fi | ||
54 | fi | ||
55 | |||
56 | +################################################## | ||
57 | +# SafeStack | ||
58 | + | ||
59 | + | ||
60 | +if test "$safe_stack" = "yes"; then | ||
61 | +cat > $TMPC << EOF | ||
62 | +int main(int argc, char *argv[]) | ||
63 | +{ | ||
64 | +#if ! __has_feature(safe_stack) | ||
65 | +#error SafeStack Disabled | ||
66 | +#endif | ||
67 | + return 0; | ||
68 | +} | ||
69 | +EOF | ||
70 | + flag="-fsanitize=safe-stack" | ||
71 | + # Check that safe-stack is supported and enabled. | ||
72 | + if compile_prog "-Werror $flag" "$flag"; then | ||
73 | + # Flag needed both at compilation and at linking | ||
74 | + QEMU_CFLAGS="$QEMU_CFLAGS $flag" | ||
75 | + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" | ||
76 | + else | ||
77 | + error_exit "SafeStack not supported by your compiler" | ||
78 | + fi | ||
79 | + if test "$coroutine" != "ucontext"; then | ||
80 | + error_exit "SafeStack is only supported by the coroutine backend ucontext" | ||
81 | + fi | ||
82 | +else | ||
83 | +cat > $TMPC << EOF | ||
84 | +int main(int argc, char *argv[]) | ||
85 | +{ | ||
86 | +#if defined(__has_feature) | ||
87 | +#if __has_feature(safe_stack) | ||
88 | +#error SafeStack Enabled | ||
89 | +#endif | ||
90 | +#endif | ||
91 | + return 0; | ||
92 | +} | ||
93 | +EOF | ||
94 | +if test "$safe_stack" = "no"; then | ||
95 | + # Make sure that safe-stack is disabled | ||
96 | + if ! compile_prog "-Werror" ""; then | ||
97 | + # SafeStack was already enabled, try to explicitly remove the feature | ||
98 | + flag="-fno-sanitize=safe-stack" | ||
99 | + if ! compile_prog "-Werror $flag" "$flag"; then | ||
100 | + error_exit "Configure cannot disable SafeStack" | ||
101 | + fi | ||
102 | + QEMU_CFLAGS="$QEMU_CFLAGS $flag" | ||
103 | + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" | ||
104 | + fi | ||
105 | +else # "$safe_stack" = "" | ||
106 | + # Set safe_stack to yes or no based on pre-existing flags | ||
107 | + if compile_prog "-Werror" ""; then | ||
108 | + safe_stack="no" | ||
109 | + else | ||
110 | + safe_stack="yes" | ||
111 | + if test "$coroutine" != "ucontext"; then | ||
112 | + error_exit "SafeStack is only supported by the coroutine backend ucontext" | ||
113 | + fi | ||
114 | + fi | ||
115 | +fi | ||
116 | +fi | ||
117 | |||
118 | ########################################## | ||
119 | # check if we have open_by_handle_at | ||
120 | @@ -XXX,XX +XXX,XX @@ echo "sparse enabled $sparse" | ||
121 | echo "strip binaries $strip_opt" | ||
122 | echo "profiler $profiler" | ||
123 | echo "static build $static" | ||
124 | +echo "safe stack $safe_stack" | ||
125 | if test "$darwin" = "yes" ; then | ||
126 | echo "Cocoa support $cocoa" | ||
127 | fi | ||
128 | @@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then | ||
129 | echo "export CCACHE_CPP2=y" >> $config_host_mak | ||
130 | fi | ||
131 | |||
132 | +if test "$safe_stack" = "yes"; then | ||
133 | + echo "CONFIG_SAFESTACK=y" >> $config_host_mak | ||
134 | +fi | ||
135 | + | ||
136 | # If we're using a separate build tree, set it up now. | ||
137 | # DIRS are directories which we simply mkdir in the build tree; | ||
138 | # LINKS are things to symlink back into the source tree | ||
139 | -- | ||
140 | 2.26.2 | ||
141 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | SafeStack is a stack protection technique implemented in llvm. It is | ||
4 | enabled with a -fsanitize flag. | ||
5 | iotests are currently disabled when any -fsanitize option is used, | ||
6 | because such options tend to produce additional warnings and false | ||
7 | positives. | ||
8 | |||
9 | While common -fsanitize options are used to verify the code and not | ||
10 | added in production, SafeStack's main use is in production environments | ||
11 | to protect against stack smashing. | ||
12 | |||
13 | Since SafeStack does not print any warning or false positive, enable | ||
14 | iotests when SafeStack is the only -fsanitize option used. | ||
15 | This is likely going to be a production binary and we want to make sure | ||
16 | it works correctly. | ||
17 | |||
18 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
19 | Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com | ||
20 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
21 | --- | ||
22 | tests/check-block.sh | 12 +++++++++++- | ||
23 | 1 file changed, 11 insertions(+), 1 deletion(-) | ||
24 | |||
25 | diff --git a/tests/check-block.sh b/tests/check-block.sh | ||
26 | index XXXXXXX..XXXXXXX 100755 | ||
27 | --- a/tests/check-block.sh | ||
28 | +++ b/tests/check-block.sh | ||
29 | @@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then | ||
30 | exit 0 | ||
31 | fi | ||
32 | |||
33 | -if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then | ||
34 | +# Disable tests with any sanitizer except for SafeStack | ||
35 | +CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ) | ||
36 | +SANITIZE_FLAGS="" | ||
37 | +#Remove all occurrencies of -fsanitize=safe-stack | ||
38 | +for i in ${CFLAGS}; do | ||
39 | + if [ "${i}" != "-fsanitize=safe-stack" ]; then | ||
40 | + SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}" | ||
41 | + fi | ||
42 | +done | ||
43 | +if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then | ||
44 | + # Have a sanitize flag that is not allowed, stop | ||
45 | echo "Sanitizers are enabled ==> Not running the qemu-iotests." | ||
46 | exit 0 | ||
47 | fi | ||
48 | -- | ||
49 | 2.26.2 | ||
50 | diff view generated by jsdifflib |
1 | A lot of CPU time is spent simply locking/unlocking q->lock during | 1 | From: Stefano Garzarella <sgarzare@redhat.com> |
---|---|---|---|
2 | polling. Check for completion outside the lock to make q->lock disappear | ||
3 | from the profile. | ||
4 | 2 | ||
5 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 3 | The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the fd |
6 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 4 | passing through the new 'fd' property. |
7 | Message-id: 20200617132201.1832152-2-stefanha@redhat.com | 5 | |
6 | Since now we are using qemu_open() on '@path' if the virtio-blk driver | ||
7 | supports the fd passing, let's announce it. | ||
8 | In this way, the management layer can pass the file descriptor of an | ||
9 | already opened vhost-vdpa character device. This is useful especially | ||
10 | when the device can only be accessed with certain privileges. | ||
11 | |||
12 | Add the '@fdset' feature only when the virtio-blk-vhost-vdpa driver | ||
13 | in libblkio supports it. | ||
14 | |||
15 | Suggested-by: Markus Armbruster <armbru@redhat.com> | ||
16 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
17 | Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> | ||
18 | Message-id: 20230530071941.8954-3-sgarzare@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
9 | --- | 20 | --- |
10 | block/nvme.c | 12 ++++++++++++ | 21 | qapi/block-core.json | 6 ++++++ |
11 | 1 file changed, 12 insertions(+) | 22 | meson.build | 4 ++++ |
23 | 2 files changed, 10 insertions(+) | ||
12 | 24 | ||
13 | diff --git a/block/nvme.c b/block/nvme.c | 25 | diff --git a/qapi/block-core.json b/qapi/block-core.json |
14 | index XXXXXXX..XXXXXXX 100644 | 26 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/block/nvme.c | 27 | --- a/qapi/block-core.json |
16 | +++ b/block/nvme.c | 28 | +++ b/qapi/block-core.json |
17 | @@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s) | 29 | @@ -XXX,XX +XXX,XX @@ |
18 | 30 | # | |
19 | for (i = 0; i < s->nr_queues; i++) { | 31 | # @path: path to the vhost-vdpa character device. |
20 | NVMeQueuePair *q = s->queues[i]; | 32 | # |
21 | + const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; | 33 | +# Features: |
22 | + NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; | 34 | +# @fdset: Member @path supports the special "/dev/fdset/N" path |
23 | + | 35 | +# (since 8.1) |
24 | + /* | 36 | +# |
25 | + * Do an early check for completions. q->lock isn't needed because | 37 | # Since: 7.2 |
26 | + * nvme_process_completion() only runs in the event loop thread and | 38 | ## |
27 | + * cannot race with itself. | 39 | { 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa', |
28 | + */ | 40 | 'data': { 'path': 'str' }, |
29 | + if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { | 41 | + 'features': [ { 'name' :'fdset', |
30 | + continue; | 42 | + 'if': 'CONFIG_BLKIO_VHOST_VDPA_FD' } ], |
31 | + } | 43 | 'if': 'CONFIG_BLKIO' } |
32 | + | 44 | |
33 | qemu_mutex_lock(&q->lock); | 45 | ## |
34 | while (nvme_process_completion(s, q)) { | 46 | diff --git a/meson.build b/meson.build |
35 | /* Keep polling */ | 47 | index XXXXXXX..XXXXXXX 100644 |
48 | --- a/meson.build | ||
49 | +++ b/meson.build | ||
50 | @@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_LZO', lzo.found()) | ||
51 | config_host_data.set('CONFIG_MPATH', mpathpersist.found()) | ||
52 | config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api) | ||
53 | config_host_data.set('CONFIG_BLKIO', blkio.found()) | ||
54 | +if blkio.found() | ||
55 | + config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD', | ||
56 | + blkio.version().version_compare('>=1.3.0')) | ||
57 | +endif | ||
58 | config_host_data.set('CONFIG_CURL', curl.found()) | ||
59 | config_host_data.set('CONFIG_CURSES', curses.found()) | ||
60 | config_host_data.set('CONFIG_GBM', gbm.found()) | ||
36 | -- | 61 | -- |
37 | 2.26.2 | 62 | 2.40.1 |
38 | diff view generated by jsdifflib |