1 | The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae: | 1 | The following changes since commit 9cf289af47bcfae5c75de37d8e5d6fd23705322c: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100) | 3 | Merge tag 'qga-pull-request' of gitlab.com:marcandre.lureau/qemu into staging (2022-05-04 03:42:49 -0700) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a: | 9 | for you to fetch changes up to bef2e050d6a7feb865854c65570c496ac5a8cf53: |
10 | 10 | ||
11 | block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100) | 11 | util/event-loop-base: Introduce options to set the thread pool size (2022-05-04 17:02:19 +0100) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | Add new thread-pool-min/thread-pool-max parameters to control the thread pool | ||
17 | used for async I/O. | ||
18 | |||
16 | ---------------------------------------------------------------- | 19 | ---------------------------------------------------------------- |
17 | 20 | ||
18 | Daniele Buono (4): | 21 | Nicolas Saenz Julienne (3): |
19 | coroutine: support SafeStack in ucontext backend | 22 | Introduce event-loop-base abstract class |
20 | coroutine: add check for SafeStack in sigaltstack | 23 | util/main-loop: Introduce the main loop into QOM |
21 | configure: add flags to support SafeStack | 24 | util/event-loop-base: Introduce options to set the thread pool size |
22 | check-block: enable iotests with SafeStack | ||
23 | 25 | ||
24 | Stefan Hajnoczi (8): | 26 | qapi/qom.json | 43 ++++++++-- |
25 | minikconf: explicitly set encoding to UTF-8 | 27 | meson.build | 26 +++--- |
26 | block/nvme: poll queues without q->lock | 28 | include/block/aio.h | 10 +++ |
27 | block/nvme: drop tautologous assertion | 29 | include/block/thread-pool.h | 3 + |
28 | block/nvme: don't access CQE after moving cq.head | 30 | include/qemu/main-loop.h | 10 +++ |
29 | block/nvme: switch to a NVMeRequest freelist | 31 | include/sysemu/event-loop-base.h | 41 +++++++++ |
30 | block/nvme: clarify that free_req_queue is protected by q->lock | 32 | include/sysemu/iothread.h | 6 +- |
31 | block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair | 33 | event-loop-base.c | 140 +++++++++++++++++++++++++++++++ |
32 | block/nvme: support nested aio_poll() | 34 | iothread.c | 68 +++++---------- |
33 | 35 | util/aio-posix.c | 1 + | |
34 | configure | 73 ++++++++++++ | 36 | util/async.c | 20 +++++ |
35 | include/qemu/coroutine_int.h | 5 + | 37 | util/main-loop.c | 65 ++++++++++++++ |
36 | block/nvme.c | 220 +++++++++++++++++++++++++---------- | 38 | util/thread-pool.c | 55 +++++++++++- |
37 | util/coroutine-sigaltstack.c | 4 + | 39 | 13 files changed, 419 insertions(+), 69 deletions(-) |
38 | util/coroutine-ucontext.c | 28 +++++ | 40 | create mode 100644 include/sysemu/event-loop-base.h |
39 | block/trace-events | 2 +- | 41 | create mode 100644 event-loop-base.c |
40 | scripts/minikconf.py | 6 +- | ||
41 | tests/check-block.sh | 12 +- | ||
42 | 8 files changed, 284 insertions(+), 66 deletions(-) | ||
43 | 42 | ||
44 | -- | 43 | -- |
45 | 2.26.2 | 44 | 2.35.1 |
46 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | QEMU currently only has ASCII Kconfig files but Linux actually uses | ||
2 | UTF-8. Explicitly specify the encoding and that we're doing text file | ||
3 | I/O. | ||
4 | 1 | ||
5 | It's unclear whether or not QEMU will ever need Unicode in its Kconfig | ||
6 | files. If we start using the help text then it will become an issue | ||
7 | sooner or later. Make this change now for consistency with Linux | ||
8 | Kconfig. | ||
9 | |||
10 | Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
13 | Message-id: 20200521153616.307100-1-stefanha@redhat.com | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | --- | ||
16 | scripts/minikconf.py | 6 +++--- | ||
17 | 1 file changed, 3 insertions(+), 3 deletions(-) | ||
18 | |||
19 | diff --git a/scripts/minikconf.py b/scripts/minikconf.py | ||
20 | index XXXXXXX..XXXXXXX 100755 | ||
21 | --- a/scripts/minikconf.py | ||
22 | +++ b/scripts/minikconf.py | ||
23 | @@ -XXX,XX +XXX,XX @@ class KconfigParser: | ||
24 | if incl_abs_fname in self.data.previously_included: | ||
25 | return | ||
26 | try: | ||
27 | - fp = open(incl_abs_fname, 'r') | ||
28 | + fp = open(incl_abs_fname, 'rt', encoding='utf-8') | ||
29 | except IOError as e: | ||
30 | raise KconfigParserError(self, | ||
31 | '%s: %s' % (e.strerror, include)) | ||
32 | @@ -XXX,XX +XXX,XX @@ if __name__ == '__main__': | ||
33 | parser.do_assignment(name, value == 'y') | ||
34 | external_vars.add(name[7:]) | ||
35 | else: | ||
36 | - fp = open(arg, 'r') | ||
37 | + fp = open(arg, 'rt', encoding='utf-8') | ||
38 | parser.parse_file(fp) | ||
39 | fp.close() | ||
40 | |||
41 | @@ -XXX,XX +XXX,XX @@ if __name__ == '__main__': | ||
42 | if key not in external_vars and config[key]: | ||
43 | print ('CONFIG_%s=y' % key) | ||
44 | |||
45 | - deps = open(argv[2], 'w') | ||
46 | + deps = open(argv[2], 'wt', encoding='utf-8') | ||
47 | for fname in data.previously_included: | ||
48 | print ('%s: %s' % (argv[1], fname), file=deps) | ||
49 | deps.close() | ||
50 | -- | ||
51 | 2.26.2 | ||
52 | diff view generated by jsdifflib |
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | 1 | From: Nicolas Saenz Julienne <nsaenzju@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | LLVM's SafeStack instrumentation does not yet support programs that make | 3 | Introduce the 'event-loop-base' abstract class, it'll hold the |
4 | use of the APIs in ucontext.h | 4 | properties common to all event loops and provide the necessary hooks for |
5 | With the current implementation of coroutine-ucontext, the resulting | 5 | their creation and maintenance. Then have iothread inherit from it. |
6 | binary is incorrect, with different coroutines sharing the same unsafe | 6 | |
7 | stack and producing undefined behavior at runtime. | 7 | EventLoopBaseClass is defined as user creatable and provides a hook for |
8 | This fix allocates an additional unsafe stack area for each coroutine, | 8 | its children to attach themselves to the user creatable class 'complete' |
9 | and sets the new unsafe stack pointer before calling swapcontext() in | 9 | function. It also provides an update_params() callback to propagate |
10 | qemu_coroutine_new. | 10 | property changes onto its children. |
11 | This is the only place where the pointer needs to be manually updated, | 11 | |
12 | since sigsetjmp/siglongjmp are already instrumented by LLVM to properly | 12 | The new 'event-loop-base' class will live in the root directory. It is |
13 | support SafeStack. | 13 | built on its own using the 'link_whole' option (there are no direct |
14 | The additional stack is then freed in qemu_coroutine_delete. | 14 | function dependencies between the class and its children, it all happens |
15 | 15 | trough 'constructor' magic). And also imposes new compilation | |
16 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | 16 | dependencies: |
17 | Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com | 17 | |
18 | qom <- event-loop-base <- blockdev (iothread.c) | ||
19 | |||
20 | And in subsequent patches: | ||
21 | |||
22 | qom <- event-loop-base <- qemuutil (util/main-loop.c) | ||
23 | |||
24 | All this forced some amount of reordering in meson.build: | ||
25 | |||
26 | - Moved qom build definition before qemuutil. Doing it the other way | ||
27 | around (i.e. moving qemuutil after qom) isn't possible as a lot of | ||
28 | core libraries that live in between the two depend on it. | ||
29 | |||
30 | - Process the 'hw' subdir earlier, as it introduces files into the | ||
31 | 'qom' source set. | ||
32 | |||
33 | No functional changes intended. | ||
34 | |||
35 | Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> | ||
36 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
37 | Acked-by: Markus Armbruster <armbru@redhat.com> | ||
38 | Message-id: 20220425075723.20019-2-nsaenzju@redhat.com | ||
18 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 39 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
19 | --- | 40 | --- |
20 | include/qemu/coroutine_int.h | 5 +++++ | 41 | qapi/qom.json | 22 +++++-- |
21 | util/coroutine-ucontext.c | 28 ++++++++++++++++++++++++++++ | 42 | meson.build | 23 ++++--- |
22 | 2 files changed, 33 insertions(+) | 43 | include/sysemu/event-loop-base.h | 36 +++++++++++ |
23 | 44 | include/sysemu/iothread.h | 6 +- | |
24 | diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h | 45 | event-loop-base.c | 104 +++++++++++++++++++++++++++++++ |
46 | iothread.c | 65 ++++++------------- | ||
47 | 6 files changed, 192 insertions(+), 64 deletions(-) | ||
48 | create mode 100644 include/sysemu/event-loop-base.h | ||
49 | create mode 100644 event-loop-base.c | ||
50 | |||
51 | diff --git a/qapi/qom.json b/qapi/qom.json | ||
25 | index XXXXXXX..XXXXXXX 100644 | 52 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/include/qemu/coroutine_int.h | 53 | --- a/qapi/qom.json |
27 | +++ b/include/qemu/coroutine_int.h | 54 | +++ b/qapi/qom.json |
28 | @@ -XXX,XX +XXX,XX @@ | 55 | @@ -XXX,XX +XXX,XX @@ |
29 | #include "qemu/queue.h" | 56 | '*repeat': 'bool', |
30 | #include "qemu/coroutine.h" | 57 | '*grab-toggle': 'GrabToggleKeys' } } |
31 | 58 | ||
32 | +#ifdef CONFIG_SAFESTACK | 59 | +## |
33 | +/* Pointer to the unsafe stack, defined by the compiler */ | 60 | +# @EventLoopBaseProperties: |
34 | +extern __thread void *__safestack_unsafe_stack_ptr; | 61 | +# |
62 | +# Common properties for event loops | ||
63 | +# | ||
64 | +# @aio-max-batch: maximum number of requests in a batch for the AIO engine, | ||
65 | +# 0 means that the engine will use its default. | ||
66 | +# (default: 0) | ||
67 | +# | ||
68 | +# Since: 7.1 | ||
69 | +## | ||
70 | +{ 'struct': 'EventLoopBaseProperties', | ||
71 | + 'data': { '*aio-max-batch': 'int' } } | ||
72 | + | ||
73 | ## | ||
74 | # @IothreadProperties: | ||
75 | # | ||
76 | @@ -XXX,XX +XXX,XX @@ | ||
77 | # algorithm detects it is spending too long polling without | ||
78 | # encountering events. 0 selects a default behaviour (default: 0) | ||
79 | # | ||
80 | -# @aio-max-batch: maximum number of requests in a batch for the AIO engine, | ||
81 | -# 0 means that the engine will use its default | ||
82 | -# (default:0, since 6.1) | ||
83 | +# The @aio-max-batch option is available since 6.1. | ||
84 | # | ||
85 | # Since: 2.0 | ||
86 | ## | ||
87 | { 'struct': 'IothreadProperties', | ||
88 | + 'base': 'EventLoopBaseProperties', | ||
89 | 'data': { '*poll-max-ns': 'int', | ||
90 | '*poll-grow': 'int', | ||
91 | - '*poll-shrink': 'int', | ||
92 | - '*aio-max-batch': 'int' } } | ||
93 | + '*poll-shrink': 'int' } } | ||
94 | |||
95 | ## | ||
96 | # @MemoryBackendProperties: | ||
97 | diff --git a/meson.build b/meson.build | ||
98 | index XXXXXXX..XXXXXXX 100644 | ||
99 | --- a/meson.build | ||
100 | +++ b/meson.build | ||
101 | @@ -XXX,XX +XXX,XX @@ subdir('qom') | ||
102 | subdir('authz') | ||
103 | subdir('crypto') | ||
104 | subdir('ui') | ||
105 | +subdir('hw') | ||
106 | |||
107 | |||
108 | if enable_modules | ||
109 | @@ -XXX,XX +XXX,XX @@ if enable_modules | ||
110 | modulecommon = declare_dependency(link_whole: libmodulecommon, compile_args: '-DBUILD_DSO') | ||
111 | endif | ||
112 | |||
113 | +qom_ss = qom_ss.apply(config_host, strict: false) | ||
114 | +libqom = static_library('qom', qom_ss.sources() + genh, | ||
115 | + dependencies: [qom_ss.dependencies()], | ||
116 | + name_suffix: 'fa') | ||
117 | +qom = declare_dependency(link_whole: libqom) | ||
118 | + | ||
119 | +event_loop_base = files('event-loop-base.c') | ||
120 | +event_loop_base = static_library('event-loop-base', sources: event_loop_base + genh, | ||
121 | + build_by_default: true) | ||
122 | +event_loop_base = declare_dependency(link_whole: event_loop_base, | ||
123 | + dependencies: [qom]) | ||
124 | + | ||
125 | stub_ss = stub_ss.apply(config_all, strict: false) | ||
126 | |||
127 | util_ss.add_all(trace_ss) | ||
128 | @@ -XXX,XX +XXX,XX @@ subdir('monitor') | ||
129 | subdir('net') | ||
130 | subdir('replay') | ||
131 | subdir('semihosting') | ||
132 | -subdir('hw') | ||
133 | subdir('tcg') | ||
134 | subdir('fpu') | ||
135 | subdir('accel') | ||
136 | @@ -XXX,XX +XXX,XX @@ qemu_syms = custom_target('qemu.syms', output: 'qemu.syms', | ||
137 | capture: true, | ||
138 | command: [undefsym, nm, '@INPUT@']) | ||
139 | |||
140 | -qom_ss = qom_ss.apply(config_host, strict: false) | ||
141 | -libqom = static_library('qom', qom_ss.sources() + genh, | ||
142 | - dependencies: [qom_ss.dependencies()], | ||
143 | - name_suffix: 'fa') | ||
144 | - | ||
145 | -qom = declare_dependency(link_whole: libqom) | ||
146 | - | ||
147 | authz_ss = authz_ss.apply(config_host, strict: false) | ||
148 | libauthz = static_library('authz', authz_ss.sources() + genh, | ||
149 | dependencies: [authz_ss.dependencies()], | ||
150 | @@ -XXX,XX +XXX,XX @@ libblockdev = static_library('blockdev', blockdev_ss.sources() + genh, | ||
151 | build_by_default: false) | ||
152 | |||
153 | blockdev = declare_dependency(link_whole: [libblockdev], | ||
154 | - dependencies: [block]) | ||
155 | + dependencies: [block, event_loop_base]) | ||
156 | |||
157 | qmp_ss = qmp_ss.apply(config_host, strict: false) | ||
158 | libqmp = static_library('qmp', qmp_ss.sources() + genh, | ||
159 | diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h | ||
160 | new file mode 100644 | ||
161 | index XXXXXXX..XXXXXXX | ||
162 | --- /dev/null | ||
163 | +++ b/include/sysemu/event-loop-base.h | ||
164 | @@ -XXX,XX +XXX,XX @@ | ||
165 | +/* | ||
166 | + * QEMU event-loop backend | ||
167 | + * | ||
168 | + * Copyright (C) 2022 Red Hat Inc | ||
169 | + * | ||
170 | + * Authors: | ||
171 | + * Nicolas Saenz Julienne <nsaenzju@redhat.com> | ||
172 | + * | ||
173 | + * This work is licensed under the terms of the GNU GPL, version 2 or later. | ||
174 | + * See the COPYING file in the top-level directory. | ||
175 | + */ | ||
176 | +#ifndef QEMU_EVENT_LOOP_BASE_H | ||
177 | +#define QEMU_EVENT_LOOP_BASE_H | ||
178 | + | ||
179 | +#include "qom/object.h" | ||
180 | +#include "block/aio.h" | ||
181 | +#include "qemu/typedefs.h" | ||
182 | + | ||
183 | +#define TYPE_EVENT_LOOP_BASE "event-loop-base" | ||
184 | +OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass, | ||
185 | + EVENT_LOOP_BASE) | ||
186 | + | ||
187 | +struct EventLoopBaseClass { | ||
188 | + ObjectClass parent_class; | ||
189 | + | ||
190 | + void (*init)(EventLoopBase *base, Error **errp); | ||
191 | + void (*update_params)(EventLoopBase *base, Error **errp); | ||
192 | +}; | ||
193 | + | ||
194 | +struct EventLoopBase { | ||
195 | + Object parent; | ||
196 | + | ||
197 | + /* AioContext AIO engine parameters */ | ||
198 | + int64_t aio_max_batch; | ||
199 | +}; | ||
35 | +#endif | 200 | +#endif |
36 | + | 201 | diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h |
37 | #define COROUTINE_STACK_SIZE (1 << 20) | ||
38 | |||
39 | typedef enum { | ||
40 | diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c | ||
41 | index XXXXXXX..XXXXXXX 100644 | 202 | index XXXXXXX..XXXXXXX 100644 |
42 | --- a/util/coroutine-ucontext.c | 203 | --- a/include/sysemu/iothread.h |
43 | +++ b/util/coroutine-ucontext.c | 204 | +++ b/include/sysemu/iothread.h |
44 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 205 | @@ -XXX,XX +XXX,XX @@ |
45 | Coroutine base; | 206 | #include "block/aio.h" |
46 | void *stack; | 207 | #include "qemu/thread.h" |
47 | size_t stack_size; | 208 | #include "qom/object.h" |
48 | +#ifdef CONFIG_SAFESTACK | 209 | +#include "sysemu/event-loop-base.h" |
49 | + /* Need an unsafe stack for each coroutine */ | 210 | |
50 | + void *unsafe_stack; | 211 | #define TYPE_IOTHREAD "iothread" |
51 | + size_t unsafe_stack_size; | 212 | |
52 | +#endif | 213 | struct IOThread { |
53 | sigjmp_buf env; | 214 | - Object parent_obj; |
54 | 215 | + EventLoopBase parent_obj; | |
55 | void *tsan_co_fiber; | 216 | |
56 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void) | 217 | QemuThread thread; |
57 | co = g_malloc0(sizeof(*co)); | 218 | AioContext *ctx; |
58 | co->stack_size = COROUTINE_STACK_SIZE; | 219 | @@ -XXX,XX +XXX,XX @@ struct IOThread { |
59 | co->stack = qemu_alloc_stack(&co->stack_size); | 220 | int64_t poll_max_ns; |
60 | +#ifdef CONFIG_SAFESTACK | 221 | int64_t poll_grow; |
61 | + co->unsafe_stack_size = COROUTINE_STACK_SIZE; | 222 | int64_t poll_shrink; |
62 | + co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size); | 223 | - |
63 | +#endif | 224 | - /* AioContext AIO engine parameters */ |
64 | co->base.entry_arg = &old_env; /* stash away our jmp_buf */ | 225 | - int64_t aio_max_batch; |
65 | 226 | }; | |
66 | uc.uc_link = &old_uc; | 227 | typedef struct IOThread IOThread; |
67 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void) | 228 | |
68 | COROUTINE_YIELD, | 229 | diff --git a/event-loop-base.c b/event-loop-base.c |
69 | &fake_stack_save, | 230 | new file mode 100644 |
70 | co->stack, co->stack_size, co->tsan_co_fiber); | 231 | index XXXXXXX..XXXXXXX |
71 | + | 232 | --- /dev/null |
72 | +#ifdef CONFIG_SAFESTACK | 233 | +++ b/event-loop-base.c |
73 | + /* | 234 | @@ -XXX,XX +XXX,XX @@ |
74 | + * Before we swap the context, set the new unsafe stack | 235 | +/* |
75 | + * The unsafe stack grows just like the normal stack, so start from | 236 | + * QEMU event-loop base |
76 | + * the last usable location of the memory area. | 237 | + * |
77 | + * NOTE: we don't have to re-set the usp afterwards because we are | 238 | + * Copyright (C) 2022 Red Hat Inc |
78 | + * coming back to this context through a siglongjmp. | 239 | + * |
79 | + * The compiler already wrapped the corresponding sigsetjmp call with | 240 | + * Authors: |
80 | + * code that saves the usp on the (safe) stack before the call, and | 241 | + * Stefan Hajnoczi <stefanha@redhat.com> |
81 | + * restores it right after (which is where we return with siglongjmp). | 242 | + * Nicolas Saenz Julienne <nsaenzju@redhat.com> |
82 | + */ | 243 | + * |
83 | + void *usp = co->unsafe_stack + co->unsafe_stack_size; | 244 | + * This work is licensed under the terms of the GNU GPL, version 2 or later. |
84 | + __safestack_unsafe_stack_ptr = usp; | 245 | + * See the COPYING file in the top-level directory. |
85 | +#endif | 246 | + */ |
86 | + | 247 | + |
87 | swapcontext(&old_uc, &uc); | 248 | +#include "qemu/osdep.h" |
249 | +#include "qom/object_interfaces.h" | ||
250 | +#include "qapi/error.h" | ||
251 | +#include "sysemu/event-loop-base.h" | ||
252 | + | ||
253 | +typedef struct { | ||
254 | + const char *name; | ||
255 | + ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */ | ||
256 | +} EventLoopBaseParamInfo; | ||
257 | + | ||
258 | +static EventLoopBaseParamInfo aio_max_batch_info = { | ||
259 | + "aio-max-batch", offsetof(EventLoopBase, aio_max_batch), | ||
260 | +}; | ||
261 | + | ||
262 | +static void event_loop_base_get_param(Object *obj, Visitor *v, | ||
263 | + const char *name, void *opaque, Error **errp) | ||
264 | +{ | ||
265 | + EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj); | ||
266 | + EventLoopBaseParamInfo *info = opaque; | ||
267 | + int64_t *field = (void *)event_loop_base + info->offset; | ||
268 | + | ||
269 | + visit_type_int64(v, name, field, errp); | ||
270 | +} | ||
271 | + | ||
272 | +static void event_loop_base_set_param(Object *obj, Visitor *v, | ||
273 | + const char *name, void *opaque, Error **errp) | ||
274 | +{ | ||
275 | + EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj); | ||
276 | + EventLoopBase *base = EVENT_LOOP_BASE(obj); | ||
277 | + EventLoopBaseParamInfo *info = opaque; | ||
278 | + int64_t *field = (void *)base + info->offset; | ||
279 | + int64_t value; | ||
280 | + | ||
281 | + if (!visit_type_int64(v, name, &value, errp)) { | ||
282 | + return; | ||
283 | + } | ||
284 | + | ||
285 | + if (value < 0) { | ||
286 | + error_setg(errp, "%s value must be in range [0, %" PRId64 "]", | ||
287 | + info->name, INT64_MAX); | ||
288 | + return; | ||
289 | + } | ||
290 | + | ||
291 | + *field = value; | ||
292 | + | ||
293 | + if (bc->update_params) { | ||
294 | + bc->update_params(base, errp); | ||
295 | + } | ||
296 | + | ||
297 | + return; | ||
298 | +} | ||
299 | + | ||
300 | +static void event_loop_base_complete(UserCreatable *uc, Error **errp) | ||
301 | +{ | ||
302 | + EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc); | ||
303 | + EventLoopBase *base = EVENT_LOOP_BASE(uc); | ||
304 | + | ||
305 | + if (bc->init) { | ||
306 | + bc->init(base, errp); | ||
307 | + } | ||
308 | +} | ||
309 | + | ||
310 | +static void event_loop_base_class_init(ObjectClass *klass, void *class_data) | ||
311 | +{ | ||
312 | + UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass); | ||
313 | + ucc->complete = event_loop_base_complete; | ||
314 | + | ||
315 | + object_class_property_add(klass, "aio-max-batch", "int", | ||
316 | + event_loop_base_get_param, | ||
317 | + event_loop_base_set_param, | ||
318 | + NULL, &aio_max_batch_info); | ||
319 | +} | ||
320 | + | ||
321 | +static const TypeInfo event_loop_base_info = { | ||
322 | + .name = TYPE_EVENT_LOOP_BASE, | ||
323 | + .parent = TYPE_OBJECT, | ||
324 | + .instance_size = sizeof(EventLoopBase), | ||
325 | + .class_size = sizeof(EventLoopBaseClass), | ||
326 | + .class_init = event_loop_base_class_init, | ||
327 | + .abstract = true, | ||
328 | + .interfaces = (InterfaceInfo[]) { | ||
329 | + { TYPE_USER_CREATABLE }, | ||
330 | + { } | ||
331 | + } | ||
332 | +}; | ||
333 | + | ||
334 | +static void register_types(void) | ||
335 | +{ | ||
336 | + type_register_static(&event_loop_base_info); | ||
337 | +} | ||
338 | +type_init(register_types); | ||
339 | diff --git a/iothread.c b/iothread.c | ||
340 | index XXXXXXX..XXXXXXX 100644 | ||
341 | --- a/iothread.c | ||
342 | +++ b/iothread.c | ||
343 | @@ -XXX,XX +XXX,XX @@ | ||
344 | #include "qemu/module.h" | ||
345 | #include "block/aio.h" | ||
346 | #include "block/block.h" | ||
347 | +#include "sysemu/event-loop-base.h" | ||
348 | #include "sysemu/iothread.h" | ||
349 | #include "qapi/error.h" | ||
350 | #include "qapi/qapi-commands-misc.h" | ||
351 | @@ -XXX,XX +XXX,XX @@ static void iothread_init_gcontext(IOThread *iothread) | ||
352 | iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE); | ||
353 | } | ||
354 | |||
355 | -static void iothread_set_aio_context_params(IOThread *iothread, Error **errp) | ||
356 | +static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp) | ||
357 | { | ||
358 | + IOThread *iothread = IOTHREAD(base); | ||
359 | ERRP_GUARD(); | ||
360 | |||
361 | + if (!iothread->ctx) { | ||
362 | + return; | ||
363 | + } | ||
364 | + | ||
365 | aio_context_set_poll_params(iothread->ctx, | ||
366 | iothread->poll_max_ns, | ||
367 | iothread->poll_grow, | ||
368 | @@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(IOThread *iothread, Error **errp) | ||
88 | } | 369 | } |
89 | 370 | ||
90 | @@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_) | 371 | aio_context_set_aio_params(iothread->ctx, |
91 | #endif | 372 | - iothread->aio_max_batch, |
92 | 373 | + iothread->parent_obj.aio_max_batch, | |
93 | qemu_free_stack(co->stack, co->stack_size); | 374 | errp); |
94 | +#ifdef CONFIG_SAFESTACK | ||
95 | + qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size); | ||
96 | +#endif | ||
97 | g_free(co); | ||
98 | } | 375 | } |
99 | 376 | ||
377 | -static void iothread_complete(UserCreatable *obj, Error **errp) | ||
378 | + | ||
379 | +static void iothread_init(EventLoopBase *base, Error **errp) | ||
380 | { | ||
381 | Error *local_error = NULL; | ||
382 | - IOThread *iothread = IOTHREAD(obj); | ||
383 | + IOThread *iothread = IOTHREAD(base); | ||
384 | char *thread_name; | ||
385 | |||
386 | iothread->stopping = false; | ||
387 | @@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp) | ||
388 | */ | ||
389 | iothread_init_gcontext(iothread); | ||
390 | |||
391 | - iothread_set_aio_context_params(iothread, &local_error); | ||
392 | + iothread_set_aio_context_params(base, &local_error); | ||
393 | if (local_error) { | ||
394 | error_propagate(errp, local_error); | ||
395 | aio_context_unref(iothread->ctx); | ||
396 | @@ -XXX,XX +XXX,XX @@ static void iothread_complete(UserCreatable *obj, Error **errp) | ||
397 | * to inherit. | ||
398 | */ | ||
399 | thread_name = g_strdup_printf("IO %s", | ||
400 | - object_get_canonical_path_component(OBJECT(obj))); | ||
401 | + object_get_canonical_path_component(OBJECT(base))); | ||
402 | qemu_thread_create(&iothread->thread, thread_name, iothread_run, | ||
403 | iothread, QEMU_THREAD_JOINABLE); | ||
404 | g_free(thread_name); | ||
405 | @@ -XXX,XX +XXX,XX @@ static IOThreadParamInfo poll_grow_info = { | ||
406 | static IOThreadParamInfo poll_shrink_info = { | ||
407 | "poll-shrink", offsetof(IOThread, poll_shrink), | ||
408 | }; | ||
409 | -static IOThreadParamInfo aio_max_batch_info = { | ||
410 | - "aio-max-batch", offsetof(IOThread, aio_max_batch), | ||
411 | -}; | ||
412 | |||
413 | static void iothread_get_param(Object *obj, Visitor *v, | ||
414 | const char *name, IOThreadParamInfo *info, Error **errp) | ||
415 | @@ -XXX,XX +XXX,XX @@ static void iothread_set_poll_param(Object *obj, Visitor *v, | ||
416 | } | ||
417 | } | ||
418 | |||
419 | -static void iothread_get_aio_param(Object *obj, Visitor *v, | ||
420 | - const char *name, void *opaque, Error **errp) | ||
421 | -{ | ||
422 | - IOThreadParamInfo *info = opaque; | ||
423 | - | ||
424 | - iothread_get_param(obj, v, name, info, errp); | ||
425 | -} | ||
426 | - | ||
427 | -static void iothread_set_aio_param(Object *obj, Visitor *v, | ||
428 | - const char *name, void *opaque, Error **errp) | ||
429 | -{ | ||
430 | - IOThread *iothread = IOTHREAD(obj); | ||
431 | - IOThreadParamInfo *info = opaque; | ||
432 | - | ||
433 | - if (!iothread_set_param(obj, v, name, info, errp)) { | ||
434 | - return; | ||
435 | - } | ||
436 | - | ||
437 | - if (iothread->ctx) { | ||
438 | - aio_context_set_aio_params(iothread->ctx, | ||
439 | - iothread->aio_max_batch, | ||
440 | - errp); | ||
441 | - } | ||
442 | -} | ||
443 | - | ||
444 | static void iothread_class_init(ObjectClass *klass, void *class_data) | ||
445 | { | ||
446 | - UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass); | ||
447 | - ucc->complete = iothread_complete; | ||
448 | + EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(klass); | ||
449 | + | ||
450 | + bc->init = iothread_init; | ||
451 | + bc->update_params = iothread_set_aio_context_params; | ||
452 | |||
453 | object_class_property_add(klass, "poll-max-ns", "int", | ||
454 | iothread_get_poll_param, | ||
455 | @@ -XXX,XX +XXX,XX @@ static void iothread_class_init(ObjectClass *klass, void *class_data) | ||
456 | iothread_get_poll_param, | ||
457 | iothread_set_poll_param, | ||
458 | NULL, &poll_shrink_info); | ||
459 | - object_class_property_add(klass, "aio-max-batch", "int", | ||
460 | - iothread_get_aio_param, | ||
461 | - iothread_set_aio_param, | ||
462 | - NULL, &aio_max_batch_info); | ||
463 | } | ||
464 | |||
465 | static const TypeInfo iothread_info = { | ||
466 | .name = TYPE_IOTHREAD, | ||
467 | - .parent = TYPE_OBJECT, | ||
468 | + .parent = TYPE_EVENT_LOOP_BASE, | ||
469 | .class_init = iothread_class_init, | ||
470 | .instance_size = sizeof(IOThread), | ||
471 | .instance_init = iothread_instance_init, | ||
472 | .instance_finalize = iothread_instance_finalize, | ||
473 | - .interfaces = (InterfaceInfo[]) { | ||
474 | - {TYPE_USER_CREATABLE}, | ||
475 | - {} | ||
476 | - }, | ||
477 | }; | ||
478 | |||
479 | static void iothread_register_types(void) | ||
480 | @@ -XXX,XX +XXX,XX @@ static int query_one_iothread(Object *object, void *opaque) | ||
481 | info->poll_max_ns = iothread->poll_max_ns; | ||
482 | info->poll_grow = iothread->poll_grow; | ||
483 | info->poll_shrink = iothread->poll_shrink; | ||
484 | - info->aio_max_batch = iothread->aio_max_batch; | ||
485 | + info->aio_max_batch = iothread->parent_obj.aio_max_batch; | ||
486 | |||
487 | QAPI_LIST_APPEND(*tail, info); | ||
488 | return 0; | ||
100 | -- | 489 | -- |
101 | 2.26.2 | 490 | 2.35.1 |
102 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | Current implementation of LLVM's SafeStack is not compatible with | ||
4 | code that uses an alternate stack created with sigaltstack(). | ||
5 | Since coroutine-sigaltstack relies on sigaltstack(), it is not | ||
6 | compatible with SafeStack. The resulting binary is incorrect, with | ||
7 | different coroutines sharing the same unsafe stack and producing | ||
8 | undefined behavior at runtime. | ||
9 | |||
10 | In the future LLVM may provide a SafeStack implementation compatible with | ||
11 | sigaltstack(). In the meantime, if SafeStack is desired, the coroutine | ||
12 | implementation from coroutine-ucontext should be used. | ||
13 | As a safety check, add a control in coroutine-sigaltstack to throw a | ||
14 | preprocessor #error if SafeStack is enabled and we are trying to | ||
15 | use coroutine-sigaltstack to implement coroutines. | ||
16 | |||
17 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
18 | Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com | ||
19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
20 | --- | ||
21 | util/coroutine-sigaltstack.c | 4 ++++ | ||
22 | 1 file changed, 4 insertions(+) | ||
23 | |||
24 | diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c | ||
25 | index XXXXXXX..XXXXXXX 100644 | ||
26 | --- a/util/coroutine-sigaltstack.c | ||
27 | +++ b/util/coroutine-sigaltstack.c | ||
28 | @@ -XXX,XX +XXX,XX @@ | ||
29 | #include "qemu-common.h" | ||
30 | #include "qemu/coroutine_int.h" | ||
31 | |||
32 | +#ifdef CONFIG_SAFESTACK | ||
33 | +#error "SafeStack is not compatible with code run in alternate signal stacks" | ||
34 | +#endif | ||
35 | + | ||
36 | typedef struct { | ||
37 | Coroutine base; | ||
38 | void *stack; | ||
39 | -- | ||
40 | 2.26.2 | ||
41 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | This patch adds a flag to enable/disable the SafeStack instrumentation | ||
4 | provided by LLVM. | ||
5 | |||
6 | On enable, make sure that the compiler supports the flags, and that we | ||
7 | are using the proper coroutine implementation (coroutine-ucontext). | ||
8 | On disable, explicitly disable the option if it was enabled by default. | ||
9 | |||
10 | While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS, | ||
11 | we are not checking for the O.S. since this is already done by LLVM. | ||
12 | |||
13 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
14 | Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com | ||
15 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
16 | --- | ||
17 | configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ | ||
18 | 1 file changed, 73 insertions(+) | ||
19 | |||
20 | diff --git a/configure b/configure | ||
21 | index XXXXXXX..XXXXXXX 100755 | ||
22 | --- a/configure | ||
23 | +++ b/configure | ||
24 | @@ -XXX,XX +XXX,XX @@ audio_win_int="" | ||
25 | libs_qga="" | ||
26 | debug_info="yes" | ||
27 | stack_protector="" | ||
28 | +safe_stack="" | ||
29 | use_containers="yes" | ||
30 | gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb") | ||
31 | |||
32 | @@ -XXX,XX +XXX,XX @@ for opt do | ||
33 | ;; | ||
34 | --disable-stack-protector) stack_protector="no" | ||
35 | ;; | ||
36 | + --enable-safe-stack) safe_stack="yes" | ||
37 | + ;; | ||
38 | + --disable-safe-stack) safe_stack="no" | ||
39 | + ;; | ||
40 | --disable-curses) curses="no" | ||
41 | ;; | ||
42 | --enable-curses) curses="yes" | ||
43 | @@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available: | ||
44 | debug-tcg TCG debugging (default is disabled) | ||
45 | debug-info debugging information | ||
46 | sparse sparse checker | ||
47 | + safe-stack SafeStack Stack Smash Protection. Depends on | ||
48 | + clang/llvm >= 3.7 and requires coroutine backend ucontext. | ||
49 | |||
50 | gnutls GNUTLS cryptography support | ||
51 | nettle nettle cryptography support | ||
52 | @@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then | ||
53 | fi | ||
54 | fi | ||
55 | |||
56 | +################################################## | ||
57 | +# SafeStack | ||
58 | + | ||
59 | + | ||
60 | +if test "$safe_stack" = "yes"; then | ||
61 | +cat > $TMPC << EOF | ||
62 | +int main(int argc, char *argv[]) | ||
63 | +{ | ||
64 | +#if ! __has_feature(safe_stack) | ||
65 | +#error SafeStack Disabled | ||
66 | +#endif | ||
67 | + return 0; | ||
68 | +} | ||
69 | +EOF | ||
70 | + flag="-fsanitize=safe-stack" | ||
71 | + # Check that safe-stack is supported and enabled. | ||
72 | + if compile_prog "-Werror $flag" "$flag"; then | ||
73 | + # Flag needed both at compilation and at linking | ||
74 | + QEMU_CFLAGS="$QEMU_CFLAGS $flag" | ||
75 | + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" | ||
76 | + else | ||
77 | + error_exit "SafeStack not supported by your compiler" | ||
78 | + fi | ||
79 | + if test "$coroutine" != "ucontext"; then | ||
80 | + error_exit "SafeStack is only supported by the coroutine backend ucontext" | ||
81 | + fi | ||
82 | +else | ||
83 | +cat > $TMPC << EOF | ||
84 | +int main(int argc, char *argv[]) | ||
85 | +{ | ||
86 | +#if defined(__has_feature) | ||
87 | +#if __has_feature(safe_stack) | ||
88 | +#error SafeStack Enabled | ||
89 | +#endif | ||
90 | +#endif | ||
91 | + return 0; | ||
92 | +} | ||
93 | +EOF | ||
94 | +if test "$safe_stack" = "no"; then | ||
95 | + # Make sure that safe-stack is disabled | ||
96 | + if ! compile_prog "-Werror" ""; then | ||
97 | + # SafeStack was already enabled, try to explicitly remove the feature | ||
98 | + flag="-fno-sanitize=safe-stack" | ||
99 | + if ! compile_prog "-Werror $flag" "$flag"; then | ||
100 | + error_exit "Configure cannot disable SafeStack" | ||
101 | + fi | ||
102 | + QEMU_CFLAGS="$QEMU_CFLAGS $flag" | ||
103 | + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" | ||
104 | + fi | ||
105 | +else # "$safe_stack" = "" | ||
106 | + # Set safe_stack to yes or no based on pre-existing flags | ||
107 | + if compile_prog "-Werror" ""; then | ||
108 | + safe_stack="no" | ||
109 | + else | ||
110 | + safe_stack="yes" | ||
111 | + if test "$coroutine" != "ucontext"; then | ||
112 | + error_exit "SafeStack is only supported by the coroutine backend ucontext" | ||
113 | + fi | ||
114 | + fi | ||
115 | +fi | ||
116 | +fi | ||
117 | |||
118 | ########################################## | ||
119 | # check if we have open_by_handle_at | ||
120 | @@ -XXX,XX +XXX,XX @@ echo "sparse enabled $sparse" | ||
121 | echo "strip binaries $strip_opt" | ||
122 | echo "profiler $profiler" | ||
123 | echo "static build $static" | ||
124 | +echo "safe stack $safe_stack" | ||
125 | if test "$darwin" = "yes" ; then | ||
126 | echo "Cocoa support $cocoa" | ||
127 | fi | ||
128 | @@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then | ||
129 | echo "export CCACHE_CPP2=y" >> $config_host_mak | ||
130 | fi | ||
131 | |||
132 | +if test "$safe_stack" = "yes"; then | ||
133 | + echo "CONFIG_SAFESTACK=y" >> $config_host_mak | ||
134 | +fi | ||
135 | + | ||
136 | # If we're using a separate build tree, set it up now. | ||
137 | # DIRS are directories which we simply mkdir in the build tree; | ||
138 | # LINKS are things to symlink back into the source tree | ||
139 | -- | ||
140 | 2.26.2 | ||
141 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | SafeStack is a stack protection technique implemented in llvm. It is | ||
4 | enabled with a -fsanitize flag. | ||
5 | iotests are currently disabled when any -fsanitize option is used, | ||
6 | because such options tend to produce additional warnings and false | ||
7 | positives. | ||
8 | |||
9 | While common -fsanitize options are used to verify the code and not | ||
10 | added in production, SafeStack's main use is in production environments | ||
11 | to protect against stack smashing. | ||
12 | |||
13 | Since SafeStack does not print any warning or false positive, enable | ||
14 | iotests when SafeStack is the only -fsanitize option used. | ||
15 | This is likely going to be a production binary and we want to make sure | ||
16 | it works correctly. | ||
17 | |||
18 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
19 | Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com | ||
20 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
21 | --- | ||
22 | tests/check-block.sh | 12 +++++++++++- | ||
23 | 1 file changed, 11 insertions(+), 1 deletion(-) | ||
24 | |||
25 | diff --git a/tests/check-block.sh b/tests/check-block.sh | ||
26 | index XXXXXXX..XXXXXXX 100755 | ||
27 | --- a/tests/check-block.sh | ||
28 | +++ b/tests/check-block.sh | ||
29 | @@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then | ||
30 | exit 0 | ||
31 | fi | ||
32 | |||
33 | -if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then | ||
34 | +# Disable tests with any sanitizer except for SafeStack | ||
35 | +CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ) | ||
36 | +SANITIZE_FLAGS="" | ||
37 | +#Remove all occurrencies of -fsanitize=safe-stack | ||
38 | +for i in ${CFLAGS}; do | ||
39 | + if [ "${i}" != "-fsanitize=safe-stack" ]; then | ||
40 | + SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}" | ||
41 | + fi | ||
42 | +done | ||
43 | +if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then | ||
44 | + # Have a sanitize flag that is not allowed, stop | ||
45 | echo "Sanitizers are enabled ==> Not running the qemu-iotests." | ||
46 | exit 0 | ||
47 | fi | ||
48 | -- | ||
49 | 2.26.2 | ||
50 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | A lot of CPU time is spent simply locking/unlocking q->lock during | ||
2 | polling. Check for completion outside the lock to make q->lock disappear | ||
3 | from the profile. | ||
4 | 1 | ||
5 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
6 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
7 | Message-id: 20200617132201.1832152-2-stefanha@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | --- | ||
10 | block/nvme.c | 12 ++++++++++++ | ||
11 | 1 file changed, 12 insertions(+) | ||
12 | |||
13 | diff --git a/block/nvme.c b/block/nvme.c | ||
14 | index XXXXXXX..XXXXXXX 100644 | ||
15 | --- a/block/nvme.c | ||
16 | +++ b/block/nvme.c | ||
17 | @@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s) | ||
18 | |||
19 | for (i = 0; i < s->nr_queues; i++) { | ||
20 | NVMeQueuePair *q = s->queues[i]; | ||
21 | + const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; | ||
22 | + NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; | ||
23 | + | ||
24 | + /* | ||
25 | + * Do an early check for completions. q->lock isn't needed because | ||
26 | + * nvme_process_completion() only runs in the event loop thread and | ||
27 | + * cannot race with itself. | ||
28 | + */ | ||
29 | + if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { | ||
30 | + continue; | ||
31 | + } | ||
32 | + | ||
33 | qemu_mutex_lock(&q->lock); | ||
34 | while (nvme_process_completion(s, q)) { | ||
35 | /* Keep polling */ | ||
36 | -- | ||
37 | 2.26.2 | ||
38 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | nvme_process_completion() explicitly checks cid so the assertion that | ||
2 | follows is always true: | ||
3 | 1 | ||
4 | if (cid == 0 || cid > NVME_QUEUE_SIZE) { | ||
5 | ... | ||
6 | continue; | ||
7 | } | ||
8 | assert(cid <= NVME_QUEUE_SIZE); | ||
9 | |||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
11 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
12 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
13 | Message-id: 20200617132201.1832152-3-stefanha@redhat.com | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | --- | ||
16 | block/nvme.c | 1 - | ||
17 | 1 file changed, 1 deletion(-) | ||
18 | |||
19 | diff --git a/block/nvme.c b/block/nvme.c | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/block/nvme.c | ||
22 | +++ b/block/nvme.c | ||
23 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
24 | cid); | ||
25 | continue; | ||
26 | } | ||
27 | - assert(cid <= NVME_QUEUE_SIZE); | ||
28 | trace_nvme_complete_command(s, q->index, cid); | ||
29 | preq = &q->reqs[cid - 1]; | ||
30 | req = *preq; | ||
31 | -- | ||
32 | 2.26.2 | ||
33 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Do not access a CQE after incrementing q->cq.head and releasing q->lock. | ||
2 | It is unlikely that this causes problems in practice but it's a latent | ||
3 | bug. | ||
4 | 1 | ||
5 | The reason why it should be safe at the moment is that completion | ||
6 | processing is not re-entrant and the CQ doorbell isn't written until the | ||
7 | end of nvme_process_completion(). | ||
8 | |||
9 | Make this change now because QEMU expects completion processing to be | ||
10 | re-entrant and later patches will do that. | ||
11 | |||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
14 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
15 | Message-id: 20200617132201.1832152-4-stefanha@redhat.com | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
17 | --- | ||
18 | block/nvme.c | 5 ++++- | ||
19 | 1 file changed, 4 insertions(+), 1 deletion(-) | ||
20 | |||
21 | diff --git a/block/nvme.c b/block/nvme.c | ||
22 | index XXXXXXX..XXXXXXX 100644 | ||
23 | --- a/block/nvme.c | ||
24 | +++ b/block/nvme.c | ||
25 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
26 | q->busy = true; | ||
27 | assert(q->inflight >= 0); | ||
28 | while (q->inflight) { | ||
29 | + int ret; | ||
30 | int16_t cid; | ||
31 | + | ||
32 | c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; | ||
33 | if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { | ||
34 | break; | ||
35 | } | ||
36 | + ret = nvme_translate_error(c); | ||
37 | q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; | ||
38 | if (!q->cq.head) { | ||
39 | q->cq_phase = !q->cq_phase; | ||
40 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
41 | preq->busy = false; | ||
42 | preq->cb = preq->opaque = NULL; | ||
43 | qemu_mutex_unlock(&q->lock); | ||
44 | - req.cb(req.opaque, nvme_translate_error(c)); | ||
45 | + req.cb(req.opaque, ret); | ||
46 | qemu_mutex_lock(&q->lock); | ||
47 | q->inflight--; | ||
48 | progress = true; | ||
49 | -- | ||
50 | 2.26.2 | ||
51 | diff view generated by jsdifflib |
1 | Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce | 1 | From: Nicolas Saenz Julienne <nsaenzju@redhat.com> |
---|---|---|---|
2 | the number of function arguments by keeping the BDRVNVMeState pointer in | 2 | |
3 | NVMeQueuePair. This will come in handly when a BH is introduced in a | 3 | 'event-loop-base' provides basic property handling for all 'AioContext' |
4 | later patch and only one argument can be passed to it. | 4 | based event loops. So let's define a new 'MainLoopClass' that inherits |
5 | 5 | from it. This will permit tweaking the main loop's properties through | |
6 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 6 | qapi as well as through the command line using the '-object' keyword[1]. |
7 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 7 | Only one instance of 'MainLoopClass' might be created at any time. |
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 8 | |
9 | Message-id: 20200617132201.1832152-7-stefanha@redhat.com | 9 | 'EventLoopBaseClass' learns a new callback, 'can_be_deleted()' so as to |
10 | mark 'MainLoop' as non-deletable. | ||
11 | |||
12 | [1] For example: | ||
13 | -object main-loop,id=main-loop,aio-max-batch=<value> | ||
14 | |||
15 | Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> | ||
16 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
17 | Acked-by: Markus Armbruster <armbru@redhat.com> | ||
18 | Message-id: 20220425075723.20019-3-nsaenzju@redhat.com | ||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
11 | --- | 20 | --- |
12 | block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------ | 21 | qapi/qom.json | 13 ++++++++ |
13 | 1 file changed, 38 insertions(+), 32 deletions(-) | 22 | meson.build | 3 +- |
14 | 23 | include/qemu/main-loop.h | 10 ++++++ | |
15 | diff --git a/block/nvme.c b/block/nvme.c | 24 | include/sysemu/event-loop-base.h | 1 + |
16 | index XXXXXXX..XXXXXXX 100644 | 25 | event-loop-base.c | 13 ++++++++ |
17 | --- a/block/nvme.c | 26 | util/main-loop.c | 56 ++++++++++++++++++++++++++++++++ |
18 | +++ b/block/nvme.c | 27 | 6 files changed, 95 insertions(+), 1 deletion(-) |
19 | @@ -XXX,XX +XXX,XX @@ | 28 | |
20 | */ | 29 | diff --git a/qapi/qom.json b/qapi/qom.json |
21 | #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) | 30 | index XXXXXXX..XXXXXXX 100644 |
22 | 31 | --- a/qapi/qom.json | |
23 | +typedef struct BDRVNVMeState BDRVNVMeState; | 32 | +++ b/qapi/qom.json |
24 | + | 33 | @@ -XXX,XX +XXX,XX @@ |
25 | typedef struct { | 34 | '*poll-grow': 'int', |
26 | int32_t head, tail; | 35 | '*poll-shrink': 'int' } } |
27 | uint8_t *queue; | 36 | |
28 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 37 | +## |
29 | typedef struct { | 38 | +# @MainLoopProperties: |
30 | QemuMutex lock; | 39 | +# |
31 | 40 | +# Properties for the main-loop object. | |
32 | + /* Read from I/O code path, initialized under BQL */ | 41 | +# |
33 | + BDRVNVMeState *s; | 42 | +# Since: 7.1 |
34 | + int index; | 43 | +## |
35 | + | 44 | +{ 'struct': 'MainLoopProperties', |
36 | /* Fields protected by BQL */ | 45 | + 'base': 'EventLoopBaseProperties', |
37 | - int index; | 46 | + 'data': {} } |
38 | uint8_t *prp_list_pages; | 47 | + |
39 | 48 | ## | |
40 | /* Fields protected by @lock */ | 49 | # @MemoryBackendProperties: |
41 | @@ -XXX,XX +XXX,XX @@ typedef volatile struct { | 50 | # |
42 | 51 | @@ -XXX,XX +XXX,XX @@ | |
43 | QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); | 52 | { 'name': 'input-linux', |
44 | 53 | 'if': 'CONFIG_LINUX' }, | |
45 | -typedef struct { | 54 | 'iothread', |
46 | +struct BDRVNVMeState { | 55 | + 'main-loop', |
47 | AioContext *aio_context; | 56 | { 'name': 'memory-backend-epc', |
48 | QEMUVFIOState *vfio; | 57 | 'if': 'CONFIG_LINUX' }, |
49 | NVMeRegs *regs; | 58 | 'memory-backend-file', |
50 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 59 | @@ -XXX,XX +XXX,XX @@ |
51 | 60 | 'input-linux': { 'type': 'InputLinuxProperties', | |
52 | /* PCI address (required for nvme_refresh_filename()) */ | 61 | 'if': 'CONFIG_LINUX' }, |
53 | char *device; | 62 | 'iothread': 'IothreadProperties', |
54 | -} BDRVNVMeState; | 63 | + 'main-loop': 'MainLoopProperties', |
64 | 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', | ||
65 | 'if': 'CONFIG_LINUX' }, | ||
66 | 'memory-backend-file': 'MemoryBackendFileProperties', | ||
67 | diff --git a/meson.build b/meson.build | ||
68 | index XXXXXXX..XXXXXXX 100644 | ||
69 | --- a/meson.build | ||
70 | +++ b/meson.build | ||
71 | @@ -XXX,XX +XXX,XX @@ libqemuutil = static_library('qemuutil', | ||
72 | sources: util_ss.sources() + stub_ss.sources() + genh, | ||
73 | dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc, pixman]) | ||
74 | qemuutil = declare_dependency(link_with: libqemuutil, | ||
75 | - sources: genh + version_res) | ||
76 | + sources: genh + version_res, | ||
77 | + dependencies: [event_loop_base]) | ||
78 | |||
79 | if have_system or have_user | ||
80 | decodetree = generator(find_program('scripts/decodetree.py'), | ||
81 | diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h | ||
82 | index XXXXXXX..XXXXXXX 100644 | ||
83 | --- a/include/qemu/main-loop.h | ||
84 | +++ b/include/qemu/main-loop.h | ||
85 | @@ -XXX,XX +XXX,XX @@ | ||
86 | #define QEMU_MAIN_LOOP_H | ||
87 | |||
88 | #include "block/aio.h" | ||
89 | +#include "qom/object.h" | ||
90 | +#include "sysemu/event-loop-base.h" | ||
91 | |||
92 | #define SIG_IPI SIGUSR1 | ||
93 | |||
94 | +#define TYPE_MAIN_LOOP "main-loop" | ||
95 | +OBJECT_DECLARE_TYPE(MainLoop, MainLoopClass, MAIN_LOOP) | ||
96 | + | ||
97 | +struct MainLoop { | ||
98 | + EventLoopBase parent_obj; | ||
55 | +}; | 99 | +}; |
56 | 100 | +typedef struct MainLoop MainLoop; | |
57 | #define NVME_BLOCK_OPT_DEVICE "device" | 101 | + |
58 | #define NVME_BLOCK_OPT_NAMESPACE "namespace" | 102 | /** |
59 | @@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, | 103 | * qemu_init_main_loop: Set up the process so that it can run the main loop. |
104 | * | ||
105 | diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h | ||
106 | index XXXXXXX..XXXXXXX 100644 | ||
107 | --- a/include/sysemu/event-loop-base.h | ||
108 | +++ b/include/sysemu/event-loop-base.h | ||
109 | @@ -XXX,XX +XXX,XX @@ struct EventLoopBaseClass { | ||
110 | |||
111 | void (*init)(EventLoopBase *base, Error **errp); | ||
112 | void (*update_params)(EventLoopBase *base, Error **errp); | ||
113 | + bool (*can_be_deleted)(EventLoopBase *base); | ||
114 | }; | ||
115 | |||
116 | struct EventLoopBase { | ||
117 | diff --git a/event-loop-base.c b/event-loop-base.c | ||
118 | index XXXXXXX..XXXXXXX 100644 | ||
119 | --- a/event-loop-base.c | ||
120 | +++ b/event-loop-base.c | ||
121 | @@ -XXX,XX +XXX,XX @@ static void event_loop_base_complete(UserCreatable *uc, Error **errp) | ||
60 | } | 122 | } |
61 | } | 123 | } |
62 | 124 | ||
63 | -static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q) | 125 | +static bool event_loop_base_can_be_deleted(UserCreatable *uc) |
64 | +static void nvme_free_queue_pair(NVMeQueuePair *q) | 126 | +{ |
127 | + EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc); | ||
128 | + EventLoopBase *backend = EVENT_LOOP_BASE(uc); | ||
129 | + | ||
130 | + if (bc->can_be_deleted) { | ||
131 | + return bc->can_be_deleted(backend); | ||
132 | + } | ||
133 | + | ||
134 | + return true; | ||
135 | +} | ||
136 | + | ||
137 | static void event_loop_base_class_init(ObjectClass *klass, void *class_data) | ||
65 | { | 138 | { |
66 | qemu_vfree(q->prp_list_pages); | 139 | UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass); |
67 | qemu_vfree(q->sq.queue); | 140 | ucc->complete = event_loop_base_complete; |
68 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | 141 | + ucc->can_be_deleted = event_loop_base_can_be_deleted; |
69 | uint64_t prp_list_iova; | 142 | |
70 | 143 | object_class_property_add(klass, "aio-max-batch", "int", | |
71 | qemu_mutex_init(&q->lock); | 144 | event_loop_base_get_param, |
72 | + q->s = s; | 145 | diff --git a/util/main-loop.c b/util/main-loop.c |
73 | q->index = idx; | 146 | index XXXXXXX..XXXXXXX 100644 |
74 | qemu_co_queue_init(&q->free_req_queue); | 147 | --- a/util/main-loop.c |
75 | q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | 148 | +++ b/util/main-loop.c |
76 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | 149 | @@ -XXX,XX +XXX,XX @@ |
77 | 150 | #include "qemu/error-report.h" | |
78 | return q; | 151 | #include "qemu/queue.h" |
79 | fail: | 152 | #include "qemu/compiler.h" |
80 | - nvme_free_queue_pair(bs, q); | 153 | +#include "qom/object.h" |
81 | + nvme_free_queue_pair(q); | 154 | |
82 | return NULL; | 155 | #ifndef _WIN32 |
156 | #include <sys/wait.h> | ||
157 | @@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp) | ||
158 | return 0; | ||
83 | } | 159 | } |
84 | 160 | ||
85 | /* With q->lock */ | 161 | +static void main_loop_update_params(EventLoopBase *base, Error **errp) |
86 | -static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) | 162 | +{ |
87 | +static void nvme_kick(NVMeQueuePair *q) | 163 | + if (!qemu_aio_context) { |
88 | { | 164 | + error_setg(errp, "qemu aio context not ready"); |
89 | + BDRVNVMeState *s = q->s; | 165 | + return; |
90 | + | 166 | + } |
91 | if (s->plugged || !q->need_kick) { | 167 | + |
92 | return; | 168 | + aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp); |
93 | } | 169 | +} |
94 | @@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) | 170 | + |
95 | } | 171 | +MainLoop *mloop; |
96 | 172 | + | |
97 | /* With q->lock */ | 173 | +static void main_loop_init(EventLoopBase *base, Error **errp) |
98 | -static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) | 174 | +{ |
99 | +static void nvme_wake_free_req_locked(NVMeQueuePair *q) | 175 | + MainLoop *m = MAIN_LOOP(base); |
100 | { | 176 | + |
101 | if (!qemu_co_queue_empty(&q->free_req_queue)) { | 177 | + if (mloop) { |
102 | - replay_bh_schedule_oneshot_event(s->aio_context, | 178 | + error_setg(errp, "only one main-loop instance allowed"); |
103 | + replay_bh_schedule_oneshot_event(q->s->aio_context, | 179 | + return; |
104 | nvme_free_req_queue_cb, q); | 180 | + } |
105 | } | 181 | + |
106 | } | 182 | + main_loop_update_params(base, errp); |
107 | 183 | + | |
108 | /* Insert a request in the freelist and wake waiters */ | 184 | + mloop = m; |
109 | -static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, | 185 | + return; |
110 | - NVMeRequest *req) | 186 | +} |
111 | +static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) | 187 | + |
112 | { | 188 | +static bool main_loop_can_be_deleted(EventLoopBase *base) |
113 | qemu_mutex_lock(&q->lock); | 189 | +{ |
114 | nvme_put_free_req_locked(q, req); | 190 | + return false; |
115 | - nvme_wake_free_req_locked(s, q); | 191 | +} |
116 | + nvme_wake_free_req_locked(q); | 192 | + |
117 | qemu_mutex_unlock(&q->lock); | 193 | +static void main_loop_class_init(ObjectClass *oc, void *class_data) |
118 | } | 194 | +{ |
119 | 195 | + EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(oc); | |
120 | @@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c) | 196 | + |
121 | } | 197 | + bc->init = main_loop_init; |
122 | 198 | + bc->update_params = main_loop_update_params; | |
123 | /* With q->lock */ | 199 | + bc->can_be_deleted = main_loop_can_be_deleted; |
124 | -static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 200 | +} |
125 | +static bool nvme_process_completion(NVMeQueuePair *q) | 201 | + |
126 | { | 202 | +static const TypeInfo main_loop_info = { |
127 | + BDRVNVMeState *s = q->s; | 203 | + .name = TYPE_MAIN_LOOP, |
128 | bool progress = false; | 204 | + .parent = TYPE_EVENT_LOOP_BASE, |
129 | NVMeRequest *preq; | 205 | + .class_init = main_loop_class_init, |
130 | NVMeRequest req; | 206 | + .instance_size = sizeof(MainLoop), |
131 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 207 | +}; |
132 | /* Notify the device so it can post more completions. */ | 208 | + |
133 | smp_mb_release(); | 209 | +static void main_loop_register_types(void) |
134 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | 210 | +{ |
135 | - nvme_wake_free_req_locked(s, q); | 211 | + type_register_static(&main_loop_info); |
136 | + nvme_wake_free_req_locked(q); | 212 | +} |
137 | } | 213 | + |
138 | q->busy = false; | 214 | +type_init(main_loop_register_types) |
139 | return progress; | 215 | + |
140 | @@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd) | 216 | static int max_priority; |
141 | } | 217 | |
142 | } | 218 | #ifndef _WIN32 |
143 | |||
144 | -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, | ||
145 | - NVMeRequest *req, | ||
146 | +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, | ||
147 | NvmeCmd *cmd, BlockCompletionFunc cb, | ||
148 | void *opaque) | ||
149 | { | ||
150 | @@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, | ||
151 | req->opaque = opaque; | ||
152 | cmd->cid = cpu_to_le32(req->cid); | ||
153 | |||
154 | - trace_nvme_submit_command(s, q->index, req->cid); | ||
155 | + trace_nvme_submit_command(q->s, q->index, req->cid); | ||
156 | nvme_trace_command(cmd); | ||
157 | qemu_mutex_lock(&q->lock); | ||
158 | memcpy((uint8_t *)q->sq.queue + | ||
159 | q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); | ||
160 | q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; | ||
161 | q->need_kick++; | ||
162 | - nvme_kick(s, q); | ||
163 | - nvme_process_completion(s, q); | ||
164 | + nvme_kick(q); | ||
165 | + nvme_process_completion(q); | ||
166 | qemu_mutex_unlock(&q->lock); | ||
167 | } | ||
168 | |||
169 | @@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, | ||
170 | NvmeCmd *cmd) | ||
171 | { | ||
172 | NVMeRequest *req; | ||
173 | - BDRVNVMeState *s = bs->opaque; | ||
174 | int ret = -EINPROGRESS; | ||
175 | req = nvme_get_free_req(q); | ||
176 | if (!req) { | ||
177 | return -EBUSY; | ||
178 | } | ||
179 | - nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret); | ||
180 | + nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); | ||
181 | |||
182 | BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); | ||
183 | return ret; | ||
184 | @@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s) | ||
185 | } | ||
186 | |||
187 | qemu_mutex_lock(&q->lock); | ||
188 | - while (nvme_process_completion(s, q)) { | ||
189 | + while (nvme_process_completion(q)) { | ||
190 | /* Keep polling */ | ||
191 | progress = true; | ||
192 | } | ||
193 | @@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) | ||
194 | }; | ||
195 | if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { | ||
196 | error_setg(errp, "Failed to create io queue [%d]", n); | ||
197 | - nvme_free_queue_pair(bs, q); | ||
198 | + nvme_free_queue_pair(q); | ||
199 | return false; | ||
200 | } | ||
201 | cmd = (NvmeCmd) { | ||
202 | @@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) | ||
203 | }; | ||
204 | if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { | ||
205 | error_setg(errp, "Failed to create io queue [%d]", n); | ||
206 | - nvme_free_queue_pair(bs, q); | ||
207 | + nvme_free_queue_pair(q); | ||
208 | return false; | ||
209 | } | ||
210 | s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); | ||
211 | @@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs) | ||
212 | BDRVNVMeState *s = bs->opaque; | ||
213 | |||
214 | for (i = 0; i < s->nr_queues; ++i) { | ||
215 | - nvme_free_queue_pair(bs, s->queues[i]); | ||
216 | + nvme_free_queue_pair(s->queues[i]); | ||
217 | } | ||
218 | g_free(s->queues); | ||
219 | aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, | ||
220 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, | ||
221 | r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); | ||
222 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
223 | if (r) { | ||
224 | - nvme_put_free_req_and_wake(s, ioq, req); | ||
225 | + nvme_put_free_req_and_wake(ioq, req); | ||
226 | return r; | ||
227 | } | ||
228 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
229 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
230 | |||
231 | data.co = qemu_coroutine_self(); | ||
232 | while (data.ret == -EINPROGRESS) { | ||
233 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) | ||
234 | assert(s->nr_queues > 1); | ||
235 | req = nvme_get_free_req(ioq); | ||
236 | assert(req); | ||
237 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
238 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
239 | |||
240 | data.co = qemu_coroutine_self(); | ||
241 | if (data.ret == -EINPROGRESS) { | ||
242 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, | ||
243 | req = nvme_get_free_req(ioq); | ||
244 | assert(req); | ||
245 | |||
246 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
247 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
248 | |||
249 | data.co = qemu_coroutine_self(); | ||
250 | while (data.ret == -EINPROGRESS) { | ||
251 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, | ||
252 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
253 | |||
254 | if (ret) { | ||
255 | - nvme_put_free_req_and_wake(s, ioq, req); | ||
256 | + nvme_put_free_req_and_wake(ioq, req); | ||
257 | goto out; | ||
258 | } | ||
259 | |||
260 | trace_nvme_dsm(s, offset, bytes); | ||
261 | |||
262 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
263 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
264 | |||
265 | data.co = qemu_coroutine_self(); | ||
266 | while (data.ret == -EINPROGRESS) { | ||
267 | @@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs) | ||
268 | for (i = 1; i < s->nr_queues; i++) { | ||
269 | NVMeQueuePair *q = s->queues[i]; | ||
270 | qemu_mutex_lock(&q->lock); | ||
271 | - nvme_kick(s, q); | ||
272 | - nvme_process_completion(s, q); | ||
273 | + nvme_kick(q); | ||
274 | + nvme_process_completion(q); | ||
275 | qemu_mutex_unlock(&q->lock); | ||
276 | } | ||
277 | } | ||
278 | -- | 219 | -- |
279 | 2.26.2 | 220 | 2.35.1 |
280 | diff view generated by jsdifflib |
1 | There are three issues with the current NVMeRequest->busy field: | 1 | From: Nicolas Saenz Julienne <nsaenzju@redhat.com> |
---|---|---|---|
2 | 1. The busy field is accidentally accessed outside q->lock when request | ||
3 | submission fails. | ||
4 | 2. Waiters on free_req_queue are not woken when a request is returned | ||
5 | early due to submission failure. | ||
6 | 2. Finding a free request involves scanning all requests. This makes | ||
7 | request submission O(n^2). | ||
8 | 2 | ||
9 | Switch to an O(1) freelist that is always accessed under the lock. | 3 | The thread pool regulates itself: when idle, it kills threads until |
4 | empty, when in demand, it creates new threads until full. This behaviour | ||
5 | doesn't play well with latency sensitive workloads where the price of | ||
6 | creating a new thread is too high. For example, when paired with qemu's | ||
7 | '-mlock', or using safety features like SafeStack, creating a new thread | ||
8 | has been measured take multiple milliseconds. | ||
10 | 9 | ||
11 | Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and | 10 | In order to mitigate this let's introduce a new 'EventLoopBase' |
12 | NVME_NUM_REQS, the number of usable requests. This makes the code | 11 | property to set the thread pool size. The threads will be created during |
13 | simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind | 12 | the pool's initialization or upon updating the property's value, remain |
14 | that one slot is reserved. | 13 | available during its lifetime regardless of demand, and destroyed upon |
14 | freeing it. A properly characterized workload will then be able to | ||
15 | configure the pool to avoid any latency spikes. | ||
15 | 16 | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 17 | Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> |
17 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 18 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
18 | Message-id: 20200617132201.1832152-5-stefanha@redhat.com | 19 | Acked-by: Markus Armbruster <armbru@redhat.com> |
20 | Message-id: 20220425075723.20019-4-nsaenzju@redhat.com | ||
19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 21 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
20 | --- | 22 | --- |
21 | block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------ | 23 | qapi/qom.json | 10 +++++- |
22 | 1 file changed, 54 insertions(+), 27 deletions(-) | 24 | include/block/aio.h | 10 ++++++ |
25 | include/block/thread-pool.h | 3 ++ | ||
26 | include/sysemu/event-loop-base.h | 4 +++ | ||
27 | event-loop-base.c | 23 +++++++++++++ | ||
28 | iothread.c | 3 ++ | ||
29 | util/aio-posix.c | 1 + | ||
30 | util/async.c | 20 ++++++++++++ | ||
31 | util/main-loop.c | 9 ++++++ | ||
32 | util/thread-pool.c | 55 +++++++++++++++++++++++++++++--- | ||
33 | 10 files changed, 133 insertions(+), 5 deletions(-) | ||
23 | 34 | ||
24 | diff --git a/block/nvme.c b/block/nvme.c | 35 | diff --git a/qapi/qom.json b/qapi/qom.json |
25 | index XXXXXXX..XXXXXXX 100644 | 36 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/block/nvme.c | 37 | --- a/qapi/qom.json |
27 | +++ b/block/nvme.c | 38 | +++ b/qapi/qom.json |
28 | @@ -XXX,XX +XXX,XX @@ | 39 | @@ -XXX,XX +XXX,XX @@ |
29 | #define NVME_QUEUE_SIZE 128 | 40 | # 0 means that the engine will use its default. |
30 | #define NVME_BAR_SIZE 8192 | 41 | # (default: 0) |
31 | 42 | # | |
32 | +/* | 43 | +# @thread-pool-min: minimum number of threads reserved in the thread pool |
33 | + * We have to leave one slot empty as that is the full queue case where | 44 | +# (default:0) |
34 | + * head == tail + 1. | 45 | +# |
46 | +# @thread-pool-max: maximum number of threads the thread pool can contain | ||
47 | +# (default:64) | ||
48 | +# | ||
49 | # Since: 7.1 | ||
50 | ## | ||
51 | { 'struct': 'EventLoopBaseProperties', | ||
52 | - 'data': { '*aio-max-batch': 'int' } } | ||
53 | + 'data': { '*aio-max-batch': 'int', | ||
54 | + '*thread-pool-min': 'int', | ||
55 | + '*thread-pool-max': 'int' } } | ||
56 | |||
57 | ## | ||
58 | # @IothreadProperties: | ||
59 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
60 | index XXXXXXX..XXXXXXX 100644 | ||
61 | --- a/include/block/aio.h | ||
62 | +++ b/include/block/aio.h | ||
63 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
64 | QSLIST_HEAD(, Coroutine) scheduled_coroutines; | ||
65 | QEMUBH *co_schedule_bh; | ||
66 | |||
67 | + int thread_pool_min; | ||
68 | + int thread_pool_max; | ||
69 | /* Thread pool for performing work and receiving completion callbacks. | ||
70 | * Has its own locking. | ||
71 | */ | ||
72 | @@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, | ||
73 | void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch, | ||
74 | Error **errp); | ||
75 | |||
76 | +/** | ||
77 | + * aio_context_set_thread_pool_params: | ||
78 | + * @ctx: the aio context | ||
79 | + * @min: min number of threads to have readily available in the thread pool | ||
80 | + * @min: max number of threads the thread pool can contain | ||
35 | + */ | 81 | + */ |
36 | +#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) | 82 | +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, |
37 | + | 83 | + int64_t max, Error **errp); |
38 | typedef struct { | 84 | #endif |
39 | int32_t head, tail; | 85 | diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h |
40 | uint8_t *queue; | 86 | index XXXXXXX..XXXXXXX 100644 |
41 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 87 | --- a/include/block/thread-pool.h |
42 | int cid; | 88 | +++ b/include/block/thread-pool.h |
43 | void *prp_list_page; | 89 | @@ -XXX,XX +XXX,XX @@ |
44 | uint64_t prp_list_iova; | 90 | |
45 | - bool busy; | 91 | #include "block/block.h" |
46 | + int free_req_next; /* q->reqs[] index of next free req */ | 92 | |
47 | } NVMeRequest; | 93 | +#define THREAD_POOL_MAX_THREADS_DEFAULT 64 |
94 | + | ||
95 | typedef int ThreadPoolFunc(void *opaque); | ||
96 | |||
97 | typedef struct ThreadPool ThreadPool; | ||
98 | @@ -XXX,XX +XXX,XX @@ BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool, | ||
99 | int coroutine_fn thread_pool_submit_co(ThreadPool *pool, | ||
100 | ThreadPoolFunc *func, void *arg); | ||
101 | void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg); | ||
102 | +void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx); | ||
103 | |||
104 | #endif | ||
105 | diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h | ||
106 | index XXXXXXX..XXXXXXX 100644 | ||
107 | --- a/include/sysemu/event-loop-base.h | ||
108 | +++ b/include/sysemu/event-loop-base.h | ||
109 | @@ -XXX,XX +XXX,XX @@ struct EventLoopBase { | ||
110 | |||
111 | /* AioContext AIO engine parameters */ | ||
112 | int64_t aio_max_batch; | ||
113 | + | ||
114 | + /* AioContext thread pool parameters */ | ||
115 | + int64_t thread_pool_min; | ||
116 | + int64_t thread_pool_max; | ||
117 | }; | ||
118 | #endif | ||
119 | diff --git a/event-loop-base.c b/event-loop-base.c | ||
120 | index XXXXXXX..XXXXXXX 100644 | ||
121 | --- a/event-loop-base.c | ||
122 | +++ b/event-loop-base.c | ||
123 | @@ -XXX,XX +XXX,XX @@ | ||
124 | #include "qemu/osdep.h" | ||
125 | #include "qom/object_interfaces.h" | ||
126 | #include "qapi/error.h" | ||
127 | +#include "block/thread-pool.h" | ||
128 | #include "sysemu/event-loop-base.h" | ||
48 | 129 | ||
49 | typedef struct { | 130 | typedef struct { |
50 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 131 | @@ -XXX,XX +XXX,XX @@ typedef struct { |
51 | /* Fields protected by @lock */ | 132 | ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */ |
52 | NVMeQueue sq, cq; | 133 | } EventLoopBaseParamInfo; |
53 | int cq_phase; | 134 | |
54 | - NVMeRequest reqs[NVME_QUEUE_SIZE]; | 135 | +static void event_loop_base_instance_init(Object *obj) |
55 | + int free_req_head; | 136 | +{ |
56 | + NVMeRequest reqs[NVME_NUM_REQS]; | 137 | + EventLoopBase *base = EVENT_LOOP_BASE(obj); |
57 | bool busy; | 138 | + |
58 | int need_kick; | 139 | + base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT; |
59 | int inflight; | 140 | +} |
60 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | 141 | + |
61 | qemu_mutex_init(&q->lock); | 142 | static EventLoopBaseParamInfo aio_max_batch_info = { |
62 | q->index = idx; | 143 | "aio-max-batch", offsetof(EventLoopBase, aio_max_batch), |
63 | qemu_co_queue_init(&q->free_req_queue); | 144 | }; |
64 | - q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE); | 145 | +static EventLoopBaseParamInfo thread_pool_min_info = { |
65 | + q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | 146 | + "thread-pool-min", offsetof(EventLoopBase, thread_pool_min), |
66 | r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, | 147 | +}; |
67 | - s->page_size * NVME_QUEUE_SIZE, | 148 | +static EventLoopBaseParamInfo thread_pool_max_info = { |
68 | + s->page_size * NVME_NUM_REQS, | 149 | + "thread-pool-max", offsetof(EventLoopBase, thread_pool_max), |
69 | false, &prp_list_iova); | 150 | +}; |
70 | if (r) { | 151 | |
71 | goto fail; | 152 | static void event_loop_base_get_param(Object *obj, Visitor *v, |
153 | const char *name, void *opaque, Error **errp) | ||
154 | @@ -XXX,XX +XXX,XX @@ static void event_loop_base_class_init(ObjectClass *klass, void *class_data) | ||
155 | event_loop_base_get_param, | ||
156 | event_loop_base_set_param, | ||
157 | NULL, &aio_max_batch_info); | ||
158 | + object_class_property_add(klass, "thread-pool-min", "int", | ||
159 | + event_loop_base_get_param, | ||
160 | + event_loop_base_set_param, | ||
161 | + NULL, &thread_pool_min_info); | ||
162 | + object_class_property_add(klass, "thread-pool-max", "int", | ||
163 | + event_loop_base_get_param, | ||
164 | + event_loop_base_set_param, | ||
165 | + NULL, &thread_pool_max_info); | ||
166 | } | ||
167 | |||
168 | static const TypeInfo event_loop_base_info = { | ||
169 | .name = TYPE_EVENT_LOOP_BASE, | ||
170 | .parent = TYPE_OBJECT, | ||
171 | .instance_size = sizeof(EventLoopBase), | ||
172 | + .instance_init = event_loop_base_instance_init, | ||
173 | .class_size = sizeof(EventLoopBaseClass), | ||
174 | .class_init = event_loop_base_class_init, | ||
175 | .abstract = true, | ||
176 | diff --git a/iothread.c b/iothread.c | ||
177 | index XXXXXXX..XXXXXXX 100644 | ||
178 | --- a/iothread.c | ||
179 | +++ b/iothread.c | ||
180 | @@ -XXX,XX +XXX,XX @@ static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp) | ||
181 | aio_context_set_aio_params(iothread->ctx, | ||
182 | iothread->parent_obj.aio_max_batch, | ||
183 | errp); | ||
184 | + | ||
185 | + aio_context_set_thread_pool_params(iothread->ctx, base->thread_pool_min, | ||
186 | + base->thread_pool_max, errp); | ||
187 | } | ||
188 | |||
189 | |||
190 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
191 | index XXXXXXX..XXXXXXX 100644 | ||
192 | --- a/util/aio-posix.c | ||
193 | +++ b/util/aio-posix.c | ||
194 | @@ -XXX,XX +XXX,XX @@ | ||
195 | |||
196 | #include "qemu/osdep.h" | ||
197 | #include "block/block.h" | ||
198 | +#include "block/thread-pool.h" | ||
199 | #include "qemu/main-loop.h" | ||
200 | #include "qemu/rcu.h" | ||
201 | #include "qemu/rcu_queue.h" | ||
202 | diff --git a/util/async.c b/util/async.c | ||
203 | index XXXXXXX..XXXXXXX 100644 | ||
204 | --- a/util/async.c | ||
205 | +++ b/util/async.c | ||
206 | @@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp) | ||
207 | |||
208 | ctx->aio_max_batch = 0; | ||
209 | |||
210 | + ctx->thread_pool_min = 0; | ||
211 | + ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT; | ||
212 | + | ||
213 | return ctx; | ||
214 | fail: | ||
215 | g_source_destroy(&ctx->source); | ||
216 | @@ -XXX,XX +XXX,XX @@ void qemu_set_current_aio_context(AioContext *ctx) | ||
217 | assert(!get_my_aiocontext()); | ||
218 | set_my_aiocontext(ctx); | ||
219 | } | ||
220 | + | ||
221 | +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, | ||
222 | + int64_t max, Error **errp) | ||
223 | +{ | ||
224 | + | ||
225 | + if (min > max || !max || min > INT_MAX || max > INT_MAX) { | ||
226 | + error_setg(errp, "bad thread-pool-min/thread-pool-max values"); | ||
227 | + return; | ||
228 | + } | ||
229 | + | ||
230 | + ctx->thread_pool_min = min; | ||
231 | + ctx->thread_pool_max = max; | ||
232 | + | ||
233 | + if (ctx->thread_pool) { | ||
234 | + thread_pool_update_params(ctx->thread_pool, ctx); | ||
235 | + } | ||
236 | +} | ||
237 | diff --git a/util/main-loop.c b/util/main-loop.c | ||
238 | index XXXXXXX..XXXXXXX 100644 | ||
239 | --- a/util/main-loop.c | ||
240 | +++ b/util/main-loop.c | ||
241 | @@ -XXX,XX +XXX,XX @@ | ||
242 | #include "sysemu/replay.h" | ||
243 | #include "qemu/main-loop.h" | ||
244 | #include "block/aio.h" | ||
245 | +#include "block/thread-pool.h" | ||
246 | #include "qemu/error-report.h" | ||
247 | #include "qemu/queue.h" | ||
248 | #include "qemu/compiler.h" | ||
249 | @@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp) | ||
250 | |||
251 | static void main_loop_update_params(EventLoopBase *base, Error **errp) | ||
252 | { | ||
253 | + ERRP_GUARD(); | ||
254 | + | ||
255 | if (!qemu_aio_context) { | ||
256 | error_setg(errp, "qemu aio context not ready"); | ||
257 | return; | ||
72 | } | 258 | } |
73 | - for (i = 0; i < NVME_QUEUE_SIZE; i++) { | 259 | |
74 | + q->free_req_head = -1; | 260 | aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp); |
75 | + for (i = 0; i < NVME_NUM_REQS; i++) { | 261 | + if (*errp) { |
76 | NVMeRequest *req = &q->reqs[i]; | 262 | + return; |
77 | req->cid = i + 1; | 263 | + } |
78 | + req->free_req_next = q->free_req_head; | 264 | + |
79 | + q->free_req_head = i; | 265 | + aio_context_set_thread_pool_params(qemu_aio_context, base->thread_pool_min, |
80 | req->prp_list_page = q->prp_list_pages + i * s->page_size; | 266 | + base->thread_pool_max, errp); |
81 | req->prp_list_iova = prp_list_iova + i * s->page_size; | 267 | } |
82 | } | 268 | |
83 | + | 269 | MainLoop *mloop; |
84 | nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); | 270 | diff --git a/util/thread-pool.c b/util/thread-pool.c |
85 | if (local_err) { | 271 | index XXXXXXX..XXXXXXX 100644 |
86 | error_propagate(errp, local_err); | 272 | --- a/util/thread-pool.c |
87 | @@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) | 273 | +++ b/util/thread-pool.c |
88 | */ | 274 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool { |
89 | static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) | 275 | QemuMutex lock; |
276 | QemuCond worker_stopped; | ||
277 | QemuSemaphore sem; | ||
278 | - int max_threads; | ||
279 | QEMUBH *new_thread_bh; | ||
280 | |||
281 | /* The following variables are only accessed from one AioContext. */ | ||
282 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool { | ||
283 | int new_threads; /* backlog of threads we need to create */ | ||
284 | int pending_threads; /* threads created but not running yet */ | ||
285 | bool stopping; | ||
286 | + int min_threads; | ||
287 | + int max_threads; | ||
288 | }; | ||
289 | |||
290 | +static inline bool back_to_sleep(ThreadPool *pool, int ret) | ||
291 | +{ | ||
292 | + /* | ||
293 | + * The semaphore timed out, we should exit the loop except when: | ||
294 | + * - There is work to do, we raced with the signal. | ||
295 | + * - The max threads threshold just changed, we raced with the signal. | ||
296 | + * - The thread pool forces a minimum number of readily available threads. | ||
297 | + */ | ||
298 | + if (ret == -1 && (!QTAILQ_EMPTY(&pool->request_list) || | ||
299 | + pool->cur_threads > pool->max_threads || | ||
300 | + pool->cur_threads <= pool->min_threads)) { | ||
301 | + return true; | ||
302 | + } | ||
303 | + | ||
304 | + return false; | ||
305 | +} | ||
306 | + | ||
307 | static void *worker_thread(void *opaque) | ||
90 | { | 308 | { |
91 | - int i; | 309 | ThreadPool *pool = opaque; |
92 | - NVMeRequest *req = NULL; | 310 | @@ -XXX,XX +XXX,XX @@ static void *worker_thread(void *opaque) |
93 | + NVMeRequest *req; | 311 | ret = qemu_sem_timedwait(&pool->sem, 10000); |
94 | 312 | qemu_mutex_lock(&pool->lock); | |
95 | qemu_mutex_lock(&q->lock); | 313 | pool->idle_threads--; |
96 | - while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) { | 314 | - } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list)); |
97 | - /* We have to leave one slot empty as that is the full queue case (head | 315 | - if (ret == -1 || pool->stopping) { |
98 | - * == tail + 1). */ | 316 | + } while (back_to_sleep(pool, ret)); |
99 | + | 317 | + if (ret == -1 || pool->stopping || |
100 | + while (q->free_req_head == -1) { | 318 | + pool->cur_threads > pool->max_threads) { |
101 | if (qemu_in_coroutine()) { | 319 | break; |
102 | trace_nvme_free_req_queue_wait(q); | ||
103 | qemu_co_queue_wait(&q->free_req_queue, &q->lock); | ||
104 | @@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) | ||
105 | return NULL; | ||
106 | } | 320 | } |
107 | } | 321 | |
108 | - for (i = 0; i < NVME_QUEUE_SIZE; i++) { | 322 | @@ -XXX,XX +XXX,XX @@ void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg) |
109 | - if (!q->reqs[i].busy) { | 323 | thread_pool_submit_aio(pool, func, arg, NULL, NULL); |
110 | - q->reqs[i].busy = true; | 324 | } |
111 | - req = &q->reqs[i]; | 325 | |
112 | - break; | 326 | +void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) |
113 | - } | ||
114 | - } | ||
115 | - /* We have checked inflight and need_kick while holding q->lock, so one | ||
116 | - * free req must be available. */ | ||
117 | - assert(req); | ||
118 | + | ||
119 | + req = &q->reqs[q->free_req_head]; | ||
120 | + q->free_req_head = req->free_req_next; | ||
121 | + req->free_req_next = -1; | ||
122 | + | ||
123 | qemu_mutex_unlock(&q->lock); | ||
124 | return req; | ||
125 | } | ||
126 | |||
127 | +/* With q->lock */ | ||
128 | +static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) | ||
129 | +{ | 327 | +{ |
130 | + req->free_req_next = q->free_req_head; | 328 | + qemu_mutex_lock(&pool->lock); |
131 | + q->free_req_head = req - q->reqs; | 329 | + |
330 | + pool->min_threads = ctx->thread_pool_min; | ||
331 | + pool->max_threads = ctx->thread_pool_max; | ||
332 | + | ||
333 | + /* | ||
334 | + * We either have to: | ||
335 | + * - Increase the number available of threads until over the min_threads | ||
336 | + * threshold. | ||
337 | + * - Decrease the number of available threads until under the max_threads | ||
338 | + * threshold. | ||
339 | + * - Do nothing. The current number of threads fall in between the min and | ||
340 | + * max thresholds. We'll let the pool manage itself. | ||
341 | + */ | ||
342 | + for (int i = pool->cur_threads; i < pool->min_threads; i++) { | ||
343 | + spawn_thread(pool); | ||
344 | + } | ||
345 | + | ||
346 | + for (int i = pool->cur_threads; i > pool->max_threads; i--) { | ||
347 | + qemu_sem_post(&pool->sem); | ||
348 | + } | ||
349 | + | ||
350 | + qemu_mutex_unlock(&pool->lock); | ||
132 | +} | 351 | +} |
133 | + | 352 | + |
134 | +/* With q->lock */ | 353 | static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) |
135 | +static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) | ||
136 | +{ | ||
137 | + if (!qemu_co_queue_empty(&q->free_req_queue)) { | ||
138 | + replay_bh_schedule_oneshot_event(s->aio_context, | ||
139 | + nvme_free_req_queue_cb, q); | ||
140 | + } | ||
141 | +} | ||
142 | + | ||
143 | +/* Insert a request in the freelist and wake waiters */ | ||
144 | +static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, | ||
145 | + NVMeRequest *req) | ||
146 | +{ | ||
147 | + qemu_mutex_lock(&q->lock); | ||
148 | + nvme_put_free_req_locked(q, req); | ||
149 | + nvme_wake_free_req_locked(s, q); | ||
150 | + qemu_mutex_unlock(&q->lock); | ||
151 | +} | ||
152 | + | ||
153 | static inline int nvme_translate_error(const NvmeCqe *c) | ||
154 | { | 354 | { |
155 | uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; | 355 | if (!ctx) { |
156 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 356 | @@ -XXX,XX +XXX,XX @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) |
157 | req = *preq; | 357 | qemu_mutex_init(&pool->lock); |
158 | assert(req.cid == cid); | 358 | qemu_cond_init(&pool->worker_stopped); |
159 | assert(req.cb); | 359 | qemu_sem_init(&pool->sem, 0); |
160 | - preq->busy = false; | 360 | - pool->max_threads = 64; |
161 | + nvme_put_free_req_locked(q, preq); | 361 | pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool); |
162 | preq->cb = preq->opaque = NULL; | 362 | |
163 | qemu_mutex_unlock(&q->lock); | 363 | QLIST_INIT(&pool->head); |
164 | req.cb(req.opaque, ret); | 364 | QTAILQ_INIT(&pool->request_list); |
165 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | 365 | + |
166 | /* Notify the device so it can post more completions. */ | 366 | + thread_pool_update_params(pool, ctx); |
167 | smp_mb_release(); | 367 | } |
168 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | 368 | |
169 | - if (!qemu_co_queue_empty(&q->free_req_queue)) { | 369 | ThreadPool *thread_pool_new(AioContext *ctx) |
170 | - replay_bh_schedule_oneshot_event(s->aio_context, | ||
171 | - nvme_free_req_queue_cb, q); | ||
172 | - } | ||
173 | + nvme_wake_free_req_locked(s, q); | ||
174 | } | ||
175 | q->busy = false; | ||
176 | return progress; | ||
177 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, | ||
178 | r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); | ||
179 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
180 | if (r) { | ||
181 | - req->busy = false; | ||
182 | + nvme_put_free_req_and_wake(s, ioq, req); | ||
183 | return r; | ||
184 | } | ||
185 | nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
186 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, | ||
187 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
188 | |||
189 | if (ret) { | ||
190 | - req->busy = false; | ||
191 | + nvme_put_free_req_and_wake(s, ioq, req); | ||
192 | goto out; | ||
193 | } | ||
194 | |||
195 | -- | 370 | -- |
196 | 2.26.2 | 371 | 2.35.1 |
197 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Existing users access free_req_queue under q->lock. Document this. | ||
2 | 1 | ||
3 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
4 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
6 | Message-id: 20200617132201.1832152-6-stefanha@redhat.com | ||
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
8 | --- | ||
9 | block/nvme.c | 2 +- | ||
10 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
11 | |||
12 | diff --git a/block/nvme.c b/block/nvme.c | ||
13 | index XXXXXXX..XXXXXXX 100644 | ||
14 | --- a/block/nvme.c | ||
15 | +++ b/block/nvme.c | ||
16 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
17 | } NVMeRequest; | ||
18 | |||
19 | typedef struct { | ||
20 | - CoQueue free_req_queue; | ||
21 | QemuMutex lock; | ||
22 | |||
23 | /* Fields protected by BQL */ | ||
24 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
25 | uint8_t *prp_list_pages; | ||
26 | |||
27 | /* Fields protected by @lock */ | ||
28 | + CoQueue free_req_queue; | ||
29 | NVMeQueue sq, cq; | ||
30 | int cq_phase; | ||
31 | int free_req_head; | ||
32 | -- | ||
33 | 2.26.2 | ||
34 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | QEMU block drivers are supposed to support aio_poll() from I/O | ||
2 | completion callback functions. This means completion processing must be | ||
3 | re-entrant. | ||
4 | 1 | ||
5 | The standard approach is to schedule a BH during completion processing | ||
6 | and cancel it at the end of processing. If aio_poll() is invoked by a | ||
7 | callback function then the BH will run. The BH continues the suspended | ||
8 | completion processing. | ||
9 | |||
10 | All of this means that request A's cb() can synchronously wait for | ||
11 | request B to complete. Previously the nvme block driver would hang | ||
12 | because it didn't process completions from nested aio_poll(). | ||
13 | |||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
16 | Message-id: 20200617132201.1832152-8-stefanha@redhat.com | ||
17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
18 | --- | ||
19 | block/nvme.c | 67 ++++++++++++++++++++++++++++++++++++++++------ | ||
20 | block/trace-events | 2 +- | ||
21 | 2 files changed, 60 insertions(+), 9 deletions(-) | ||
22 | |||
23 | diff --git a/block/nvme.c b/block/nvme.c | ||
24 | index XXXXXXX..XXXXXXX 100644 | ||
25 | --- a/block/nvme.c | ||
26 | +++ b/block/nvme.c | ||
27 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
28 | int cq_phase; | ||
29 | int free_req_head; | ||
30 | NVMeRequest reqs[NVME_NUM_REQS]; | ||
31 | - bool busy; | ||
32 | int need_kick; | ||
33 | int inflight; | ||
34 | + | ||
35 | + /* Thread-safe, no lock necessary */ | ||
36 | + QEMUBH *completion_bh; | ||
37 | } NVMeQueuePair; | ||
38 | |||
39 | /* Memory mapped registers */ | ||
40 | @@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState { | ||
41 | #define NVME_BLOCK_OPT_DEVICE "device" | ||
42 | #define NVME_BLOCK_OPT_NAMESPACE "namespace" | ||
43 | |||
44 | +static void nvme_process_completion_bh(void *opaque); | ||
45 | + | ||
46 | static QemuOptsList runtime_opts = { | ||
47 | .name = "nvme", | ||
48 | .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), | ||
49 | @@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, | ||
50 | |||
51 | static void nvme_free_queue_pair(NVMeQueuePair *q) | ||
52 | { | ||
53 | + if (q->completion_bh) { | ||
54 | + qemu_bh_delete(q->completion_bh); | ||
55 | + } | ||
56 | qemu_vfree(q->prp_list_pages); | ||
57 | qemu_vfree(q->sq.queue); | ||
58 | qemu_vfree(q->cq.queue); | ||
59 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | ||
60 | q->index = idx; | ||
61 | qemu_co_queue_init(&q->free_req_queue); | ||
62 | q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
63 | + q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs), | ||
64 | + nvme_process_completion_bh, q); | ||
65 | r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, | ||
66 | s->page_size * NVME_NUM_REQS, | ||
67 | false, &prp_list_iova); | ||
68 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
69 | NvmeCqe *c; | ||
70 | |||
71 | trace_nvme_process_completion(s, q->index, q->inflight); | ||
72 | - if (q->busy || s->plugged) { | ||
73 | - trace_nvme_process_completion_queue_busy(s, q->index); | ||
74 | + if (s->plugged) { | ||
75 | + trace_nvme_process_completion_queue_plugged(s, q->index); | ||
76 | return false; | ||
77 | } | ||
78 | - q->busy = true; | ||
79 | + | ||
80 | + /* | ||
81 | + * Support re-entrancy when a request cb() function invokes aio_poll(). | ||
82 | + * Pending completions must be visible to aio_poll() so that a cb() | ||
83 | + * function can wait for the completion of another request. | ||
84 | + * | ||
85 | + * The aio_poll() loop will execute our BH and we'll resume completion | ||
86 | + * processing there. | ||
87 | + */ | ||
88 | + qemu_bh_schedule(q->completion_bh); | ||
89 | + | ||
90 | assert(q->inflight >= 0); | ||
91 | while (q->inflight) { | ||
92 | int ret; | ||
93 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
94 | assert(req.cb); | ||
95 | nvme_put_free_req_locked(q, preq); | ||
96 | preq->cb = preq->opaque = NULL; | ||
97 | - qemu_mutex_unlock(&q->lock); | ||
98 | - req.cb(req.opaque, ret); | ||
99 | - qemu_mutex_lock(&q->lock); | ||
100 | q->inflight--; | ||
101 | + qemu_mutex_unlock(&q->lock); | ||
102 | + req.cb(req.opaque, ret); | ||
103 | + qemu_mutex_lock(&q->lock); | ||
104 | progress = true; | ||
105 | } | ||
106 | if (progress) { | ||
107 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
108 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
109 | nvme_wake_free_req_locked(q); | ||
110 | } | ||
111 | - q->busy = false; | ||
112 | + | ||
113 | + qemu_bh_cancel(q->completion_bh); | ||
114 | + | ||
115 | return progress; | ||
116 | } | ||
117 | |||
118 | +static void nvme_process_completion_bh(void *opaque) | ||
119 | +{ | ||
120 | + NVMeQueuePair *q = opaque; | ||
121 | + | ||
122 | + /* | ||
123 | + * We're being invoked because a nvme_process_completion() cb() function | ||
124 | + * called aio_poll(). The callback may be waiting for further completions | ||
125 | + * so notify the device that it has space to fill in more completions now. | ||
126 | + */ | ||
127 | + smp_mb_release(); | ||
128 | + *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
129 | + nvme_wake_free_req_locked(q); | ||
130 | + | ||
131 | + nvme_process_completion(q); | ||
132 | +} | ||
133 | + | ||
134 | static void nvme_trace_command(const NvmeCmd *cmd) | ||
135 | { | ||
136 | int i; | ||
137 | @@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs) | ||
138 | { | ||
139 | BDRVNVMeState *s = bs->opaque; | ||
140 | |||
141 | + for (int i = 0; i < s->nr_queues; i++) { | ||
142 | + NVMeQueuePair *q = s->queues[i]; | ||
143 | + | ||
144 | + qemu_bh_delete(q->completion_bh); | ||
145 | + q->completion_bh = NULL; | ||
146 | + } | ||
147 | + | ||
148 | aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, | ||
149 | false, NULL, NULL); | ||
150 | } | ||
151 | @@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs, | ||
152 | s->aio_context = new_context; | ||
153 | aio_set_event_notifier(new_context, &s->irq_notifier, | ||
154 | false, nvme_handle_event, nvme_poll_cb); | ||
155 | + | ||
156 | + for (int i = 0; i < s->nr_queues; i++) { | ||
157 | + NVMeQueuePair *q = s->queues[i]; | ||
158 | + | ||
159 | + q->completion_bh = | ||
160 | + aio_bh_new(new_context, nvme_process_completion_bh, q); | ||
161 | + } | ||
162 | } | ||
163 | |||
164 | static void nvme_aio_plug(BlockDriverState *bs) | ||
165 | diff --git a/block/trace-events b/block/trace-events | ||
166 | index XXXXXXX..XXXXXXX 100644 | ||
167 | --- a/block/trace-events | ||
168 | +++ b/block/trace-events | ||
169 | @@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d" | ||
170 | nvme_dma_flush_queue_wait(void *s) "s %p" | ||
171 | nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" | ||
172 | nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" | ||
173 | -nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" | ||
174 | +nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d" | ||
175 | nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" | ||
176 | nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" | ||
177 | nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" | ||
178 | -- | ||
179 | 2.26.2 | ||
180 | diff view generated by jsdifflib |