1 | The following changes since commit 887cba855bb6ff4775256f7968409281350b568c: | 1 | The following changes since commit ddc27d2ad9361a81c2b3800d14143bf420dae172: |
---|---|---|---|
2 | 2 | ||
3 | configure: Fix cross-building for RISCV host (v5) (2023-07-11 17:56:09 +0100) | 3 | Merge tag 'pull-request-2024-03-18' of https://gitlab.com/thuth/qemu into staging (2024-03-19 10:25:25 +0000) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 75dcb4d790bbe5327169fd72b185960ca58e2fa6: | 9 | for you to fetch changes up to 86a637e48104ae74d8be53bed6441ce32be33433: |
10 | 10 | ||
11 | virtio-blk: fix host notifier issues during dataplane start/stop (2023-07-12 15:20:32 -0400) | 11 | coroutine: cap per-thread local pool size (2024-03-19 10:49:31 -0400) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | This fix solves the "failed to set up stack guard page" error that has been | ||
17 | reported on Linux hosts where the QEMU coroutine pool exceeds the | ||
18 | vm.max_map_count limit. | ||
19 | |||
16 | ---------------------------------------------------------------- | 20 | ---------------------------------------------------------------- |
17 | 21 | ||
18 | Stefan Hajnoczi (1): | 22 | Stefan Hajnoczi (1): |
19 | virtio-blk: fix host notifier issues during dataplane start/stop | 23 | coroutine: cap per-thread local pool size |
20 | 24 | ||
21 | hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++-------------- | 25 | util/qemu-coroutine.c | 282 +++++++++++++++++++++++++++++++++--------- |
22 | 1 file changed, 38 insertions(+), 29 deletions(-) | 26 | 1 file changed, 223 insertions(+), 59 deletions(-) |
23 | 27 | ||
24 | -- | 28 | -- |
25 | 2.40.1 | 29 | 2.44.0 | diff view generated by jsdifflib |
1 | The main loop thread can consume 100% CPU when using --device | 1 | The coroutine pool implementation can hit the Linux vm.max_map_count |
---|---|---|---|
2 | virtio-blk-pci,iothread=<iothread>. ppoll() constantly returns but | 2 | limit, causing QEMU to abort with "failed to allocate memory for stack" |
3 | reading virtqueue host notifiers fails with EAGAIN. The file descriptors | 3 | or "failed to set up stack guard page" during coroutine creation. |
4 | are stale and remain registered with the AioContext because of bugs in | 4 | |
5 | the virtio-blk dataplane start/stop code. | 5 | This happens because per-thread pools can grow to tens of thousands of |
6 | 6 | coroutines. Each coroutine causes 2 virtual memory areas to be created. | |
7 | The problem is that the dataplane start/stop code involves drain | 7 | Eventually vm.max_map_count is reached and memory-related syscalls fail. |
8 | operations, which call virtio_blk_drained_begin() and | 8 | The per-thread pool sizes are non-uniform and depend on past coroutine |
9 | virtio_blk_drained_end() at points where the host notifier is not | 9 | usage in each thread, so it's possible for one thread to have a large |
10 | operational: | 10 | pool while another thread's pool is empty. |
11 | - In virtio_blk_data_plane_start(), blk_set_aio_context() drains after | 11 | |
12 | vblk->dataplane_started has been set to true but the host notifier has | 12 | Switch to a new coroutine pool implementation with a global pool that |
13 | not been attached yet. | 13 | grows to a maximum number of coroutines and per-thread local pools that |
14 | - In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context() | 14 | are capped at hardcoded small number of coroutines. |
15 | drain after the host notifier has already been detached but with | 15 | |
16 | vblk->dataplane_started still set to true. | 16 | This approach does not leave large numbers of coroutines pooled in a |
17 | 17 | thread that may not use them again. In order to perform well it | |
18 | I would like to simplify ->ioeventfd_start/stop() to avoid interactions | 18 | amortizes the cost of global pool accesses by working in batches of |
19 | with drain entirely, but couldn't find a way to do that. Instead, this | 19 | coroutines instead of individual coroutines. |
20 | patch accepts the fragile nature of the code and reorders it so that | 20 | |
21 | vblk->dataplane_started is false during drain operations. This way the | 21 | The global pool is a list. Threads donate batches of coroutines to when |
22 | virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't | 22 | they have too many and take batches from when they have too few: |
23 | touch the host notifier. The result is that | 23 | |
24 | virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have | 24 | .-----------------------------------. |
25 | complete control over the host notifier and stale file descriptors are | 25 | | Batch 1 | Batch 2 | Batch 3 | ... | global_pool |
26 | no longer left in the AioContext. | 26 | `-----------------------------------' |
27 | 27 | ||
28 | This patch fixes the 100% CPU consumption in the main loop thread and | 28 | Each thread has up to 2 batches of coroutines: |
29 | correctly moves host notifier processing to the IOThread. | 29 | |
30 | 30 | .-------------------. | |
31 | Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()") | 31 | | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches) |
32 | Reported-by: Lukáš Doktor <ldoktor@redhat.com> | 32 | `-------------------' |
33 | |||
34 | The goal of this change is to reduce the excessive number of pooled | ||
35 | coroutines that cause QEMU to abort when vm.max_map_count is reached | ||
36 | without losing the performance of an adequately sized coroutine pool. | ||
37 | |||
38 | Here are virtio-blk disk I/O benchmark results: | ||
39 | |||
40 | RW BLKSIZE IODEPTH OLD NEW CHANGE | ||
41 | randread 4k 1 113725 117451 +3.3% | ||
42 | randread 4k 8 192968 198510 +2.9% | ||
43 | randread 4k 16 207138 209429 +1.1% | ||
44 | randread 4k 32 212399 215145 +1.3% | ||
45 | randread 4k 64 218319 221277 +1.4% | ||
46 | randread 128k 1 17587 17535 -0.3% | ||
47 | randread 128k 8 17614 17616 +0.0% | ||
48 | randread 128k 16 17608 17609 +0.0% | ||
49 | randread 128k 32 17552 17553 +0.0% | ||
50 | randread 128k 64 17484 17484 +0.0% | ||
51 | |||
52 | See files/{fio.sh,test.xml.j2} for the benchmark configuration: | ||
53 | https://gitlab.com/stefanha/virt-playbooks/-/tree/coroutine-pool-fix-sizing | ||
54 | |||
55 | Buglink: https://issues.redhat.com/browse/RHEL-28947 | ||
56 | Reported-by: Sanjay Rao <srao@redhat.com> | ||
57 | Reported-by: Boaz Ben Shabat <bbenshab@redhat.com> | ||
58 | Reported-by: Joe Mario <jmario@redhat.com> | ||
59 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
33 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 60 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
34 | Tested-by: Lukas Doktor <ldoktor@redhat.com> | 61 | Message-ID: <20240318183429.1039340-1-stefanha@redhat.com> |
35 | Message-id: 20230704151527.193586-1-stefanha@redhat.com | ||
36 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
37 | --- | 62 | --- |
38 | hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++-------------- | 63 | util/qemu-coroutine.c | 282 +++++++++++++++++++++++++++++++++--------- |
39 | 1 file changed, 38 insertions(+), 29 deletions(-) | 64 | 1 file changed, 223 insertions(+), 59 deletions(-) |
40 | 65 | ||
41 | diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c | 66 | diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c |
42 | index XXXXXXX..XXXXXXX 100644 | 67 | index XXXXXXX..XXXXXXX 100644 |
43 | --- a/hw/block/dataplane/virtio-blk.c | 68 | --- a/util/qemu-coroutine.c |
44 | +++ b/hw/block/dataplane/virtio-blk.c | 69 | +++ b/util/qemu-coroutine.c |
45 | @@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) | 70 | @@ -XXX,XX +XXX,XX @@ |
46 | 71 | #include "qemu/atomic.h" | |
47 | memory_region_transaction_commit(); | 72 | #include "qemu/coroutine_int.h" |
48 | 73 | #include "qemu/coroutine-tls.h" | |
49 | - /* | 74 | +#include "qemu/cutils.h" |
50 | - * These fields are visible to the IOThread so we rely on implicit barriers | 75 | #include "block/aio.h" |
51 | - * in aio_context_acquire() on the write side and aio_notify_accept() on | 76 | |
52 | - * the read side. | 77 | -/** |
53 | - */ | 78 | - * The minimal batch size is always 64, coroutines from the release_pool are |
54 | - s->starting = false; | 79 | - * reused as soon as there are 64 coroutines in it. The maximum pool size starts |
55 | - vblk->dataplane_started = true; | 80 | - * with 64 and is increased on demand so that coroutines are not deleted even if |
56 | trace_virtio_blk_data_plane_start(s); | 81 | - * they are not immediately reused. |
57 | 82 | - */ | |
58 | old_context = blk_get_aio_context(s->conf->conf.blk); | 83 | enum { |
59 | @@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) | 84 | - POOL_MIN_BATCH_SIZE = 64, |
60 | event_notifier_set(virtio_queue_get_host_notifier(vq)); | 85 | - POOL_INITIAL_MAX_SIZE = 64, |
86 | + COROUTINE_POOL_BATCH_MAX_SIZE = 128, | ||
87 | }; | ||
88 | |||
89 | -/** Free list to speed up creation */ | ||
90 | -static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); | ||
91 | -static unsigned int pool_max_size = POOL_INITIAL_MAX_SIZE; | ||
92 | -static unsigned int release_pool_size; | ||
93 | +/* | ||
94 | + * Coroutine creation and deletion is expensive so a pool of unused coroutines | ||
95 | + * is kept as a cache. When the pool has coroutines available, they are | ||
96 | + * recycled instead of creating new ones from scratch. Coroutines are added to | ||
97 | + * the pool upon termination. | ||
98 | + * | ||
99 | + * The pool is global but each thread maintains a small local pool to avoid | ||
100 | + * global pool contention. Threads fetch and return batches of coroutines from | ||
101 | + * the global pool to maintain their local pool. The local pool holds up to two | ||
102 | + * batches whereas the maximum size of the global pool is controlled by the | ||
103 | + * qemu_coroutine_inc_pool_size() API. | ||
104 | + * | ||
105 | + * .-----------------------------------. | ||
106 | + * | Batch 1 | Batch 2 | Batch 3 | ... | global_pool | ||
107 | + * `-----------------------------------' | ||
108 | + * | ||
109 | + * .-------------------. | ||
110 | + * | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches) | ||
111 | + * `-------------------' | ||
112 | + */ | ||
113 | +typedef struct CoroutinePoolBatch { | ||
114 | + /* Batches are kept in a list */ | ||
115 | + QSLIST_ENTRY(CoroutinePoolBatch) next; | ||
116 | |||
117 | -typedef QSLIST_HEAD(, Coroutine) CoroutineQSList; | ||
118 | -QEMU_DEFINE_STATIC_CO_TLS(CoroutineQSList, alloc_pool); | ||
119 | -QEMU_DEFINE_STATIC_CO_TLS(unsigned int, alloc_pool_size); | ||
120 | -QEMU_DEFINE_STATIC_CO_TLS(Notifier, coroutine_pool_cleanup_notifier); | ||
121 | + /* This batch holds up to @COROUTINE_POOL_BATCH_MAX_SIZE coroutines */ | ||
122 | + QSLIST_HEAD(, Coroutine) list; | ||
123 | + unsigned int size; | ||
124 | +} CoroutinePoolBatch; | ||
125 | |||
126 | -static void coroutine_pool_cleanup(Notifier *n, void *value) | ||
127 | +typedef QSLIST_HEAD(, CoroutinePoolBatch) CoroutinePool; | ||
128 | + | ||
129 | +/* Host operating system limit on number of pooled coroutines */ | ||
130 | +static unsigned int global_pool_hard_max_size; | ||
131 | + | ||
132 | +static QemuMutex global_pool_lock; /* protects the following variables */ | ||
133 | +static CoroutinePool global_pool = QSLIST_HEAD_INITIALIZER(global_pool); | ||
134 | +static unsigned int global_pool_size; | ||
135 | +static unsigned int global_pool_max_size = COROUTINE_POOL_BATCH_MAX_SIZE; | ||
136 | + | ||
137 | +QEMU_DEFINE_STATIC_CO_TLS(CoroutinePool, local_pool); | ||
138 | +QEMU_DEFINE_STATIC_CO_TLS(Notifier, local_pool_cleanup_notifier); | ||
139 | + | ||
140 | +static CoroutinePoolBatch *coroutine_pool_batch_new(void) | ||
141 | +{ | ||
142 | + CoroutinePoolBatch *batch = g_new(CoroutinePoolBatch, 1); | ||
143 | + | ||
144 | + QSLIST_INIT(&batch->list); | ||
145 | + batch->size = 0; | ||
146 | + return batch; | ||
147 | +} | ||
148 | + | ||
149 | +static void coroutine_pool_batch_delete(CoroutinePoolBatch *batch) | ||
150 | { | ||
151 | Coroutine *co; | ||
152 | Coroutine *tmp; | ||
153 | - CoroutineQSList *alloc_pool = get_ptr_alloc_pool(); | ||
154 | |||
155 | - QSLIST_FOREACH_SAFE(co, alloc_pool, pool_next, tmp) { | ||
156 | - QSLIST_REMOVE_HEAD(alloc_pool, pool_next); | ||
157 | + QSLIST_FOREACH_SAFE(co, &batch->list, pool_next, tmp) { | ||
158 | + QSLIST_REMOVE_HEAD(&batch->list, pool_next); | ||
159 | qemu_coroutine_delete(co); | ||
61 | } | 160 | } |
62 | 161 | + g_free(batch); | |
162 | +} | ||
163 | + | ||
164 | +static void local_pool_cleanup(Notifier *n, void *value) | ||
165 | +{ | ||
166 | + CoroutinePool *local_pool = get_ptr_local_pool(); | ||
167 | + CoroutinePoolBatch *batch; | ||
168 | + CoroutinePoolBatch *tmp; | ||
169 | + | ||
170 | + QSLIST_FOREACH_SAFE(batch, local_pool, next, tmp) { | ||
171 | + QSLIST_REMOVE_HEAD(local_pool, next); | ||
172 | + coroutine_pool_batch_delete(batch); | ||
173 | + } | ||
174 | +} | ||
175 | + | ||
176 | +/* Ensure the atexit notifier is registered */ | ||
177 | +static void local_pool_cleanup_init_once(void) | ||
178 | +{ | ||
179 | + Notifier *notifier = get_ptr_local_pool_cleanup_notifier(); | ||
180 | + if (!notifier->notify) { | ||
181 | + notifier->notify = local_pool_cleanup; | ||
182 | + qemu_thread_atexit_add(notifier); | ||
183 | + } | ||
184 | +} | ||
185 | + | ||
186 | +/* Helper to get the next unused coroutine from the local pool */ | ||
187 | +static Coroutine *coroutine_pool_get_local(void) | ||
188 | +{ | ||
189 | + CoroutinePool *local_pool = get_ptr_local_pool(); | ||
190 | + CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool); | ||
191 | + Coroutine *co; | ||
192 | + | ||
193 | + if (unlikely(!batch)) { | ||
194 | + return NULL; | ||
195 | + } | ||
196 | + | ||
197 | + co = QSLIST_FIRST(&batch->list); | ||
198 | + QSLIST_REMOVE_HEAD(&batch->list, pool_next); | ||
199 | + batch->size--; | ||
200 | + | ||
201 | + if (batch->size == 0) { | ||
202 | + QSLIST_REMOVE_HEAD(local_pool, next); | ||
203 | + coroutine_pool_batch_delete(batch); | ||
204 | + } | ||
205 | + return co; | ||
206 | +} | ||
207 | + | ||
208 | +/* Get the next batch from the global pool */ | ||
209 | +static void coroutine_pool_refill_local(void) | ||
210 | +{ | ||
211 | + CoroutinePool *local_pool = get_ptr_local_pool(); | ||
212 | + CoroutinePoolBatch *batch; | ||
213 | + | ||
214 | + WITH_QEMU_LOCK_GUARD(&global_pool_lock) { | ||
215 | + batch = QSLIST_FIRST(&global_pool); | ||
216 | + | ||
217 | + if (batch) { | ||
218 | + QSLIST_REMOVE_HEAD(&global_pool, next); | ||
219 | + global_pool_size -= batch->size; | ||
220 | + } | ||
221 | + } | ||
222 | + | ||
223 | + if (batch) { | ||
224 | + QSLIST_INSERT_HEAD(local_pool, batch, next); | ||
225 | + local_pool_cleanup_init_once(); | ||
226 | + } | ||
227 | +} | ||
228 | + | ||
229 | +/* Add a batch of coroutines to the global pool */ | ||
230 | +static void coroutine_pool_put_global(CoroutinePoolBatch *batch) | ||
231 | +{ | ||
232 | + WITH_QEMU_LOCK_GUARD(&global_pool_lock) { | ||
233 | + unsigned int max = MIN(global_pool_max_size, | ||
234 | + global_pool_hard_max_size); | ||
235 | + | ||
236 | + if (global_pool_size < max) { | ||
237 | + QSLIST_INSERT_HEAD(&global_pool, batch, next); | ||
238 | + | ||
239 | + /* Overshooting the max pool size is allowed */ | ||
240 | + global_pool_size += batch->size; | ||
241 | + return; | ||
242 | + } | ||
243 | + } | ||
244 | + | ||
245 | + /* The global pool was full, so throw away this batch */ | ||
246 | + coroutine_pool_batch_delete(batch); | ||
247 | +} | ||
248 | + | ||
249 | +/* Get the next unused coroutine from the pool or return NULL */ | ||
250 | +static Coroutine *coroutine_pool_get(void) | ||
251 | +{ | ||
252 | + Coroutine *co; | ||
253 | + | ||
254 | + co = coroutine_pool_get_local(); | ||
255 | + if (!co) { | ||
256 | + coroutine_pool_refill_local(); | ||
257 | + co = coroutine_pool_get_local(); | ||
258 | + } | ||
259 | + return co; | ||
260 | +} | ||
261 | + | ||
262 | +static void coroutine_pool_put(Coroutine *co) | ||
263 | +{ | ||
264 | + CoroutinePool *local_pool = get_ptr_local_pool(); | ||
265 | + CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool); | ||
266 | + | ||
267 | + if (unlikely(!batch)) { | ||
268 | + batch = coroutine_pool_batch_new(); | ||
269 | + QSLIST_INSERT_HEAD(local_pool, batch, next); | ||
270 | + local_pool_cleanup_init_once(); | ||
271 | + } | ||
272 | + | ||
273 | + if (unlikely(batch->size >= COROUTINE_POOL_BATCH_MAX_SIZE)) { | ||
274 | + CoroutinePoolBatch *next = QSLIST_NEXT(batch, next); | ||
275 | + | ||
276 | + /* Is the local pool full? */ | ||
277 | + if (next) { | ||
278 | + QSLIST_REMOVE_HEAD(local_pool, next); | ||
279 | + coroutine_pool_put_global(batch); | ||
280 | + } | ||
281 | + | ||
282 | + batch = coroutine_pool_batch_new(); | ||
283 | + QSLIST_INSERT_HEAD(local_pool, batch, next); | ||
284 | + } | ||
285 | + | ||
286 | + QSLIST_INSERT_HEAD(&batch->list, co, pool_next); | ||
287 | + batch->size++; | ||
288 | } | ||
289 | |||
290 | Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) | ||
291 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) | ||
292 | Coroutine *co = NULL; | ||
293 | |||
294 | if (IS_ENABLED(CONFIG_COROUTINE_POOL)) { | ||
295 | - CoroutineQSList *alloc_pool = get_ptr_alloc_pool(); | ||
296 | - | ||
297 | - co = QSLIST_FIRST(alloc_pool); | ||
298 | - if (!co) { | ||
299 | - if (release_pool_size > POOL_MIN_BATCH_SIZE) { | ||
300 | - /* Slow path; a good place to register the destructor, too. */ | ||
301 | - Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier(); | ||
302 | - if (!notifier->notify) { | ||
303 | - notifier->notify = coroutine_pool_cleanup; | ||
304 | - qemu_thread_atexit_add(notifier); | ||
305 | - } | ||
306 | - | ||
307 | - /* This is not exact; there could be a little skew between | ||
308 | - * release_pool_size and the actual size of release_pool. But | ||
309 | - * it is just a heuristic, it does not need to be perfect. | ||
310 | - */ | ||
311 | - set_alloc_pool_size(qatomic_xchg(&release_pool_size, 0)); | ||
312 | - QSLIST_MOVE_ATOMIC(alloc_pool, &release_pool); | ||
313 | - co = QSLIST_FIRST(alloc_pool); | ||
314 | - } | ||
315 | - } | ||
316 | - if (co) { | ||
317 | - QSLIST_REMOVE_HEAD(alloc_pool, pool_next); | ||
318 | - set_alloc_pool_size(get_alloc_pool_size() - 1); | ||
319 | - } | ||
320 | + co = coroutine_pool_get(); | ||
321 | } | ||
322 | |||
323 | if (!co) { | ||
324 | @@ -XXX,XX +XXX,XX @@ static void coroutine_delete(Coroutine *co) | ||
325 | co->caller = NULL; | ||
326 | |||
327 | if (IS_ENABLED(CONFIG_COROUTINE_POOL)) { | ||
328 | - if (release_pool_size < qatomic_read(&pool_max_size) * 2) { | ||
329 | - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); | ||
330 | - qatomic_inc(&release_pool_size); | ||
331 | - return; | ||
332 | - } | ||
333 | - if (get_alloc_pool_size() < qatomic_read(&pool_max_size)) { | ||
334 | - QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next); | ||
335 | - set_alloc_pool_size(get_alloc_pool_size() + 1); | ||
336 | - return; | ||
337 | - } | ||
338 | + coroutine_pool_put(co); | ||
339 | + } else { | ||
340 | + qemu_coroutine_delete(co); | ||
341 | } | ||
342 | - | ||
343 | - qemu_coroutine_delete(co); | ||
344 | } | ||
345 | |||
346 | void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co) | ||
347 | @@ -XXX,XX +XXX,XX @@ AioContext *qemu_coroutine_get_aio_context(Coroutine *co) | ||
348 | |||
349 | void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size) | ||
350 | { | ||
351 | - qatomic_add(&pool_max_size, additional_pool_size); | ||
352 | + QEMU_LOCK_GUARD(&global_pool_lock); | ||
353 | + global_pool_max_size += additional_pool_size; | ||
354 | } | ||
355 | |||
356 | void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size) | ||
357 | { | ||
358 | - qatomic_sub(&pool_max_size, removing_pool_size); | ||
359 | + QEMU_LOCK_GUARD(&global_pool_lock); | ||
360 | + global_pool_max_size -= removing_pool_size; | ||
361 | +} | ||
362 | + | ||
363 | +static unsigned int get_global_pool_hard_max_size(void) | ||
364 | +{ | ||
365 | +#ifdef __linux__ | ||
366 | + g_autofree char *contents = NULL; | ||
367 | + int max_map_count; | ||
368 | + | ||
63 | + /* | 369 | + /* |
64 | + * These fields must be visible to the IOThread when it processes the | 370 | + * Linux processes can have up to max_map_count virtual memory areas |
65 | + * virtqueue, otherwise it will think dataplane has not started yet. | 371 | + * (VMAs). mmap(2), mprotect(2), etc fail with ENOMEM beyond this limit. We |
66 | + * | 372 | + * must limit the coroutine pool to a safe size to avoid running out of |
67 | + * Make sure ->dataplane_started is false when blk_set_aio_context() is | 373 | + * VMAs. |
68 | + * called above so that draining does not cause the host notifier to be | ||
69 | + * detached/attached prematurely. | ||
70 | + */ | 374 | + */ |
71 | + s->starting = false; | 375 | + if (g_file_get_contents("/proc/sys/vm/max_map_count", &contents, NULL, |
72 | + vblk->dataplane_started = true; | 376 | + NULL) && |
73 | + smp_wmb(); /* paired with aio_notify_accept() on the read side */ | 377 | + qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) { |
74 | + | 378 | + /* |
75 | /* Get this show started by hooking up our callbacks */ | 379 | + * This is a conservative upper bound that avoids exceeding |
76 | if (!blk_in_drain(s->conf->conf.blk)) { | 380 | + * max_map_count. Leave half for non-coroutine users like library |
77 | aio_context_acquire(s->ctx); | 381 | + * dependencies, vhost-user, etc. Each coroutine takes up 2 VMAs so |
78 | @@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) | 382 | + * halve the amount again. |
79 | fail_guest_notifiers: | 383 | + */ |
80 | vblk->dataplane_disabled = true; | 384 | + return max_map_count / 4; |
81 | s->starting = false; | 385 | + } |
82 | - vblk->dataplane_started = true; | 386 | +#endif |
83 | return -ENOSYS; | 387 | + |
84 | } | 388 | + return UINT_MAX; |
85 | 389 | +} | |
86 | @@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) | 390 | + |
87 | aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s); | 391 | +static void __attribute__((constructor)) qemu_coroutine_init(void) |
88 | } | 392 | +{ |
89 | 393 | + qemu_mutex_init(&global_pool_lock); | |
90 | + /* | 394 | + global_pool_hard_max_size = get_global_pool_hard_max_size(); |
91 | + * Batch all the host notifiers in a single transaction to avoid | ||
92 | + * quadratic time complexity in address_space_update_ioeventfds(). | ||
93 | + */ | ||
94 | + memory_region_transaction_begin(); | ||
95 | + | ||
96 | + for (i = 0; i < nvqs; i++) { | ||
97 | + virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); | ||
98 | + } | ||
99 | + | ||
100 | + /* | ||
101 | + * The transaction expects the ioeventfds to be open when it | ||
102 | + * commits. Do it now, before the cleanup loop. | ||
103 | + */ | ||
104 | + memory_region_transaction_commit(); | ||
105 | + | ||
106 | + for (i = 0; i < nvqs; i++) { | ||
107 | + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); | ||
108 | + } | ||
109 | + | ||
110 | + /* | ||
111 | + * Set ->dataplane_started to false before draining so that host notifiers | ||
112 | + * are not detached/attached anymore. | ||
113 | + */ | ||
114 | + vblk->dataplane_started = false; | ||
115 | + | ||
116 | aio_context_acquire(s->ctx); | ||
117 | |||
118 | /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */ | ||
119 | @@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) | ||
120 | |||
121 | aio_context_release(s->ctx); | ||
122 | |||
123 | - /* | ||
124 | - * Batch all the host notifiers in a single transaction to avoid | ||
125 | - * quadratic time complexity in address_space_update_ioeventfds(). | ||
126 | - */ | ||
127 | - memory_region_transaction_begin(); | ||
128 | - | ||
129 | - for (i = 0; i < nvqs; i++) { | ||
130 | - virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); | ||
131 | - } | ||
132 | - | ||
133 | - /* | ||
134 | - * The transaction expects the ioeventfds to be open when it | ||
135 | - * commits. Do it now, before the cleanup loop. | ||
136 | - */ | ||
137 | - memory_region_transaction_commit(); | ||
138 | - | ||
139 | - for (i = 0; i < nvqs; i++) { | ||
140 | - virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); | ||
141 | - } | ||
142 | - | ||
143 | qemu_bh_cancel(s->bh); | ||
144 | notify_guest_bh(s); /* final chance to notify guest */ | ||
145 | |||
146 | /* Clean up guest notifier (irq) */ | ||
147 | k->set_guest_notifiers(qbus->parent, nvqs, false); | ||
148 | |||
149 | - vblk->dataplane_started = false; | ||
150 | s->stopping = false; | ||
151 | } | 395 | } |
152 | -- | 396 | -- |
153 | 2.40.1 | 397 | 2.44.0 |
154 | |||
155 | diff view generated by jsdifflib |