1
The following changes since commit 3521ade3510eb5cefb2e27a101667f25dad89935:
1
The following changes since commit ddc27d2ad9361a81c2b3800d14143bf420dae172:
2
2
3
Merge remote-tracking branch 'remotes/thuth-gitlab/tags/pull-request-2021-07-29' into staging (2021-07-29 13:17:20 +0100)
3
Merge tag 'pull-request-2024-03-18' of https://gitlab.com/thuth/qemu into staging (2024-03-19 10:25:25 +0000)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to cc8eecd7f105a1dff5876adeb238a14696061a4a:
9
for you to fetch changes up to 86a637e48104ae74d8be53bed6441ce32be33433:
10
10
11
MAINTAINERS: Added myself as a reviewer for the NVMe Block Driver (2021-07-29 17:17:34 +0100)
11
coroutine: cap per-thread local pool size (2024-03-19 10:49:31 -0400)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
The main fix here is for io_uring. Spurious -EAGAIN errors can happen and the
16
This fix solves the "failed to set up stack guard page" error that has been
17
request needs to be resubmitted.
17
reported on Linux hosts where the QEMU coroutine pool exceeds the
18
18
vm.max_map_count limit.
19
The MAINTAINERS changes carry no risk and we might as well include them in QEMU
20
6.1.
21
19
22
----------------------------------------------------------------
20
----------------------------------------------------------------
23
21
24
Fabian Ebner (1):
22
Stefan Hajnoczi (1):
25
block/io_uring: resubmit when result is -EAGAIN
23
coroutine: cap per-thread local pool size
26
24
27
Philippe Mathieu-Daudé (1):
25
util/qemu-coroutine.c | 282 +++++++++++++++++++++++++++++++++---------
28
MAINTAINERS: Added myself as a reviewer for the NVMe Block Driver
26
1 file changed, 223 insertions(+), 59 deletions(-)
29
30
Stefano Garzarella (1):
31
MAINTAINERS: add Stefano Garzarella as io_uring reviewer
32
33
MAINTAINERS | 2 ++
34
block/io_uring.c | 16 +++++++++++++++-
35
2 files changed, 17 insertions(+), 1 deletion(-)
36
27
37
--
28
--
38
2.31.1
29
2.44.0
39
diff view generated by jsdifflib
Deleted patch
1
From: Stefano Garzarella <sgarzare@redhat.com>
2
1
3
I've been working with io_uring for a while so I'd like to help
4
with reviews.
5
6
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
7
Message-Id: <20210728131515.131045-1-sgarzare@redhat.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
MAINTAINERS | 1 +
11
1 file changed, 1 insertion(+)
12
13
diff --git a/MAINTAINERS b/MAINTAINERS
14
index XXXXXXX..XXXXXXX 100644
15
--- a/MAINTAINERS
16
+++ b/MAINTAINERS
17
@@ -XXX,XX +XXX,XX @@ Linux io_uring
18
M: Aarushi Mehta <mehta.aaru20@gmail.com>
19
M: Julia Suvorova <jusual@redhat.com>
20
M: Stefan Hajnoczi <stefanha@redhat.com>
21
+R: Stefano Garzarella <sgarzare@redhat.com>
22
L: qemu-block@nongnu.org
23
S: Maintained
24
F: block/io_uring.c
25
--
26
2.31.1
27
diff view generated by jsdifflib
Deleted patch
1
From: Fabian Ebner <f.ebner@proxmox.com>
2
1
3
Linux SCSI can throw spurious -EAGAIN in some corner cases in its
4
completion path, which will end up being the result in the completed
5
io_uring request.
6
7
Resubmitting such requests should allow block jobs to complete, even
8
if such spurious errors are encountered.
9
10
Co-authored-by: Stefan Hajnoczi <stefanha@gmail.com>
11
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
12
Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
13
Message-id: 20210729091029.65369-1-f.ebner@proxmox.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
block/io_uring.c | 16 +++++++++++++++-
17
1 file changed, 15 insertions(+), 1 deletion(-)
18
19
diff --git a/block/io_uring.c b/block/io_uring.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/block/io_uring.c
22
+++ b/block/io_uring.c
23
@@ -XXX,XX +XXX,XX @@ static void luring_process_completions(LuringState *s)
24
total_bytes = ret + luringcb->total_read;
25
26
if (ret < 0) {
27
- if (ret == -EINTR) {
28
+ /*
29
+ * Only writev/readv/fsync requests on regular files or host block
30
+ * devices are submitted. Therefore -EAGAIN is not expected but it's
31
+ * known to happen sometimes with Linux SCSI. Submit again and hope
32
+ * the request completes successfully.
33
+ *
34
+ * For more information, see:
35
+ * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
36
+ *
37
+ * If the code is changed to submit other types of requests in the
38
+ * future, then this workaround may need to be extended to deal with
39
+ * genuine -EAGAIN results that should not be resubmitted
40
+ * immediately.
41
+ */
42
+ if (ret == -EINTR || ret == -EAGAIN) {
43
luring_resubmit(s, luringcb);
44
continue;
45
}
46
--
47
2.31.1
48
diff view generated by jsdifflib
1
From: Philippe Mathieu-Daudé <philmd@redhat.com>
1
The coroutine pool implementation can hit the Linux vm.max_map_count
2
2
limit, causing QEMU to abort with "failed to allocate memory for stack"
3
I'm interested in following the activity around the NVMe bdrv.
3
or "failed to set up stack guard page" during coroutine creation.
4
4
5
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
5
This happens because per-thread pools can grow to tens of thousands of
6
Message-id: 20210728183340.2018313-1-philmd@redhat.com
6
coroutines. Each coroutine causes 2 virtual memory areas to be created.
7
Eventually vm.max_map_count is reached and memory-related syscalls fail.
8
The per-thread pool sizes are non-uniform and depend on past coroutine
9
usage in each thread, so it's possible for one thread to have a large
10
pool while another thread's pool is empty.
11
12
Switch to a new coroutine pool implementation with a global pool that
13
grows to a maximum number of coroutines and per-thread local pools that
14
are capped at hardcoded small number of coroutines.
15
16
This approach does not leave large numbers of coroutines pooled in a
17
thread that may not use them again. In order to perform well it
18
amortizes the cost of global pool accesses by working in batches of
19
coroutines instead of individual coroutines.
20
21
The global pool is a list. Threads donate batches of coroutines to when
22
they have too many and take batches from when they have too few:
23
24
.-----------------------------------.
25
| Batch 1 | Batch 2 | Batch 3 | ... | global_pool
26
`-----------------------------------'
27
28
Each thread has up to 2 batches of coroutines:
29
30
.-------------------.
31
| Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches)
32
`-------------------'
33
34
The goal of this change is to reduce the excessive number of pooled
35
coroutines that cause QEMU to abort when vm.max_map_count is reached
36
without losing the performance of an adequately sized coroutine pool.
37
38
Here are virtio-blk disk I/O benchmark results:
39
40
RW BLKSIZE IODEPTH OLD NEW CHANGE
41
randread 4k 1 113725 117451 +3.3%
42
randread 4k 8 192968 198510 +2.9%
43
randread 4k 16 207138 209429 +1.1%
44
randread 4k 32 212399 215145 +1.3%
45
randread 4k 64 218319 221277 +1.4%
46
randread 128k 1 17587 17535 -0.3%
47
randread 128k 8 17614 17616 +0.0%
48
randread 128k 16 17608 17609 +0.0%
49
randread 128k 32 17552 17553 +0.0%
50
randread 128k 64 17484 17484 +0.0%
51
52
See files/{fio.sh,test.xml.j2} for the benchmark configuration:
53
https://gitlab.com/stefanha/virt-playbooks/-/tree/coroutine-pool-fix-sizing
54
55
Buglink: https://issues.redhat.com/browse/RHEL-28947
56
Reported-by: Sanjay Rao <srao@redhat.com>
57
Reported-by: Boaz Ben Shabat <bbenshab@redhat.com>
58
Reported-by: Joe Mario <jmario@redhat.com>
59
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
60
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
61
Message-ID: <20240318183429.1039340-1-stefanha@redhat.com>
8
---
62
---
9
MAINTAINERS | 1 +
63
util/qemu-coroutine.c | 282 +++++++++++++++++++++++++++++++++---------
10
1 file changed, 1 insertion(+)
64
1 file changed, 223 insertions(+), 59 deletions(-)
11
65
12
diff --git a/MAINTAINERS b/MAINTAINERS
66
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
13
index XXXXXXX..XXXXXXX 100644
67
index XXXXXXX..XXXXXXX 100644
14
--- a/MAINTAINERS
68
--- a/util/qemu-coroutine.c
15
+++ b/MAINTAINERS
69
+++ b/util/qemu-coroutine.c
16
@@ -XXX,XX +XXX,XX @@ F: block/null.c
70
@@ -XXX,XX +XXX,XX @@
17
NVMe Block Driver
71
#include "qemu/atomic.h"
18
M: Stefan Hajnoczi <stefanha@redhat.com>
72
#include "qemu/coroutine_int.h"
19
R: Fam Zheng <fam@euphon.net>
73
#include "qemu/coroutine-tls.h"
20
+R: Philippe Mathieu-Daudé <philmd@redhat.com>
74
+#include "qemu/cutils.h"
21
L: qemu-block@nongnu.org
75
#include "block/aio.h"
22
S: Supported
76
23
F: block/nvme*
77
-/**
78
- * The minimal batch size is always 64, coroutines from the release_pool are
79
- * reused as soon as there are 64 coroutines in it. The maximum pool size starts
80
- * with 64 and is increased on demand so that coroutines are not deleted even if
81
- * they are not immediately reused.
82
- */
83
enum {
84
- POOL_MIN_BATCH_SIZE = 64,
85
- POOL_INITIAL_MAX_SIZE = 64,
86
+ COROUTINE_POOL_BATCH_MAX_SIZE = 128,
87
};
88
89
-/** Free list to speed up creation */
90
-static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
91
-static unsigned int pool_max_size = POOL_INITIAL_MAX_SIZE;
92
-static unsigned int release_pool_size;
93
+/*
94
+ * Coroutine creation and deletion is expensive so a pool of unused coroutines
95
+ * is kept as a cache. When the pool has coroutines available, they are
96
+ * recycled instead of creating new ones from scratch. Coroutines are added to
97
+ * the pool upon termination.
98
+ *
99
+ * The pool is global but each thread maintains a small local pool to avoid
100
+ * global pool contention. Threads fetch and return batches of coroutines from
101
+ * the global pool to maintain their local pool. The local pool holds up to two
102
+ * batches whereas the maximum size of the global pool is controlled by the
103
+ * qemu_coroutine_inc_pool_size() API.
104
+ *
105
+ * .-----------------------------------.
106
+ * | Batch 1 | Batch 2 | Batch 3 | ... | global_pool
107
+ * `-----------------------------------'
108
+ *
109
+ * .-------------------.
110
+ * | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches)
111
+ * `-------------------'
112
+ */
113
+typedef struct CoroutinePoolBatch {
114
+ /* Batches are kept in a list */
115
+ QSLIST_ENTRY(CoroutinePoolBatch) next;
116
117
-typedef QSLIST_HEAD(, Coroutine) CoroutineQSList;
118
-QEMU_DEFINE_STATIC_CO_TLS(CoroutineQSList, alloc_pool);
119
-QEMU_DEFINE_STATIC_CO_TLS(unsigned int, alloc_pool_size);
120
-QEMU_DEFINE_STATIC_CO_TLS(Notifier, coroutine_pool_cleanup_notifier);
121
+ /* This batch holds up to @COROUTINE_POOL_BATCH_MAX_SIZE coroutines */
122
+ QSLIST_HEAD(, Coroutine) list;
123
+ unsigned int size;
124
+} CoroutinePoolBatch;
125
126
-static void coroutine_pool_cleanup(Notifier *n, void *value)
127
+typedef QSLIST_HEAD(, CoroutinePoolBatch) CoroutinePool;
128
+
129
+/* Host operating system limit on number of pooled coroutines */
130
+static unsigned int global_pool_hard_max_size;
131
+
132
+static QemuMutex global_pool_lock; /* protects the following variables */
133
+static CoroutinePool global_pool = QSLIST_HEAD_INITIALIZER(global_pool);
134
+static unsigned int global_pool_size;
135
+static unsigned int global_pool_max_size = COROUTINE_POOL_BATCH_MAX_SIZE;
136
+
137
+QEMU_DEFINE_STATIC_CO_TLS(CoroutinePool, local_pool);
138
+QEMU_DEFINE_STATIC_CO_TLS(Notifier, local_pool_cleanup_notifier);
139
+
140
+static CoroutinePoolBatch *coroutine_pool_batch_new(void)
141
+{
142
+ CoroutinePoolBatch *batch = g_new(CoroutinePoolBatch, 1);
143
+
144
+ QSLIST_INIT(&batch->list);
145
+ batch->size = 0;
146
+ return batch;
147
+}
148
+
149
+static void coroutine_pool_batch_delete(CoroutinePoolBatch *batch)
150
{
151
Coroutine *co;
152
Coroutine *tmp;
153
- CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
154
155
- QSLIST_FOREACH_SAFE(co, alloc_pool, pool_next, tmp) {
156
- QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
157
+ QSLIST_FOREACH_SAFE(co, &batch->list, pool_next, tmp) {
158
+ QSLIST_REMOVE_HEAD(&batch->list, pool_next);
159
qemu_coroutine_delete(co);
160
}
161
+ g_free(batch);
162
+}
163
+
164
+static void local_pool_cleanup(Notifier *n, void *value)
165
+{
166
+ CoroutinePool *local_pool = get_ptr_local_pool();
167
+ CoroutinePoolBatch *batch;
168
+ CoroutinePoolBatch *tmp;
169
+
170
+ QSLIST_FOREACH_SAFE(batch, local_pool, next, tmp) {
171
+ QSLIST_REMOVE_HEAD(local_pool, next);
172
+ coroutine_pool_batch_delete(batch);
173
+ }
174
+}
175
+
176
+/* Ensure the atexit notifier is registered */
177
+static void local_pool_cleanup_init_once(void)
178
+{
179
+ Notifier *notifier = get_ptr_local_pool_cleanup_notifier();
180
+ if (!notifier->notify) {
181
+ notifier->notify = local_pool_cleanup;
182
+ qemu_thread_atexit_add(notifier);
183
+ }
184
+}
185
+
186
+/* Helper to get the next unused coroutine from the local pool */
187
+static Coroutine *coroutine_pool_get_local(void)
188
+{
189
+ CoroutinePool *local_pool = get_ptr_local_pool();
190
+ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
191
+ Coroutine *co;
192
+
193
+ if (unlikely(!batch)) {
194
+ return NULL;
195
+ }
196
+
197
+ co = QSLIST_FIRST(&batch->list);
198
+ QSLIST_REMOVE_HEAD(&batch->list, pool_next);
199
+ batch->size--;
200
+
201
+ if (batch->size == 0) {
202
+ QSLIST_REMOVE_HEAD(local_pool, next);
203
+ coroutine_pool_batch_delete(batch);
204
+ }
205
+ return co;
206
+}
207
+
208
+/* Get the next batch from the global pool */
209
+static void coroutine_pool_refill_local(void)
210
+{
211
+ CoroutinePool *local_pool = get_ptr_local_pool();
212
+ CoroutinePoolBatch *batch;
213
+
214
+ WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
215
+ batch = QSLIST_FIRST(&global_pool);
216
+
217
+ if (batch) {
218
+ QSLIST_REMOVE_HEAD(&global_pool, next);
219
+ global_pool_size -= batch->size;
220
+ }
221
+ }
222
+
223
+ if (batch) {
224
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
225
+ local_pool_cleanup_init_once();
226
+ }
227
+}
228
+
229
+/* Add a batch of coroutines to the global pool */
230
+static void coroutine_pool_put_global(CoroutinePoolBatch *batch)
231
+{
232
+ WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
233
+ unsigned int max = MIN(global_pool_max_size,
234
+ global_pool_hard_max_size);
235
+
236
+ if (global_pool_size < max) {
237
+ QSLIST_INSERT_HEAD(&global_pool, batch, next);
238
+
239
+ /* Overshooting the max pool size is allowed */
240
+ global_pool_size += batch->size;
241
+ return;
242
+ }
243
+ }
244
+
245
+ /* The global pool was full, so throw away this batch */
246
+ coroutine_pool_batch_delete(batch);
247
+}
248
+
249
+/* Get the next unused coroutine from the pool or return NULL */
250
+static Coroutine *coroutine_pool_get(void)
251
+{
252
+ Coroutine *co;
253
+
254
+ co = coroutine_pool_get_local();
255
+ if (!co) {
256
+ coroutine_pool_refill_local();
257
+ co = coroutine_pool_get_local();
258
+ }
259
+ return co;
260
+}
261
+
262
+static void coroutine_pool_put(Coroutine *co)
263
+{
264
+ CoroutinePool *local_pool = get_ptr_local_pool();
265
+ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
266
+
267
+ if (unlikely(!batch)) {
268
+ batch = coroutine_pool_batch_new();
269
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
270
+ local_pool_cleanup_init_once();
271
+ }
272
+
273
+ if (unlikely(batch->size >= COROUTINE_POOL_BATCH_MAX_SIZE)) {
274
+ CoroutinePoolBatch *next = QSLIST_NEXT(batch, next);
275
+
276
+ /* Is the local pool full? */
277
+ if (next) {
278
+ QSLIST_REMOVE_HEAD(local_pool, next);
279
+ coroutine_pool_put_global(batch);
280
+ }
281
+
282
+ batch = coroutine_pool_batch_new();
283
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
284
+ }
285
+
286
+ QSLIST_INSERT_HEAD(&batch->list, co, pool_next);
287
+ batch->size++;
288
}
289
290
Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
291
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
292
Coroutine *co = NULL;
293
294
if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
295
- CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
296
-
297
- co = QSLIST_FIRST(alloc_pool);
298
- if (!co) {
299
- if (release_pool_size > POOL_MIN_BATCH_SIZE) {
300
- /* Slow path; a good place to register the destructor, too. */
301
- Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier();
302
- if (!notifier->notify) {
303
- notifier->notify = coroutine_pool_cleanup;
304
- qemu_thread_atexit_add(notifier);
305
- }
306
-
307
- /* This is not exact; there could be a little skew between
308
- * release_pool_size and the actual size of release_pool. But
309
- * it is just a heuristic, it does not need to be perfect.
310
- */
311
- set_alloc_pool_size(qatomic_xchg(&release_pool_size, 0));
312
- QSLIST_MOVE_ATOMIC(alloc_pool, &release_pool);
313
- co = QSLIST_FIRST(alloc_pool);
314
- }
315
- }
316
- if (co) {
317
- QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
318
- set_alloc_pool_size(get_alloc_pool_size() - 1);
319
- }
320
+ co = coroutine_pool_get();
321
}
322
323
if (!co) {
324
@@ -XXX,XX +XXX,XX @@ static void coroutine_delete(Coroutine *co)
325
co->caller = NULL;
326
327
if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
328
- if (release_pool_size < qatomic_read(&pool_max_size) * 2) {
329
- QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
330
- qatomic_inc(&release_pool_size);
331
- return;
332
- }
333
- if (get_alloc_pool_size() < qatomic_read(&pool_max_size)) {
334
- QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next);
335
- set_alloc_pool_size(get_alloc_pool_size() + 1);
336
- return;
337
- }
338
+ coroutine_pool_put(co);
339
+ } else {
340
+ qemu_coroutine_delete(co);
341
}
342
-
343
- qemu_coroutine_delete(co);
344
}
345
346
void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
347
@@ -XXX,XX +XXX,XX @@ AioContext *qemu_coroutine_get_aio_context(Coroutine *co)
348
349
void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)
350
{
351
- qatomic_add(&pool_max_size, additional_pool_size);
352
+ QEMU_LOCK_GUARD(&global_pool_lock);
353
+ global_pool_max_size += additional_pool_size;
354
}
355
356
void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)
357
{
358
- qatomic_sub(&pool_max_size, removing_pool_size);
359
+ QEMU_LOCK_GUARD(&global_pool_lock);
360
+ global_pool_max_size -= removing_pool_size;
361
+}
362
+
363
+static unsigned int get_global_pool_hard_max_size(void)
364
+{
365
+#ifdef __linux__
366
+ g_autofree char *contents = NULL;
367
+ int max_map_count;
368
+
369
+ /*
370
+ * Linux processes can have up to max_map_count virtual memory areas
371
+ * (VMAs). mmap(2), mprotect(2), etc fail with ENOMEM beyond this limit. We
372
+ * must limit the coroutine pool to a safe size to avoid running out of
373
+ * VMAs.
374
+ */
375
+ if (g_file_get_contents("/proc/sys/vm/max_map_count", &contents, NULL,
376
+ NULL) &&
377
+ qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) {
378
+ /*
379
+ * This is a conservative upper bound that avoids exceeding
380
+ * max_map_count. Leave half for non-coroutine users like library
381
+ * dependencies, vhost-user, etc. Each coroutine takes up 2 VMAs so
382
+ * halve the amount again.
383
+ */
384
+ return max_map_count / 4;
385
+ }
386
+#endif
387
+
388
+ return UINT_MAX;
389
+}
390
+
391
+static void __attribute__((constructor)) qemu_coroutine_init(void)
392
+{
393
+ qemu_mutex_init(&global_pool_lock);
394
+ global_pool_hard_max_size = get_global_pool_hard_max_size();
395
}
24
--
396
--
25
2.31.1
397
2.44.0
26
diff view generated by jsdifflib