1 | The following changes since commit ab08440a4ee09032d1a9cb22fdcab23bc7e1c656: | 1 | The following changes since commit afc9fcde55296b83f659de9da3cdf044812a6eeb: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20180702' into staging (2018-07-02 17:57:46 +0100) | 3 | Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging (2021-10-20 06:10:51 -0700) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | git://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 9ded4a0114968e98b41494fc035ba14f84cdf700: | 9 | for you to fetch changes up to 4b2b3d2653f255ef4259a7689af1956536565901: |
10 | 10 | ||
11 | backup: Use copy offloading (2018-07-02 23:23:45 -0400) | 11 | coroutine: resize pool periodically instead of limiting size (2021-10-21 18:40:07 +0100) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Block backup patches | 14 | Pull request |
15 | |||
16 | Performance optimization when guest applications submit a lot of parallel I/O. | ||
17 | This has also been found to improve clang SafeStack performance. | ||
18 | |||
15 | ---------------------------------------------------------------- | 19 | ---------------------------------------------------------------- |
16 | 20 | ||
17 | Fam Zheng (3): | 21 | Stefan Hajnoczi (1): |
18 | block: Fix parameter checking in bdrv_co_copy_range_internal | 22 | coroutine: resize pool periodically instead of limiting size |
19 | block: Honour BDRV_REQ_NO_SERIALISING in copy range | ||
20 | backup: Use copy offloading | ||
21 | 23 | ||
22 | block/backup.c | 150 ++++++++++++++++++++++++++++++------------ | 24 | include/qemu/coroutine-pool-timer.h | 36 ++++++++++++++++ |
23 | block/io.c | 35 +++++----- | 25 | include/qemu/coroutine.h | 7 ++++ |
24 | block/trace-events | 1 + | 26 | iothread.c | 6 +++ |
25 | include/block/block.h | 5 +- | 27 | util/coroutine-pool-timer.c | 35 ++++++++++++++++ |
26 | 4 files changed, 132 insertions(+), 59 deletions(-) | 28 | util/main-loop.c | 5 +++ |
29 | util/qemu-coroutine.c | 64 ++++++++++++++++------------- | ||
30 | util/meson.build | 1 + | ||
31 | 7 files changed, 125 insertions(+), 29 deletions(-) | ||
32 | create mode 100644 include/qemu/coroutine-pool-timer.h | ||
33 | create mode 100644 util/coroutine-pool-timer.c | ||
27 | 34 | ||
28 | -- | 35 | -- |
29 | 2.17.1 | 36 | 2.31.1 |
30 | 37 | ||
31 | 38 | ||
39 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | src may be NULL if BDRV_REQ_ZERO_WRITE flag is set, in this case only | ||
4 | check dst and dst->bs. This bug was introduced when moving in the | ||
5 | request tracking code from bdrv_co_copy_range, in 37aec7d75eb. | ||
6 | |||
7 | This especially fixes the possible segfault when initializing src_bs | ||
8 | with a NULL src. | ||
9 | |||
10 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
11 | Message-id: 20180703023758.14422-2-famz@redhat.com | ||
12 | Reviewed-by: Jeff Cody <jcody@redhat.com> | ||
13 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
14 | --- | ||
15 | block/io.c | 29 +++++++++++++++-------------- | ||
16 | 1 file changed, 15 insertions(+), 14 deletions(-) | ||
17 | |||
18 | diff --git a/block/io.c b/block/io.c | ||
19 | index XXXXXXX..XXXXXXX 100644 | ||
20 | --- a/block/io.c | ||
21 | +++ b/block/io.c | ||
22 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, | ||
23 | bool recurse_src) | ||
24 | { | ||
25 | BdrvTrackedRequest src_req, dst_req; | ||
26 | - BlockDriverState *src_bs = src->bs; | ||
27 | - BlockDriverState *dst_bs = dst->bs; | ||
28 | int ret; | ||
29 | |||
30 | - if (!src || !dst || !src->bs || !dst->bs) { | ||
31 | + if (!dst || !dst->bs) { | ||
32 | return -ENOMEDIUM; | ||
33 | } | ||
34 | - ret = bdrv_check_byte_request(src->bs, src_offset, bytes); | ||
35 | - if (ret) { | ||
36 | - return ret; | ||
37 | - } | ||
38 | - | ||
39 | ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); | ||
40 | if (ret) { | ||
41 | return ret; | ||
42 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, | ||
43 | return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); | ||
44 | } | ||
45 | |||
46 | + if (!src || !src->bs) { | ||
47 | + return -ENOMEDIUM; | ||
48 | + } | ||
49 | + ret = bdrv_check_byte_request(src->bs, src_offset, bytes); | ||
50 | + if (ret) { | ||
51 | + return ret; | ||
52 | + } | ||
53 | + | ||
54 | if (!src->bs->drv->bdrv_co_copy_range_from | ||
55 | || !dst->bs->drv->bdrv_co_copy_range_to | ||
56 | || src->bs->encrypted || dst->bs->encrypted) { | ||
57 | return -ENOTSUP; | ||
58 | } | ||
59 | - bdrv_inc_in_flight(src_bs); | ||
60 | - bdrv_inc_in_flight(dst_bs); | ||
61 | - tracked_request_begin(&src_req, src_bs, src_offset, | ||
62 | + bdrv_inc_in_flight(src->bs); | ||
63 | + bdrv_inc_in_flight(dst->bs); | ||
64 | + tracked_request_begin(&src_req, src->bs, src_offset, | ||
65 | bytes, BDRV_TRACKED_READ); | ||
66 | - tracked_request_begin(&dst_req, dst_bs, dst_offset, | ||
67 | + tracked_request_begin(&dst_req, dst->bs, dst_offset, | ||
68 | bytes, BDRV_TRACKED_WRITE); | ||
69 | |||
70 | wait_serialising_requests(&src_req); | ||
71 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, | ||
72 | } | ||
73 | tracked_request_end(&src_req); | ||
74 | tracked_request_end(&dst_req); | ||
75 | - bdrv_dec_in_flight(src_bs); | ||
76 | - bdrv_dec_in_flight(dst_bs); | ||
77 | + bdrv_dec_in_flight(src->bs); | ||
78 | + bdrv_dec_in_flight(dst->bs); | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | -- | ||
83 | 2.17.1 | ||
84 | |||
85 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | This semantics is needed by drive-backup so implement it before using | ||
4 | this API there. | ||
5 | |||
6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
7 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
8 | Message-id: 20180703023758.14422-3-famz@redhat.com | ||
9 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
10 | --- | ||
11 | block/io.c | 6 ++++-- | ||
12 | include/block/block.h | 5 +++-- | ||
13 | 2 files changed, 7 insertions(+), 4 deletions(-) | ||
14 | |||
15 | diff --git a/block/io.c b/block/io.c | ||
16 | index XXXXXXX..XXXXXXX 100644 | ||
17 | --- a/block/io.c | ||
18 | +++ b/block/io.c | ||
19 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, | ||
20 | tracked_request_begin(&dst_req, dst->bs, dst_offset, | ||
21 | bytes, BDRV_TRACKED_WRITE); | ||
22 | |||
23 | - wait_serialising_requests(&src_req); | ||
24 | - wait_serialising_requests(&dst_req); | ||
25 | + if (!(flags & BDRV_REQ_NO_SERIALISING)) { | ||
26 | + wait_serialising_requests(&src_req); | ||
27 | + wait_serialising_requests(&dst_req); | ||
28 | + } | ||
29 | if (recurse_src) { | ||
30 | ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, | ||
31 | src, src_offset, | ||
32 | diff --git a/include/block/block.h b/include/block/block.h | ||
33 | index XXXXXXX..XXXXXXX 100644 | ||
34 | --- a/include/block/block.h | ||
35 | +++ b/include/block/block.h | ||
36 | @@ -XXX,XX +XXX,XX @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host); | ||
37 | * @dst: Destination child to copy data to | ||
38 | * @dst_offset: offset in @dst image to write data | ||
39 | * @bytes: number of bytes to copy | ||
40 | - * @flags: request flags. Must be one of: | ||
41 | - * 0 - actually read data from src; | ||
42 | + * @flags: request flags. Supported flags: | ||
43 | * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero | ||
44 | * write on @dst as if bdrv_co_pwrite_zeroes is | ||
45 | * called. Used to simplify caller code, or | ||
46 | * during BlockDriver.bdrv_co_copy_range_from() | ||
47 | * recursion. | ||
48 | + * BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping | ||
49 | + * requests currently in flight. | ||
50 | * | ||
51 | * Returns: 0 if succeeded; negative error code if failed. | ||
52 | **/ | ||
53 | -- | ||
54 | 2.17.1 | ||
55 | |||
56 | diff view generated by jsdifflib |
1 | From: Fam Zheng <famz@redhat.com> | 1 | It was reported that enabling SafeStack reduces IOPS significantly |
---|---|---|---|
2 | 2 | (>25%) with the following fio benchmark on virtio-blk using a NVMe host | |
3 | The implementation is similar to the 'qemu-img convert'. In the | 3 | block device: |
4 | beginning of the job, offloaded copy is attempted. If it fails, further | 4 | |
5 | I/O will go through the existing bounce buffer code path. | 5 | # fio --rw=randrw --bs=4k --iodepth=64 --runtime=1m --direct=1 \ |
6 | 6 | --filename=/dev/vdb --name=job1 --ioengine=libaio --thread \ | |
7 | Then, as Kevin pointed out, both this and qemu-img convert can benefit | 7 | --group_reporting --numjobs=16 --time_based \ |
8 | from a local check if one request fails because of, for example, the | 8 | --output=/tmp/fio_result |
9 | offset is beyond EOF, but another may well be accepted by the protocol | 9 | |
10 | layer. This will be implemented separately. | 10 | Serge Guelton and I found that SafeStack is not really at fault, it just |
11 | 11 | increases the cost of coroutine creation. This fio workload exhausts the | |
12 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 12 | coroutine pool and coroutine creation becomes a bottleneck. Previous |
13 | Signed-off-by: Fam Zheng <famz@redhat.com> | 13 | work by Honghao Wang also pointed to excessive coroutine creation. |
14 | Message-id: 20180703023758.14422-4-famz@redhat.com | 14 | |
15 | Signed-off-by: Jeff Cody <jcody@redhat.com> | 15 | Creating new coroutines is expensive due to allocating new stacks with |
16 | mmap(2) and mprotect(2). Currently there are thread-local and global | ||
17 | pools that recycle old Coroutine objects and their stacks but the | ||
18 | hardcoded size limit of 64 for thread-local pools and 128 for the global | ||
19 | pool is insufficient for the fio benchmark shown above. | ||
20 | |||
21 | This patch changes the coroutine pool algorithm to a simple thread-local | ||
22 | pool without a maximum size limit. Threads periodically shrink the pool | ||
23 | down to a size sufficient for the maximum observed number of coroutines. | ||
24 | |||
25 | The global pool is removed by this patch. It can help to hide the fact | ||
26 | that local pools are easily exhausted, but it's doesn't fix the root | ||
27 | cause. I don't think there is a need for a global pool because QEMU's | ||
28 | threads are long-lived, so let's keep things simple. | ||
29 | |||
30 | Performance of the above fio benchmark is as follows: | ||
31 | |||
32 | Before After | ||
33 | IOPS 60k 97k | ||
34 | |||
35 | Memory usage varies over time as needed by the workload: | ||
36 | |||
37 | VSZ (KB) RSS (KB) | ||
38 | Before fio 4705248 843128 | ||
39 | During fio 5747668 (+ ~100 MB) 849280 | ||
40 | After fio 4694996 (- ~100 MB) 845184 | ||
41 | |||
42 | This confirms that coroutines are indeed being freed when no longer | ||
43 | needed. | ||
44 | |||
45 | Thanks to Serge Guelton for working on identifying the bottleneck with | ||
46 | me! | ||
47 | |||
48 | Reported-by: Tingting Mao <timao@redhat.com> | ||
49 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
50 | Message-id: 20210913153524.1190696-1-stefanha@redhat.com | ||
51 | Cc: Serge Guelton <sguelton@redhat.com> | ||
52 | Cc: Honghao Wang <wanghonghao@bytedance.com> | ||
53 | Cc: Paolo Bonzini <pbonzini@redhat.com> | ||
54 | Cc: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
55 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
56 | |||
57 | [Moved atexit notifier to coroutine_delete() after GitLab CI reported a | ||
58 | memory leak in tests/unit/test-aio-multithread because the Coroutine | ||
59 | object was created in the main thread but runs in an IOThread (where | ||
60 | it's also deleted). | ||
61 | --Stefan] | ||
16 | --- | 62 | --- |
17 | block/backup.c | 150 ++++++++++++++++++++++++++++++++------------- | 63 | include/qemu/coroutine-pool-timer.h | 36 ++++++++++++++++ |
18 | block/trace-events | 1 + | 64 | include/qemu/coroutine.h | 7 ++++ |
19 | 2 files changed, 110 insertions(+), 41 deletions(-) | 65 | iothread.c | 6 +++ |
20 | 66 | util/coroutine-pool-timer.c | 35 ++++++++++++++++ | |
21 | diff --git a/block/backup.c b/block/backup.c | 67 | util/main-loop.c | 5 +++ |
22 | index XXXXXXX..XXXXXXX 100644 | 68 | util/qemu-coroutine.c | 64 ++++++++++++++++------------- |
23 | --- a/block/backup.c | 69 | util/meson.build | 1 + |
24 | +++ b/block/backup.c | 70 | 7 files changed, 125 insertions(+), 29 deletions(-) |
25 | @@ -XXX,XX +XXX,XX @@ typedef struct BackupBlockJob { | 71 | create mode 100644 include/qemu/coroutine-pool-timer.h |
26 | QLIST_HEAD(, CowRequest) inflight_reqs; | 72 | create mode 100644 util/coroutine-pool-timer.c |
27 | 73 | ||
28 | HBitmap *copy_bitmap; | 74 | diff --git a/include/qemu/coroutine-pool-timer.h b/include/qemu/coroutine-pool-timer.h |
29 | + bool use_copy_range; | 75 | new file mode 100644 |
30 | + int64_t copy_range_size; | 76 | index XXXXXXX..XXXXXXX |
31 | } BackupBlockJob; | 77 | --- /dev/null |
32 | 78 | +++ b/include/qemu/coroutine-pool-timer.h | |
33 | static const BlockJobDriver backup_job_driver; | 79 | @@ -XXX,XX +XXX,XX @@ |
34 | @@ -XXX,XX +XXX,XX @@ static void cow_request_end(CowRequest *req) | 80 | +/* |
35 | qemu_co_queue_restart_all(&req->wait_queue); | 81 | + * QEMU coroutine pool timer |
82 | + * | ||
83 | + * Copyright (c) 2021 Red Hat, Inc. | ||
84 | + * | ||
85 | + * SPDX-License-Identifier: LGPL-2.1-or-later | ||
86 | + * | ||
87 | + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. | ||
88 | + * See the COPYING.LIB file in the top-level directory. | ||
89 | + * | ||
90 | + */ | ||
91 | +#ifndef COROUTINE_POOL_TIMER_H | ||
92 | +#define COROUTINE_POOL_TIMER_H | ||
93 | + | ||
94 | +#include "qemu/osdep.h" | ||
95 | +#include "block/aio.h" | ||
96 | + | ||
97 | +/** | ||
98 | + * A timer that periodically resizes this thread's coroutine pool, freeing | ||
99 | + * memory if there are too many unused coroutines. | ||
100 | + * | ||
101 | + * Threads that make heavy use of coroutines should use this. Failure to resize | ||
102 | + * the coroutine pool can lead to large amounts of memory sitting idle and | ||
103 | + * never being used after the first time. | ||
104 | + */ | ||
105 | +typedef struct { | ||
106 | + QEMUTimer *timer; | ||
107 | +} CoroutinePoolTimer; | ||
108 | + | ||
109 | +/* Call this before the thread runs the AioContext */ | ||
110 | +void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx); | ||
111 | + | ||
112 | +/* Call this before the AioContext from the init function is destroyed */ | ||
113 | +void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt); | ||
114 | + | ||
115 | +#endif /* COROUTINE_POOL_TIMER_H */ | ||
116 | diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h | ||
117 | index XXXXXXX..XXXXXXX 100644 | ||
118 | --- a/include/qemu/coroutine.h | ||
119 | +++ b/include/qemu/coroutine.h | ||
120 | @@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void); | ||
121 | */ | ||
122 | bool qemu_coroutine_entered(Coroutine *co); | ||
123 | |||
124 | +/** | ||
125 | + * Optionally call this function periodically to shrink the thread-local pool | ||
126 | + * down. Spiky workloads can create many coroutines and then never reach that | ||
127 | + * level again. Shrinking the pool reclaims memory in this case. | ||
128 | + */ | ||
129 | +void qemu_coroutine_pool_periodic_resize(void); | ||
130 | + | ||
131 | /** | ||
132 | * Provides a mutex that can be used to synchronise coroutines | ||
133 | */ | ||
134 | diff --git a/iothread.c b/iothread.c | ||
135 | index XXXXXXX..XXXXXXX 100644 | ||
136 | --- a/iothread.c | ||
137 | +++ b/iothread.c | ||
138 | @@ -XXX,XX +XXX,XX @@ | ||
139 | #include "qemu/error-report.h" | ||
140 | #include "qemu/rcu.h" | ||
141 | #include "qemu/main-loop.h" | ||
142 | +#include "qemu/coroutine-pool-timer.h" | ||
143 | |||
144 | typedef ObjectClass IOThreadClass; | ||
145 | |||
146 | @@ -XXX,XX +XXX,XX @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD, | ||
147 | static void *iothread_run(void *opaque) | ||
148 | { | ||
149 | IOThread *iothread = opaque; | ||
150 | + CoroutinePoolTimer co_pool_timer; | ||
151 | |||
152 | rcu_register_thread(); | ||
153 | /* | ||
154 | @@ -XXX,XX +XXX,XX @@ static void *iothread_run(void *opaque) | ||
155 | iothread->thread_id = qemu_get_thread_id(); | ||
156 | qemu_sem_post(&iothread->init_done_sem); | ||
157 | |||
158 | + coroutine_pool_timer_init(&co_pool_timer, iothread->ctx); | ||
159 | + | ||
160 | while (iothread->running) { | ||
161 | /* | ||
162 | * Note: from functional-wise the g_main_loop_run() below can | ||
163 | @@ -XXX,XX +XXX,XX @@ static void *iothread_run(void *opaque) | ||
164 | } | ||
165 | } | ||
166 | |||
167 | + coroutine_pool_timer_cleanup(&co_pool_timer); | ||
168 | + | ||
169 | g_main_context_pop_thread_default(iothread->worker_context); | ||
170 | rcu_unregister_thread(); | ||
171 | return NULL; | ||
172 | diff --git a/util/coroutine-pool-timer.c b/util/coroutine-pool-timer.c | ||
173 | new file mode 100644 | ||
174 | index XXXXXXX..XXXXXXX | ||
175 | --- /dev/null | ||
176 | +++ b/util/coroutine-pool-timer.c | ||
177 | @@ -XXX,XX +XXX,XX @@ | ||
178 | +/* | ||
179 | + * QEMU coroutine pool timer | ||
180 | + * | ||
181 | + * Copyright (c) 2021 Red Hat, Inc. | ||
182 | + * | ||
183 | + * SPDX-License-Identifier: LGPL-2.1-or-later | ||
184 | + * | ||
185 | + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. | ||
186 | + * See the COPYING.LIB file in the top-level directory. | ||
187 | + * | ||
188 | + */ | ||
189 | +#include "qemu/coroutine-pool-timer.h" | ||
190 | + | ||
191 | +static void coroutine_pool_timer_cb(void *opaque) | ||
192 | +{ | ||
193 | + CoroutinePoolTimer *pt = opaque; | ||
194 | + int64_t expiry_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + | ||
195 | + 15 * NANOSECONDS_PER_SECOND; | ||
196 | + | ||
197 | + qemu_coroutine_pool_periodic_resize(); | ||
198 | + timer_mod(pt->timer, expiry_time_ns); | ||
199 | +} | ||
200 | + | ||
201 | +void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx) | ||
202 | +{ | ||
203 | + pt->timer = aio_timer_new(ctx, QEMU_CLOCK_REALTIME, SCALE_NS, | ||
204 | + coroutine_pool_timer_cb, pt); | ||
205 | + coroutine_pool_timer_cb(pt); | ||
206 | +} | ||
207 | + | ||
208 | +void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt) | ||
209 | +{ | ||
210 | + timer_free(pt->timer); | ||
211 | + pt->timer = NULL; | ||
212 | +} | ||
213 | diff --git a/util/main-loop.c b/util/main-loop.c | ||
214 | index XXXXXXX..XXXXXXX 100644 | ||
215 | --- a/util/main-loop.c | ||
216 | +++ b/util/main-loop.c | ||
217 | @@ -XXX,XX +XXX,XX @@ | ||
218 | #include "qemu/error-report.h" | ||
219 | #include "qemu/queue.h" | ||
220 | #include "qemu/compiler.h" | ||
221 | +#include "qemu/coroutine-pool-timer.h" | ||
222 | |||
223 | #ifndef _WIN32 | ||
224 | #include <sys/wait.h> | ||
225 | @@ -XXX,XX +XXX,XX @@ static int qemu_signal_init(Error **errp) | ||
226 | |||
227 | static AioContext *qemu_aio_context; | ||
228 | static QEMUBH *qemu_notify_bh; | ||
229 | +static CoroutinePoolTimer main_loop_co_pool_timer; | ||
230 | |||
231 | static void notify_event_cb(void *opaque) | ||
232 | { | ||
233 | @@ -XXX,XX +XXX,XX @@ int qemu_init_main_loop(Error **errp) | ||
234 | g_source_set_name(src, "io-handler"); | ||
235 | g_source_attach(src, NULL); | ||
236 | g_source_unref(src); | ||
237 | + | ||
238 | + coroutine_pool_timer_init(&main_loop_co_pool_timer, qemu_aio_context); | ||
239 | + | ||
240 | return 0; | ||
36 | } | 241 | } |
37 | 242 | ||
38 | +/* Copy range to target with a bounce buffer and return the bytes copied. If | 243 | diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c |
39 | + * error occured, return a negative error number */ | 244 | index XXXXXXX..XXXXXXX 100644 |
40 | +static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, | 245 | --- a/util/qemu-coroutine.c |
41 | + int64_t start, | 246 | +++ b/util/qemu-coroutine.c |
42 | + int64_t end, | 247 | @@ -XXX,XX +XXX,XX @@ |
43 | + bool is_write_notifier, | 248 | #include "block/aio.h" |
44 | + bool *error_is_read, | 249 | |
45 | + void **bounce_buffer) | 250 | enum { |
251 | - POOL_BATCH_SIZE = 64, | ||
252 | + /* | ||
253 | + * qemu_coroutine_pool_periodic_resize() keeps at least this many | ||
254 | + * coroutines around. | ||
255 | + */ | ||
256 | + ALLOC_POOL_MIN = 64, | ||
257 | }; | ||
258 | |||
259 | + | ||
260 | /** Free list to speed up creation */ | ||
261 | -static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); | ||
262 | -static unsigned int release_pool_size; | ||
263 | static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool); | ||
264 | static __thread unsigned int alloc_pool_size; | ||
265 | +static __thread unsigned int num_coroutines; | ||
266 | +static __thread unsigned int max_coroutines_this_slice; | ||
267 | static __thread Notifier coroutine_pool_cleanup_notifier; | ||
268 | |||
269 | static void coroutine_pool_cleanup(Notifier *n, void *value) | ||
270 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) | ||
271 | |||
272 | if (CONFIG_COROUTINE_POOL) { | ||
273 | co = QSLIST_FIRST(&alloc_pool); | ||
274 | - if (!co) { | ||
275 | - if (release_pool_size > POOL_BATCH_SIZE) { | ||
276 | - /* Slow path; a good place to register the destructor, too. */ | ||
277 | - if (!coroutine_pool_cleanup_notifier.notify) { | ||
278 | - coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; | ||
279 | - qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); | ||
280 | - } | ||
281 | - | ||
282 | - /* This is not exact; there could be a little skew between | ||
283 | - * release_pool_size and the actual size of release_pool. But | ||
284 | - * it is just a heuristic, it does not need to be perfect. | ||
285 | - */ | ||
286 | - alloc_pool_size = qatomic_xchg(&release_pool_size, 0); | ||
287 | - QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); | ||
288 | - co = QSLIST_FIRST(&alloc_pool); | ||
289 | - } | ||
290 | - } | ||
291 | if (co) { | ||
292 | QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); | ||
293 | alloc_pool_size--; | ||
294 | } | ||
295 | + | ||
296 | + num_coroutines++; | ||
297 | + if (num_coroutines > max_coroutines_this_slice) { | ||
298 | + max_coroutines_this_slice = num_coroutines; | ||
299 | + } | ||
300 | } | ||
301 | |||
302 | if (!co) { | ||
303 | @@ -XXX,XX +XXX,XX @@ static void coroutine_delete(Coroutine *co) | ||
304 | co->caller = NULL; | ||
305 | |||
306 | if (CONFIG_COROUTINE_POOL) { | ||
307 | - if (release_pool_size < POOL_BATCH_SIZE * 2) { | ||
308 | - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); | ||
309 | - qatomic_inc(&release_pool_size); | ||
310 | - return; | ||
311 | - } | ||
312 | - if (alloc_pool_size < POOL_BATCH_SIZE) { | ||
313 | - QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); | ||
314 | - alloc_pool_size++; | ||
315 | - return; | ||
316 | + if (!coroutine_pool_cleanup_notifier.notify) { | ||
317 | + coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; | ||
318 | + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); | ||
319 | } | ||
320 | + | ||
321 | + num_coroutines--; | ||
322 | + QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); | ||
323 | + alloc_pool_size++; | ||
324 | + return; | ||
325 | } | ||
326 | |||
327 | qemu_coroutine_delete(co); | ||
328 | } | ||
329 | |||
330 | +void qemu_coroutine_pool_periodic_resize(void) | ||
46 | +{ | 331 | +{ |
47 | + int ret; | 332 | + unsigned pool_size_target = |
48 | + struct iovec iov; | 333 | + MAX(ALLOC_POOL_MIN, max_coroutines_this_slice) - num_coroutines; |
49 | + QEMUIOVector qiov; | 334 | + max_coroutines_this_slice = num_coroutines; |
50 | + BlockBackend *blk = job->common.blk; | 335 | + |
51 | + int nbytes; | 336 | + while (alloc_pool_size > pool_size_target) { |
52 | + | 337 | + Coroutine *co = QSLIST_FIRST(&alloc_pool); |
53 | + hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); | 338 | + QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); |
54 | + nbytes = MIN(job->cluster_size, job->len - start); | 339 | + qemu_coroutine_delete(co); |
55 | + if (!*bounce_buffer) { | 340 | + alloc_pool_size--; |
56 | + *bounce_buffer = blk_blockalign(blk, job->cluster_size); | ||
57 | + } | 341 | + } |
58 | + iov.iov_base = *bounce_buffer; | ||
59 | + iov.iov_len = nbytes; | ||
60 | + qemu_iovec_init_external(&qiov, &iov, 1); | ||
61 | + | ||
62 | + ret = blk_co_preadv(blk, start, qiov.size, &qiov, | ||
63 | + is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); | ||
64 | + if (ret < 0) { | ||
65 | + trace_backup_do_cow_read_fail(job, start, ret); | ||
66 | + if (error_is_read) { | ||
67 | + *error_is_read = true; | ||
68 | + } | ||
69 | + goto fail; | ||
70 | + } | ||
71 | + | ||
72 | + if (qemu_iovec_is_zero(&qiov)) { | ||
73 | + ret = blk_co_pwrite_zeroes(job->target, start, | ||
74 | + qiov.size, BDRV_REQ_MAY_UNMAP); | ||
75 | + } else { | ||
76 | + ret = blk_co_pwritev(job->target, start, | ||
77 | + qiov.size, &qiov, | ||
78 | + job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); | ||
79 | + } | ||
80 | + if (ret < 0) { | ||
81 | + trace_backup_do_cow_write_fail(job, start, ret); | ||
82 | + if (error_is_read) { | ||
83 | + *error_is_read = false; | ||
84 | + } | ||
85 | + goto fail; | ||
86 | + } | ||
87 | + | ||
88 | + return nbytes; | ||
89 | +fail: | ||
90 | + hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); | ||
91 | + return ret; | ||
92 | + | ||
93 | +} | 342 | +} |
94 | + | 343 | + |
95 | +/* Copy range to target and return the bytes copied. If error occured, return a | 344 | void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co) |
96 | + * negative error number. */ | ||
97 | +static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, | ||
98 | + int64_t start, | ||
99 | + int64_t end, | ||
100 | + bool is_write_notifier) | ||
101 | +{ | ||
102 | + int ret; | ||
103 | + int nr_clusters; | ||
104 | + BlockBackend *blk = job->common.blk; | ||
105 | + int nbytes; | ||
106 | + | ||
107 | + assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size)); | ||
108 | + nbytes = MIN(job->copy_range_size, end - start); | ||
109 | + nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size); | ||
110 | + hbitmap_reset(job->copy_bitmap, start / job->cluster_size, | ||
111 | + nr_clusters); | ||
112 | + ret = blk_co_copy_range(blk, start, job->target, start, nbytes, | ||
113 | + is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); | ||
114 | + if (ret < 0) { | ||
115 | + trace_backup_do_cow_copy_range_fail(job, start, ret); | ||
116 | + hbitmap_set(job->copy_bitmap, start / job->cluster_size, | ||
117 | + nr_clusters); | ||
118 | + return ret; | ||
119 | + } | ||
120 | + | ||
121 | + return nbytes; | ||
122 | +} | ||
123 | + | ||
124 | static int coroutine_fn backup_do_cow(BackupBlockJob *job, | ||
125 | int64_t offset, uint64_t bytes, | ||
126 | bool *error_is_read, | ||
127 | bool is_write_notifier) | ||
128 | { | 345 | { |
129 | - BlockBackend *blk = job->common.blk; | 346 | QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending); |
130 | CowRequest cow_request; | 347 | diff --git a/util/meson.build b/util/meson.build |
131 | - struct iovec iov; | 348 | index XXXXXXX..XXXXXXX 100644 |
132 | - QEMUIOVector bounce_qiov; | 349 | --- a/util/meson.build |
133 | - void *bounce_buffer = NULL; | 350 | +++ b/util/meson.build |
134 | int ret = 0; | 351 | @@ -XXX,XX +XXX,XX @@ if have_block |
135 | int64_t start, end; /* bytes */ | 352 | util_ss.add(files('buffer.c')) |
136 | - int n; /* bytes */ | 353 | util_ss.add(files('bufferiszero.c')) |
137 | + void *bounce_buffer = NULL; | 354 | util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND']))) |
138 | 355 | + util_ss.add(files('coroutine-pool-timer.c')) | |
139 | qemu_co_rwlock_rdlock(&job->flush_rwlock); | 356 | util_ss.add(files('hbitmap.c')) |
140 | 357 | util_ss.add(files('hexdump.c')) | |
141 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job, | 358 | util_ss.add(files('iova-tree.c')) |
142 | wait_for_overlapping_requests(job, start, end); | ||
143 | cow_request_begin(&cow_request, job, start, end); | ||
144 | |||
145 | - for (; start < end; start += job->cluster_size) { | ||
146 | + while (start < end) { | ||
147 | if (!hbitmap_get(job->copy_bitmap, start / job->cluster_size)) { | ||
148 | trace_backup_do_cow_skip(job, start); | ||
149 | + start += job->cluster_size; | ||
150 | continue; /* already copied */ | ||
151 | } | ||
152 | - hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); | ||
153 | |||
154 | trace_backup_do_cow_process(job, start); | ||
155 | |||
156 | - n = MIN(job->cluster_size, job->len - start); | ||
157 | - | ||
158 | - if (!bounce_buffer) { | ||
159 | - bounce_buffer = blk_blockalign(blk, job->cluster_size); | ||
160 | - } | ||
161 | - iov.iov_base = bounce_buffer; | ||
162 | - iov.iov_len = n; | ||
163 | - qemu_iovec_init_external(&bounce_qiov, &iov, 1); | ||
164 | - | ||
165 | - ret = blk_co_preadv(blk, start, bounce_qiov.size, &bounce_qiov, | ||
166 | - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); | ||
167 | - if (ret < 0) { | ||
168 | - trace_backup_do_cow_read_fail(job, start, ret); | ||
169 | - if (error_is_read) { | ||
170 | - *error_is_read = true; | ||
171 | + if (job->use_copy_range) { | ||
172 | + ret = backup_cow_with_offload(job, start, end, is_write_notifier); | ||
173 | + if (ret < 0) { | ||
174 | + job->use_copy_range = false; | ||
175 | } | ||
176 | - hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); | ||
177 | - goto out; | ||
178 | } | ||
179 | - | ||
180 | - if (buffer_is_zero(iov.iov_base, iov.iov_len)) { | ||
181 | - ret = blk_co_pwrite_zeroes(job->target, start, | ||
182 | - bounce_qiov.size, BDRV_REQ_MAY_UNMAP); | ||
183 | - } else { | ||
184 | - ret = blk_co_pwritev(job->target, start, | ||
185 | - bounce_qiov.size, &bounce_qiov, | ||
186 | - job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); | ||
187 | + if (!job->use_copy_range) { | ||
188 | + ret = backup_cow_with_bounce_buffer(job, start, end, is_write_notifier, | ||
189 | + error_is_read, &bounce_buffer); | ||
190 | } | ||
191 | if (ret < 0) { | ||
192 | - trace_backup_do_cow_write_fail(job, start, ret); | ||
193 | - if (error_is_read) { | ||
194 | - *error_is_read = false; | ||
195 | - } | ||
196 | - hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); | ||
197 | - goto out; | ||
198 | + break; | ||
199 | } | ||
200 | |||
201 | /* Publish progress, guest I/O counts as progress too. Note that the | ||
202 | * offset field is an opaque progress value, it is not a disk offset. | ||
203 | */ | ||
204 | - job->bytes_read += n; | ||
205 | - job_progress_update(&job->common.job, n); | ||
206 | + start += ret; | ||
207 | + job->bytes_read += ret; | ||
208 | + job_progress_update(&job->common.job, ret); | ||
209 | + ret = 0; | ||
210 | } | ||
211 | |||
212 | -out: | ||
213 | if (bounce_buffer) { | ||
214 | qemu_vfree(bounce_buffer); | ||
215 | } | ||
216 | @@ -XXX,XX +XXX,XX @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, | ||
217 | } else { | ||
218 | job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); | ||
219 | } | ||
220 | + job->use_copy_range = true; | ||
221 | + job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk), | ||
222 | + blk_get_max_transfer(job->target)); | ||
223 | + job->copy_range_size = MAX(job->cluster_size, | ||
224 | + QEMU_ALIGN_UP(job->copy_range_size, | ||
225 | + job->cluster_size)); | ||
226 | |||
227 | /* Required permissions are already taken with target's blk_new() */ | ||
228 | block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, | ||
229 | diff --git a/block/trace-events b/block/trace-events | ||
230 | index XXXXXXX..XXXXXXX 100644 | ||
231 | --- a/block/trace-events | ||
232 | +++ b/block/trace-events | ||
233 | @@ -XXX,XX +XXX,XX @@ backup_do_cow_skip(void *job, int64_t start) "job %p start %"PRId64 | ||
234 | backup_do_cow_process(void *job, int64_t start) "job %p start %"PRId64 | ||
235 | backup_do_cow_read_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d" | ||
236 | backup_do_cow_write_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d" | ||
237 | +backup_do_cow_copy_range_fail(void *job, int64_t start, int ret) "job %p start %"PRId64" ret %d" | ||
238 | |||
239 | # blockdev.c | ||
240 | qmp_block_job_cancel(void *job) "job %p" | ||
241 | -- | 359 | -- |
242 | 2.17.1 | 360 | 2.31.1 |
243 | 361 | ||
244 | 362 | diff view generated by jsdifflib |