1 | The following changes since commit c25e8bba1f546ea72744ccfab77f8a9e8a323be8: | 1 | The following changes since commit 8bac3ba57eecc466b7e73dabf7d19328a59f684e: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/otubo/tags/pull-seccomp-20180601' into staging (2018-06-01 13:11:30 +0100) | 3 | Merge remote-tracking branch 'remotes/rth/tags/pull-rx-20200408' into staging (2020-04-09 13:23:30 +0100) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | git://github.com/stefanha/qemu.git tags/block-pull-request | 7 | https://github.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 21891a5a3011608845b5d7f1f9cce60cdc2bcc62: | 9 | for you to fetch changes up to 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c: |
10 | 10 | ||
11 | main-loop: drop spin_counter (2018-06-01 16:01:29 +0100) | 11 | async: use explicit memory barriers (2020-04-09 16:17:14 +0100) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | * Copy offloading for qemu-img convert (iSCSI, raw, and qcow2) | 16 | Fixes for QEMU on aarch64 ARM hosts and fdmon-io_uring. |
17 | |||
18 | If the underlying storage supports copy offloading, qemu-img convert will | ||
19 | use it instead of performing reads and writes. This avoids data transfers | ||
20 | and thus frees up storage bandwidth for other purposes. SCSI EXTENDED COPY | ||
21 | and Linux copy_file_range(2) are used to implement this optimization. | ||
22 | |||
23 | * Drop spurious "WARNING: I\/O thread spun for 1000 iterations" warning | ||
24 | 17 | ||
25 | ---------------------------------------------------------------- | 18 | ---------------------------------------------------------------- |
26 | 19 | ||
27 | Fam Zheng (10): | 20 | Paolo Bonzini (2): |
28 | block: Introduce API for copy offloading | 21 | aio-wait: delegate polling of main AioContext if BQL not held |
29 | raw: Check byte range uniformly | 22 | async: use explicit memory barriers |
30 | raw: Implement copy offloading | ||
31 | qcow2: Implement copy offloading | ||
32 | file-posix: Implement bdrv_co_copy_range | ||
33 | iscsi: Query and save device designator when opening | ||
34 | iscsi: Create and use iscsi_co_wait_for_task | ||
35 | iscsi: Implement copy offloading | ||
36 | block-backend: Add blk_co_copy_range | ||
37 | qemu-img: Convert with copy offloading | ||
38 | 23 | ||
39 | Stefan Hajnoczi (1): | 24 | Stefan Hajnoczi (1): |
40 | main-loop: drop spin_counter | 25 | aio-posix: signal-proof fdmon-io_uring |
41 | 26 | ||
42 | configure | 17 ++ | 27 | include/block/aio-wait.h | 22 ++++++++++++++++++++++ |
43 | include/block/block.h | 32 ++++ | 28 | include/block/aio.h | 29 ++++++++++------------------- |
44 | include/block/block_int.h | 38 ++++ | 29 | util/aio-posix.c | 16 ++++++++++++++-- |
45 | include/block/raw-aio.h | 10 +- | 30 | util/aio-win32.c | 17 ++++++++++++++--- |
46 | include/scsi/constants.h | 4 + | 31 | util/async.c | 16 ++++++++++++---- |
47 | include/sysemu/block-backend.h | 4 + | 32 | util/fdmon-io_uring.c | 10 ++++++++-- |
48 | block/block-backend.c | 18 ++ | 33 | 6 files changed, 80 insertions(+), 30 deletions(-) |
49 | block/file-posix.c | 98 +++++++++- | ||
50 | block/io.c | 97 ++++++++++ | ||
51 | block/iscsi.c | 314 +++++++++++++++++++++++++++---- | ||
52 | block/qcow2.c | 229 +++++++++++++++++++--- | ||
53 | block/raw-format.c | 96 +++++++--- | ||
54 | qemu-img.c | 50 ++++- | ||
55 | util/main-loop.c | 25 --- | ||
56 | tests/qemu-iotests/common.filter | 1 - | ||
57 | 15 files changed, 908 insertions(+), 125 deletions(-) | ||
58 | 34 | ||
59 | -- | 35 | -- |
60 | 2.17.1 | 36 | 2.25.1 |
61 | 37 | ||
62 | diff view generated by jsdifflib |
1 | Commit d759c951f3287fad04210a52f2dc93f94cf58c7f ("replay: push | 1 | The io_uring_enter(2) syscall returns with errno=EINTR when interrupted |
---|---|---|---|
2 | replay_mutex_lock up the call tree") removed the !timeout lock | 2 | by a signal. Retry the syscall in this case. |
3 | optimization in the main loop. | ||
4 | 3 | ||
5 | The idea of the optimization was to avoid ping-pongs between threads by | 4 | It's essential to do this in the io_uring_submit_and_wait() case. My |
6 | keeping the Big QEMU Lock held across non-blocking (!timeout) main loop | 5 | interpretation of the Linux v5.5 io_uring_enter(2) code is that it |
7 | iterations. | 6 | shouldn't affect the io_uring_submit() case, but there is no guarantee |
7 | this will always be the case. Let's check for -EINTR around both APIs. | ||
8 | 8 | ||
9 | A warning is printed when the main loop spins without releasing BQL for | 9 | Note that the liburing APIs have -errno return values. |
10 | long periods of time. These warnings were supposed to aid debugging but | ||
11 | in practice they just alarm users. They are considered noise because | ||
12 | the cause of spinning is not shown and is hard to find. | ||
13 | |||
14 | Now that the lock optimization has been removed, there is no danger of | ||
15 | hogging the BQL. Drop the spin counter and the infamous warning. | ||
16 | 10 | ||
17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
18 | Reviewed-by: Jeff Cody <jcody@redhat.com> | 12 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
13 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
14 | Message-id: 20200408091139.273851-1-stefanha@redhat.com | ||
15 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
19 | --- | 16 | --- |
20 | util/main-loop.c | 25 ------------------------- | 17 | util/fdmon-io_uring.c | 10 ++++++++-- |
21 | tests/qemu-iotests/common.filter | 1 - | 18 | 1 file changed, 8 insertions(+), 2 deletions(-) |
22 | 2 files changed, 26 deletions(-) | ||
23 | 19 | ||
24 | diff --git a/util/main-loop.c b/util/main-loop.c | 20 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c |
25 | index XXXXXXX..XXXXXXX 100644 | 21 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/util/main-loop.c | 22 | --- a/util/fdmon-io_uring.c |
27 | +++ b/util/main-loop.c | 23 | +++ b/util/fdmon-io_uring.c |
28 | @@ -XXX,XX +XXX,XX @@ static int os_host_main_loop_wait(int64_t timeout) | 24 | @@ -XXX,XX +XXX,XX @@ static struct io_uring_sqe *get_sqe(AioContext *ctx) |
29 | { | 25 | } |
30 | GMainContext *context = g_main_context_default(); | 26 | |
31 | int ret; | 27 | /* No free sqes left, submit pending sqes first */ |
32 | - static int spin_counter; | 28 | - ret = io_uring_submit(ring); |
33 | 29 | + do { | |
34 | g_main_context_acquire(context); | 30 | + ret = io_uring_submit(ring); |
35 | 31 | + } while (ret == -EINTR); | |
36 | glib_pollfds_fill(&timeout); | 32 | + |
37 | 33 | assert(ret > 1); | |
38 | - /* If the I/O thread is very busy or we are incorrectly busy waiting in | 34 | sqe = io_uring_get_sqe(ring); |
39 | - * the I/O thread, this can lead to starvation of the BQL such that the | 35 | assert(sqe); |
40 | - * VCPU threads never run. To make sure we can detect the later case, | 36 | @@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, |
41 | - * print a message to the screen. If we run into this condition, create | 37 | |
42 | - * a fake timeout in order to give the VCPU threads a chance to run. | 38 | fill_sq_ring(ctx); |
43 | - */ | 39 | |
44 | - if (!timeout && (spin_counter > MAX_MAIN_LOOP_SPIN)) { | 40 | - ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); |
45 | - static bool notified; | 41 | + do { |
46 | - | 42 | + ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); |
47 | - if (!notified && !qtest_enabled() && !qtest_driver()) { | 43 | + } while (ret == -EINTR); |
48 | - warn_report("I/O thread spun for %d iterations", | 44 | + |
49 | - MAX_MAIN_LOOP_SPIN); | 45 | assert(ret >= 0); |
50 | - notified = true; | 46 | |
51 | - } | 47 | return process_cq_ring(ctx, ready_list); |
52 | - | ||
53 | - timeout = SCALE_MS; | ||
54 | - } | ||
55 | - | ||
56 | - | ||
57 | - if (timeout) { | ||
58 | - spin_counter = 0; | ||
59 | - } else { | ||
60 | - spin_counter++; | ||
61 | - } | ||
62 | qemu_mutex_unlock_iothread(); | ||
63 | replay_mutex_unlock(); | ||
64 | |||
65 | diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter | ||
66 | index XXXXXXX..XXXXXXX 100644 | ||
67 | --- a/tests/qemu-iotests/common.filter | ||
68 | +++ b/tests/qemu-iotests/common.filter | ||
69 | @@ -XXX,XX +XXX,XX @@ _filter_qemu() | ||
70 | { | ||
71 | sed -e "s#\\(^\\|(qemu) \\)$(basename $QEMU_PROG):#\1QEMU_PROG:#" \ | ||
72 | -e 's#^QEMU [0-9]\+\.[0-9]\+\.[0-9]\+ monitor#QEMU X.Y.Z monitor#' \ | ||
73 | - -e '/main-loop: WARNING: I\/O thread spun for [0-9]\+ iterations/d' \ | ||
74 | -e $'s#\r##' # QEMU monitor uses \r\n line endings | ||
75 | } | ||
76 | |||
77 | -- | 48 | -- |
78 | 2.17.1 | 49 | 2.25.1 |
79 | 50 | ||
80 | diff view generated by jsdifflib |
1 | From: Fam Zheng <famz@redhat.com> | 1 | From: Paolo Bonzini <pbonzini@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Introduce the bdrv_co_copy_range() API for copy offloading. Block | 3 | Any thread that is not a iothread returns NULL for qemu_get_current_aio_context(). |
4 | drivers implementing this API support efficient copy operations that | 4 | As a result, it would also return true for |
5 | avoid reading each block from the source device and writing it to the | 5 | in_aio_context_home_thread(qemu_get_aio_context()), causing |
6 | destination devices. Examples of copy offload primitives are SCSI | 6 | AIO_WAIT_WHILE to invoke aio_poll() directly. This is incorrect |
7 | EXTENDED COPY and Linux copy_file_range(2). | 7 | if the BQL is not held, because aio_poll() does not expect to |
8 | run concurrently from multiple threads, and it can actually | ||
9 | happen when savevm writes to the vmstate file from the | ||
10 | migration thread. | ||
8 | 11 | ||
9 | Signed-off-by: Fam Zheng <famz@redhat.com> | 12 | Therefore, restrict in_aio_context_home_thread to return true |
10 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 13 | for the main AioContext only if the BQL is held. |
11 | Message-id: 20180601092648.24614-2-famz@redhat.com | 14 | |
15 | The function is moved to aio-wait.h because it is mostly used | ||
16 | there and to avoid a circular reference between main-loop.h | ||
17 | and block/aio.h. | ||
18 | |||
19 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
20 | Message-Id: <20200407140746.8041-5-pbonzini@redhat.com> | ||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 21 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
13 | --- | 22 | --- |
14 | include/block/block.h | 32 +++++++++++++ | 23 | include/block/aio-wait.h | 22 ++++++++++++++++++++++ |
15 | include/block/block_int.h | 38 +++++++++++++++ | 24 | include/block/aio.h | 29 ++++++++++------------------- |
16 | block/io.c | 97 +++++++++++++++++++++++++++++++++++++++ | 25 | 2 files changed, 32 insertions(+), 19 deletions(-) |
17 | 3 files changed, 167 insertions(+) | ||
18 | 26 | ||
19 | diff --git a/include/block/block.h b/include/block/block.h | 27 | diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h |
20 | index XXXXXXX..XXXXXXX 100644 | 28 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/include/block/block.h | 29 | --- a/include/block/aio-wait.h |
22 | +++ b/include/block/block.h | 30 | +++ b/include/block/aio-wait.h |
23 | @@ -XXX,XX +XXX,XX @@ bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, | 31 | @@ -XXX,XX +XXX,XX @@ |
32 | #define QEMU_AIO_WAIT_H | ||
33 | |||
34 | #include "block/aio.h" | ||
35 | +#include "qemu/main-loop.h" | ||
36 | |||
37 | /** | ||
38 | * AioWait: | ||
39 | @@ -XXX,XX +XXX,XX @@ void aio_wait_kick(void); | ||
24 | */ | 40 | */ |
25 | void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); | 41 | void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); |
26 | void bdrv_unregister_buf(BlockDriverState *bs, void *host); | 42 | |
27 | + | ||
28 | +/** | 43 | +/** |
44 | + * in_aio_context_home_thread: | ||
45 | + * @ctx: the aio context | ||
29 | + * | 46 | + * |
30 | + * bdrv_co_copy_range: | 47 | + * Return whether we are running in the thread that normally runs @ctx. Note |
31 | + * | 48 | + * that acquiring/releasing ctx does not affect the outcome, each AioContext |
32 | + * Do offloaded copy between two children. If the operation is not implemented | 49 | + * still only has one home thread that is responsible for running it. |
33 | + * by the driver, or if the backend storage doesn't support it, a negative | 50 | + */ |
34 | + * error code will be returned. | 51 | +static inline bool in_aio_context_home_thread(AioContext *ctx) |
35 | + * | ||
36 | + * Note: block layer doesn't emulate or fallback to a bounce buffer approach | ||
37 | + * because usually the caller shouldn't attempt offloaded copy any more (e.g. | ||
38 | + * calling copy_file_range(2)) after the first error, thus it should fall back | ||
39 | + * to a read+write path in the caller level. | ||
40 | + * | ||
41 | + * @src: Source child to copy data from | ||
42 | + * @src_offset: offset in @src image to read data | ||
43 | + * @dst: Destination child to copy data to | ||
44 | + * @dst_offset: offset in @dst image to write data | ||
45 | + * @bytes: number of bytes to copy | ||
46 | + * @flags: request flags. Must be one of: | ||
47 | + * 0 - actually read data from src; | ||
48 | + * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero | ||
49 | + * write on @dst as if bdrv_co_pwrite_zeroes is | ||
50 | + * called. Used to simplify caller code, or | ||
51 | + * during BlockDriver.bdrv_co_copy_range_from() | ||
52 | + * recursion. | ||
53 | + * | ||
54 | + * Returns: 0 if succeeded; negative error code if failed. | ||
55 | + **/ | ||
56 | +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, | ||
57 | + BdrvChild *dst, uint64_t dst_offset, | ||
58 | + uint64_t bytes, BdrvRequestFlags flags); | ||
59 | #endif | ||
60 | diff --git a/include/block/block_int.h b/include/block/block_int.h | ||
61 | index XXXXXXX..XXXXXXX 100644 | ||
62 | --- a/include/block/block_int.h | ||
63 | +++ b/include/block/block_int.h | ||
64 | @@ -XXX,XX +XXX,XX @@ struct BlockDriver { | ||
65 | int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, | ||
66 | int64_t offset, int bytes); | ||
67 | |||
68 | + /* Map [offset, offset + nbytes) range onto a child of @bs to copy from, | ||
69 | + * and invoke bdrv_co_copy_range_from(child, ...), or invoke | ||
70 | + * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from. | ||
71 | + * | ||
72 | + * See the comment of bdrv_co_copy_range for the parameter and return value | ||
73 | + * semantics. | ||
74 | + */ | ||
75 | + int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs, | ||
76 | + BdrvChild *src, | ||
77 | + uint64_t offset, | ||
78 | + BdrvChild *dst, | ||
79 | + uint64_t dst_offset, | ||
80 | + uint64_t bytes, | ||
81 | + BdrvRequestFlags flags); | ||
82 | + | ||
83 | + /* Map [offset, offset + nbytes) range onto a child of bs to copy data to, | ||
84 | + * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy | ||
85 | + * operation if @bs is the leaf and @src has the same BlockDriver. Return | ||
86 | + * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver. | ||
87 | + * | ||
88 | + * See the comment of bdrv_co_copy_range for the parameter and return value | ||
89 | + * semantics. | ||
90 | + */ | ||
91 | + int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs, | ||
92 | + BdrvChild *src, | ||
93 | + uint64_t src_offset, | ||
94 | + BdrvChild *dst, | ||
95 | + uint64_t dst_offset, | ||
96 | + uint64_t bytes, | ||
97 | + BdrvRequestFlags flags); | ||
98 | + | ||
99 | /* | ||
100 | * Building block for bdrv_block_status[_above] and | ||
101 | * bdrv_is_allocated[_above]. The driver should answer only | ||
102 | @@ -XXX,XX +XXX,XX @@ void bdrv_dec_in_flight(BlockDriverState *bs); | ||
103 | |||
104 | void blockdev_close_all_bdrv_states(void); | ||
105 | |||
106 | +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, | ||
107 | + BdrvChild *dst, uint64_t dst_offset, | ||
108 | + uint64_t bytes, BdrvRequestFlags flags); | ||
109 | +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, | ||
110 | + BdrvChild *dst, uint64_t dst_offset, | ||
111 | + uint64_t bytes, BdrvRequestFlags flags); | ||
112 | + | ||
113 | #endif /* BLOCK_INT_H */ | ||
114 | diff --git a/block/io.c b/block/io.c | ||
115 | index XXXXXXX..XXXXXXX 100644 | ||
116 | --- a/block/io.c | ||
117 | +++ b/block/io.c | ||
118 | @@ -XXX,XX +XXX,XX @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host) | ||
119 | bdrv_unregister_buf(child->bs, host); | ||
120 | } | ||
121 | } | ||
122 | + | ||
123 | +static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, | ||
124 | + uint64_t src_offset, | ||
125 | + BdrvChild *dst, | ||
126 | + uint64_t dst_offset, | ||
127 | + uint64_t bytes, | ||
128 | + BdrvRequestFlags flags, | ||
129 | + bool recurse_src) | ||
130 | +{ | 52 | +{ |
131 | + int ret; | 53 | + if (ctx == qemu_get_current_aio_context()) { |
132 | + | 54 | + return true; |
133 | + if (!src || !dst || !src->bs || !dst->bs) { | ||
134 | + return -ENOMEDIUM; | ||
135 | + } | ||
136 | + ret = bdrv_check_byte_request(src->bs, src_offset, bytes); | ||
137 | + if (ret) { | ||
138 | + return ret; | ||
139 | + } | 55 | + } |
140 | + | 56 | + |
141 | + ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); | 57 | + if (ctx == qemu_get_aio_context()) { |
142 | + if (ret) { | 58 | + return qemu_mutex_iothread_locked(); |
143 | + return ret; | ||
144 | + } | ||
145 | + if (flags & BDRV_REQ_ZERO_WRITE) { | ||
146 | + return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); | ||
147 | + } | ||
148 | + | ||
149 | + if (!src->bs->drv->bdrv_co_copy_range_from | ||
150 | + || !dst->bs->drv->bdrv_co_copy_range_to | ||
151 | + || src->bs->encrypted || dst->bs->encrypted) { | ||
152 | + return -ENOTSUP; | ||
153 | + } | ||
154 | + if (recurse_src) { | ||
155 | + return src->bs->drv->bdrv_co_copy_range_from(src->bs, | ||
156 | + src, src_offset, | ||
157 | + dst, dst_offset, | ||
158 | + bytes, flags); | ||
159 | + } else { | 59 | + } else { |
160 | + return dst->bs->drv->bdrv_co_copy_range_to(dst->bs, | 60 | + return false; |
161 | + src, src_offset, | ||
162 | + dst, dst_offset, | ||
163 | + bytes, flags); | ||
164 | + } | 61 | + } |
165 | +} | 62 | +} |
166 | + | 63 | + |
167 | +/* Copy range from @src to @dst. | 64 | #endif /* QEMU_AIO_WAIT_H */ |
168 | + * | 65 | diff --git a/include/block/aio.h b/include/block/aio.h |
169 | + * See the comment of bdrv_co_copy_range for the parameter and return value | 66 | index XXXXXXX..XXXXXXX 100644 |
170 | + * semantics. */ | 67 | --- a/include/block/aio.h |
171 | +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, | 68 | +++ b/include/block/aio.h |
172 | + BdrvChild *dst, uint64_t dst_offset, | 69 | @@ -XXX,XX +XXX,XX @@ struct AioContext { |
173 | + uint64_t bytes, BdrvRequestFlags flags) | 70 | AioHandlerList deleted_aio_handlers; |
174 | +{ | 71 | |
175 | + return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, | 72 | /* Used to avoid unnecessary event_notifier_set calls in aio_notify; |
176 | + bytes, flags, true); | 73 | - * accessed with atomic primitives. If this field is 0, everything |
177 | +} | 74 | - * (file descriptors, bottom halves, timers) will be re-evaluated |
178 | + | 75 | - * before the next blocking poll(), thus the event_notifier_set call |
179 | +/* Copy range from @src to @dst. | 76 | - * can be skipped. If it is non-zero, you may need to wake up a |
180 | + * | 77 | - * concurrent aio_poll or the glib main event loop, making |
181 | + * See the comment of bdrv_co_copy_range for the parameter and return value | 78 | - * event_notifier_set necessary. |
182 | + * semantics. */ | 79 | + * only written from the AioContext home thread, or under the BQL in |
183 | +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, | 80 | + * the case of the main AioContext. However, it is read from any |
184 | + BdrvChild *dst, uint64_t dst_offset, | 81 | + * thread so it is still accessed with atomic primitives. |
185 | + uint64_t bytes, BdrvRequestFlags flags) | 82 | + * |
186 | +{ | 83 | + * If this field is 0, everything (file descriptors, bottom halves, |
187 | + return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, | 84 | + * timers) will be re-evaluated before the next blocking poll() or |
188 | + bytes, flags, false); | 85 | + * io_uring wait; therefore, the event_notifier_set call can be |
189 | +} | 86 | + * skipped. If it is non-zero, you may need to wake up a concurrent |
190 | + | 87 | + * aio_poll or the glib main event loop, making event_notifier_set |
191 | +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, | 88 | + * necessary. |
192 | + BdrvChild *dst, uint64_t dst_offset, | 89 | * |
193 | + uint64_t bytes, BdrvRequestFlags flags) | 90 | * Bit 0 is reserved for GSource usage of the AioContext, and is 1 |
194 | +{ | 91 | * between a call to aio_ctx_prepare and the next call to aio_ctx_check. |
195 | + BdrvTrackedRequest src_req, dst_req; | 92 | @@ -XXX,XX +XXX,XX @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co); |
196 | + BlockDriverState *src_bs = src->bs; | 93 | */ |
197 | + BlockDriverState *dst_bs = dst->bs; | 94 | AioContext *qemu_get_current_aio_context(void); |
198 | + int ret; | 95 | |
199 | + | 96 | -/** |
200 | + bdrv_inc_in_flight(src_bs); | 97 | - * in_aio_context_home_thread: |
201 | + bdrv_inc_in_flight(dst_bs); | 98 | - * @ctx: the aio context |
202 | + tracked_request_begin(&src_req, src_bs, src_offset, | 99 | - * |
203 | + bytes, BDRV_TRACKED_READ); | 100 | - * Return whether we are running in the thread that normally runs @ctx. Note |
204 | + tracked_request_begin(&dst_req, dst_bs, dst_offset, | 101 | - * that acquiring/releasing ctx does not affect the outcome, each AioContext |
205 | + bytes, BDRV_TRACKED_WRITE); | 102 | - * still only has one home thread that is responsible for running it. |
206 | + | 103 | - */ |
207 | + wait_serialising_requests(&src_req); | 104 | -static inline bool in_aio_context_home_thread(AioContext *ctx) |
208 | + wait_serialising_requests(&dst_req); | 105 | -{ |
209 | + ret = bdrv_co_copy_range_from(src, src_offset, | 106 | - return ctx == qemu_get_current_aio_context(); |
210 | + dst, dst_offset, | 107 | -} |
211 | + bytes, flags); | 108 | - |
212 | + | 109 | /** |
213 | + tracked_request_end(&src_req); | 110 | * aio_context_setup: |
214 | + tracked_request_end(&dst_req); | 111 | * @ctx: the aio context |
215 | + bdrv_dec_in_flight(src_bs); | ||
216 | + bdrv_dec_in_flight(dst_bs); | ||
217 | + return ret; | ||
218 | +} | ||
219 | -- | 112 | -- |
220 | 2.17.1 | 113 | 2.25.1 |
221 | 114 | ||
222 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | We don't verify the request range against s->size in the I/O callbacks | ||
4 | except for raw_co_pwritev. This is inconsistent (especially for | ||
5 | raw_co_pwrite_zeroes and raw_co_pdiscard), so fix them, in the meanwhile | ||
6 | make the helper reusable by the coming new callbacks. | ||
7 | |||
8 | Note that in most cases the block layer already verifies the request | ||
9 | byte range against our reported image length, before invoking the driver | ||
10 | callbacks. The exception is during image creating, after | ||
11 | blk_set_allow_write_beyond_eof(blk, true) is called. But in that case, | ||
12 | the requests are not directly from the user or guest. So there is no | ||
13 | visible behavior change in adding the check code. | ||
14 | |||
15 | The int64_t -> uint64_t inconsistency, as shown by the type casting, is | ||
16 | pre-existing due to the interface. | ||
17 | |||
18 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
19 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
20 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
21 | Message-id: 20180601092648.24614-3-famz@redhat.com | ||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
23 | --- | ||
24 | block/raw-format.c | 64 ++++++++++++++++++++++++++++------------------ | ||
25 | 1 file changed, 39 insertions(+), 25 deletions(-) | ||
26 | |||
27 | diff --git a/block/raw-format.c b/block/raw-format.c | ||
28 | index XXXXXXX..XXXXXXX 100644 | ||
29 | --- a/block/raw-format.c | ||
30 | +++ b/block/raw-format.c | ||
31 | @@ -XXX,XX +XXX,XX @@ static void raw_reopen_abort(BDRVReopenState *state) | ||
32 | state->opaque = NULL; | ||
33 | } | ||
34 | |||
35 | +/* Check and adjust the offset, against 'offset' and 'size' options. */ | ||
36 | +static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset, | ||
37 | + uint64_t bytes, bool is_write) | ||
38 | +{ | ||
39 | + BDRVRawState *s = bs->opaque; | ||
40 | + | ||
41 | + if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) { | ||
42 | + /* There's not enough space for the write, or the read request is | ||
43 | + * out-of-range. Don't read/write anything to prevent leaking out of | ||
44 | + * the size specified in options. */ | ||
45 | + return is_write ? -ENOSPC : -EINVAL;; | ||
46 | + } | ||
47 | + | ||
48 | + if (*offset > INT64_MAX - s->offset) { | ||
49 | + return -EINVAL; | ||
50 | + } | ||
51 | + *offset += s->offset; | ||
52 | + | ||
53 | + return 0; | ||
54 | +} | ||
55 | + | ||
56 | static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, | ||
57 | uint64_t bytes, QEMUIOVector *qiov, | ||
58 | int flags) | ||
59 | { | ||
60 | - BDRVRawState *s = bs->opaque; | ||
61 | + int ret; | ||
62 | |||
63 | - if (offset > UINT64_MAX - s->offset) { | ||
64 | - return -EINVAL; | ||
65 | + ret = raw_adjust_offset(bs, &offset, bytes, false); | ||
66 | + if (ret) { | ||
67 | + return ret; | ||
68 | } | ||
69 | - offset += s->offset; | ||
70 | |||
71 | BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); | ||
72 | return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); | ||
73 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, | ||
74 | uint64_t bytes, QEMUIOVector *qiov, | ||
75 | int flags) | ||
76 | { | ||
77 | - BDRVRawState *s = bs->opaque; | ||
78 | void *buf = NULL; | ||
79 | BlockDriver *drv; | ||
80 | QEMUIOVector local_qiov; | ||
81 | int ret; | ||
82 | |||
83 | - if (s->has_size && (offset > s->size || bytes > (s->size - offset))) { | ||
84 | - /* There's not enough space for the data. Don't write anything and just | ||
85 | - * fail to prevent leaking out of the size specified in options. */ | ||
86 | - return -ENOSPC; | ||
87 | - } | ||
88 | - | ||
89 | - if (offset > UINT64_MAX - s->offset) { | ||
90 | - ret = -EINVAL; | ||
91 | - goto fail; | ||
92 | - } | ||
93 | - | ||
94 | if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) { | ||
95 | /* Handling partial writes would be a pain - so we just | ||
96 | * require that guests have 512-byte request alignment if | ||
97 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, | ||
98 | qiov = &local_qiov; | ||
99 | } | ||
100 | |||
101 | - offset += s->offset; | ||
102 | + ret = raw_adjust_offset(bs, &offset, bytes, true); | ||
103 | + if (ret) { | ||
104 | + goto fail; | ||
105 | + } | ||
106 | |||
107 | BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
108 | ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); | ||
109 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, | ||
110 | int64_t offset, int bytes, | ||
111 | BdrvRequestFlags flags) | ||
112 | { | ||
113 | - BDRVRawState *s = bs->opaque; | ||
114 | - if (offset > UINT64_MAX - s->offset) { | ||
115 | - return -EINVAL; | ||
116 | + int ret; | ||
117 | + | ||
118 | + ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); | ||
119 | + if (ret) { | ||
120 | + return ret; | ||
121 | } | ||
122 | - offset += s->offset; | ||
123 | return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); | ||
124 | } | ||
125 | |||
126 | static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, | ||
127 | int64_t offset, int bytes) | ||
128 | { | ||
129 | - BDRVRawState *s = bs->opaque; | ||
130 | - if (offset > UINT64_MAX - s->offset) { | ||
131 | - return -EINVAL; | ||
132 | + int ret; | ||
133 | + | ||
134 | + ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); | ||
135 | + if (ret) { | ||
136 | + return ret; | ||
137 | } | ||
138 | - offset += s->offset; | ||
139 | return bdrv_co_pdiscard(bs->file->bs, offset, bytes); | ||
140 | } | ||
141 | |||
142 | -- | ||
143 | 2.17.1 | ||
144 | |||
145 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | Just pass down to ->file. | ||
4 | |||
5 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
7 | Message-id: 20180601092648.24614-4-famz@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | --- | ||
10 | block/raw-format.c | 32 ++++++++++++++++++++++++++++++++ | ||
11 | 1 file changed, 32 insertions(+) | ||
12 | |||
13 | diff --git a/block/raw-format.c b/block/raw-format.c | ||
14 | index XXXXXXX..XXXXXXX 100644 | ||
15 | --- a/block/raw-format.c | ||
16 | +++ b/block/raw-format.c | ||
17 | @@ -XXX,XX +XXX,XX @@ static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) | ||
18 | return bdrv_probe_geometry(bs->file->bs, geo); | ||
19 | } | ||
20 | |||
21 | +static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, | ||
22 | + BdrvChild *src, uint64_t src_offset, | ||
23 | + BdrvChild *dst, uint64_t dst_offset, | ||
24 | + uint64_t bytes, BdrvRequestFlags flags) | ||
25 | +{ | ||
26 | + int ret; | ||
27 | + | ||
28 | + ret = raw_adjust_offset(bs, &src_offset, bytes, false); | ||
29 | + if (ret) { | ||
30 | + return ret; | ||
31 | + } | ||
32 | + return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset, | ||
33 | + bytes, flags); | ||
34 | +} | ||
35 | + | ||
36 | +static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, | ||
37 | + BdrvChild *src, uint64_t src_offset, | ||
38 | + BdrvChild *dst, uint64_t dst_offset, | ||
39 | + uint64_t bytes, BdrvRequestFlags flags) | ||
40 | +{ | ||
41 | + int ret; | ||
42 | + | ||
43 | + ret = raw_adjust_offset(bs, &dst_offset, bytes, true); | ||
44 | + if (ret) { | ||
45 | + return ret; | ||
46 | + } | ||
47 | + return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes, | ||
48 | + flags); | ||
49 | +} | ||
50 | + | ||
51 | BlockDriver bdrv_raw = { | ||
52 | .format_name = "raw", | ||
53 | .instance_size = sizeof(BDRVRawState), | ||
54 | @@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = { | ||
55 | .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, | ||
56 | .bdrv_co_pdiscard = &raw_co_pdiscard, | ||
57 | .bdrv_co_block_status = &raw_co_block_status, | ||
58 | + .bdrv_co_copy_range_from = &raw_co_copy_range_from, | ||
59 | + .bdrv_co_copy_range_to = &raw_co_copy_range_to, | ||
60 | .bdrv_truncate = &raw_truncate, | ||
61 | .bdrv_getlength = &raw_getlength, | ||
62 | .has_variable_length = true, | ||
63 | -- | ||
64 | 2.17.1 | ||
65 | |||
66 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | The two callbacks are implemented quite similarly to the read/write | ||
4 | functions: bdrv_co_copy_range_from maps for read and calls into bs->file | ||
5 | or bs->backing depending on the allocation status; bdrv_co_copy_range_to | ||
6 | maps for write and calls into bs->file. | ||
7 | |||
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
10 | Message-id: 20180601092648.24614-5-famz@redhat.com | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | --- | ||
13 | block/qcow2.c | 229 +++++++++++++++++++++++++++++++++++++++++++------- | ||
14 | 1 file changed, 199 insertions(+), 30 deletions(-) | ||
15 | |||
16 | diff --git a/block/qcow2.c b/block/qcow2.c | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/block/qcow2.c | ||
19 | +++ b/block/qcow2.c | ||
20 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs, | ||
21 | return status; | ||
22 | } | ||
23 | |||
24 | +static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs, | ||
25 | + QCowL2Meta **pl2meta, | ||
26 | + bool link_l2) | ||
27 | +{ | ||
28 | + int ret = 0; | ||
29 | + QCowL2Meta *l2meta = *pl2meta; | ||
30 | + | ||
31 | + while (l2meta != NULL) { | ||
32 | + QCowL2Meta *next; | ||
33 | + | ||
34 | + if (!ret && link_l2) { | ||
35 | + ret = qcow2_alloc_cluster_link_l2(bs, l2meta); | ||
36 | + if (ret) { | ||
37 | + goto out; | ||
38 | + } | ||
39 | + } | ||
40 | + | ||
41 | + /* Take the request off the list of running requests */ | ||
42 | + if (l2meta->nb_clusters != 0) { | ||
43 | + QLIST_REMOVE(l2meta, next_in_flight); | ||
44 | + } | ||
45 | + | ||
46 | + qemu_co_queue_restart_all(&l2meta->dependent_requests); | ||
47 | + | ||
48 | + next = l2meta->next; | ||
49 | + g_free(l2meta); | ||
50 | + l2meta = next; | ||
51 | + } | ||
52 | +out: | ||
53 | + *pl2meta = l2meta; | ||
54 | + return ret; | ||
55 | +} | ||
56 | + | ||
57 | static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, | ||
58 | uint64_t bytes, QEMUIOVector *qiov, | ||
59 | int flags) | ||
60 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, | ||
61 | } | ||
62 | } | ||
63 | |||
64 | - while (l2meta != NULL) { | ||
65 | - QCowL2Meta *next; | ||
66 | - | ||
67 | - ret = qcow2_alloc_cluster_link_l2(bs, l2meta); | ||
68 | - if (ret < 0) { | ||
69 | - goto fail; | ||
70 | - } | ||
71 | - | ||
72 | - /* Take the request off the list of running requests */ | ||
73 | - if (l2meta->nb_clusters != 0) { | ||
74 | - QLIST_REMOVE(l2meta, next_in_flight); | ||
75 | - } | ||
76 | - | ||
77 | - qemu_co_queue_restart_all(&l2meta->dependent_requests); | ||
78 | - | ||
79 | - next = l2meta->next; | ||
80 | - g_free(l2meta); | ||
81 | - l2meta = next; | ||
82 | + ret = qcow2_handle_l2meta(bs, &l2meta, true); | ||
83 | + if (ret) { | ||
84 | + goto fail; | ||
85 | } | ||
86 | |||
87 | bytes -= cur_bytes; | ||
88 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, | ||
89 | ret = 0; | ||
90 | |||
91 | fail: | ||
92 | - while (l2meta != NULL) { | ||
93 | - QCowL2Meta *next; | ||
94 | - | ||
95 | - if (l2meta->nb_clusters != 0) { | ||
96 | - QLIST_REMOVE(l2meta, next_in_flight); | ||
97 | - } | ||
98 | - qemu_co_queue_restart_all(&l2meta->dependent_requests); | ||
99 | - | ||
100 | - next = l2meta->next; | ||
101 | - g_free(l2meta); | ||
102 | - l2meta = next; | ||
103 | - } | ||
104 | + qcow2_handle_l2meta(bs, &l2meta, false); | ||
105 | |||
106 | qemu_co_mutex_unlock(&s->lock); | ||
107 | |||
108 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, | ||
109 | return ret; | ||
110 | } | ||
111 | |||
112 | +static int coroutine_fn | ||
113 | +qcow2_co_copy_range_from(BlockDriverState *bs, | ||
114 | + BdrvChild *src, uint64_t src_offset, | ||
115 | + BdrvChild *dst, uint64_t dst_offset, | ||
116 | + uint64_t bytes, BdrvRequestFlags flags) | ||
117 | +{ | ||
118 | + BDRVQcow2State *s = bs->opaque; | ||
119 | + int ret; | ||
120 | + unsigned int cur_bytes; /* number of bytes in current iteration */ | ||
121 | + BdrvChild *child = NULL; | ||
122 | + BdrvRequestFlags cur_flags; | ||
123 | + | ||
124 | + assert(!bs->encrypted); | ||
125 | + qemu_co_mutex_lock(&s->lock); | ||
126 | + | ||
127 | + while (bytes != 0) { | ||
128 | + uint64_t copy_offset = 0; | ||
129 | + /* prepare next request */ | ||
130 | + cur_bytes = MIN(bytes, INT_MAX); | ||
131 | + cur_flags = flags; | ||
132 | + | ||
133 | + ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, ©_offset); | ||
134 | + if (ret < 0) { | ||
135 | + goto out; | ||
136 | + } | ||
137 | + | ||
138 | + switch (ret) { | ||
139 | + case QCOW2_CLUSTER_UNALLOCATED: | ||
140 | + if (bs->backing && bs->backing->bs) { | ||
141 | + int64_t backing_length = bdrv_getlength(bs->backing->bs); | ||
142 | + if (src_offset >= backing_length) { | ||
143 | + cur_flags |= BDRV_REQ_ZERO_WRITE; | ||
144 | + } else { | ||
145 | + child = bs->backing; | ||
146 | + cur_bytes = MIN(cur_bytes, backing_length - src_offset); | ||
147 | + copy_offset = src_offset; | ||
148 | + } | ||
149 | + } else { | ||
150 | + cur_flags |= BDRV_REQ_ZERO_WRITE; | ||
151 | + } | ||
152 | + break; | ||
153 | + | ||
154 | + case QCOW2_CLUSTER_ZERO_PLAIN: | ||
155 | + case QCOW2_CLUSTER_ZERO_ALLOC: | ||
156 | + cur_flags |= BDRV_REQ_ZERO_WRITE; | ||
157 | + break; | ||
158 | + | ||
159 | + case QCOW2_CLUSTER_COMPRESSED: | ||
160 | + ret = -ENOTSUP; | ||
161 | + goto out; | ||
162 | + break; | ||
163 | + | ||
164 | + case QCOW2_CLUSTER_NORMAL: | ||
165 | + child = bs->file; | ||
166 | + copy_offset += offset_into_cluster(s, src_offset); | ||
167 | + if ((copy_offset & 511) != 0) { | ||
168 | + ret = -EIO; | ||
169 | + goto out; | ||
170 | + } | ||
171 | + break; | ||
172 | + | ||
173 | + default: | ||
174 | + abort(); | ||
175 | + } | ||
176 | + qemu_co_mutex_unlock(&s->lock); | ||
177 | + ret = bdrv_co_copy_range_from(child, | ||
178 | + copy_offset, | ||
179 | + dst, dst_offset, | ||
180 | + cur_bytes, cur_flags); | ||
181 | + qemu_co_mutex_lock(&s->lock); | ||
182 | + if (ret < 0) { | ||
183 | + goto out; | ||
184 | + } | ||
185 | + | ||
186 | + bytes -= cur_bytes; | ||
187 | + src_offset += cur_bytes; | ||
188 | + dst_offset += cur_bytes; | ||
189 | + } | ||
190 | + ret = 0; | ||
191 | + | ||
192 | +out: | ||
193 | + qemu_co_mutex_unlock(&s->lock); | ||
194 | + return ret; | ||
195 | +} | ||
196 | + | ||
197 | +static int coroutine_fn | ||
198 | +qcow2_co_copy_range_to(BlockDriverState *bs, | ||
199 | + BdrvChild *src, uint64_t src_offset, | ||
200 | + BdrvChild *dst, uint64_t dst_offset, | ||
201 | + uint64_t bytes, BdrvRequestFlags flags) | ||
202 | +{ | ||
203 | + BDRVQcow2State *s = bs->opaque; | ||
204 | + int offset_in_cluster; | ||
205 | + int ret; | ||
206 | + unsigned int cur_bytes; /* number of sectors in current iteration */ | ||
207 | + uint64_t cluster_offset; | ||
208 | + uint8_t *cluster_data = NULL; | ||
209 | + QCowL2Meta *l2meta = NULL; | ||
210 | + | ||
211 | + assert(!bs->encrypted); | ||
212 | + s->cluster_cache_offset = -1; /* disable compressed cache */ | ||
213 | + | ||
214 | + qemu_co_mutex_lock(&s->lock); | ||
215 | + | ||
216 | + while (bytes != 0) { | ||
217 | + | ||
218 | + l2meta = NULL; | ||
219 | + | ||
220 | + offset_in_cluster = offset_into_cluster(s, dst_offset); | ||
221 | + cur_bytes = MIN(bytes, INT_MAX); | ||
222 | + | ||
223 | + /* TODO: | ||
224 | + * If src->bs == dst->bs, we could simply copy by incrementing | ||
225 | + * the refcnt, without copying user data. | ||
226 | + * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */ | ||
227 | + ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes, | ||
228 | + &cluster_offset, &l2meta); | ||
229 | + if (ret < 0) { | ||
230 | + goto fail; | ||
231 | + } | ||
232 | + | ||
233 | + assert((cluster_offset & 511) == 0); | ||
234 | + | ||
235 | + ret = qcow2_pre_write_overlap_check(bs, 0, | ||
236 | + cluster_offset + offset_in_cluster, cur_bytes); | ||
237 | + if (ret < 0) { | ||
238 | + goto fail; | ||
239 | + } | ||
240 | + | ||
241 | + qemu_co_mutex_unlock(&s->lock); | ||
242 | + ret = bdrv_co_copy_range_to(src, src_offset, | ||
243 | + bs->file, | ||
244 | + cluster_offset + offset_in_cluster, | ||
245 | + cur_bytes, flags); | ||
246 | + qemu_co_mutex_lock(&s->lock); | ||
247 | + if (ret < 0) { | ||
248 | + goto fail; | ||
249 | + } | ||
250 | + | ||
251 | + ret = qcow2_handle_l2meta(bs, &l2meta, true); | ||
252 | + if (ret) { | ||
253 | + goto fail; | ||
254 | + } | ||
255 | + | ||
256 | + bytes -= cur_bytes; | ||
257 | + dst_offset += cur_bytes; | ||
258 | + } | ||
259 | + ret = 0; | ||
260 | + | ||
261 | +fail: | ||
262 | + qcow2_handle_l2meta(bs, &l2meta, false); | ||
263 | + | ||
264 | + qemu_co_mutex_unlock(&s->lock); | ||
265 | + | ||
266 | + qemu_vfree(cluster_data); | ||
267 | + trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); | ||
268 | + | ||
269 | + return ret; | ||
270 | +} | ||
271 | + | ||
272 | static int qcow2_truncate(BlockDriverState *bs, int64_t offset, | ||
273 | PreallocMode prealloc, Error **errp) | ||
274 | { | ||
275 | @@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_qcow2 = { | ||
276 | |||
277 | .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, | ||
278 | .bdrv_co_pdiscard = qcow2_co_pdiscard, | ||
279 | + .bdrv_co_copy_range_from = qcow2_co_copy_range_from, | ||
280 | + .bdrv_co_copy_range_to = qcow2_co_copy_range_to, | ||
281 | .bdrv_truncate = qcow2_truncate, | ||
282 | .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed, | ||
283 | .bdrv_make_empty = qcow2_make_empty, | ||
284 | -- | ||
285 | 2.17.1 | ||
286 | |||
287 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | With copy_file_range(2), we can implement the bdrv_co_copy_range | ||
4 | semantics. | ||
5 | |||
6 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
7 | Message-id: 20180601092648.24614-6-famz@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | --- | ||
10 | configure | 17 +++++++ | ||
11 | include/block/raw-aio.h | 10 ++++- | ||
12 | block/file-posix.c | 98 +++++++++++++++++++++++++++++++++++++++-- | ||
13 | 3 files changed, 120 insertions(+), 5 deletions(-) | ||
14 | |||
15 | diff --git a/configure b/configure | ||
16 | index XXXXXXX..XXXXXXX 100755 | ||
17 | --- a/configure | ||
18 | +++ b/configure | ||
19 | @@ -XXX,XX +XXX,XX @@ if test "$fortify_source" != "no"; then | ||
20 | fi | ||
21 | fi | ||
22 | |||
23 | +############################################### | ||
24 | +# Check if copy_file_range is provided by glibc | ||
25 | +have_copy_file_range=no | ||
26 | +cat > $TMPC << EOF | ||
27 | +#include <unistd.h> | ||
28 | +int main(void) { | ||
29 | + copy_file_range(0, NULL, 0, NULL, 0, 0); | ||
30 | + return 0; | ||
31 | +} | ||
32 | +EOF | ||
33 | +if compile_prog "" "" ; then | ||
34 | + have_copy_file_range=yes | ||
35 | +fi | ||
36 | + | ||
37 | ########################################## | ||
38 | # check if struct fsxattr is available via linux/fs.h | ||
39 | |||
40 | @@ -XXX,XX +XXX,XX @@ fi | ||
41 | if test "$have_fsxattr" = "yes" ; then | ||
42 | echo "HAVE_FSXATTR=y" >> $config_host_mak | ||
43 | fi | ||
44 | +if test "$have_copy_file_range" = "yes" ; then | ||
45 | + echo "HAVE_COPY_FILE_RANGE=y" >> $config_host_mak | ||
46 | +fi | ||
47 | if test "$vte" = "yes" ; then | ||
48 | echo "CONFIG_VTE=y" >> $config_host_mak | ||
49 | echo "VTE_CFLAGS=$vte_cflags" >> $config_host_mak | ||
50 | diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h | ||
51 | index XXXXXXX..XXXXXXX 100644 | ||
52 | --- a/include/block/raw-aio.h | ||
53 | +++ b/include/block/raw-aio.h | ||
54 | @@ -XXX,XX +XXX,XX @@ | ||
55 | #define QEMU_AIO_FLUSH 0x0008 | ||
56 | #define QEMU_AIO_DISCARD 0x0010 | ||
57 | #define QEMU_AIO_WRITE_ZEROES 0x0020 | ||
58 | +#define QEMU_AIO_COPY_RANGE 0x0040 | ||
59 | #define QEMU_AIO_TYPE_MASK \ | ||
60 | - (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ | ||
61 | - QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) | ||
62 | + (QEMU_AIO_READ | \ | ||
63 | + QEMU_AIO_WRITE | \ | ||
64 | + QEMU_AIO_IOCTL | \ | ||
65 | + QEMU_AIO_FLUSH | \ | ||
66 | + QEMU_AIO_DISCARD | \ | ||
67 | + QEMU_AIO_WRITE_ZEROES | \ | ||
68 | + QEMU_AIO_COPY_RANGE) | ||
69 | |||
70 | /* AIO flags */ | ||
71 | #define QEMU_AIO_MISALIGNED 0x1000 | ||
72 | diff --git a/block/file-posix.c b/block/file-posix.c | ||
73 | index XXXXXXX..XXXXXXX 100644 | ||
74 | --- a/block/file-posix.c | ||
75 | +++ b/block/file-posix.c | ||
76 | @@ -XXX,XX +XXX,XX @@ | ||
77 | #ifdef __linux__ | ||
78 | #include <sys/ioctl.h> | ||
79 | #include <sys/param.h> | ||
80 | +#include <sys/syscall.h> | ||
81 | #include <linux/cdrom.h> | ||
82 | #include <linux/fd.h> | ||
83 | #include <linux/fs.h> | ||
84 | @@ -XXX,XX +XXX,XX @@ typedef struct RawPosixAIOData { | ||
85 | #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ | ||
86 | off_t aio_offset; | ||
87 | int aio_type; | ||
88 | + int aio_fd2; | ||
89 | + off_t aio_offset2; | ||
90 | } RawPosixAIOData; | ||
91 | |||
92 | #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) | ||
93 | @@ -XXX,XX +XXX,XX @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) | ||
94 | return -ENOTSUP; | ||
95 | } | ||
96 | |||
97 | +#ifndef HAVE_COPY_FILE_RANGE | ||
98 | +static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, | ||
99 | + off_t *out_off, size_t len, unsigned int flags) | ||
100 | +{ | ||
101 | +#ifdef __NR_copy_file_range | ||
102 | + return syscall(__NR_copy_file_range, in_fd, in_off, out_fd, | ||
103 | + out_off, len, flags); | ||
104 | +#else | ||
105 | + errno = ENOSYS; | ||
106 | + return -1; | ||
107 | +#endif | ||
108 | +} | ||
109 | +#endif | ||
110 | + | ||
111 | +static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) | ||
112 | +{ | ||
113 | + uint64_t bytes = aiocb->aio_nbytes; | ||
114 | + off_t in_off = aiocb->aio_offset; | ||
115 | + off_t out_off = aiocb->aio_offset2; | ||
116 | + | ||
117 | + while (bytes) { | ||
118 | + ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, | ||
119 | + aiocb->aio_fd2, &out_off, | ||
120 | + bytes, 0); | ||
121 | + if (ret == -EINTR) { | ||
122 | + continue; | ||
123 | + } | ||
124 | + if (ret < 0) { | ||
125 | + if (errno == ENOSYS) { | ||
126 | + return -ENOTSUP; | ||
127 | + } else { | ||
128 | + return -errno; | ||
129 | + } | ||
130 | + } | ||
131 | + if (!ret) { | ||
132 | + /* No progress (e.g. when beyond EOF), fall back to buffer I/O. */ | ||
133 | + return -ENOTSUP; | ||
134 | + } | ||
135 | + bytes -= ret; | ||
136 | + } | ||
137 | + return 0; | ||
138 | +} | ||
139 | + | ||
140 | static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) | ||
141 | { | ||
142 | int ret = -EOPNOTSUPP; | ||
143 | @@ -XXX,XX +XXX,XX @@ static int aio_worker(void *arg) | ||
144 | case QEMU_AIO_WRITE_ZEROES: | ||
145 | ret = handle_aiocb_write_zeroes(aiocb); | ||
146 | break; | ||
147 | + case QEMU_AIO_COPY_RANGE: | ||
148 | + ret = handle_aiocb_copy_range(aiocb); | ||
149 | + break; | ||
150 | default: | ||
151 | fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); | ||
152 | ret = -EINVAL; | ||
153 | @@ -XXX,XX +XXX,XX @@ static int aio_worker(void *arg) | ||
154 | return ret; | ||
155 | } | ||
156 | |||
157 | -static int paio_submit_co(BlockDriverState *bs, int fd, | ||
158 | - int64_t offset, QEMUIOVector *qiov, | ||
159 | - int bytes, int type) | ||
160 | +static int paio_submit_co_full(BlockDriverState *bs, int fd, | ||
161 | + int64_t offset, int fd2, int64_t offset2, | ||
162 | + QEMUIOVector *qiov, | ||
163 | + int bytes, int type) | ||
164 | { | ||
165 | RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); | ||
166 | ThreadPool *pool; | ||
167 | @@ -XXX,XX +XXX,XX @@ static int paio_submit_co(BlockDriverState *bs, int fd, | ||
168 | acb->bs = bs; | ||
169 | acb->aio_type = type; | ||
170 | acb->aio_fildes = fd; | ||
171 | + acb->aio_fd2 = fd2; | ||
172 | + acb->aio_offset2 = offset2; | ||
173 | |||
174 | acb->aio_nbytes = bytes; | ||
175 | acb->aio_offset = offset; | ||
176 | @@ -XXX,XX +XXX,XX @@ static int paio_submit_co(BlockDriverState *bs, int fd, | ||
177 | return thread_pool_submit_co(pool, aio_worker, acb); | ||
178 | } | ||
179 | |||
180 | +static inline int paio_submit_co(BlockDriverState *bs, int fd, | ||
181 | + int64_t offset, QEMUIOVector *qiov, | ||
182 | + int bytes, int type) | ||
183 | +{ | ||
184 | + return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type); | ||
185 | +} | ||
186 | + | ||
187 | static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, | ||
188 | int64_t offset, QEMUIOVector *qiov, int bytes, | ||
189 | BlockCompletionFunc *cb, void *opaque, int type) | ||
190 | @@ -XXX,XX +XXX,XX @@ static void raw_abort_perm_update(BlockDriverState *bs) | ||
191 | raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); | ||
192 | } | ||
193 | |||
194 | +static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, | ||
195 | + BdrvChild *src, uint64_t src_offset, | ||
196 | + BdrvChild *dst, uint64_t dst_offset, | ||
197 | + uint64_t bytes, BdrvRequestFlags flags) | ||
198 | +{ | ||
199 | + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); | ||
200 | +} | ||
201 | + | ||
202 | +static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, | ||
203 | + BdrvChild *src, uint64_t src_offset, | ||
204 | + BdrvChild *dst, uint64_t dst_offset, | ||
205 | + uint64_t bytes, BdrvRequestFlags flags) | ||
206 | +{ | ||
207 | + BDRVRawState *s = bs->opaque; | ||
208 | + BDRVRawState *src_s; | ||
209 | + | ||
210 | + assert(dst->bs == bs); | ||
211 | + if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) { | ||
212 | + return -ENOTSUP; | ||
213 | + } | ||
214 | + | ||
215 | + src_s = src->bs->opaque; | ||
216 | + if (fd_open(bs) < 0 || fd_open(bs) < 0) { | ||
217 | + return -EIO; | ||
218 | + } | ||
219 | + return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset, | ||
220 | + NULL, bytes, QEMU_AIO_COPY_RANGE); | ||
221 | +} | ||
222 | + | ||
223 | BlockDriver bdrv_file = { | ||
224 | .format_name = "file", | ||
225 | .protocol_name = "file", | ||
226 | @@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = { | ||
227 | .bdrv_co_pwritev = raw_co_pwritev, | ||
228 | .bdrv_aio_flush = raw_aio_flush, | ||
229 | .bdrv_aio_pdiscard = raw_aio_pdiscard, | ||
230 | + .bdrv_co_copy_range_from = raw_co_copy_range_from, | ||
231 | + .bdrv_co_copy_range_to = raw_co_copy_range_to, | ||
232 | .bdrv_refresh_limits = raw_refresh_limits, | ||
233 | .bdrv_io_plug = raw_aio_plug, | ||
234 | .bdrv_io_unplug = raw_aio_unplug, | ||
235 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = { | ||
236 | .bdrv_co_pwritev = raw_co_pwritev, | ||
237 | .bdrv_aio_flush = raw_aio_flush, | ||
238 | .bdrv_aio_pdiscard = hdev_aio_pdiscard, | ||
239 | + .bdrv_co_copy_range_from = raw_co_copy_range_from, | ||
240 | + .bdrv_co_copy_range_to = raw_co_copy_range_to, | ||
241 | .bdrv_refresh_limits = raw_refresh_limits, | ||
242 | .bdrv_io_plug = raw_aio_plug, | ||
243 | .bdrv_io_unplug = raw_aio_unplug, | ||
244 | -- | ||
245 | 2.17.1 | ||
246 | |||
247 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | The device designator data returned in INQUIRY command will be useful to | ||
4 | fill in source/target fields during copy offloading. Do this when | ||
5 | connecting to the target and save the data for later use. | ||
6 | |||
7 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Message-id: 20180601092648.24614-7-famz@redhat.com | ||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
11 | --- | ||
12 | block/iscsi.c | 41 +++++++++++++++++++++++++++++++++++++++++ | ||
13 | 1 file changed, 41 insertions(+) | ||
14 | |||
15 | diff --git a/block/iscsi.c b/block/iscsi.c | ||
16 | index XXXXXXX..XXXXXXX 100644 | ||
17 | --- a/block/iscsi.c | ||
18 | +++ b/block/iscsi.c | ||
19 | @@ -XXX,XX +XXX,XX @@ typedef struct IscsiLun { | ||
20 | QemuMutex mutex; | ||
21 | struct scsi_inquiry_logical_block_provisioning lbp; | ||
22 | struct scsi_inquiry_block_limits bl; | ||
23 | + struct scsi_inquiry_device_designator *dd; | ||
24 | unsigned char *zeroblock; | ||
25 | /* The allocmap tracks which clusters (pages) on the iSCSI target are | ||
26 | * allocated and which are not. In case a target returns zeros for | ||
27 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = { | ||
28 | }, | ||
29 | }; | ||
30 | |||
31 | +static void iscsi_save_designator(IscsiLun *lun, | ||
32 | + struct scsi_inquiry_device_identification *inq_di) | ||
33 | +{ | ||
34 | + struct scsi_inquiry_device_designator *desig, *copy = NULL; | ||
35 | + | ||
36 | + for (desig = inq_di->designators; desig; desig = desig->next) { | ||
37 | + if (desig->association || | ||
38 | + desig->designator_type > SCSI_DESIGNATOR_TYPE_NAA) { | ||
39 | + continue; | ||
40 | + } | ||
41 | + /* NAA works better than T10 vendor ID based designator. */ | ||
42 | + if (!copy || copy->designator_type < desig->designator_type) { | ||
43 | + copy = desig; | ||
44 | + } | ||
45 | + } | ||
46 | + if (copy) { | ||
47 | + lun->dd = g_new(struct scsi_inquiry_device_designator, 1); | ||
48 | + *lun->dd = *copy; | ||
49 | + lun->dd->next = NULL; | ||
50 | + lun->dd->designator = g_malloc(copy->designator_length); | ||
51 | + memcpy(lun->dd->designator, copy->designator, copy->designator_length); | ||
52 | + } | ||
53 | +} | ||
54 | + | ||
55 | static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, | ||
56 | Error **errp) | ||
57 | { | ||
58 | @@ -XXX,XX +XXX,XX @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, | ||
59 | struct scsi_task *inq_task; | ||
60 | struct scsi_inquiry_logical_block_provisioning *inq_lbp; | ||
61 | struct scsi_inquiry_block_limits *inq_bl; | ||
62 | + struct scsi_inquiry_device_identification *inq_di; | ||
63 | switch (inq_vpd->pages[i]) { | ||
64 | case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: | ||
65 | inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, | ||
66 | @@ -XXX,XX +XXX,XX @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, | ||
67 | sizeof(struct scsi_inquiry_block_limits)); | ||
68 | scsi_free_scsi_task(inq_task); | ||
69 | break; | ||
70 | + case SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION: | ||
71 | + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, | ||
72 | + SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION, | ||
73 | + (void **) &inq_di, errp); | ||
74 | + if (inq_task == NULL) { | ||
75 | + ret = -EINVAL; | ||
76 | + goto out; | ||
77 | + } | ||
78 | + iscsi_save_designator(iscsilun, inq_di); | ||
79 | + scsi_free_scsi_task(inq_task); | ||
80 | + break; | ||
81 | default: | ||
82 | break; | ||
83 | } | ||
84 | @@ -XXX,XX +XXX,XX @@ static void iscsi_close(BlockDriverState *bs) | ||
85 | iscsi_logout_sync(iscsi); | ||
86 | } | ||
87 | iscsi_destroy_context(iscsi); | ||
88 | + if (iscsilun->dd) { | ||
89 | + g_free(iscsilun->dd->designator); | ||
90 | + g_free(iscsilun->dd); | ||
91 | + } | ||
92 | g_free(iscsilun->zeroblock); | ||
93 | iscsi_allocmap_free(iscsilun); | ||
94 | qemu_mutex_destroy(&iscsilun->mutex); | ||
95 | -- | ||
96 | 2.17.1 | ||
97 | |||
98 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | This loop is repeated a growing number times. Make a helper. | ||
4 | |||
5 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
7 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
8 | Message-id: 20180601092648.24614-8-famz@redhat.com | ||
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | --- | ||
11 | block/iscsi.c | 54 ++++++++++++++++----------------------------------- | ||
12 | 1 file changed, 17 insertions(+), 37 deletions(-) | ||
13 | |||
14 | diff --git a/block/iscsi.c b/block/iscsi.c | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/block/iscsi.c | ||
17 | +++ b/block/iscsi.c | ||
18 | @@ -XXX,XX +XXX,XX @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun, | ||
19 | offset / iscsilun->cluster_size) == size); | ||
20 | } | ||
21 | |||
22 | +static void coroutine_fn iscsi_co_wait_for_task(IscsiTask *iTask, | ||
23 | + IscsiLun *iscsilun) | ||
24 | +{ | ||
25 | + while (!iTask->complete) { | ||
26 | + iscsi_set_events(iscsilun); | ||
27 | + qemu_mutex_unlock(&iscsilun->mutex); | ||
28 | + qemu_coroutine_yield(); | ||
29 | + qemu_mutex_lock(&iscsilun->mutex); | ||
30 | + } | ||
31 | +} | ||
32 | + | ||
33 | static int coroutine_fn | ||
34 | iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, | ||
35 | QEMUIOVector *iov, int flags) | ||
36 | @@ -XXX,XX +XXX,XX @@ retry: | ||
37 | scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, | ||
38 | iov->niov); | ||
39 | #endif | ||
40 | - while (!iTask.complete) { | ||
41 | - iscsi_set_events(iscsilun); | ||
42 | - qemu_mutex_unlock(&iscsilun->mutex); | ||
43 | - qemu_coroutine_yield(); | ||
44 | - qemu_mutex_lock(&iscsilun->mutex); | ||
45 | - } | ||
46 | + iscsi_co_wait_for_task(&iTask, iscsilun); | ||
47 | |||
48 | if (iTask.task != NULL) { | ||
49 | scsi_free_scsi_task(iTask.task); | ||
50 | @@ -XXX,XX +XXX,XX @@ retry: | ||
51 | ret = -ENOMEM; | ||
52 | goto out_unlock; | ||
53 | } | ||
54 | - | ||
55 | - while (!iTask.complete) { | ||
56 | - iscsi_set_events(iscsilun); | ||
57 | - qemu_mutex_unlock(&iscsilun->mutex); | ||
58 | - qemu_coroutine_yield(); | ||
59 | - qemu_mutex_lock(&iscsilun->mutex); | ||
60 | - } | ||
61 | + iscsi_co_wait_for_task(&iTask, iscsilun); | ||
62 | |||
63 | if (iTask.do_retry) { | ||
64 | if (iTask.task != NULL) { | ||
65 | @@ -XXX,XX +XXX,XX @@ retry: | ||
66 | #if LIBISCSI_API_VERSION < (20160603) | ||
67 | scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); | ||
68 | #endif | ||
69 | - while (!iTask.complete) { | ||
70 | - iscsi_set_events(iscsilun); | ||
71 | - qemu_mutex_unlock(&iscsilun->mutex); | ||
72 | - qemu_coroutine_yield(); | ||
73 | - qemu_mutex_lock(&iscsilun->mutex); | ||
74 | - } | ||
75 | |||
76 | + iscsi_co_wait_for_task(&iTask, iscsilun); | ||
77 | if (iTask.task != NULL) { | ||
78 | scsi_free_scsi_task(iTask.task); | ||
79 | iTask.task = NULL; | ||
80 | @@ -XXX,XX +XXX,XX @@ retry: | ||
81 | return -ENOMEM; | ||
82 | } | ||
83 | |||
84 | - while (!iTask.complete) { | ||
85 | - iscsi_set_events(iscsilun); | ||
86 | - qemu_mutex_unlock(&iscsilun->mutex); | ||
87 | - qemu_coroutine_yield(); | ||
88 | - qemu_mutex_lock(&iscsilun->mutex); | ||
89 | - } | ||
90 | + iscsi_co_wait_for_task(&iTask, iscsilun); | ||
91 | |||
92 | if (iTask.task != NULL) { | ||
93 | scsi_free_scsi_task(iTask.task); | ||
94 | @@ -XXX,XX +XXX,XX @@ retry: | ||
95 | goto out_unlock; | ||
96 | } | ||
97 | |||
98 | - while (!iTask.complete) { | ||
99 | - iscsi_set_events(iscsilun); | ||
100 | - qemu_mutex_unlock(&iscsilun->mutex); | ||
101 | - qemu_coroutine_yield(); | ||
102 | - qemu_mutex_lock(&iscsilun->mutex); | ||
103 | - } | ||
104 | + iscsi_co_wait_for_task(&iTask, iscsilun); | ||
105 | |||
106 | if (iTask.task != NULL) { | ||
107 | scsi_free_scsi_task(iTask.task); | ||
108 | @@ -XXX,XX +XXX,XX @@ retry: | ||
109 | return -ENOMEM; | ||
110 | } | ||
111 | |||
112 | - while (!iTask.complete) { | ||
113 | - iscsi_set_events(iscsilun); | ||
114 | - qemu_mutex_unlock(&iscsilun->mutex); | ||
115 | - qemu_coroutine_yield(); | ||
116 | - qemu_mutex_lock(&iscsilun->mutex); | ||
117 | - } | ||
118 | + iscsi_co_wait_for_task(&iTask, iscsilun); | ||
119 | |||
120 | if (iTask.status == SCSI_STATUS_CHECK_CONDITION && | ||
121 | iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && | ||
122 | -- | ||
123 | 2.17.1 | ||
124 | |||
125 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | Issue EXTENDED COPY (LID1) command to implement the copy_range API. | ||
4 | |||
5 | The parameter data construction code is modified from libiscsi's | ||
6 | iscsi-dd.c. | ||
7 | |||
8 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
9 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | Message-id: 20180601092648.24614-9-famz@redhat.com | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | --- | ||
13 | include/scsi/constants.h | 4 + | ||
14 | block/iscsi.c | 219 +++++++++++++++++++++++++++++++++++++++ | ||
15 | 2 files changed, 223 insertions(+) | ||
16 | |||
17 | diff --git a/include/scsi/constants.h b/include/scsi/constants.h | ||
18 | index XXXXXXX..XXXXXXX 100644 | ||
19 | --- a/include/scsi/constants.h | ||
20 | +++ b/include/scsi/constants.h | ||
21 | @@ -XXX,XX +XXX,XX @@ | ||
22 | #define MMC_PROFILE_HDDVD_RW_DL 0x005A | ||
23 | #define MMC_PROFILE_INVALID 0xFFFF | ||
24 | |||
25 | +#define XCOPY_DESC_OFFSET 16 | ||
26 | +#define IDENT_DESCR_TGT_DESCR_SIZE 32 | ||
27 | +#define XCOPY_BLK2BLK_SEG_DESC_SIZE 28 | ||
28 | + | ||
29 | #endif | ||
30 | diff --git a/block/iscsi.c b/block/iscsi.c | ||
31 | index XXXXXXX..XXXXXXX 100644 | ||
32 | --- a/block/iscsi.c | ||
33 | +++ b/block/iscsi.c | ||
34 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn iscsi_co_invalidate_cache(BlockDriverState *bs, | ||
35 | iscsi_allocmap_invalidate(iscsilun); | ||
36 | } | ||
37 | |||
38 | +static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs, | ||
39 | + BdrvChild *src, | ||
40 | + uint64_t src_offset, | ||
41 | + BdrvChild *dst, | ||
42 | + uint64_t dst_offset, | ||
43 | + uint64_t bytes, | ||
44 | + BdrvRequestFlags flags) | ||
45 | +{ | ||
46 | + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); | ||
47 | +} | ||
48 | + | ||
49 | +static struct scsi_task *iscsi_xcopy_task(int param_len) | ||
50 | +{ | ||
51 | + struct scsi_task *task; | ||
52 | + | ||
53 | + task = g_new0(struct scsi_task, 1); | ||
54 | + | ||
55 | + task->cdb[0] = EXTENDED_COPY; | ||
56 | + task->cdb[10] = (param_len >> 24) & 0xFF; | ||
57 | + task->cdb[11] = (param_len >> 16) & 0xFF; | ||
58 | + task->cdb[12] = (param_len >> 8) & 0xFF; | ||
59 | + task->cdb[13] = param_len & 0xFF; | ||
60 | + task->cdb_size = 16; | ||
61 | + task->xfer_dir = SCSI_XFER_WRITE; | ||
62 | + task->expxferlen = param_len; | ||
63 | + | ||
64 | + return task; | ||
65 | +} | ||
66 | + | ||
67 | +static void iscsi_populate_target_desc(unsigned char *desc, IscsiLun *lun) | ||
68 | +{ | ||
69 | + struct scsi_inquiry_device_designator *dd = lun->dd; | ||
70 | + | ||
71 | + memset(desc, 0, 32); | ||
72 | + desc[0] = 0xE4; /* IDENT_DESCR_TGT_DESCR */ | ||
73 | + desc[4] = dd->code_set; | ||
74 | + desc[5] = (dd->designator_type & 0xF) | ||
75 | + | ((dd->association & 3) << 4); | ||
76 | + desc[7] = dd->designator_length; | ||
77 | + memcpy(desc + 8, dd->designator, dd->designator_length); | ||
78 | + | ||
79 | + desc[28] = 0; | ||
80 | + desc[29] = (lun->block_size >> 16) & 0xFF; | ||
81 | + desc[30] = (lun->block_size >> 8) & 0xFF; | ||
82 | + desc[31] = lun->block_size & 0xFF; | ||
83 | +} | ||
84 | + | ||
85 | +static void iscsi_xcopy_desc_hdr(uint8_t *hdr, int dc, int cat, int src_index, | ||
86 | + int dst_index) | ||
87 | +{ | ||
88 | + hdr[0] = 0x02; /* BLK_TO_BLK_SEG_DESCR */ | ||
89 | + hdr[1] = ((dc << 1) | cat) & 0xFF; | ||
90 | + hdr[2] = (XCOPY_BLK2BLK_SEG_DESC_SIZE >> 8) & 0xFF; | ||
91 | + /* don't account for the first 4 bytes in descriptor header*/ | ||
92 | + hdr[3] = (XCOPY_BLK2BLK_SEG_DESC_SIZE - 4 /* SEG_DESC_SRC_INDEX_OFFSET */) & 0xFF; | ||
93 | + hdr[4] = (src_index >> 8) & 0xFF; | ||
94 | + hdr[5] = src_index & 0xFF; | ||
95 | + hdr[6] = (dst_index >> 8) & 0xFF; | ||
96 | + hdr[7] = dst_index & 0xFF; | ||
97 | +} | ||
98 | + | ||
99 | +static void iscsi_xcopy_populate_desc(uint8_t *desc, int dc, int cat, | ||
100 | + int src_index, int dst_index, int num_blks, | ||
101 | + uint64_t src_lba, uint64_t dst_lba) | ||
102 | +{ | ||
103 | + iscsi_xcopy_desc_hdr(desc, dc, cat, src_index, dst_index); | ||
104 | + | ||
105 | + /* The caller should verify the request size */ | ||
106 | + assert(num_blks < 65536); | ||
107 | + desc[10] = (num_blks >> 8) & 0xFF; | ||
108 | + desc[11] = num_blks & 0xFF; | ||
109 | + desc[12] = (src_lba >> 56) & 0xFF; | ||
110 | + desc[13] = (src_lba >> 48) & 0xFF; | ||
111 | + desc[14] = (src_lba >> 40) & 0xFF; | ||
112 | + desc[15] = (src_lba >> 32) & 0xFF; | ||
113 | + desc[16] = (src_lba >> 24) & 0xFF; | ||
114 | + desc[17] = (src_lba >> 16) & 0xFF; | ||
115 | + desc[18] = (src_lba >> 8) & 0xFF; | ||
116 | + desc[19] = src_lba & 0xFF; | ||
117 | + desc[20] = (dst_lba >> 56) & 0xFF; | ||
118 | + desc[21] = (dst_lba >> 48) & 0xFF; | ||
119 | + desc[22] = (dst_lba >> 40) & 0xFF; | ||
120 | + desc[23] = (dst_lba >> 32) & 0xFF; | ||
121 | + desc[24] = (dst_lba >> 24) & 0xFF; | ||
122 | + desc[25] = (dst_lba >> 16) & 0xFF; | ||
123 | + desc[26] = (dst_lba >> 8) & 0xFF; | ||
124 | + desc[27] = dst_lba & 0xFF; | ||
125 | +} | ||
126 | + | ||
127 | +static void iscsi_xcopy_populate_header(unsigned char *buf, int list_id, int str, | ||
128 | + int list_id_usage, int prio, | ||
129 | + int tgt_desc_len, | ||
130 | + int seg_desc_len, int inline_data_len) | ||
131 | +{ | ||
132 | + buf[0] = list_id; | ||
133 | + buf[1] = ((str & 1) << 5) | ((list_id_usage & 3) << 3) | (prio & 7); | ||
134 | + buf[2] = (tgt_desc_len >> 8) & 0xFF; | ||
135 | + buf[3] = tgt_desc_len & 0xFF; | ||
136 | + buf[8] = (seg_desc_len >> 24) & 0xFF; | ||
137 | + buf[9] = (seg_desc_len >> 16) & 0xFF; | ||
138 | + buf[10] = (seg_desc_len >> 8) & 0xFF; | ||
139 | + buf[11] = seg_desc_len & 0xFF; | ||
140 | + buf[12] = (inline_data_len >> 24) & 0xFF; | ||
141 | + buf[13] = (inline_data_len >> 16) & 0xFF; | ||
142 | + buf[14] = (inline_data_len >> 8) & 0xFF; | ||
143 | + buf[15] = inline_data_len & 0xFF; | ||
144 | +} | ||
145 | + | ||
146 | +static void iscsi_xcopy_data(struct iscsi_data *data, | ||
147 | + IscsiLun *src, int64_t src_lba, | ||
148 | + IscsiLun *dst, int64_t dst_lba, | ||
149 | + uint16_t num_blocks) | ||
150 | +{ | ||
151 | + uint8_t *buf; | ||
152 | + const int src_offset = XCOPY_DESC_OFFSET; | ||
153 | + const int dst_offset = XCOPY_DESC_OFFSET + IDENT_DESCR_TGT_DESCR_SIZE; | ||
154 | + const int seg_offset = dst_offset + IDENT_DESCR_TGT_DESCR_SIZE; | ||
155 | + | ||
156 | + data->size = XCOPY_DESC_OFFSET + | ||
157 | + IDENT_DESCR_TGT_DESCR_SIZE * 2 + | ||
158 | + XCOPY_BLK2BLK_SEG_DESC_SIZE; | ||
159 | + data->data = g_malloc0(data->size); | ||
160 | + buf = data->data; | ||
161 | + | ||
162 | + /* Initialise the parameter list header */ | ||
163 | + iscsi_xcopy_populate_header(buf, 1, 0, 2 /* LIST_ID_USAGE_DISCARD */, | ||
164 | + 0, 2 * IDENT_DESCR_TGT_DESCR_SIZE, | ||
165 | + XCOPY_BLK2BLK_SEG_DESC_SIZE, | ||
166 | + 0); | ||
167 | + | ||
168 | + /* Initialise CSCD list with one src + one dst descriptor */ | ||
169 | + iscsi_populate_target_desc(&buf[src_offset], src); | ||
170 | + iscsi_populate_target_desc(&buf[dst_offset], dst); | ||
171 | + | ||
172 | + /* Initialise one segment descriptor */ | ||
173 | + iscsi_xcopy_populate_desc(&buf[seg_offset], 0, 0, 0, 1, num_blocks, | ||
174 | + src_lba, dst_lba); | ||
175 | +} | ||
176 | + | ||
177 | +static int coroutine_fn iscsi_co_copy_range_to(BlockDriverState *bs, | ||
178 | + BdrvChild *src, | ||
179 | + uint64_t src_offset, | ||
180 | + BdrvChild *dst, | ||
181 | + uint64_t dst_offset, | ||
182 | + uint64_t bytes, | ||
183 | + BdrvRequestFlags flags) | ||
184 | +{ | ||
185 | + IscsiLun *dst_lun = dst->bs->opaque; | ||
186 | + IscsiLun *src_lun; | ||
187 | + struct IscsiTask iscsi_task; | ||
188 | + struct iscsi_data data; | ||
189 | + int r = 0; | ||
190 | + int block_size; | ||
191 | + | ||
192 | + if (src->bs->drv->bdrv_co_copy_range_to != iscsi_co_copy_range_to) { | ||
193 | + return -ENOTSUP; | ||
194 | + } | ||
195 | + src_lun = src->bs->opaque; | ||
196 | + | ||
197 | + if (!src_lun->dd || !dst_lun->dd) { | ||
198 | + return -ENOTSUP; | ||
199 | + } | ||
200 | + if (!is_byte_request_lun_aligned(dst_offset, bytes, dst_lun)) { | ||
201 | + return -ENOTSUP; | ||
202 | + } | ||
203 | + if (!is_byte_request_lun_aligned(src_offset, bytes, src_lun)) { | ||
204 | + return -ENOTSUP; | ||
205 | + } | ||
206 | + if (dst_lun->block_size != src_lun->block_size || | ||
207 | + !dst_lun->block_size) { | ||
208 | + return -ENOTSUP; | ||
209 | + } | ||
210 | + | ||
211 | + block_size = dst_lun->block_size; | ||
212 | + if (bytes / block_size > 65535) { | ||
213 | + return -ENOTSUP; | ||
214 | + } | ||
215 | + | ||
216 | + iscsi_xcopy_data(&data, | ||
217 | + src_lun, src_offset / block_size, | ||
218 | + dst_lun, dst_offset / block_size, | ||
219 | + bytes / block_size); | ||
220 | + | ||
221 | + iscsi_co_init_iscsitask(dst_lun, &iscsi_task); | ||
222 | + | ||
223 | + qemu_mutex_lock(&dst_lun->mutex); | ||
224 | + iscsi_task.task = iscsi_xcopy_task(data.size); | ||
225 | +retry: | ||
226 | + if (iscsi_scsi_command_async(dst_lun->iscsi, dst_lun->lun, | ||
227 | + iscsi_task.task, iscsi_co_generic_cb, | ||
228 | + &data, | ||
229 | + &iscsi_task) != 0) { | ||
230 | + r = -EIO; | ||
231 | + goto out_unlock; | ||
232 | + } | ||
233 | + | ||
234 | + iscsi_co_wait_for_task(&iscsi_task, dst_lun); | ||
235 | + | ||
236 | + if (iscsi_task.do_retry) { | ||
237 | + iscsi_task.complete = 0; | ||
238 | + goto retry; | ||
239 | + } | ||
240 | + | ||
241 | + if (iscsi_task.status != SCSI_STATUS_GOOD) { | ||
242 | + r = iscsi_task.err_code; | ||
243 | + goto out_unlock; | ||
244 | + } | ||
245 | + | ||
246 | +out_unlock: | ||
247 | + g_free(iscsi_task.task); | ||
248 | + qemu_mutex_unlock(&dst_lun->mutex); | ||
249 | + g_free(iscsi_task.err_str); | ||
250 | + return r; | ||
251 | +} | ||
252 | + | ||
253 | static QemuOptsList iscsi_create_opts = { | ||
254 | .name = "iscsi-create-opts", | ||
255 | .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), | ||
256 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iscsi = { | ||
257 | |||
258 | .bdrv_co_block_status = iscsi_co_block_status, | ||
259 | .bdrv_co_pdiscard = iscsi_co_pdiscard, | ||
260 | + .bdrv_co_copy_range_from = iscsi_co_copy_range_from, | ||
261 | + .bdrv_co_copy_range_to = iscsi_co_copy_range_to, | ||
262 | .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, | ||
263 | .bdrv_co_readv = iscsi_co_readv, | ||
264 | .bdrv_co_writev = iscsi_co_writev, | ||
265 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iser = { | ||
266 | |||
267 | .bdrv_co_block_status = iscsi_co_block_status, | ||
268 | .bdrv_co_pdiscard = iscsi_co_pdiscard, | ||
269 | + .bdrv_co_copy_range_from = iscsi_co_copy_range_from, | ||
270 | + .bdrv_co_copy_range_to = iscsi_co_copy_range_to, | ||
271 | .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes, | ||
272 | .bdrv_co_readv = iscsi_co_readv, | ||
273 | .bdrv_co_writev = iscsi_co_writev, | ||
274 | -- | ||
275 | 2.17.1 | ||
276 | |||
277 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
2 | 1 | ||
3 | It's a BlockBackend wrapper of the BDS interface. | ||
4 | |||
5 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
7 | Message-id: 20180601092648.24614-10-famz@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | --- | ||
10 | include/sysemu/block-backend.h | 4 ++++ | ||
11 | block/block-backend.c | 18 ++++++++++++++++++ | ||
12 | 2 files changed, 22 insertions(+) | ||
13 | |||
14 | diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/include/sysemu/block-backend.h | ||
17 | +++ b/include/sysemu/block-backend.h | ||
18 | @@ -XXX,XX +XXX,XX @@ void blk_set_force_allow_inactivate(BlockBackend *blk); | ||
19 | void blk_register_buf(BlockBackend *blk, void *host, size_t size); | ||
20 | void blk_unregister_buf(BlockBackend *blk, void *host); | ||
21 | |||
22 | +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, | ||
23 | + BlockBackend *blk_out, int64_t off_out, | ||
24 | + int bytes, BdrvRequestFlags flags); | ||
25 | + | ||
26 | #endif | ||
27 | diff --git a/block/block-backend.c b/block/block-backend.c | ||
28 | index XXXXXXX..XXXXXXX 100644 | ||
29 | --- a/block/block-backend.c | ||
30 | +++ b/block/block-backend.c | ||
31 | @@ -XXX,XX +XXX,XX @@ void blk_unregister_buf(BlockBackend *blk, void *host) | ||
32 | { | ||
33 | bdrv_unregister_buf(blk_bs(blk), host); | ||
34 | } | ||
35 | + | ||
36 | +int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, | ||
37 | + BlockBackend *blk_out, int64_t off_out, | ||
38 | + int bytes, BdrvRequestFlags flags) | ||
39 | +{ | ||
40 | + int r; | ||
41 | + r = blk_check_byte_request(blk_in, off_in, bytes); | ||
42 | + if (r) { | ||
43 | + return r; | ||
44 | + } | ||
45 | + r = blk_check_byte_request(blk_out, off_out, bytes); | ||
46 | + if (r) { | ||
47 | + return r; | ||
48 | + } | ||
49 | + return bdrv_co_copy_range(blk_in->root, off_in, | ||
50 | + blk_out->root, off_out, | ||
51 | + bytes, flags); | ||
52 | +} | ||
53 | -- | ||
54 | 2.17.1 | ||
55 | |||
56 | diff view generated by jsdifflib |
1 | From: Fam Zheng <famz@redhat.com> | 1 | From: Paolo Bonzini <pbonzini@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | The new blk_co_copy_range interface offers a more efficient way in the | 3 | When using C11 atomics, non-seqcst reads and writes do not participate |
4 | case of network based storage. Make use of it to allow faster convert | 4 | in the total order of seqcst operations. In util/async.c and util/aio-posix.c, |
5 | operation. | 5 | in particular, the pattern that we use |
6 | 6 | ||
7 | Since copy offloading cannot do zero detection ('-S') and compression | 7 | write ctx->notify_me write bh->scheduled |
8 | (-c), only try it when these options are not used. | 8 | read bh->scheduled read ctx->notify_me |
9 | if !bh->scheduled, sleep if ctx->notify_me, notify | ||
9 | 10 | ||
10 | Signed-off-by: Fam Zheng <famz@redhat.com> | 11 | needs to use seqcst operations for both the write and the read. In |
11 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 12 | general this is something that we do not want, because there can be |
12 | Message-id: 20180601092648.24614-11-famz@redhat.com | 13 | many sources that are polled in addition to bottom halves. The |
14 | alternative is to place a seqcst memory barrier between the write | ||
15 | and the read. This also comes with a disadvantage, in that the | ||
16 | memory barrier is implicit on strongly-ordered architectures and | ||
17 | it wastes a few dozen clock cycles. | ||
18 | |||
19 | Fortunately, ctx->notify_me is never written concurrently by two | ||
20 | threads, so we can assert that and relax the writes to ctx->notify_me. | ||
21 | The resulting solution works and performs well on both aarch64 and x86. | ||
22 | |||
23 | Note that the atomic_set/atomic_read combination is not an atomic | ||
24 | read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED; | ||
25 | on x86, ATOMIC_RELAXED compiles to a locked operation. | ||
26 | |||
27 | Analyzed-by: Ying Fang <fangying1@huawei.com> | ||
28 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
29 | Tested-by: Ying Fang <fangying1@huawei.com> | ||
30 | Message-Id: <20200407140746.8041-6-pbonzini@redhat.com> | ||
13 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 31 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
14 | --- | 32 | --- |
15 | qemu-img.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- | 33 | util/aio-posix.c | 16 ++++++++++++++-- |
16 | 1 file changed, 48 insertions(+), 2 deletions(-) | 34 | util/aio-win32.c | 17 ++++++++++++++--- |
35 | util/async.c | 16 ++++++++++++---- | ||
36 | 3 files changed, 40 insertions(+), 9 deletions(-) | ||
17 | 37 | ||
18 | diff --git a/qemu-img.c b/qemu-img.c | 38 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
19 | index XXXXXXX..XXXXXXX 100644 | 39 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/qemu-img.c | 40 | --- a/util/aio-posix.c |
21 | +++ b/qemu-img.c | 41 | +++ b/util/aio-posix.c |
22 | @@ -XXX,XX +XXX,XX @@ typedef struct ImgConvertState { | 42 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) |
23 | bool compressed; | 43 | int64_t timeout; |
24 | bool target_has_backing; | 44 | int64_t start = 0; |
25 | bool wr_in_order; | 45 | |
26 | + bool copy_range; | 46 | + /* |
27 | int min_sparse; | 47 | + * There cannot be two concurrent aio_poll calls for the same AioContext (or |
28 | size_t cluster_sectors; | 48 | + * an aio_poll concurrent with a GSource prepare/check/dispatch callback). |
29 | size_t buf_sectors; | 49 | + * We rely on this below to avoid slow locked accesses to ctx->notify_me. |
30 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn convert_co_write(ImgConvertState *s, int64_t sector_num, | 50 | + */ |
31 | return 0; | 51 | assert(in_aio_context_home_thread(ctx)); |
32 | } | 52 | |
33 | 53 | /* aio_notify can avoid the expensive event_notifier_set if | |
34 | +static int coroutine_fn convert_co_copy_range(ImgConvertState *s, int64_t sector_num, | 54 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) |
35 | + int nb_sectors) | 55 | * so disable the optimization now. |
36 | +{ | 56 | */ |
37 | + int n, ret; | 57 | if (blocking) { |
58 | - atomic_add(&ctx->notify_me, 2); | ||
59 | + atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); | ||
60 | + /* | ||
61 | + * Write ctx->notify_me before computing the timeout | ||
62 | + * (reading bottom half flags, etc.). Pairs with | ||
63 | + * smp_mb in aio_notify(). | ||
64 | + */ | ||
65 | + smp_mb(); | ||
66 | } | ||
67 | |||
68 | qemu_lockcnt_inc(&ctx->list_lock); | ||
69 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
70 | } | ||
71 | |||
72 | if (blocking) { | ||
73 | - atomic_sub(&ctx->notify_me, 2); | ||
74 | + /* Finish the poll before clearing the flag. */ | ||
75 | + atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); | ||
76 | aio_notify_accept(ctx); | ||
77 | } | ||
78 | |||
79 | diff --git a/util/aio-win32.c b/util/aio-win32.c | ||
80 | index XXXXXXX..XXXXXXX 100644 | ||
81 | --- a/util/aio-win32.c | ||
82 | +++ b/util/aio-win32.c | ||
83 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
84 | int count; | ||
85 | int timeout; | ||
86 | |||
87 | + /* | ||
88 | + * There cannot be two concurrent aio_poll calls for the same AioContext (or | ||
89 | + * an aio_poll concurrent with a GSource prepare/check/dispatch callback). | ||
90 | + * We rely on this below to avoid slow locked accesses to ctx->notify_me. | ||
91 | + */ | ||
92 | + assert(in_aio_context_home_thread(ctx)); | ||
93 | progress = false; | ||
94 | |||
95 | /* aio_notify can avoid the expensive event_notifier_set if | ||
96 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
97 | * so disable the optimization now. | ||
98 | */ | ||
99 | if (blocking) { | ||
100 | - atomic_add(&ctx->notify_me, 2); | ||
101 | + atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); | ||
102 | + /* | ||
103 | + * Write ctx->notify_me before computing the timeout | ||
104 | + * (reading bottom half flags, etc.). Pairs with | ||
105 | + * smp_mb in aio_notify(). | ||
106 | + */ | ||
107 | + smp_mb(); | ||
108 | } | ||
109 | |||
110 | qemu_lockcnt_inc(&ctx->list_lock); | ||
111 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
112 | ret = WaitForMultipleObjects(count, events, FALSE, timeout); | ||
113 | if (blocking) { | ||
114 | assert(first); | ||
115 | - assert(in_aio_context_home_thread(ctx)); | ||
116 | - atomic_sub(&ctx->notify_me, 2); | ||
117 | + atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); | ||
118 | aio_notify_accept(ctx); | ||
119 | } | ||
120 | |||
121 | diff --git a/util/async.c b/util/async.c | ||
122 | index XXXXXXX..XXXXXXX 100644 | ||
123 | --- a/util/async.c | ||
124 | +++ b/util/async.c | ||
125 | @@ -XXX,XX +XXX,XX @@ aio_ctx_prepare(GSource *source, gint *timeout) | ||
126 | { | ||
127 | AioContext *ctx = (AioContext *) source; | ||
128 | |||
129 | - atomic_or(&ctx->notify_me, 1); | ||
130 | + atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1); | ||
38 | + | 131 | + |
39 | + while (nb_sectors > 0) { | 132 | + /* |
40 | + BlockBackend *blk; | 133 | + * Write ctx->notify_me before computing the timeout |
41 | + int src_cur; | 134 | + * (reading bottom half flags, etc.). Pairs with |
42 | + int64_t bs_sectors, src_cur_offset; | 135 | + * smp_mb in aio_notify(). |
43 | + int64_t offset; | 136 | + */ |
44 | + | 137 | + smp_mb(); |
45 | + convert_select_part(s, sector_num, &src_cur, &src_cur_offset); | 138 | |
46 | + offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS; | 139 | /* We assume there is no timeout already supplied */ |
47 | + blk = s->src[src_cur]; | 140 | *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); |
48 | + bs_sectors = s->src_sectors[src_cur]; | 141 | @@ -XXX,XX +XXX,XX @@ aio_ctx_check(GSource *source) |
49 | + | 142 | QEMUBH *bh; |
50 | + n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset)); | 143 | BHListSlice *s; |
51 | + | 144 | |
52 | + ret = blk_co_copy_range(blk, offset, s->target, | 145 | - atomic_and(&ctx->notify_me, ~1); |
53 | + sector_num << BDRV_SECTOR_BITS, | 146 | + /* Finish computing the timeout before clearing the flag. */ |
54 | + n << BDRV_SECTOR_BITS, 0); | 147 | + atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1); |
55 | + if (ret < 0) { | 148 | aio_notify_accept(ctx); |
56 | + return ret; | 149 | |
57 | + } | 150 | QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) { |
58 | + | 151 | @@ -XXX,XX +XXX,XX @@ LuringState *aio_get_linux_io_uring(AioContext *ctx) |
59 | + sector_num += n; | 152 | void aio_notify(AioContext *ctx) |
60 | + nb_sectors -= n; | ||
61 | + } | ||
62 | + return 0; | ||
63 | +} | ||
64 | + | ||
65 | static void coroutine_fn convert_co_do_copy(void *opaque) | ||
66 | { | 153 | { |
67 | ImgConvertState *s = opaque; | 154 | /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs |
68 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn convert_co_do_copy(void *opaque) | 155 | - * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll. |
69 | int n; | 156 | + * with smp_mb in aio_ctx_prepare or aio_poll. |
70 | int64_t sector_num; | 157 | */ |
71 | enum ImgConvertBlockStatus status; | 158 | smp_mb(); |
72 | + bool copy_range; | 159 | - if (ctx->notify_me) { |
73 | 160 | + if (atomic_read(&ctx->notify_me)) { | |
74 | qemu_co_mutex_lock(&s->lock); | 161 | event_notifier_set(&ctx->notifier); |
75 | if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) { | 162 | atomic_mb_set(&ctx->notified, true); |
76 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn convert_co_do_copy(void *opaque) | 163 | } |
77 | s->allocated_sectors, 0); | ||
78 | } | ||
79 | |||
80 | - if (status == BLK_DATA) { | ||
81 | +retry: | ||
82 | + copy_range = s->copy_range && s->status == BLK_DATA; | ||
83 | + if (status == BLK_DATA && !copy_range) { | ||
84 | ret = convert_co_read(s, sector_num, n, buf); | ||
85 | if (ret < 0) { | ||
86 | error_report("error while reading sector %" PRId64 | ||
87 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn convert_co_do_copy(void *opaque) | ||
88 | } | ||
89 | |||
90 | if (s->ret == -EINPROGRESS) { | ||
91 | - ret = convert_co_write(s, sector_num, n, buf, status); | ||
92 | + if (copy_range) { | ||
93 | + ret = convert_co_copy_range(s, sector_num, n); | ||
94 | + if (ret) { | ||
95 | + s->copy_range = false; | ||
96 | + goto retry; | ||
97 | + } | ||
98 | + } else { | ||
99 | + ret = convert_co_write(s, sector_num, n, buf, status); | ||
100 | + } | ||
101 | if (ret < 0) { | ||
102 | error_report("error while writing sector %" PRId64 | ||
103 | ": %s", sector_num, strerror(-ret)); | ||
104 | @@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv) | ||
105 | ImgConvertState s = (ImgConvertState) { | ||
106 | /* Need at least 4k of zeros for sparse detection */ | ||
107 | .min_sparse = 8, | ||
108 | + .copy_range = true, | ||
109 | .buf_sectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE, | ||
110 | .wr_in_order = true, | ||
111 | .num_coroutines = 8, | ||
112 | @@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv) | ||
113 | break; | ||
114 | case 'c': | ||
115 | s.compressed = true; | ||
116 | + s.copy_range = false; | ||
117 | break; | ||
118 | case 'o': | ||
119 | if (!is_valid_option_list(optarg)) { | ||
120 | @@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv) | ||
121 | } | ||
122 | |||
123 | s.min_sparse = sval / BDRV_SECTOR_SIZE; | ||
124 | + s.copy_range = false; | ||
125 | break; | ||
126 | } | ||
127 | case 'p': | ||
128 | -- | 164 | -- |
129 | 2.17.1 | 165 | 2.25.1 |
130 | 166 | ||
131 | diff view generated by jsdifflib |