1
The following changes since commit c25e8bba1f546ea72744ccfab77f8a9e8a323be8:
1
The following changes since commit 8bac3ba57eecc466b7e73dabf7d19328a59f684e:
2
2
3
Merge remote-tracking branch 'remotes/otubo/tags/pull-seccomp-20180601' into staging (2018-06-01 13:11:30 +0100)
3
Merge remote-tracking branch 'remotes/rth/tags/pull-rx-20200408' into staging (2020-04-09 13:23:30 +0100)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
git://github.com/stefanha/qemu.git tags/block-pull-request
7
https://github.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 21891a5a3011608845b5d7f1f9cce60cdc2bcc62:
9
for you to fetch changes up to 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c:
10
10
11
main-loop: drop spin_counter (2018-06-01 16:01:29 +0100)
11
async: use explicit memory barriers (2020-04-09 16:17:14 +0100)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
* Copy offloading for qemu-img convert (iSCSI, raw, and qcow2)
16
Fixes for QEMU on aarch64 ARM hosts and fdmon-io_uring.
17
18
If the underlying storage supports copy offloading, qemu-img convert will
19
use it instead of performing reads and writes. This avoids data transfers
20
and thus frees up storage bandwidth for other purposes. SCSI EXTENDED COPY
21
and Linux copy_file_range(2) are used to implement this optimization.
22
23
* Drop spurious "WARNING: I\/O thread spun for 1000 iterations" warning
24
17
25
----------------------------------------------------------------
18
----------------------------------------------------------------
26
19
27
Fam Zheng (10):
20
Paolo Bonzini (2):
28
block: Introduce API for copy offloading
21
aio-wait: delegate polling of main AioContext if BQL not held
29
raw: Check byte range uniformly
22
async: use explicit memory barriers
30
raw: Implement copy offloading
31
qcow2: Implement copy offloading
32
file-posix: Implement bdrv_co_copy_range
33
iscsi: Query and save device designator when opening
34
iscsi: Create and use iscsi_co_wait_for_task
35
iscsi: Implement copy offloading
36
block-backend: Add blk_co_copy_range
37
qemu-img: Convert with copy offloading
38
23
39
Stefan Hajnoczi (1):
24
Stefan Hajnoczi (1):
40
main-loop: drop spin_counter
25
aio-posix: signal-proof fdmon-io_uring
41
26
42
configure | 17 ++
27
include/block/aio-wait.h | 22 ++++++++++++++++++++++
43
include/block/block.h | 32 ++++
28
include/block/aio.h | 29 ++++++++++-------------------
44
include/block/block_int.h | 38 ++++
29
util/aio-posix.c | 16 ++++++++++++++--
45
include/block/raw-aio.h | 10 +-
30
util/aio-win32.c | 17 ++++++++++++++---
46
include/scsi/constants.h | 4 +
31
util/async.c | 16 ++++++++++++----
47
include/sysemu/block-backend.h | 4 +
32
util/fdmon-io_uring.c | 10 ++++++++--
48
block/block-backend.c | 18 ++
33
6 files changed, 80 insertions(+), 30 deletions(-)
49
block/file-posix.c | 98 +++++++++-
50
block/io.c | 97 ++++++++++
51
block/iscsi.c | 314 +++++++++++++++++++++++++++----
52
block/qcow2.c | 229 +++++++++++++++++++---
53
block/raw-format.c | 96 +++++++---
54
qemu-img.c | 50 ++++-
55
util/main-loop.c | 25 ---
56
tests/qemu-iotests/common.filter | 1 -
57
15 files changed, 908 insertions(+), 125 deletions(-)
58
34
59
--
35
--
60
2.17.1
36
2.25.1
61
37
62
diff view generated by jsdifflib
1
Commit d759c951f3287fad04210a52f2dc93f94cf58c7f ("replay: push
1
The io_uring_enter(2) syscall returns with errno=EINTR when interrupted
2
replay_mutex_lock up the call tree") removed the !timeout lock
2
by a signal. Retry the syscall in this case.
3
optimization in the main loop.
4
3
5
The idea of the optimization was to avoid ping-pongs between threads by
4
It's essential to do this in the io_uring_submit_and_wait() case. My
6
keeping the Big QEMU Lock held across non-blocking (!timeout) main loop
5
interpretation of the Linux v5.5 io_uring_enter(2) code is that it
7
iterations.
6
shouldn't affect the io_uring_submit() case, but there is no guarantee
7
this will always be the case. Let's check for -EINTR around both APIs.
8
8
9
A warning is printed when the main loop spins without releasing BQL for
9
Note that the liburing APIs have -errno return values.
10
long periods of time. These warnings were supposed to aid debugging but
11
in practice they just alarm users. They are considered noise because
12
the cause of spinning is not shown and is hard to find.
13
14
Now that the lock optimization has been removed, there is no danger of
15
hogging the BQL. Drop the spin counter and the infamous warning.
16
10
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
Reviewed-by: Jeff Cody <jcody@redhat.com>
12
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
13
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
14
Message-id: 20200408091139.273851-1-stefanha@redhat.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
---
16
---
20
util/main-loop.c | 25 -------------------------
17
util/fdmon-io_uring.c | 10 ++++++++--
21
tests/qemu-iotests/common.filter | 1 -
18
1 file changed, 8 insertions(+), 2 deletions(-)
22
2 files changed, 26 deletions(-)
23
19
24
diff --git a/util/main-loop.c b/util/main-loop.c
20
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
25
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
26
--- a/util/main-loop.c
22
--- a/util/fdmon-io_uring.c
27
+++ b/util/main-loop.c
23
+++ b/util/fdmon-io_uring.c
28
@@ -XXX,XX +XXX,XX @@ static int os_host_main_loop_wait(int64_t timeout)
24
@@ -XXX,XX +XXX,XX @@ static struct io_uring_sqe *get_sqe(AioContext *ctx)
29
{
25
}
30
GMainContext *context = g_main_context_default();
26
31
int ret;
27
/* No free sqes left, submit pending sqes first */
32
- static int spin_counter;
28
- ret = io_uring_submit(ring);
33
29
+ do {
34
g_main_context_acquire(context);
30
+ ret = io_uring_submit(ring);
35
31
+ } while (ret == -EINTR);
36
glib_pollfds_fill(&timeout);
32
+
37
33
assert(ret > 1);
38
- /* If the I/O thread is very busy or we are incorrectly busy waiting in
34
sqe = io_uring_get_sqe(ring);
39
- * the I/O thread, this can lead to starvation of the BQL such that the
35
assert(sqe);
40
- * VCPU threads never run. To make sure we can detect the later case,
36
@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
41
- * print a message to the screen. If we run into this condition, create
37
42
- * a fake timeout in order to give the VCPU threads a chance to run.
38
fill_sq_ring(ctx);
43
- */
39
44
- if (!timeout && (spin_counter > MAX_MAIN_LOOP_SPIN)) {
40
- ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
45
- static bool notified;
41
+ do {
46
-
42
+ ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
47
- if (!notified && !qtest_enabled() && !qtest_driver()) {
43
+ } while (ret == -EINTR);
48
- warn_report("I/O thread spun for %d iterations",
44
+
49
- MAX_MAIN_LOOP_SPIN);
45
assert(ret >= 0);
50
- notified = true;
46
51
- }
47
return process_cq_ring(ctx, ready_list);
52
-
53
- timeout = SCALE_MS;
54
- }
55
-
56
-
57
- if (timeout) {
58
- spin_counter = 0;
59
- } else {
60
- spin_counter++;
61
- }
62
qemu_mutex_unlock_iothread();
63
replay_mutex_unlock();
64
65
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
66
index XXXXXXX..XXXXXXX 100644
67
--- a/tests/qemu-iotests/common.filter
68
+++ b/tests/qemu-iotests/common.filter
69
@@ -XXX,XX +XXX,XX @@ _filter_qemu()
70
{
71
sed -e "s#\\(^\\|(qemu) \\)$(basename $QEMU_PROG):#\1QEMU_PROG:#" \
72
-e 's#^QEMU [0-9]\+\.[0-9]\+\.[0-9]\+ monitor#QEMU X.Y.Z monitor#' \
73
- -e '/main-loop: WARNING: I\/O thread spun for [0-9]\+ iterations/d' \
74
-e $'s#\r##' # QEMU monitor uses \r\n line endings
75
}
76
77
--
48
--
78
2.17.1
49
2.25.1
79
50
80
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
2
3
Introduce the bdrv_co_copy_range() API for copy offloading. Block
3
Any thread that is not a iothread returns NULL for qemu_get_current_aio_context().
4
drivers implementing this API support efficient copy operations that
4
As a result, it would also return true for
5
avoid reading each block from the source device and writing it to the
5
in_aio_context_home_thread(qemu_get_aio_context()), causing
6
destination devices. Examples of copy offload primitives are SCSI
6
AIO_WAIT_WHILE to invoke aio_poll() directly. This is incorrect
7
EXTENDED COPY and Linux copy_file_range(2).
7
if the BQL is not held, because aio_poll() does not expect to
8
run concurrently from multiple threads, and it can actually
9
happen when savevm writes to the vmstate file from the
10
migration thread.
8
11
9
Signed-off-by: Fam Zheng <famz@redhat.com>
12
Therefore, restrict in_aio_context_home_thread to return true
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
for the main AioContext only if the BQL is held.
11
Message-id: 20180601092648.24614-2-famz@redhat.com
14
15
The function is moved to aio-wait.h because it is mostly used
16
there and to avoid a circular reference between main-loop.h
17
and block/aio.h.
18
19
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
20
Message-Id: <20200407140746.8041-5-pbonzini@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
22
---
14
include/block/block.h | 32 +++++++++++++
23
include/block/aio-wait.h | 22 ++++++++++++++++++++++
15
include/block/block_int.h | 38 +++++++++++++++
24
include/block/aio.h | 29 ++++++++++-------------------
16
block/io.c | 97 +++++++++++++++++++++++++++++++++++++++
25
2 files changed, 32 insertions(+), 19 deletions(-)
17
3 files changed, 167 insertions(+)
18
26
19
diff --git a/include/block/block.h b/include/block/block.h
27
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
20
index XXXXXXX..XXXXXXX 100644
28
index XXXXXXX..XXXXXXX 100644
21
--- a/include/block/block.h
29
--- a/include/block/aio-wait.h
22
+++ b/include/block/block.h
30
+++ b/include/block/aio-wait.h
23
@@ -XXX,XX +XXX,XX @@ bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
31
@@ -XXX,XX +XXX,XX @@
32
#define QEMU_AIO_WAIT_H
33
34
#include "block/aio.h"
35
+#include "qemu/main-loop.h"
36
37
/**
38
* AioWait:
39
@@ -XXX,XX +XXX,XX @@ void aio_wait_kick(void);
24
*/
40
*/
25
void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
41
void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
26
void bdrv_unregister_buf(BlockDriverState *bs, void *host);
42
27
+
28
+/**
43
+/**
44
+ * in_aio_context_home_thread:
45
+ * @ctx: the aio context
29
+ *
46
+ *
30
+ * bdrv_co_copy_range:
47
+ * Return whether we are running in the thread that normally runs @ctx. Note
31
+ *
48
+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
32
+ * Do offloaded copy between two children. If the operation is not implemented
49
+ * still only has one home thread that is responsible for running it.
33
+ * by the driver, or if the backend storage doesn't support it, a negative
50
+ */
34
+ * error code will be returned.
51
+static inline bool in_aio_context_home_thread(AioContext *ctx)
35
+ *
36
+ * Note: block layer doesn't emulate or fallback to a bounce buffer approach
37
+ * because usually the caller shouldn't attempt offloaded copy any more (e.g.
38
+ * calling copy_file_range(2)) after the first error, thus it should fall back
39
+ * to a read+write path in the caller level.
40
+ *
41
+ * @src: Source child to copy data from
42
+ * @src_offset: offset in @src image to read data
43
+ * @dst: Destination child to copy data to
44
+ * @dst_offset: offset in @dst image to write data
45
+ * @bytes: number of bytes to copy
46
+ * @flags: request flags. Must be one of:
47
+ * 0 - actually read data from src;
48
+ * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero
49
+ * write on @dst as if bdrv_co_pwrite_zeroes is
50
+ * called. Used to simplify caller code, or
51
+ * during BlockDriver.bdrv_co_copy_range_from()
52
+ * recursion.
53
+ *
54
+ * Returns: 0 if succeeded; negative error code if failed.
55
+ **/
56
+int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
57
+ BdrvChild *dst, uint64_t dst_offset,
58
+ uint64_t bytes, BdrvRequestFlags flags);
59
#endif
60
diff --git a/include/block/block_int.h b/include/block/block_int.h
61
index XXXXXXX..XXXXXXX 100644
62
--- a/include/block/block_int.h
63
+++ b/include/block/block_int.h
64
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
65
int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
66
int64_t offset, int bytes);
67
68
+ /* Map [offset, offset + nbytes) range onto a child of @bs to copy from,
69
+ * and invoke bdrv_co_copy_range_from(child, ...), or invoke
70
+ * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from.
71
+ *
72
+ * See the comment of bdrv_co_copy_range for the parameter and return value
73
+ * semantics.
74
+ */
75
+ int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs,
76
+ BdrvChild *src,
77
+ uint64_t offset,
78
+ BdrvChild *dst,
79
+ uint64_t dst_offset,
80
+ uint64_t bytes,
81
+ BdrvRequestFlags flags);
82
+
83
+ /* Map [offset, offset + nbytes) range onto a child of bs to copy data to,
84
+ * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy
85
+ * operation if @bs is the leaf and @src has the same BlockDriver. Return
86
+ * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver.
87
+ *
88
+ * See the comment of bdrv_co_copy_range for the parameter and return value
89
+ * semantics.
90
+ */
91
+ int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs,
92
+ BdrvChild *src,
93
+ uint64_t src_offset,
94
+ BdrvChild *dst,
95
+ uint64_t dst_offset,
96
+ uint64_t bytes,
97
+ BdrvRequestFlags flags);
98
+
99
/*
100
* Building block for bdrv_block_status[_above] and
101
* bdrv_is_allocated[_above]. The driver should answer only
102
@@ -XXX,XX +XXX,XX @@ void bdrv_dec_in_flight(BlockDriverState *bs);
103
104
void blockdev_close_all_bdrv_states(void);
105
106
+int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
107
+ BdrvChild *dst, uint64_t dst_offset,
108
+ uint64_t bytes, BdrvRequestFlags flags);
109
+int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
110
+ BdrvChild *dst, uint64_t dst_offset,
111
+ uint64_t bytes, BdrvRequestFlags flags);
112
+
113
#endif /* BLOCK_INT_H */
114
diff --git a/block/io.c b/block/io.c
115
index XXXXXXX..XXXXXXX 100644
116
--- a/block/io.c
117
+++ b/block/io.c
118
@@ -XXX,XX +XXX,XX @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host)
119
bdrv_unregister_buf(child->bs, host);
120
}
121
}
122
+
123
+static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src,
124
+ uint64_t src_offset,
125
+ BdrvChild *dst,
126
+ uint64_t dst_offset,
127
+ uint64_t bytes,
128
+ BdrvRequestFlags flags,
129
+ bool recurse_src)
130
+{
52
+{
131
+ int ret;
53
+ if (ctx == qemu_get_current_aio_context()) {
132
+
54
+ return true;
133
+ if (!src || !dst || !src->bs || !dst->bs) {
134
+ return -ENOMEDIUM;
135
+ }
136
+ ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
137
+ if (ret) {
138
+ return ret;
139
+ }
55
+ }
140
+
56
+
141
+ ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
57
+ if (ctx == qemu_get_aio_context()) {
142
+ if (ret) {
58
+ return qemu_mutex_iothread_locked();
143
+ return ret;
144
+ }
145
+ if (flags & BDRV_REQ_ZERO_WRITE) {
146
+ return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags);
147
+ }
148
+
149
+ if (!src->bs->drv->bdrv_co_copy_range_from
150
+ || !dst->bs->drv->bdrv_co_copy_range_to
151
+ || src->bs->encrypted || dst->bs->encrypted) {
152
+ return -ENOTSUP;
153
+ }
154
+ if (recurse_src) {
155
+ return src->bs->drv->bdrv_co_copy_range_from(src->bs,
156
+ src, src_offset,
157
+ dst, dst_offset,
158
+ bytes, flags);
159
+ } else {
59
+ } else {
160
+ return dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
60
+ return false;
161
+ src, src_offset,
162
+ dst, dst_offset,
163
+ bytes, flags);
164
+ }
61
+ }
165
+}
62
+}
166
+
63
+
167
+/* Copy range from @src to @dst.
64
#endif /* QEMU_AIO_WAIT_H */
168
+ *
65
diff --git a/include/block/aio.h b/include/block/aio.h
169
+ * See the comment of bdrv_co_copy_range for the parameter and return value
66
index XXXXXXX..XXXXXXX 100644
170
+ * semantics. */
67
--- a/include/block/aio.h
171
+int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
68
+++ b/include/block/aio.h
172
+ BdrvChild *dst, uint64_t dst_offset,
69
@@ -XXX,XX +XXX,XX @@ struct AioContext {
173
+ uint64_t bytes, BdrvRequestFlags flags)
70
AioHandlerList deleted_aio_handlers;
174
+{
71
175
+ return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
72
/* Used to avoid unnecessary event_notifier_set calls in aio_notify;
176
+ bytes, flags, true);
73
- * accessed with atomic primitives. If this field is 0, everything
177
+}
74
- * (file descriptors, bottom halves, timers) will be re-evaluated
178
+
75
- * before the next blocking poll(), thus the event_notifier_set call
179
+/* Copy range from @src to @dst.
76
- * can be skipped. If it is non-zero, you may need to wake up a
180
+ *
77
- * concurrent aio_poll or the glib main event loop, making
181
+ * See the comment of bdrv_co_copy_range for the parameter and return value
78
- * event_notifier_set necessary.
182
+ * semantics. */
79
+ * only written from the AioContext home thread, or under the BQL in
183
+int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
80
+ * the case of the main AioContext. However, it is read from any
184
+ BdrvChild *dst, uint64_t dst_offset,
81
+ * thread so it is still accessed with atomic primitives.
185
+ uint64_t bytes, BdrvRequestFlags flags)
82
+ *
186
+{
83
+ * If this field is 0, everything (file descriptors, bottom halves,
187
+ return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
84
+ * timers) will be re-evaluated before the next blocking poll() or
188
+ bytes, flags, false);
85
+ * io_uring wait; therefore, the event_notifier_set call can be
189
+}
86
+ * skipped. If it is non-zero, you may need to wake up a concurrent
190
+
87
+ * aio_poll or the glib main event loop, making event_notifier_set
191
+int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
88
+ * necessary.
192
+ BdrvChild *dst, uint64_t dst_offset,
89
*
193
+ uint64_t bytes, BdrvRequestFlags flags)
90
* Bit 0 is reserved for GSource usage of the AioContext, and is 1
194
+{
91
* between a call to aio_ctx_prepare and the next call to aio_ctx_check.
195
+ BdrvTrackedRequest src_req, dst_req;
92
@@ -XXX,XX +XXX,XX @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
196
+ BlockDriverState *src_bs = src->bs;
93
*/
197
+ BlockDriverState *dst_bs = dst->bs;
94
AioContext *qemu_get_current_aio_context(void);
198
+ int ret;
95
199
+
96
-/**
200
+ bdrv_inc_in_flight(src_bs);
97
- * in_aio_context_home_thread:
201
+ bdrv_inc_in_flight(dst_bs);
98
- * @ctx: the aio context
202
+ tracked_request_begin(&src_req, src_bs, src_offset,
99
- *
203
+ bytes, BDRV_TRACKED_READ);
100
- * Return whether we are running in the thread that normally runs @ctx. Note
204
+ tracked_request_begin(&dst_req, dst_bs, dst_offset,
101
- * that acquiring/releasing ctx does not affect the outcome, each AioContext
205
+ bytes, BDRV_TRACKED_WRITE);
102
- * still only has one home thread that is responsible for running it.
206
+
103
- */
207
+ wait_serialising_requests(&src_req);
104
-static inline bool in_aio_context_home_thread(AioContext *ctx)
208
+ wait_serialising_requests(&dst_req);
105
-{
209
+ ret = bdrv_co_copy_range_from(src, src_offset,
106
- return ctx == qemu_get_current_aio_context();
210
+ dst, dst_offset,
107
-}
211
+ bytes, flags);
108
-
212
+
109
/**
213
+ tracked_request_end(&src_req);
110
* aio_context_setup:
214
+ tracked_request_end(&dst_req);
111
* @ctx: the aio context
215
+ bdrv_dec_in_flight(src_bs);
216
+ bdrv_dec_in_flight(dst_bs);
217
+ return ret;
218
+}
219
--
112
--
220
2.17.1
113
2.25.1
221
114
222
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
We don't verify the request range against s->size in the I/O callbacks
4
except for raw_co_pwritev. This is inconsistent (especially for
5
raw_co_pwrite_zeroes and raw_co_pdiscard), so fix them, in the meanwhile
6
make the helper reusable by the coming new callbacks.
7
8
Note that in most cases the block layer already verifies the request
9
byte range against our reported image length, before invoking the driver
10
callbacks. The exception is during image creating, after
11
blk_set_allow_write_beyond_eof(blk, true) is called. But in that case,
12
the requests are not directly from the user or guest. So there is no
13
visible behavior change in adding the check code.
14
15
The int64_t -> uint64_t inconsistency, as shown by the type casting, is
16
pre-existing due to the interface.
17
18
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
19
Reviewed-by: Eric Blake <eblake@redhat.com>
20
Signed-off-by: Fam Zheng <famz@redhat.com>
21
Message-id: 20180601092648.24614-3-famz@redhat.com
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
23
---
24
block/raw-format.c | 64 ++++++++++++++++++++++++++++------------------
25
1 file changed, 39 insertions(+), 25 deletions(-)
26
27
diff --git a/block/raw-format.c b/block/raw-format.c
28
index XXXXXXX..XXXXXXX 100644
29
--- a/block/raw-format.c
30
+++ b/block/raw-format.c
31
@@ -XXX,XX +XXX,XX @@ static void raw_reopen_abort(BDRVReopenState *state)
32
state->opaque = NULL;
33
}
34
35
+/* Check and adjust the offset, against 'offset' and 'size' options. */
36
+static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset,
37
+ uint64_t bytes, bool is_write)
38
+{
39
+ BDRVRawState *s = bs->opaque;
40
+
41
+ if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) {
42
+ /* There's not enough space for the write, or the read request is
43
+ * out-of-range. Don't read/write anything to prevent leaking out of
44
+ * the size specified in options. */
45
+ return is_write ? -ENOSPC : -EINVAL;;
46
+ }
47
+
48
+ if (*offset > INT64_MAX - s->offset) {
49
+ return -EINVAL;
50
+ }
51
+ *offset += s->offset;
52
+
53
+ return 0;
54
+}
55
+
56
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
57
uint64_t bytes, QEMUIOVector *qiov,
58
int flags)
59
{
60
- BDRVRawState *s = bs->opaque;
61
+ int ret;
62
63
- if (offset > UINT64_MAX - s->offset) {
64
- return -EINVAL;
65
+ ret = raw_adjust_offset(bs, &offset, bytes, false);
66
+ if (ret) {
67
+ return ret;
68
}
69
- offset += s->offset;
70
71
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
72
return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
73
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
74
uint64_t bytes, QEMUIOVector *qiov,
75
int flags)
76
{
77
- BDRVRawState *s = bs->opaque;
78
void *buf = NULL;
79
BlockDriver *drv;
80
QEMUIOVector local_qiov;
81
int ret;
82
83
- if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
84
- /* There's not enough space for the data. Don't write anything and just
85
- * fail to prevent leaking out of the size specified in options. */
86
- return -ENOSPC;
87
- }
88
-
89
- if (offset > UINT64_MAX - s->offset) {
90
- ret = -EINVAL;
91
- goto fail;
92
- }
93
-
94
if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
95
/* Handling partial writes would be a pain - so we just
96
* require that guests have 512-byte request alignment if
97
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
98
qiov = &local_qiov;
99
}
100
101
- offset += s->offset;
102
+ ret = raw_adjust_offset(bs, &offset, bytes, true);
103
+ if (ret) {
104
+ goto fail;
105
+ }
106
107
BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
108
ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
109
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
110
int64_t offset, int bytes,
111
BdrvRequestFlags flags)
112
{
113
- BDRVRawState *s = bs->opaque;
114
- if (offset > UINT64_MAX - s->offset) {
115
- return -EINVAL;
116
+ int ret;
117
+
118
+ ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
119
+ if (ret) {
120
+ return ret;
121
}
122
- offset += s->offset;
123
return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
124
}
125
126
static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
127
int64_t offset, int bytes)
128
{
129
- BDRVRawState *s = bs->opaque;
130
- if (offset > UINT64_MAX - s->offset) {
131
- return -EINVAL;
132
+ int ret;
133
+
134
+ ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
135
+ if (ret) {
136
+ return ret;
137
}
138
- offset += s->offset;
139
return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
140
}
141
142
--
143
2.17.1
144
145
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
Just pass down to ->file.
4
5
Signed-off-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Message-id: 20180601092648.24614-4-famz@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
block/raw-format.c | 32 ++++++++++++++++++++++++++++++++
11
1 file changed, 32 insertions(+)
12
13
diff --git a/block/raw-format.c b/block/raw-format.c
14
index XXXXXXX..XXXXXXX 100644
15
--- a/block/raw-format.c
16
+++ b/block/raw-format.c
17
@@ -XXX,XX +XXX,XX @@ static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
18
return bdrv_probe_geometry(bs->file->bs, geo);
19
}
20
21
+static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs,
22
+ BdrvChild *src, uint64_t src_offset,
23
+ BdrvChild *dst, uint64_t dst_offset,
24
+ uint64_t bytes, BdrvRequestFlags flags)
25
+{
26
+ int ret;
27
+
28
+ ret = raw_adjust_offset(bs, &src_offset, bytes, false);
29
+ if (ret) {
30
+ return ret;
31
+ }
32
+ return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset,
33
+ bytes, flags);
34
+}
35
+
36
+static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
37
+ BdrvChild *src, uint64_t src_offset,
38
+ BdrvChild *dst, uint64_t dst_offset,
39
+ uint64_t bytes, BdrvRequestFlags flags)
40
+{
41
+ int ret;
42
+
43
+ ret = raw_adjust_offset(bs, &dst_offset, bytes, true);
44
+ if (ret) {
45
+ return ret;
46
+ }
47
+ return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes,
48
+ flags);
49
+}
50
+
51
BlockDriver bdrv_raw = {
52
.format_name = "raw",
53
.instance_size = sizeof(BDRVRawState),
54
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
55
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
56
.bdrv_co_pdiscard = &raw_co_pdiscard,
57
.bdrv_co_block_status = &raw_co_block_status,
58
+ .bdrv_co_copy_range_from = &raw_co_copy_range_from,
59
+ .bdrv_co_copy_range_to = &raw_co_copy_range_to,
60
.bdrv_truncate = &raw_truncate,
61
.bdrv_getlength = &raw_getlength,
62
.has_variable_length = true,
63
--
64
2.17.1
65
66
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
The two callbacks are implemented quite similarly to the read/write
4
functions: bdrv_co_copy_range_from maps for read and calls into bs->file
5
or bs->backing depending on the allocation status; bdrv_co_copy_range_to
6
maps for write and calls into bs->file.
7
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Fam Zheng <famz@redhat.com>
10
Message-id: 20180601092648.24614-5-famz@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
block/qcow2.c | 229 +++++++++++++++++++++++++++++++++++++++++++-------
14
1 file changed, 199 insertions(+), 30 deletions(-)
15
16
diff --git a/block/qcow2.c b/block/qcow2.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/block/qcow2.c
19
+++ b/block/qcow2.c
20
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
21
return status;
22
}
23
24
+static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
25
+ QCowL2Meta **pl2meta,
26
+ bool link_l2)
27
+{
28
+ int ret = 0;
29
+ QCowL2Meta *l2meta = *pl2meta;
30
+
31
+ while (l2meta != NULL) {
32
+ QCowL2Meta *next;
33
+
34
+ if (!ret && link_l2) {
35
+ ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
36
+ if (ret) {
37
+ goto out;
38
+ }
39
+ }
40
+
41
+ /* Take the request off the list of running requests */
42
+ if (l2meta->nb_clusters != 0) {
43
+ QLIST_REMOVE(l2meta, next_in_flight);
44
+ }
45
+
46
+ qemu_co_queue_restart_all(&l2meta->dependent_requests);
47
+
48
+ next = l2meta->next;
49
+ g_free(l2meta);
50
+ l2meta = next;
51
+ }
52
+out:
53
+ *pl2meta = l2meta;
54
+ return ret;
55
+}
56
+
57
static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
58
uint64_t bytes, QEMUIOVector *qiov,
59
int flags)
60
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
61
}
62
}
63
64
- while (l2meta != NULL) {
65
- QCowL2Meta *next;
66
-
67
- ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
68
- if (ret < 0) {
69
- goto fail;
70
- }
71
-
72
- /* Take the request off the list of running requests */
73
- if (l2meta->nb_clusters != 0) {
74
- QLIST_REMOVE(l2meta, next_in_flight);
75
- }
76
-
77
- qemu_co_queue_restart_all(&l2meta->dependent_requests);
78
-
79
- next = l2meta->next;
80
- g_free(l2meta);
81
- l2meta = next;
82
+ ret = qcow2_handle_l2meta(bs, &l2meta, true);
83
+ if (ret) {
84
+ goto fail;
85
}
86
87
bytes -= cur_bytes;
88
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
89
ret = 0;
90
91
fail:
92
- while (l2meta != NULL) {
93
- QCowL2Meta *next;
94
-
95
- if (l2meta->nb_clusters != 0) {
96
- QLIST_REMOVE(l2meta, next_in_flight);
97
- }
98
- qemu_co_queue_restart_all(&l2meta->dependent_requests);
99
-
100
- next = l2meta->next;
101
- g_free(l2meta);
102
- l2meta = next;
103
- }
104
+ qcow2_handle_l2meta(bs, &l2meta, false);
105
106
qemu_co_mutex_unlock(&s->lock);
107
108
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
109
return ret;
110
}
111
112
+static int coroutine_fn
113
+qcow2_co_copy_range_from(BlockDriverState *bs,
114
+ BdrvChild *src, uint64_t src_offset,
115
+ BdrvChild *dst, uint64_t dst_offset,
116
+ uint64_t bytes, BdrvRequestFlags flags)
117
+{
118
+ BDRVQcow2State *s = bs->opaque;
119
+ int ret;
120
+ unsigned int cur_bytes; /* number of bytes in current iteration */
121
+ BdrvChild *child = NULL;
122
+ BdrvRequestFlags cur_flags;
123
+
124
+ assert(!bs->encrypted);
125
+ qemu_co_mutex_lock(&s->lock);
126
+
127
+ while (bytes != 0) {
128
+ uint64_t copy_offset = 0;
129
+ /* prepare next request */
130
+ cur_bytes = MIN(bytes, INT_MAX);
131
+ cur_flags = flags;
132
+
133
+ ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, &copy_offset);
134
+ if (ret < 0) {
135
+ goto out;
136
+ }
137
+
138
+ switch (ret) {
139
+ case QCOW2_CLUSTER_UNALLOCATED:
140
+ if (bs->backing && bs->backing->bs) {
141
+ int64_t backing_length = bdrv_getlength(bs->backing->bs);
142
+ if (src_offset >= backing_length) {
143
+ cur_flags |= BDRV_REQ_ZERO_WRITE;
144
+ } else {
145
+ child = bs->backing;
146
+ cur_bytes = MIN(cur_bytes, backing_length - src_offset);
147
+ copy_offset = src_offset;
148
+ }
149
+ } else {
150
+ cur_flags |= BDRV_REQ_ZERO_WRITE;
151
+ }
152
+ break;
153
+
154
+ case QCOW2_CLUSTER_ZERO_PLAIN:
155
+ case QCOW2_CLUSTER_ZERO_ALLOC:
156
+ cur_flags |= BDRV_REQ_ZERO_WRITE;
157
+ break;
158
+
159
+ case QCOW2_CLUSTER_COMPRESSED:
160
+ ret = -ENOTSUP;
161
+ goto out;
162
+ break;
163
+
164
+ case QCOW2_CLUSTER_NORMAL:
165
+ child = bs->file;
166
+ copy_offset += offset_into_cluster(s, src_offset);
167
+ if ((copy_offset & 511) != 0) {
168
+ ret = -EIO;
169
+ goto out;
170
+ }
171
+ break;
172
+
173
+ default:
174
+ abort();
175
+ }
176
+ qemu_co_mutex_unlock(&s->lock);
177
+ ret = bdrv_co_copy_range_from(child,
178
+ copy_offset,
179
+ dst, dst_offset,
180
+ cur_bytes, cur_flags);
181
+ qemu_co_mutex_lock(&s->lock);
182
+ if (ret < 0) {
183
+ goto out;
184
+ }
185
+
186
+ bytes -= cur_bytes;
187
+ src_offset += cur_bytes;
188
+ dst_offset += cur_bytes;
189
+ }
190
+ ret = 0;
191
+
192
+out:
193
+ qemu_co_mutex_unlock(&s->lock);
194
+ return ret;
195
+}
196
+
197
+static int coroutine_fn
198
+qcow2_co_copy_range_to(BlockDriverState *bs,
199
+ BdrvChild *src, uint64_t src_offset,
200
+ BdrvChild *dst, uint64_t dst_offset,
201
+ uint64_t bytes, BdrvRequestFlags flags)
202
+{
203
+ BDRVQcow2State *s = bs->opaque;
204
+ int offset_in_cluster;
205
+ int ret;
206
+ unsigned int cur_bytes; /* number of sectors in current iteration */
207
+ uint64_t cluster_offset;
208
+ uint8_t *cluster_data = NULL;
209
+ QCowL2Meta *l2meta = NULL;
210
+
211
+ assert(!bs->encrypted);
212
+ s->cluster_cache_offset = -1; /* disable compressed cache */
213
+
214
+ qemu_co_mutex_lock(&s->lock);
215
+
216
+ while (bytes != 0) {
217
+
218
+ l2meta = NULL;
219
+
220
+ offset_in_cluster = offset_into_cluster(s, dst_offset);
221
+ cur_bytes = MIN(bytes, INT_MAX);
222
+
223
+ /* TODO:
224
+ * If src->bs == dst->bs, we could simply copy by incrementing
225
+ * the refcnt, without copying user data.
226
+ * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
227
+ ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes,
228
+ &cluster_offset, &l2meta);
229
+ if (ret < 0) {
230
+ goto fail;
231
+ }
232
+
233
+ assert((cluster_offset & 511) == 0);
234
+
235
+ ret = qcow2_pre_write_overlap_check(bs, 0,
236
+ cluster_offset + offset_in_cluster, cur_bytes);
237
+ if (ret < 0) {
238
+ goto fail;
239
+ }
240
+
241
+ qemu_co_mutex_unlock(&s->lock);
242
+ ret = bdrv_co_copy_range_to(src, src_offset,
243
+ bs->file,
244
+ cluster_offset + offset_in_cluster,
245
+ cur_bytes, flags);
246
+ qemu_co_mutex_lock(&s->lock);
247
+ if (ret < 0) {
248
+ goto fail;
249
+ }
250
+
251
+ ret = qcow2_handle_l2meta(bs, &l2meta, true);
252
+ if (ret) {
253
+ goto fail;
254
+ }
255
+
256
+ bytes -= cur_bytes;
257
+ dst_offset += cur_bytes;
258
+ }
259
+ ret = 0;
260
+
261
+fail:
262
+ qcow2_handle_l2meta(bs, &l2meta, false);
263
+
264
+ qemu_co_mutex_unlock(&s->lock);
265
+
266
+ qemu_vfree(cluster_data);
267
+ trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
268
+
269
+ return ret;
270
+}
271
+
272
static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
273
PreallocMode prealloc, Error **errp)
274
{
275
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_qcow2 = {
276
277
.bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes,
278
.bdrv_co_pdiscard = qcow2_co_pdiscard,
279
+ .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
280
+ .bdrv_co_copy_range_to = qcow2_co_copy_range_to,
281
.bdrv_truncate = qcow2_truncate,
282
.bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
283
.bdrv_make_empty = qcow2_make_empty,
284
--
285
2.17.1
286
287
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
With copy_file_range(2), we can implement the bdrv_co_copy_range
4
semantics.
5
6
Signed-off-by: Fam Zheng <famz@redhat.com>
7
Message-id: 20180601092648.24614-6-famz@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
configure | 17 +++++++
11
include/block/raw-aio.h | 10 ++++-
12
block/file-posix.c | 98 +++++++++++++++++++++++++++++++++++++++--
13
3 files changed, 120 insertions(+), 5 deletions(-)
14
15
diff --git a/configure b/configure
16
index XXXXXXX..XXXXXXX 100755
17
--- a/configure
18
+++ b/configure
19
@@ -XXX,XX +XXX,XX @@ if test "$fortify_source" != "no"; then
20
fi
21
fi
22
23
+###############################################
24
+# Check if copy_file_range is provided by glibc
25
+have_copy_file_range=no
26
+cat > $TMPC << EOF
27
+#include <unistd.h>
28
+int main(void) {
29
+ copy_file_range(0, NULL, 0, NULL, 0, 0);
30
+ return 0;
31
+}
32
+EOF
33
+if compile_prog "" "" ; then
34
+ have_copy_file_range=yes
35
+fi
36
+
37
##########################################
38
# check if struct fsxattr is available via linux/fs.h
39
40
@@ -XXX,XX +XXX,XX @@ fi
41
if test "$have_fsxattr" = "yes" ; then
42
echo "HAVE_FSXATTR=y" >> $config_host_mak
43
fi
44
+if test "$have_copy_file_range" = "yes" ; then
45
+ echo "HAVE_COPY_FILE_RANGE=y" >> $config_host_mak
46
+fi
47
if test "$vte" = "yes" ; then
48
echo "CONFIG_VTE=y" >> $config_host_mak
49
echo "VTE_CFLAGS=$vte_cflags" >> $config_host_mak
50
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
51
index XXXXXXX..XXXXXXX 100644
52
--- a/include/block/raw-aio.h
53
+++ b/include/block/raw-aio.h
54
@@ -XXX,XX +XXX,XX @@
55
#define QEMU_AIO_FLUSH 0x0008
56
#define QEMU_AIO_DISCARD 0x0010
57
#define QEMU_AIO_WRITE_ZEROES 0x0020
58
+#define QEMU_AIO_COPY_RANGE 0x0040
59
#define QEMU_AIO_TYPE_MASK \
60
- (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
61
- QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)
62
+ (QEMU_AIO_READ | \
63
+ QEMU_AIO_WRITE | \
64
+ QEMU_AIO_IOCTL | \
65
+ QEMU_AIO_FLUSH | \
66
+ QEMU_AIO_DISCARD | \
67
+ QEMU_AIO_WRITE_ZEROES | \
68
+ QEMU_AIO_COPY_RANGE)
69
70
/* AIO flags */
71
#define QEMU_AIO_MISALIGNED 0x1000
72
diff --git a/block/file-posix.c b/block/file-posix.c
73
index XXXXXXX..XXXXXXX 100644
74
--- a/block/file-posix.c
75
+++ b/block/file-posix.c
76
@@ -XXX,XX +XXX,XX @@
77
#ifdef __linux__
78
#include <sys/ioctl.h>
79
#include <sys/param.h>
80
+#include <sys/syscall.h>
81
#include <linux/cdrom.h>
82
#include <linux/fd.h>
83
#include <linux/fs.h>
84
@@ -XXX,XX +XXX,XX @@ typedef struct RawPosixAIOData {
85
#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
86
off_t aio_offset;
87
int aio_type;
88
+ int aio_fd2;
89
+ off_t aio_offset2;
90
} RawPosixAIOData;
91
92
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
93
@@ -XXX,XX +XXX,XX @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
94
return -ENOTSUP;
95
}
96
97
+#ifndef HAVE_COPY_FILE_RANGE
98
+static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
99
+ off_t *out_off, size_t len, unsigned int flags)
100
+{
101
+#ifdef __NR_copy_file_range
102
+ return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
103
+ out_off, len, flags);
104
+#else
105
+ errno = ENOSYS;
106
+ return -1;
107
+#endif
108
+}
109
+#endif
110
+
111
+static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
112
+{
113
+ uint64_t bytes = aiocb->aio_nbytes;
114
+ off_t in_off = aiocb->aio_offset;
115
+ off_t out_off = aiocb->aio_offset2;
116
+
117
+ while (bytes) {
118
+ ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
119
+ aiocb->aio_fd2, &out_off,
120
+ bytes, 0);
121
+ if (ret == -EINTR) {
122
+ continue;
123
+ }
124
+ if (ret < 0) {
125
+ if (errno == ENOSYS) {
126
+ return -ENOTSUP;
127
+ } else {
128
+ return -errno;
129
+ }
130
+ }
131
+ if (!ret) {
132
+ /* No progress (e.g. when beyond EOF), fall back to buffer I/O. */
133
+ return -ENOTSUP;
134
+ }
135
+ bytes -= ret;
136
+ }
137
+ return 0;
138
+}
139
+
140
static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
141
{
142
int ret = -EOPNOTSUPP;
143
@@ -XXX,XX +XXX,XX @@ static int aio_worker(void *arg)
144
case QEMU_AIO_WRITE_ZEROES:
145
ret = handle_aiocb_write_zeroes(aiocb);
146
break;
147
+ case QEMU_AIO_COPY_RANGE:
148
+ ret = handle_aiocb_copy_range(aiocb);
149
+ break;
150
default:
151
fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
152
ret = -EINVAL;
153
@@ -XXX,XX +XXX,XX @@ static int aio_worker(void *arg)
154
return ret;
155
}
156
157
-static int paio_submit_co(BlockDriverState *bs, int fd,
158
- int64_t offset, QEMUIOVector *qiov,
159
- int bytes, int type)
160
+static int paio_submit_co_full(BlockDriverState *bs, int fd,
161
+ int64_t offset, int fd2, int64_t offset2,
162
+ QEMUIOVector *qiov,
163
+ int bytes, int type)
164
{
165
RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
166
ThreadPool *pool;
167
@@ -XXX,XX +XXX,XX @@ static int paio_submit_co(BlockDriverState *bs, int fd,
168
acb->bs = bs;
169
acb->aio_type = type;
170
acb->aio_fildes = fd;
171
+ acb->aio_fd2 = fd2;
172
+ acb->aio_offset2 = offset2;
173
174
acb->aio_nbytes = bytes;
175
acb->aio_offset = offset;
176
@@ -XXX,XX +XXX,XX @@ static int paio_submit_co(BlockDriverState *bs, int fd,
177
return thread_pool_submit_co(pool, aio_worker, acb);
178
}
179
180
+static inline int paio_submit_co(BlockDriverState *bs, int fd,
181
+ int64_t offset, QEMUIOVector *qiov,
182
+ int bytes, int type)
183
+{
184
+ return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
185
+}
186
+
187
static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
188
int64_t offset, QEMUIOVector *qiov, int bytes,
189
BlockCompletionFunc *cb, void *opaque, int type)
190
@@ -XXX,XX +XXX,XX @@ static void raw_abort_perm_update(BlockDriverState *bs)
191
raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
192
}
193
194
+static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs,
195
+ BdrvChild *src, uint64_t src_offset,
196
+ BdrvChild *dst, uint64_t dst_offset,
197
+ uint64_t bytes, BdrvRequestFlags flags)
198
+{
199
+ return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags);
200
+}
201
+
202
+static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
203
+ BdrvChild *src, uint64_t src_offset,
204
+ BdrvChild *dst, uint64_t dst_offset,
205
+ uint64_t bytes, BdrvRequestFlags flags)
206
+{
207
+ BDRVRawState *s = bs->opaque;
208
+ BDRVRawState *src_s;
209
+
210
+ assert(dst->bs == bs);
211
+ if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
212
+ return -ENOTSUP;
213
+ }
214
+
215
+ src_s = src->bs->opaque;
216
+ if (fd_open(bs) < 0 || fd_open(bs) < 0) {
217
+ return -EIO;
218
+ }
219
+ return paio_submit_co_full(bs, src_s->fd, src_offset, s->fd, dst_offset,
220
+ NULL, bytes, QEMU_AIO_COPY_RANGE);
221
+}
222
+
223
BlockDriver bdrv_file = {
224
.format_name = "file",
225
.protocol_name = "file",
226
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_file = {
227
.bdrv_co_pwritev = raw_co_pwritev,
228
.bdrv_aio_flush = raw_aio_flush,
229
.bdrv_aio_pdiscard = raw_aio_pdiscard,
230
+ .bdrv_co_copy_range_from = raw_co_copy_range_from,
231
+ .bdrv_co_copy_range_to = raw_co_copy_range_to,
232
.bdrv_refresh_limits = raw_refresh_limits,
233
.bdrv_io_plug = raw_aio_plug,
234
.bdrv_io_unplug = raw_aio_unplug,
235
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
236
.bdrv_co_pwritev = raw_co_pwritev,
237
.bdrv_aio_flush    = raw_aio_flush,
238
.bdrv_aio_pdiscard = hdev_aio_pdiscard,
239
+ .bdrv_co_copy_range_from = raw_co_copy_range_from,
240
+ .bdrv_co_copy_range_to = raw_co_copy_range_to,
241
.bdrv_refresh_limits = raw_refresh_limits,
242
.bdrv_io_plug = raw_aio_plug,
243
.bdrv_io_unplug = raw_aio_unplug,
244
--
245
2.17.1
246
247
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
The device designator data returned in INQUIRY command will be useful to
4
fill in source/target fields during copy offloading. Do this when
5
connecting to the target and save the data for later use.
6
7
Signed-off-by: Fam Zheng <famz@redhat.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-id: 20180601092648.24614-7-famz@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
block/iscsi.c | 41 +++++++++++++++++++++++++++++++++++++++++
13
1 file changed, 41 insertions(+)
14
15
diff --git a/block/iscsi.c b/block/iscsi.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/block/iscsi.c
18
+++ b/block/iscsi.c
19
@@ -XXX,XX +XXX,XX @@ typedef struct IscsiLun {
20
QemuMutex mutex;
21
struct scsi_inquiry_logical_block_provisioning lbp;
22
struct scsi_inquiry_block_limits bl;
23
+ struct scsi_inquiry_device_designator *dd;
24
unsigned char *zeroblock;
25
/* The allocmap tracks which clusters (pages) on the iSCSI target are
26
* allocated and which are not. In case a target returns zeros for
27
@@ -XXX,XX +XXX,XX @@ static QemuOptsList runtime_opts = {
28
},
29
};
30
31
+static void iscsi_save_designator(IscsiLun *lun,
32
+ struct scsi_inquiry_device_identification *inq_di)
33
+{
34
+ struct scsi_inquiry_device_designator *desig, *copy = NULL;
35
+
36
+ for (desig = inq_di->designators; desig; desig = desig->next) {
37
+ if (desig->association ||
38
+ desig->designator_type > SCSI_DESIGNATOR_TYPE_NAA) {
39
+ continue;
40
+ }
41
+ /* NAA works better than T10 vendor ID based designator. */
42
+ if (!copy || copy->designator_type < desig->designator_type) {
43
+ copy = desig;
44
+ }
45
+ }
46
+ if (copy) {
47
+ lun->dd = g_new(struct scsi_inquiry_device_designator, 1);
48
+ *lun->dd = *copy;
49
+ lun->dd->next = NULL;
50
+ lun->dd->designator = g_malloc(copy->designator_length);
51
+ memcpy(lun->dd->designator, copy->designator, copy->designator_length);
52
+ }
53
+}
54
+
55
static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
56
Error **errp)
57
{
58
@@ -XXX,XX +XXX,XX @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
59
struct scsi_task *inq_task;
60
struct scsi_inquiry_logical_block_provisioning *inq_lbp;
61
struct scsi_inquiry_block_limits *inq_bl;
62
+ struct scsi_inquiry_device_identification *inq_di;
63
switch (inq_vpd->pages[i]) {
64
case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING:
65
inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
66
@@ -XXX,XX +XXX,XX @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
67
sizeof(struct scsi_inquiry_block_limits));
68
scsi_free_scsi_task(inq_task);
69
break;
70
+ case SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION:
71
+ inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
72
+ SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION,
73
+ (void **) &inq_di, errp);
74
+ if (inq_task == NULL) {
75
+ ret = -EINVAL;
76
+ goto out;
77
+ }
78
+ iscsi_save_designator(iscsilun, inq_di);
79
+ scsi_free_scsi_task(inq_task);
80
+ break;
81
default:
82
break;
83
}
84
@@ -XXX,XX +XXX,XX @@ static void iscsi_close(BlockDriverState *bs)
85
iscsi_logout_sync(iscsi);
86
}
87
iscsi_destroy_context(iscsi);
88
+ if (iscsilun->dd) {
89
+ g_free(iscsilun->dd->designator);
90
+ g_free(iscsilun->dd);
91
+ }
92
g_free(iscsilun->zeroblock);
93
iscsi_allocmap_free(iscsilun);
94
qemu_mutex_destroy(&iscsilun->mutex);
95
--
96
2.17.1
97
98
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
This loop is repeated a growing number times. Make a helper.
4
5
Signed-off-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Reviewed-by: Eric Blake <eblake@redhat.com>
8
Message-id: 20180601092648.24614-8-famz@redhat.com
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
11
block/iscsi.c | 54 ++++++++++++++++-----------------------------------
12
1 file changed, 17 insertions(+), 37 deletions(-)
13
14
diff --git a/block/iscsi.c b/block/iscsi.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/block/iscsi.c
17
+++ b/block/iscsi.c
18
@@ -XXX,XX +XXX,XX @@ static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
19
offset / iscsilun->cluster_size) == size);
20
}
21
22
+static void coroutine_fn iscsi_co_wait_for_task(IscsiTask *iTask,
23
+ IscsiLun *iscsilun)
24
+{
25
+ while (!iTask->complete) {
26
+ iscsi_set_events(iscsilun);
27
+ qemu_mutex_unlock(&iscsilun->mutex);
28
+ qemu_coroutine_yield();
29
+ qemu_mutex_lock(&iscsilun->mutex);
30
+ }
31
+}
32
+
33
static int coroutine_fn
34
iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
35
QEMUIOVector *iov, int flags)
36
@@ -XXX,XX +XXX,XX @@ retry:
37
scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
38
iov->niov);
39
#endif
40
- while (!iTask.complete) {
41
- iscsi_set_events(iscsilun);
42
- qemu_mutex_unlock(&iscsilun->mutex);
43
- qemu_coroutine_yield();
44
- qemu_mutex_lock(&iscsilun->mutex);
45
- }
46
+ iscsi_co_wait_for_task(&iTask, iscsilun);
47
48
if (iTask.task != NULL) {
49
scsi_free_scsi_task(iTask.task);
50
@@ -XXX,XX +XXX,XX @@ retry:
51
ret = -ENOMEM;
52
goto out_unlock;
53
}
54
-
55
- while (!iTask.complete) {
56
- iscsi_set_events(iscsilun);
57
- qemu_mutex_unlock(&iscsilun->mutex);
58
- qemu_coroutine_yield();
59
- qemu_mutex_lock(&iscsilun->mutex);
60
- }
61
+ iscsi_co_wait_for_task(&iTask, iscsilun);
62
63
if (iTask.do_retry) {
64
if (iTask.task != NULL) {
65
@@ -XXX,XX +XXX,XX @@ retry:
66
#if LIBISCSI_API_VERSION < (20160603)
67
scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov);
68
#endif
69
- while (!iTask.complete) {
70
- iscsi_set_events(iscsilun);
71
- qemu_mutex_unlock(&iscsilun->mutex);
72
- qemu_coroutine_yield();
73
- qemu_mutex_lock(&iscsilun->mutex);
74
- }
75
76
+ iscsi_co_wait_for_task(&iTask, iscsilun);
77
if (iTask.task != NULL) {
78
scsi_free_scsi_task(iTask.task);
79
iTask.task = NULL;
80
@@ -XXX,XX +XXX,XX @@ retry:
81
return -ENOMEM;
82
}
83
84
- while (!iTask.complete) {
85
- iscsi_set_events(iscsilun);
86
- qemu_mutex_unlock(&iscsilun->mutex);
87
- qemu_coroutine_yield();
88
- qemu_mutex_lock(&iscsilun->mutex);
89
- }
90
+ iscsi_co_wait_for_task(&iTask, iscsilun);
91
92
if (iTask.task != NULL) {
93
scsi_free_scsi_task(iTask.task);
94
@@ -XXX,XX +XXX,XX @@ retry:
95
goto out_unlock;
96
}
97
98
- while (!iTask.complete) {
99
- iscsi_set_events(iscsilun);
100
- qemu_mutex_unlock(&iscsilun->mutex);
101
- qemu_coroutine_yield();
102
- qemu_mutex_lock(&iscsilun->mutex);
103
- }
104
+ iscsi_co_wait_for_task(&iTask, iscsilun);
105
106
if (iTask.task != NULL) {
107
scsi_free_scsi_task(iTask.task);
108
@@ -XXX,XX +XXX,XX @@ retry:
109
return -ENOMEM;
110
}
111
112
- while (!iTask.complete) {
113
- iscsi_set_events(iscsilun);
114
- qemu_mutex_unlock(&iscsilun->mutex);
115
- qemu_coroutine_yield();
116
- qemu_mutex_lock(&iscsilun->mutex);
117
- }
118
+ iscsi_co_wait_for_task(&iTask, iscsilun);
119
120
if (iTask.status == SCSI_STATUS_CHECK_CONDITION &&
121
iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST &&
122
--
123
2.17.1
124
125
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
Issue EXTENDED COPY (LID1) command to implement the copy_range API.
4
5
The parameter data construction code is modified from libiscsi's
6
iscsi-dd.c.
7
8
Signed-off-by: Fam Zheng <famz@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: 20180601092648.24614-9-famz@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
include/scsi/constants.h | 4 +
14
block/iscsi.c | 219 +++++++++++++++++++++++++++++++++++++++
15
2 files changed, 223 insertions(+)
16
17
diff --git a/include/scsi/constants.h b/include/scsi/constants.h
18
index XXXXXXX..XXXXXXX 100644
19
--- a/include/scsi/constants.h
20
+++ b/include/scsi/constants.h
21
@@ -XXX,XX +XXX,XX @@
22
#define MMC_PROFILE_HDDVD_RW_DL 0x005A
23
#define MMC_PROFILE_INVALID 0xFFFF
24
25
+#define XCOPY_DESC_OFFSET 16
26
+#define IDENT_DESCR_TGT_DESCR_SIZE 32
27
+#define XCOPY_BLK2BLK_SEG_DESC_SIZE 28
28
+
29
#endif
30
diff --git a/block/iscsi.c b/block/iscsi.c
31
index XXXXXXX..XXXXXXX 100644
32
--- a/block/iscsi.c
33
+++ b/block/iscsi.c
34
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn iscsi_co_invalidate_cache(BlockDriverState *bs,
35
iscsi_allocmap_invalidate(iscsilun);
36
}
37
38
+static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs,
39
+ BdrvChild *src,
40
+ uint64_t src_offset,
41
+ BdrvChild *dst,
42
+ uint64_t dst_offset,
43
+ uint64_t bytes,
44
+ BdrvRequestFlags flags)
45
+{
46
+ return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags);
47
+}
48
+
49
+static struct scsi_task *iscsi_xcopy_task(int param_len)
50
+{
51
+ struct scsi_task *task;
52
+
53
+ task = g_new0(struct scsi_task, 1);
54
+
55
+ task->cdb[0] = EXTENDED_COPY;
56
+ task->cdb[10] = (param_len >> 24) & 0xFF;
57
+ task->cdb[11] = (param_len >> 16) & 0xFF;
58
+ task->cdb[12] = (param_len >> 8) & 0xFF;
59
+ task->cdb[13] = param_len & 0xFF;
60
+ task->cdb_size = 16;
61
+ task->xfer_dir = SCSI_XFER_WRITE;
62
+ task->expxferlen = param_len;
63
+
64
+ return task;
65
+}
66
+
67
+static void iscsi_populate_target_desc(unsigned char *desc, IscsiLun *lun)
68
+{
69
+ struct scsi_inquiry_device_designator *dd = lun->dd;
70
+
71
+ memset(desc, 0, 32);
72
+ desc[0] = 0xE4; /* IDENT_DESCR_TGT_DESCR */
73
+ desc[4] = dd->code_set;
74
+ desc[5] = (dd->designator_type & 0xF)
75
+ | ((dd->association & 3) << 4);
76
+ desc[7] = dd->designator_length;
77
+ memcpy(desc + 8, dd->designator, dd->designator_length);
78
+
79
+ desc[28] = 0;
80
+ desc[29] = (lun->block_size >> 16) & 0xFF;
81
+ desc[30] = (lun->block_size >> 8) & 0xFF;
82
+ desc[31] = lun->block_size & 0xFF;
83
+}
84
+
85
+static void iscsi_xcopy_desc_hdr(uint8_t *hdr, int dc, int cat, int src_index,
86
+ int dst_index)
87
+{
88
+ hdr[0] = 0x02; /* BLK_TO_BLK_SEG_DESCR */
89
+ hdr[1] = ((dc << 1) | cat) & 0xFF;
90
+ hdr[2] = (XCOPY_BLK2BLK_SEG_DESC_SIZE >> 8) & 0xFF;
91
+ /* don't account for the first 4 bytes in descriptor header*/
92
+ hdr[3] = (XCOPY_BLK2BLK_SEG_DESC_SIZE - 4 /* SEG_DESC_SRC_INDEX_OFFSET */) & 0xFF;
93
+ hdr[4] = (src_index >> 8) & 0xFF;
94
+ hdr[5] = src_index & 0xFF;
95
+ hdr[6] = (dst_index >> 8) & 0xFF;
96
+ hdr[7] = dst_index & 0xFF;
97
+}
98
+
99
+static void iscsi_xcopy_populate_desc(uint8_t *desc, int dc, int cat,
100
+ int src_index, int dst_index, int num_blks,
101
+ uint64_t src_lba, uint64_t dst_lba)
102
+{
103
+ iscsi_xcopy_desc_hdr(desc, dc, cat, src_index, dst_index);
104
+
105
+ /* The caller should verify the request size */
106
+ assert(num_blks < 65536);
107
+ desc[10] = (num_blks >> 8) & 0xFF;
108
+ desc[11] = num_blks & 0xFF;
109
+ desc[12] = (src_lba >> 56) & 0xFF;
110
+ desc[13] = (src_lba >> 48) & 0xFF;
111
+ desc[14] = (src_lba >> 40) & 0xFF;
112
+ desc[15] = (src_lba >> 32) & 0xFF;
113
+ desc[16] = (src_lba >> 24) & 0xFF;
114
+ desc[17] = (src_lba >> 16) & 0xFF;
115
+ desc[18] = (src_lba >> 8) & 0xFF;
116
+ desc[19] = src_lba & 0xFF;
117
+ desc[20] = (dst_lba >> 56) & 0xFF;
118
+ desc[21] = (dst_lba >> 48) & 0xFF;
119
+ desc[22] = (dst_lba >> 40) & 0xFF;
120
+ desc[23] = (dst_lba >> 32) & 0xFF;
121
+ desc[24] = (dst_lba >> 24) & 0xFF;
122
+ desc[25] = (dst_lba >> 16) & 0xFF;
123
+ desc[26] = (dst_lba >> 8) & 0xFF;
124
+ desc[27] = dst_lba & 0xFF;
125
+}
126
+
127
+static void iscsi_xcopy_populate_header(unsigned char *buf, int list_id, int str,
128
+ int list_id_usage, int prio,
129
+ int tgt_desc_len,
130
+ int seg_desc_len, int inline_data_len)
131
+{
132
+ buf[0] = list_id;
133
+ buf[1] = ((str & 1) << 5) | ((list_id_usage & 3) << 3) | (prio & 7);
134
+ buf[2] = (tgt_desc_len >> 8) & 0xFF;
135
+ buf[3] = tgt_desc_len & 0xFF;
136
+ buf[8] = (seg_desc_len >> 24) & 0xFF;
137
+ buf[9] = (seg_desc_len >> 16) & 0xFF;
138
+ buf[10] = (seg_desc_len >> 8) & 0xFF;
139
+ buf[11] = seg_desc_len & 0xFF;
140
+ buf[12] = (inline_data_len >> 24) & 0xFF;
141
+ buf[13] = (inline_data_len >> 16) & 0xFF;
142
+ buf[14] = (inline_data_len >> 8) & 0xFF;
143
+ buf[15] = inline_data_len & 0xFF;
144
+}
145
+
146
+static void iscsi_xcopy_data(struct iscsi_data *data,
147
+ IscsiLun *src, int64_t src_lba,
148
+ IscsiLun *dst, int64_t dst_lba,
149
+ uint16_t num_blocks)
150
+{
151
+ uint8_t *buf;
152
+ const int src_offset = XCOPY_DESC_OFFSET;
153
+ const int dst_offset = XCOPY_DESC_OFFSET + IDENT_DESCR_TGT_DESCR_SIZE;
154
+ const int seg_offset = dst_offset + IDENT_DESCR_TGT_DESCR_SIZE;
155
+
156
+ data->size = XCOPY_DESC_OFFSET +
157
+ IDENT_DESCR_TGT_DESCR_SIZE * 2 +
158
+ XCOPY_BLK2BLK_SEG_DESC_SIZE;
159
+ data->data = g_malloc0(data->size);
160
+ buf = data->data;
161
+
162
+ /* Initialise the parameter list header */
163
+ iscsi_xcopy_populate_header(buf, 1, 0, 2 /* LIST_ID_USAGE_DISCARD */,
164
+ 0, 2 * IDENT_DESCR_TGT_DESCR_SIZE,
165
+ XCOPY_BLK2BLK_SEG_DESC_SIZE,
166
+ 0);
167
+
168
+ /* Initialise CSCD list with one src + one dst descriptor */
169
+ iscsi_populate_target_desc(&buf[src_offset], src);
170
+ iscsi_populate_target_desc(&buf[dst_offset], dst);
171
+
172
+ /* Initialise one segment descriptor */
173
+ iscsi_xcopy_populate_desc(&buf[seg_offset], 0, 0, 0, 1, num_blocks,
174
+ src_lba, dst_lba);
175
+}
176
+
177
+static int coroutine_fn iscsi_co_copy_range_to(BlockDriverState *bs,
178
+ BdrvChild *src,
179
+ uint64_t src_offset,
180
+ BdrvChild *dst,
181
+ uint64_t dst_offset,
182
+ uint64_t bytes,
183
+ BdrvRequestFlags flags)
184
+{
185
+ IscsiLun *dst_lun = dst->bs->opaque;
186
+ IscsiLun *src_lun;
187
+ struct IscsiTask iscsi_task;
188
+ struct iscsi_data data;
189
+ int r = 0;
190
+ int block_size;
191
+
192
+ if (src->bs->drv->bdrv_co_copy_range_to != iscsi_co_copy_range_to) {
193
+ return -ENOTSUP;
194
+ }
195
+ src_lun = src->bs->opaque;
196
+
197
+ if (!src_lun->dd || !dst_lun->dd) {
198
+ return -ENOTSUP;
199
+ }
200
+ if (!is_byte_request_lun_aligned(dst_offset, bytes, dst_lun)) {
201
+ return -ENOTSUP;
202
+ }
203
+ if (!is_byte_request_lun_aligned(src_offset, bytes, src_lun)) {
204
+ return -ENOTSUP;
205
+ }
206
+ if (dst_lun->block_size != src_lun->block_size ||
207
+ !dst_lun->block_size) {
208
+ return -ENOTSUP;
209
+ }
210
+
211
+ block_size = dst_lun->block_size;
212
+ if (bytes / block_size > 65535) {
213
+ return -ENOTSUP;
214
+ }
215
+
216
+ iscsi_xcopy_data(&data,
217
+ src_lun, src_offset / block_size,
218
+ dst_lun, dst_offset / block_size,
219
+ bytes / block_size);
220
+
221
+ iscsi_co_init_iscsitask(dst_lun, &iscsi_task);
222
+
223
+ qemu_mutex_lock(&dst_lun->mutex);
224
+ iscsi_task.task = iscsi_xcopy_task(data.size);
225
+retry:
226
+ if (iscsi_scsi_command_async(dst_lun->iscsi, dst_lun->lun,
227
+ iscsi_task.task, iscsi_co_generic_cb,
228
+ &data,
229
+ &iscsi_task) != 0) {
230
+ r = -EIO;
231
+ goto out_unlock;
232
+ }
233
+
234
+ iscsi_co_wait_for_task(&iscsi_task, dst_lun);
235
+
236
+ if (iscsi_task.do_retry) {
237
+ iscsi_task.complete = 0;
238
+ goto retry;
239
+ }
240
+
241
+ if (iscsi_task.status != SCSI_STATUS_GOOD) {
242
+ r = iscsi_task.err_code;
243
+ goto out_unlock;
244
+ }
245
+
246
+out_unlock:
247
+ g_free(iscsi_task.task);
248
+ qemu_mutex_unlock(&dst_lun->mutex);
249
+ g_free(iscsi_task.err_str);
250
+ return r;
251
+}
252
+
253
static QemuOptsList iscsi_create_opts = {
254
.name = "iscsi-create-opts",
255
.head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head),
256
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iscsi = {
257
258
.bdrv_co_block_status = iscsi_co_block_status,
259
.bdrv_co_pdiscard = iscsi_co_pdiscard,
260
+ .bdrv_co_copy_range_from = iscsi_co_copy_range_from,
261
+ .bdrv_co_copy_range_to = iscsi_co_copy_range_to,
262
.bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
263
.bdrv_co_readv = iscsi_co_readv,
264
.bdrv_co_writev = iscsi_co_writev,
265
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_iser = {
266
267
.bdrv_co_block_status = iscsi_co_block_status,
268
.bdrv_co_pdiscard = iscsi_co_pdiscard,
269
+ .bdrv_co_copy_range_from = iscsi_co_copy_range_from,
270
+ .bdrv_co_copy_range_to = iscsi_co_copy_range_to,
271
.bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
272
.bdrv_co_readv = iscsi_co_readv,
273
.bdrv_co_writev = iscsi_co_writev,
274
--
275
2.17.1
276
277
diff view generated by jsdifflib
Deleted patch
1
From: Fam Zheng <famz@redhat.com>
2
1
3
It's a BlockBackend wrapper of the BDS interface.
4
5
Signed-off-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Message-id: 20180601092648.24614-10-famz@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
include/sysemu/block-backend.h | 4 ++++
11
block/block-backend.c | 18 ++++++++++++++++++
12
2 files changed, 22 insertions(+)
13
14
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
15
index XXXXXXX..XXXXXXX 100644
16
--- a/include/sysemu/block-backend.h
17
+++ b/include/sysemu/block-backend.h
18
@@ -XXX,XX +XXX,XX @@ void blk_set_force_allow_inactivate(BlockBackend *blk);
19
void blk_register_buf(BlockBackend *blk, void *host, size_t size);
20
void blk_unregister_buf(BlockBackend *blk, void *host);
21
22
+int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
23
+ BlockBackend *blk_out, int64_t off_out,
24
+ int bytes, BdrvRequestFlags flags);
25
+
26
#endif
27
diff --git a/block/block-backend.c b/block/block-backend.c
28
index XXXXXXX..XXXXXXX 100644
29
--- a/block/block-backend.c
30
+++ b/block/block-backend.c
31
@@ -XXX,XX +XXX,XX @@ void blk_unregister_buf(BlockBackend *blk, void *host)
32
{
33
bdrv_unregister_buf(blk_bs(blk), host);
34
}
35
+
36
+int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
37
+ BlockBackend *blk_out, int64_t off_out,
38
+ int bytes, BdrvRequestFlags flags)
39
+{
40
+ int r;
41
+ r = blk_check_byte_request(blk_in, off_in, bytes);
42
+ if (r) {
43
+ return r;
44
+ }
45
+ r = blk_check_byte_request(blk_out, off_out, bytes);
46
+ if (r) {
47
+ return r;
48
+ }
49
+ return bdrv_co_copy_range(blk_in->root, off_in,
50
+ blk_out->root, off_out,
51
+ bytes, flags);
52
+}
53
--
54
2.17.1
55
56
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
2
3
The new blk_co_copy_range interface offers a more efficient way in the
3
When using C11 atomics, non-seqcst reads and writes do not participate
4
case of network based storage. Make use of it to allow faster convert
4
in the total order of seqcst operations. In util/async.c and util/aio-posix.c,
5
operation.
5
in particular, the pattern that we use
6
6
7
Since copy offloading cannot do zero detection ('-S') and compression
7
write ctx->notify_me write bh->scheduled
8
(-c), only try it when these options are not used.
8
read bh->scheduled read ctx->notify_me
9
if !bh->scheduled, sleep if ctx->notify_me, notify
9
10
10
Signed-off-by: Fam Zheng <famz@redhat.com>
11
needs to use seqcst operations for both the write and the read. In
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
general this is something that we do not want, because there can be
12
Message-id: 20180601092648.24614-11-famz@redhat.com
13
many sources that are polled in addition to bottom halves. The
14
alternative is to place a seqcst memory barrier between the write
15
and the read. This also comes with a disadvantage, in that the
16
memory barrier is implicit on strongly-ordered architectures and
17
it wastes a few dozen clock cycles.
18
19
Fortunately, ctx->notify_me is never written concurrently by two
20
threads, so we can assert that and relax the writes to ctx->notify_me.
21
The resulting solution works and performs well on both aarch64 and x86.
22
23
Note that the atomic_set/atomic_read combination is not an atomic
24
read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
25
on x86, ATOMIC_RELAXED compiles to a locked operation.
26
27
Analyzed-by: Ying Fang <fangying1@huawei.com>
28
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
29
Tested-by: Ying Fang <fangying1@huawei.com>
30
Message-Id: <20200407140746.8041-6-pbonzini@redhat.com>
13
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
31
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
---
32
---
15
qemu-img.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
33
util/aio-posix.c | 16 ++++++++++++++--
16
1 file changed, 48 insertions(+), 2 deletions(-)
34
util/aio-win32.c | 17 ++++++++++++++---
35
util/async.c | 16 ++++++++++++----
36
3 files changed, 40 insertions(+), 9 deletions(-)
17
37
18
diff --git a/qemu-img.c b/qemu-img.c
38
diff --git a/util/aio-posix.c b/util/aio-posix.c
19
index XXXXXXX..XXXXXXX 100644
39
index XXXXXXX..XXXXXXX 100644
20
--- a/qemu-img.c
40
--- a/util/aio-posix.c
21
+++ b/qemu-img.c
41
+++ b/util/aio-posix.c
22
@@ -XXX,XX +XXX,XX @@ typedef struct ImgConvertState {
42
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
23
bool compressed;
43
int64_t timeout;
24
bool target_has_backing;
44
int64_t start = 0;
25
bool wr_in_order;
45
26
+ bool copy_range;
46
+ /*
27
int min_sparse;
47
+ * There cannot be two concurrent aio_poll calls for the same AioContext (or
28
size_t cluster_sectors;
48
+ * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
29
size_t buf_sectors;
49
+ * We rely on this below to avoid slow locked accesses to ctx->notify_me.
30
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn convert_co_write(ImgConvertState *s, int64_t sector_num,
50
+ */
31
return 0;
51
assert(in_aio_context_home_thread(ctx));
32
}
52
33
53
/* aio_notify can avoid the expensive event_notifier_set if
34
+static int coroutine_fn convert_co_copy_range(ImgConvertState *s, int64_t sector_num,
54
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
35
+ int nb_sectors)
55
* so disable the optimization now.
36
+{
56
*/
37
+ int n, ret;
57
if (blocking) {
58
- atomic_add(&ctx->notify_me, 2);
59
+ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
60
+ /*
61
+ * Write ctx->notify_me before computing the timeout
62
+ * (reading bottom half flags, etc.). Pairs with
63
+ * smp_mb in aio_notify().
64
+ */
65
+ smp_mb();
66
}
67
68
qemu_lockcnt_inc(&ctx->list_lock);
69
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
70
}
71
72
if (blocking) {
73
- atomic_sub(&ctx->notify_me, 2);
74
+ /* Finish the poll before clearing the flag. */
75
+ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
76
aio_notify_accept(ctx);
77
}
78
79
diff --git a/util/aio-win32.c b/util/aio-win32.c
80
index XXXXXXX..XXXXXXX 100644
81
--- a/util/aio-win32.c
82
+++ b/util/aio-win32.c
83
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
84
int count;
85
int timeout;
86
87
+ /*
88
+ * There cannot be two concurrent aio_poll calls for the same AioContext (or
89
+ * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
90
+ * We rely on this below to avoid slow locked accesses to ctx->notify_me.
91
+ */
92
+ assert(in_aio_context_home_thread(ctx));
93
progress = false;
94
95
/* aio_notify can avoid the expensive event_notifier_set if
96
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
97
* so disable the optimization now.
98
*/
99
if (blocking) {
100
- atomic_add(&ctx->notify_me, 2);
101
+ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
102
+ /*
103
+ * Write ctx->notify_me before computing the timeout
104
+ * (reading bottom half flags, etc.). Pairs with
105
+ * smp_mb in aio_notify().
106
+ */
107
+ smp_mb();
108
}
109
110
qemu_lockcnt_inc(&ctx->list_lock);
111
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
112
ret = WaitForMultipleObjects(count, events, FALSE, timeout);
113
if (blocking) {
114
assert(first);
115
- assert(in_aio_context_home_thread(ctx));
116
- atomic_sub(&ctx->notify_me, 2);
117
+ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
118
aio_notify_accept(ctx);
119
}
120
121
diff --git a/util/async.c b/util/async.c
122
index XXXXXXX..XXXXXXX 100644
123
--- a/util/async.c
124
+++ b/util/async.c
125
@@ -XXX,XX +XXX,XX @@ aio_ctx_prepare(GSource *source, gint *timeout)
126
{
127
AioContext *ctx = (AioContext *) source;
128
129
- atomic_or(&ctx->notify_me, 1);
130
+ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1);
38
+
131
+
39
+ while (nb_sectors > 0) {
132
+ /*
40
+ BlockBackend *blk;
133
+ * Write ctx->notify_me before computing the timeout
41
+ int src_cur;
134
+ * (reading bottom half flags, etc.). Pairs with
42
+ int64_t bs_sectors, src_cur_offset;
135
+ * smp_mb in aio_notify().
43
+ int64_t offset;
136
+ */
44
+
137
+ smp_mb();
45
+ convert_select_part(s, sector_num, &src_cur, &src_cur_offset);
138
46
+ offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS;
139
/* We assume there is no timeout already supplied */
47
+ blk = s->src[src_cur];
140
*timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
48
+ bs_sectors = s->src_sectors[src_cur];
141
@@ -XXX,XX +XXX,XX @@ aio_ctx_check(GSource *source)
49
+
142
QEMUBH *bh;
50
+ n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset));
143
BHListSlice *s;
51
+
144
52
+ ret = blk_co_copy_range(blk, offset, s->target,
145
- atomic_and(&ctx->notify_me, ~1);
53
+ sector_num << BDRV_SECTOR_BITS,
146
+ /* Finish computing the timeout before clearing the flag. */
54
+ n << BDRV_SECTOR_BITS, 0);
147
+ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1);
55
+ if (ret < 0) {
148
aio_notify_accept(ctx);
56
+ return ret;
149
57
+ }
150
QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
58
+
151
@@ -XXX,XX +XXX,XX @@ LuringState *aio_get_linux_io_uring(AioContext *ctx)
59
+ sector_num += n;
152
void aio_notify(AioContext *ctx)
60
+ nb_sectors -= n;
61
+ }
62
+ return 0;
63
+}
64
+
65
static void coroutine_fn convert_co_do_copy(void *opaque)
66
{
153
{
67
ImgConvertState *s = opaque;
154
/* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs
68
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn convert_co_do_copy(void *opaque)
155
- * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
69
int n;
156
+ * with smp_mb in aio_ctx_prepare or aio_poll.
70
int64_t sector_num;
157
*/
71
enum ImgConvertBlockStatus status;
158
smp_mb();
72
+ bool copy_range;
159
- if (ctx->notify_me) {
73
160
+ if (atomic_read(&ctx->notify_me)) {
74
qemu_co_mutex_lock(&s->lock);
161
event_notifier_set(&ctx->notifier);
75
if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) {
162
atomic_mb_set(&ctx->notified, true);
76
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn convert_co_do_copy(void *opaque)
163
}
77
s->allocated_sectors, 0);
78
}
79
80
- if (status == BLK_DATA) {
81
+retry:
82
+ copy_range = s->copy_range && s->status == BLK_DATA;
83
+ if (status == BLK_DATA && !copy_range) {
84
ret = convert_co_read(s, sector_num, n, buf);
85
if (ret < 0) {
86
error_report("error while reading sector %" PRId64
87
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn convert_co_do_copy(void *opaque)
88
}
89
90
if (s->ret == -EINPROGRESS) {
91
- ret = convert_co_write(s, sector_num, n, buf, status);
92
+ if (copy_range) {
93
+ ret = convert_co_copy_range(s, sector_num, n);
94
+ if (ret) {
95
+ s->copy_range = false;
96
+ goto retry;
97
+ }
98
+ } else {
99
+ ret = convert_co_write(s, sector_num, n, buf, status);
100
+ }
101
if (ret < 0) {
102
error_report("error while writing sector %" PRId64
103
": %s", sector_num, strerror(-ret));
104
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
105
ImgConvertState s = (ImgConvertState) {
106
/* Need at least 4k of zeros for sparse detection */
107
.min_sparse = 8,
108
+ .copy_range = true,
109
.buf_sectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE,
110
.wr_in_order = true,
111
.num_coroutines = 8,
112
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
113
break;
114
case 'c':
115
s.compressed = true;
116
+ s.copy_range = false;
117
break;
118
case 'o':
119
if (!is_valid_option_list(optarg)) {
120
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
121
}
122
123
s.min_sparse = sval / BDRV_SECTOR_SIZE;
124
+ s.copy_range = false;
125
break;
126
}
127
case 'p':
128
--
164
--
129
2.17.1
165
2.25.1
130
166
131
diff view generated by jsdifflib