1
The following changes since commit 4c8c1cc544dbd5e2564868e61c5037258e393832:
1
The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
2
2
3
Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.10-pull-request' into staging (2017-06-22 19:01:58 +0100)
3
Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
4
4
5
are available in the git repository at:
5
are available in the git repository at:
6
6
7
8
git://repo.or.cz/qemu/kevin.git tags/for-upstream
7
git://repo.or.cz/qemu/kevin.git tags/for-upstream
9
8
10
for you to fetch changes up to 1512008812410ca4054506a7c44343088abdd977:
9
for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
11
10
12
Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into queue-block (2017-06-23 14:09:12 +0200)
11
block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
13
12
14
----------------------------------------------------------------
13
----------------------------------------------------------------
15
16
Block layer patches
14
Block layer patches
17
15
18
----------------------------------------------------------------
16
----------------------------------------------------------------
19
Alberto Garcia (9):
17
Doug Gale (1):
20
throttle: Update throttle-groups.c documentation
18
nvme: Add tracing
21
qcow2: Remove unused Error variable in do_perform_cow()
22
qcow2: Use unsigned int for both members of Qcow2COWRegion
23
qcow2: Make perform_cow() call do_perform_cow() twice
24
qcow2: Split do_perform_cow() into _read(), _encrypt() and _write()
25
qcow2: Allow reading both COW regions with only one request
26
qcow2: Pass a QEMUIOVector to do_perform_cow_{read,write}()
27
qcow2: Merge the writing of the COW regions with the guest data
28
qcow2: Use offset_into_cluster() and offset_to_l2_index()
29
19
30
Kevin Wolf (37):
20
Edgar Kaziakhmedov (1):
31
commit: Fix completion with extra reference
21
qcow2: get rid of qcow2_backing_read1 routine
32
qemu-iotests: Allow starting new qemu after cleanup
33
qemu-iotests: Test exiting qemu with running job
34
doc: Document generic -blockdev options
35
doc: Document driver-specific -blockdev options
36
qed: Use bottom half to resume waiting requests
37
qed: Make qed_read_table() synchronous
38
qed: Remove callback from qed_read_table()
39
qed: Remove callback from qed_read_l2_table()
40
qed: Remove callback from qed_find_cluster()
41
qed: Make qed_read_backing_file() synchronous
42
qed: Make qed_copy_from_backing_file() synchronous
43
qed: Remove callback from qed_copy_from_backing_file()
44
qed: Make qed_write_header() synchronous
45
qed: Remove callback from qed_write_header()
46
qed: Make qed_write_table() synchronous
47
qed: Remove GenericCB
48
qed: Remove callback from qed_write_table()
49
qed: Make qed_aio_read_data() synchronous
50
qed: Make qed_aio_write_main() synchronous
51
qed: Inline qed_commit_l2_update()
52
qed: Add return value to qed_aio_write_l1_update()
53
qed: Add return value to qed_aio_write_l2_update()
54
qed: Add return value to qed_aio_write_main()
55
qed: Add return value to qed_aio_write_cow()
56
qed: Add return value to qed_aio_write_inplace/alloc()
57
qed: Add return value to qed_aio_read/write_data()
58
qed: Remove ret argument from qed_aio_next_io()
59
qed: Remove recursion in qed_aio_next_io()
60
qed: Implement .bdrv_co_readv/writev
61
qed: Use CoQueue for serialising allocations
62
qed: Simplify request handling
63
qed: Use a coroutine for need_check_timer
64
qed: Add coroutine_fn to I/O path functions
65
qed: Use bdrv_co_* for coroutine_fns
66
block: Remove bdrv_aio_readv/writev/flush()
67
Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into queue-block
68
22
69
Manos Pitsidianakis (1):
23
Fam Zheng (2):
70
block: change variable names in BlockDriverState
24
block: Open backing image in force share mode for size probe
25
block: Remove unused bdrv_requests_pending
71
26
72
Max Reitz (3):
27
John Snow (1):
73
blkdebug: Catch bs->exact_filename overflow
28
iotests: fix 197 for vpc
74
blkverify: Catch bs->exact_filename overflow
75
block: Do not strcmp() with NULL uri->scheme
76
29
77
Stefan Hajnoczi (10):
30
Kevin Wolf (27):
78
block: count bdrv_co_rw_vmstate() requests
31
block: Formats don't need CONSISTENT_READ with NO_IO
79
block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()
32
block: Make bdrv_drain_invoke() recursive
80
migration: avoid recursive AioContext locking in save_vmstate()
33
block: Call .drain_begin only once in bdrv_drain_all_begin()
81
migration: use bdrv_drain_all_begin/end() instead bdrv_drain_all()
34
test-bdrv-drain: Test BlockDriver callbacks for drain
82
virtio-pci: use ioeventfd even when KVM is disabled
35
block: bdrv_drain_recurse(): Remove unused begin parameter
83
migration: hold AioContext lock for loadvm qemu_fclose()
36
block: Don't wait for requests in bdrv_drain*_end()
84
qemu-iotests: 068: extract _qemu() function
37
block: Unify order in drain functions
85
qemu-iotests: 068: use -drive/-device instead of -hda
38
block: Don't acquire AioContext in hmp_qemu_io()
86
qemu-iotests: 068: test iothread mode
39
block: Document that x-blockdev-change breaks quorum children list
87
qemu-img: don't shadow opts variable in img_dd()
40
block: Assert drain_all is only called from main AioContext
41
block: Make bdrv_drain() driver callbacks non-recursive
42
test-bdrv-drain: Test callback for bdrv_drain
43
test-bdrv-drain: Test bs->quiesce_counter
44
blockjob: Pause job on draining any job BDS
45
test-bdrv-drain: Test drain vs. block jobs
46
block: Don't block_job_pause_all() in bdrv_drain_all()
47
block: Nested drain_end must still call callbacks
48
test-bdrv-drain: Test nested drain sections
49
block: Don't notify parents in drain call chain
50
block: Add bdrv_subtree_drained_begin/end()
51
test-bdrv-drain: Tests for bdrv_subtree_drain
52
test-bdrv-drain: Test behaviour in coroutine context
53
test-bdrv-drain: Recursive draining with multiple parents
54
block: Allow graph changes in subtree drained section
55
test-bdrv-drain: Test graph changes in drained section
56
commit: Simplify reopen of base
57
block: Keep nodes drained between reopen_queue/multiple
88
58
89
Stephen Bates (1):
59
Thomas Huth (3):
90
nvme: Add support for Read Data and Write Data in CMBs.
60
block: Remove the obsolete -drive boot=on|off parameter
61
block: Remove the deprecated -hdachs option
62
block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
91
63
92
sochin.jiang (1):
64
qapi/block-core.json | 4 +
93
fix: avoid an infinite loop or a dangling pointer problem in img_commit
65
block/qcow2.h | 3 -
66
include/block/block.h | 15 +-
67
include/block/block_int.h | 6 +-
68
block.c | 75 ++++-
69
block/commit.c | 8 +-
70
block/io.c | 164 +++++++---
71
block/qcow2.c | 51 +--
72
block/replication.c | 6 +
73
blockdev.c | 11 -
74
blockjob.c | 22 +-
75
hmp.c | 6 -
76
hw/block/nvme.c | 349 +++++++++++++++++----
77
qemu-io-cmds.c | 3 +
78
tests/test-bdrv-drain.c | 651 +++++++++++++++++++++++++++++++++++++++
79
vl.c | 86 +-----
80
hw/block/trace-events | 93 ++++++
81
qemu-doc.texi | 29 +-
82
qemu-options.hx | 19 +-
83
tests/Makefile.include | 2 +
84
tests/qemu-iotests/197 | 4 +
85
tests/qemu-iotests/common.filter | 3 +-
86
22 files changed, 1294 insertions(+), 316 deletions(-)
87
create mode 100644 tests/test-bdrv-drain.c
94
88
95
block/Makefile.objs | 2 +-
96
block/blkdebug.c | 46 +--
97
block/blkreplay.c | 8 +-
98
block/blkverify.c | 12 +-
99
block/block-backend.c | 22 +-
100
block/commit.c | 7 +
101
block/file-posix.c | 34 +-
102
block/io.c | 240 ++-----------
103
block/iscsi.c | 20 +-
104
block/mirror.c | 8 +-
105
block/nbd-client.c | 8 +-
106
block/nbd-client.h | 4 +-
107
block/nbd.c | 6 +-
108
block/nfs.c | 2 +-
109
block/qcow2-cluster.c | 201 ++++++++---
110
block/qcow2.c | 94 +++--
111
block/qcow2.h | 11 +-
112
block/qed-cluster.c | 124 +++----
113
block/qed-gencb.c | 33 --
114
block/qed-table.c | 261 +++++---------
115
block/qed.c | 779 ++++++++++++++++-------------------------
116
block/qed.h | 54 +--
117
block/raw-format.c | 8 +-
118
block/rbd.c | 4 +-
119
block/sheepdog.c | 12 +-
120
block/ssh.c | 2 +-
121
block/throttle-groups.c | 2 +-
122
block/trace-events | 3 -
123
blockjob.c | 4 +-
124
hw/block/nvme.c | 83 +++--
125
hw/block/nvme.h | 1 +
126
hw/virtio/virtio-pci.c | 2 +-
127
include/block/block.h | 16 +-
128
include/block/block_int.h | 6 +-
129
include/block/blockjob.h | 18 +
130
include/sysemu/block-backend.h | 20 +-
131
migration/savevm.c | 32 +-
132
qemu-img.c | 29 +-
133
qemu-io-cmds.c | 46 +--
134
qemu-options.hx | 221 ++++++++++--
135
tests/qemu-iotests/068 | 37 +-
136
tests/qemu-iotests/068.out | 11 +-
137
tests/qemu-iotests/185 | 206 +++++++++++
138
tests/qemu-iotests/185.out | 59 ++++
139
tests/qemu-iotests/common.qemu | 3 +
140
tests/qemu-iotests/group | 1 +
141
46 files changed, 1477 insertions(+), 1325 deletions(-)
142
delete mode 100644 block/qed-gencb.c
143
create mode 100755 tests/qemu-iotests/185
144
create mode 100644 tests/qemu-iotests/185.out
145
diff view generated by jsdifflib
1
All functions that are marked coroutine_fn can directly call the
1
Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
2
bdrv_co_* version of functions instead of going through the wrapper.
2
in use as a mirror target. It is not enough for image formats, though,
3
as these still unconditionally request BLK_PERM_CONSISTENT_READ.
4
5
As this permission is geared towards whether the guest-visible data is
6
consistent, and has no impact on whether the metadata is sane, and
7
'qemu-img info' does not read guest-visible data (except for the raw
8
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
9
is not going to be any guest I/O performed, regardless of image format.
3
10
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
---
12
---
8
block/qed.c | 16 +++++++++-------
13
block.c | 6 +++++-
9
1 file changed, 9 insertions(+), 7 deletions(-)
14
1 file changed, 5 insertions(+), 1 deletion(-)
10
15
11
diff --git a/block/qed.c b/block/qed.c
16
diff --git a/block.c b/block.c
12
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
13
--- a/block/qed.c
18
--- a/block.c
14
+++ b/block/qed.c
19
+++ b/block.c
15
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
20
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
16
};
21
assert(role == &child_backing || role == &child_file);
17
qemu_iovec_init_external(&qiov, &iov, 1);
22
18
23
if (!backing) {
19
- ret = bdrv_preadv(s->bs->file, 0, &qiov);
24
+ int flags = bdrv_reopen_get_flags(reopen_queue, bs);
20
+ ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
25
+
21
if (ret < 0) {
26
/* Apart from the modifications below, the same permissions are
22
goto out;
27
* forwarded and left alone as for filters */
23
}
28
bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
24
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
29
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
25
/* Update header */
30
26
qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
31
/* bs->file always needs to be consistent because of the metadata. We
27
32
* can never allow other users to resize or write to it. */
28
- ret = bdrv_pwritev(s->bs->file, 0, &qiov);
33
- perm |= BLK_PERM_CONSISTENT_READ;
29
+ ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size, &qiov, 0);
34
+ if (!(flags & BDRV_O_NO_IO)) {
30
if (ret < 0) {
35
+ perm |= BLK_PERM_CONSISTENT_READ;
31
goto out;
36
+ }
32
}
37
shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
33
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
38
} else {
34
qemu_iovec_concat(*backing_qiov, qiov, 0, size);
39
/* We want consistent read from backing files if the parent needs it.
35
36
BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
37
- ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
38
+ ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
39
if (ret < 0) {
40
return ret;
41
}
42
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
43
}
44
45
BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
46
- ret = bdrv_pwritev(s->bs->file, offset, &qiov);
47
+ ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
48
if (ret < 0) {
49
goto out;
50
}
51
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
52
trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
53
54
BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
55
- ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
56
+ ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
57
+ &acb->cur_qiov, 0);
58
if (ret < 0) {
59
return ret;
60
}
61
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
62
* region. The solution is to flush after writing a new data
63
* cluster and before updating the L2 table.
64
*/
65
- ret = bdrv_flush(s->bs->file->bs);
66
+ ret = bdrv_co_flush(s->bs->file->bs);
67
if (ret < 0) {
68
return ret;
69
}
70
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
71
}
72
73
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
74
- ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
75
+ ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
76
+ &acb->cur_qiov, 0);
77
if (ret < 0) {
78
return ret;
79
}
80
--
40
--
81
1.8.3.1
41
2.13.6
82
42
83
43
diff view generated by jsdifflib
1
From: Max Reitz <mreitz@redhat.com>
1
From: John Snow <jsnow@redhat.com>
2
2
3
The bs->exact_filename field may not be sufficient to store the full
3
VPC has some difficulty creating geometries of particular size.
4
blkverify node filename. In this case, we should not generate a filename
4
However, we can indeed force it to use a literal one, so let's
5
at all instead of an unusable one.
5
do that for the sake of test 197, which is testing some specific
6
offsets.
6
7
7
Cc: qemu-stable@nongnu.org
8
Signed-off-by: John Snow <jsnow@redhat.com>
8
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
9
Reviewed-by: Eric Blake <eblake@redhat.com>
9
Signed-off-by: Max Reitz <mreitz@redhat.com>
10
Message-id: 20170613172006.19685-3-mreitz@redhat.com
11
Reviewed-by: Alberto Garcia <berto@igalia.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Signed-off-by: Max Reitz <mreitz@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
14
---
13
---
15
block/blkverify.c | 12 ++++++++----
14
tests/qemu-iotests/197 | 4 ++++
16
1 file changed, 8 insertions(+), 4 deletions(-)
15
tests/qemu-iotests/common.filter | 3 ++-
16
2 files changed, 6 insertions(+), 1 deletion(-)
17
17
18
diff --git a/block/blkverify.c b/block/blkverify.c
18
diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
19
index XXXXXXX..XXXXXXX 100755
20
--- a/tests/qemu-iotests/197
21
+++ b/tests/qemu-iotests/197
22
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
23
echo
24
25
# Prep the images
26
+# VPC rounds image sizes to a specific geometry, force a specific size.
27
+if [ "$IMGFMT" = "vpc" ]; then
28
+ IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
29
+fi
30
_make_test_img 4G
31
$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
32
IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
33
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
19
index XXXXXXX..XXXXXXX 100644
34
index XXXXXXX..XXXXXXX 100644
20
--- a/block/blkverify.c
35
--- a/tests/qemu-iotests/common.filter
21
+++ b/block/blkverify.c
36
+++ b/tests/qemu-iotests/common.filter
22
@@ -XXX,XX +XXX,XX @@ static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options)
37
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
23
if (bs->file->bs->exact_filename[0]
38
-e "s# log_size=[0-9]\\+##g" \
24
&& s->test_file->bs->exact_filename[0])
39
-e "s# refcount_bits=[0-9]\\+##g" \
25
{
40
-e "s# key-secret=[a-zA-Z0-9]\\+##g" \
26
- snprintf(bs->exact_filename, sizeof(bs->exact_filename),
41
- -e "s# iter-time=[0-9]\\+##g"
27
- "blkverify:%s:%s",
42
+ -e "s# iter-time=[0-9]\\+##g" \
28
- bs->file->bs->exact_filename,
43
+ -e "s# force_size=\\(on\\|off\\)##g"
29
- s->test_file->bs->exact_filename);
30
+ int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
31
+ "blkverify:%s:%s",
32
+ bs->file->bs->exact_filename,
33
+ s->test_file->bs->exact_filename);
34
+ if (ret >= sizeof(bs->exact_filename)) {
35
+ /* An overflow makes the filename unusable, so do not report any */
36
+ bs->exact_filename[0] = 0;
37
+ }
38
}
39
}
44
}
40
45
46
_filter_img_info()
41
--
47
--
42
1.8.3.1
48
2.13.6
43
49
44
50
diff view generated by jsdifflib
1
This change separates bdrv_drain_invoke(), which calls the BlockDriver
2
drain callbacks, from bdrv_drain_recurse(). Instead, the function
3
performs its own recursion now.
4
5
One reason for this is that bdrv_drain_recurse() can be called multiple
6
times by bdrv_drain_all_begin(), but the callbacks may only be called
7
once. The separation is necessary to fix this bug.
8
9
The other reason is that we intend to go to a model where we call all
10
driver callbacks first, and only then start polling. This is not fully
11
achieved yet with this patch, as bdrv_drain_invoke() contains a
12
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
13
call callbacks for any unrelated event. It's a step in this direction
14
anyway.
15
16
Cc: qemu-stable@nongnu.org
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
18
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
3
---
19
---
4
block/qed.c | 32 ++++++++++++--------------------
20
block/io.c | 14 +++++++++++---
5
1 file changed, 12 insertions(+), 20 deletions(-)
21
1 file changed, 11 insertions(+), 3 deletions(-)
6
22
7
diff --git a/block/qed.c b/block/qed.c
23
diff --git a/block/io.c b/block/io.c
8
index XXXXXXX..XXXXXXX 100644
24
index XXXXXXX..XXXXXXX 100644
9
--- a/block/qed.c
25
--- a/block/io.c
10
+++ b/block/qed.c
26
+++ b/block/io.c
11
@@ -XXX,XX +XXX,XX @@ int qed_write_header_sync(BDRVQEDState *s)
27
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
12
* This function only updates known header fields in-place and does not affect
28
bdrv_wakeup(bs);
13
* extra data after the QED header.
29
}
14
*/
30
15
-static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
31
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
16
- void *opaque)
32
static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
17
+static int qed_write_header(BDRVQEDState *s)
18
{
33
{
19
/* We must write full sectors for O_DIRECT but cannot necessarily generate
34
+ BdrvChild *child, *tmp;
20
* the data following the header if an unrecognized compat feature is
35
BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
21
@@ -XXX,XX +XXX,XX @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
36
22
ret = 0;
37
if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
23
out:
38
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
24
qemu_vfree(buf);
39
data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
25
- cb(opaque, ret);
40
bdrv_coroutine_enter(bs, data.co);
26
+ return ret;
41
BDRV_POLL_WHILE(bs, !data.done);
42
+
43
+ QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
44
+ bdrv_drain_invoke(child->bs, begin);
45
+ }
27
}
46
}
28
47
29
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
48
static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
30
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
49
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
50
BdrvChild *child, *tmp;
51
bool waited;
52
53
- /* Ensure any pending metadata writes are submitted to bs->file. */
54
- bdrv_drain_invoke(bs, begin);
55
-
56
/* Wait for drained requests to finish */
57
waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
58
59
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
60
bdrv_parent_drained_begin(bs);
31
}
61
}
62
63
+ bdrv_drain_invoke(bs, true);
64
bdrv_drain_recurse(bs, true);
32
}
65
}
33
66
34
-static void qed_finish_clear_need_check(void *opaque, int ret)
67
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
35
-{
36
- /* Do nothing */
37
-}
38
-
39
-static void qed_flush_after_clear_need_check(void *opaque, int ret)
40
-{
41
- BDRVQEDState *s = opaque;
42
-
43
- bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
44
-
45
- /* No need to wait until flush completes */
46
- qed_unplug_allocating_write_reqs(s);
47
-}
48
-
49
static void qed_clear_need_check(void *opaque, int ret)
50
{
51
BDRVQEDState *s = opaque;
52
@@ -XXX,XX +XXX,XX @@ static void qed_clear_need_check(void *opaque, int ret)
53
}
68
}
54
69
55
s->header.features &= ~QED_F_NEED_CHECK;
70
bdrv_parent_drained_end(bs);
56
- qed_write_header(s, qed_flush_after_clear_need_check, s);
71
+ bdrv_drain_invoke(bs, false);
57
+ ret = qed_write_header(s);
72
bdrv_drain_recurse(bs, false);
58
+ (void) ret;
73
aio_enable_external(bdrv_get_aio_context(bs));
59
+
60
+ qed_unplug_allocating_write_reqs(s);
61
+
62
+ ret = bdrv_flush(s->bs);
63
+ (void) ret;
64
}
74
}
65
75
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
66
static void qed_need_check_timer_cb(void *opaque)
76
aio_context_acquire(aio_context);
67
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
77
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
68
{
78
if (aio_context == bdrv_get_aio_context(bs)) {
69
BDRVQEDState *s = acb_to_s(acb);
79
+ /* FIXME Calling this multiple times is wrong */
70
BlockCompletionFunc *cb;
80
+ bdrv_drain_invoke(bs, true);
71
+ int ret;
81
waited |= bdrv_drain_recurse(bs, true);
72
82
}
73
/* Cancel timer when the first allocating request comes in */
83
}
74
if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
84
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
75
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
85
aio_context_acquire(aio_context);
76
86
aio_enable_external(aio_context);
77
if (qed_should_set_need_check(s)) {
87
bdrv_parent_drained_end(bs);
78
s->header.features |= QED_F_NEED_CHECK;
88
+ bdrv_drain_invoke(bs, false);
79
- qed_write_header(s, cb, acb);
89
bdrv_drain_recurse(bs, false);
80
+ ret = qed_write_header(s);
90
aio_context_release(aio_context);
81
+ cb(acb, ret);
82
} else {
83
cb(acb, 0);
84
}
91
}
85
--
92
--
86
1.8.3.1
93
2.13.6
87
94
88
95
diff view generated by jsdifflib
1
Now that we stay in coroutine context for the whole request when doing
1
bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
2
reads or writes, we can add coroutine_fn annotations to many functions
2
callback inside its polling loop. This means that how many times it got
3
that can do I/O or yield directly.
3
called for each node depended on long it had to poll the event loop.
4
4
5
This is obviously not right and results in nodes that stay drained even
6
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
7
node.
8
9
Fix bdrv_drain_all_begin() to call the callback only once, too.
10
11
Cc: qemu-stable@nongnu.org
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
---
14
---
8
block/qed-cluster.c | 5 +++--
15
block/io.c | 3 +--
9
block/qed.c | 44 ++++++++++++++++++++++++--------------------
16
1 file changed, 1 insertion(+), 2 deletions(-)
10
block/qed.h | 5 +++--
11
3 files changed, 30 insertions(+), 24 deletions(-)
12
17
13
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
18
diff --git a/block/io.c b/block/io.c
14
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
15
--- a/block/qed-cluster.c
20
--- a/block/io.c
16
+++ b/block/qed-cluster.c
21
+++ b/block/io.c
17
@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
22
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
18
* On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
23
aio_context_acquire(aio_context);
19
* table offset, respectively. len is number of contiguous unallocated bytes.
24
bdrv_parent_drained_begin(bs);
20
*/
25
aio_disable_external(aio_context);
21
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
26
+ bdrv_drain_invoke(bs, true);
22
- size_t *len, uint64_t *img_offset)
27
aio_context_release(aio_context);
23
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
28
24
+ uint64_t pos, size_t *len,
29
if (!g_slist_find(aio_ctxs, aio_context)) {
25
+ uint64_t *img_offset)
30
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
26
{
31
aio_context_acquire(aio_context);
27
uint64_t l2_offset;
32
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
28
uint64_t offset = 0;
33
if (aio_context == bdrv_get_aio_context(bs)) {
29
diff --git a/block/qed.c b/block/qed.c
34
- /* FIXME Calling this multiple times is wrong */
30
index XXXXXXX..XXXXXXX 100644
35
- bdrv_drain_invoke(bs, true);
31
--- a/block/qed.c
36
waited |= bdrv_drain_recurse(bs, true);
32
+++ b/block/qed.c
37
}
33
@@ -XXX,XX +XXX,XX @@ int qed_write_header_sync(BDRVQEDState *s)
38
}
34
* This function only updates known header fields in-place and does not affect
35
* extra data after the QED header.
36
*/
37
-static int qed_write_header(BDRVQEDState *s)
38
+static int coroutine_fn qed_write_header(BDRVQEDState *s)
39
{
40
/* We must write full sectors for O_DIRECT but cannot necessarily generate
41
* the data following the header if an unrecognized compat feature is
42
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
43
qemu_co_enter_next(&s->allocating_write_reqs);
44
}
45
46
-static void qed_need_check_timer_entry(void *opaque)
47
+static void coroutine_fn qed_need_check_timer_entry(void *opaque)
48
{
49
BDRVQEDState *s = opaque;
50
int ret;
51
@@ -XXX,XX +XXX,XX @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
52
* This function reads qiov->size bytes starting at pos from the backing file.
53
* If there is no backing file then zeroes are read.
54
*/
55
-static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
56
- QEMUIOVector *qiov,
57
- QEMUIOVector **backing_qiov)
58
+static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
59
+ QEMUIOVector *qiov,
60
+ QEMUIOVector **backing_qiov)
61
{
62
uint64_t backing_length = 0;
63
size_t size;
64
@@ -XXX,XX +XXX,XX @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
65
* @len: Number of bytes
66
* @offset: Byte offset in image file
67
*/
68
-static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
69
- uint64_t len, uint64_t offset)
70
+static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
71
+ uint64_t pos, uint64_t len,
72
+ uint64_t offset)
73
{
74
QEMUIOVector qiov;
75
QEMUIOVector *backing_qiov = NULL;
76
@@ -XXX,XX +XXX,XX @@ out:
77
* The cluster offset may be an allocated byte offset in the image file, the
78
* zero cluster marker, or the unallocated cluster marker.
79
*/
80
-static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
81
- unsigned int n, uint64_t cluster)
82
+static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
83
+ int index, unsigned int n,
84
+ uint64_t cluster)
85
{
86
int i;
87
for (i = index; i < index + n; i++) {
88
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
89
}
90
}
91
92
-static void qed_aio_complete(QEDAIOCB *acb)
93
+static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
94
{
95
BDRVQEDState *s = acb_to_s(acb);
96
97
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb)
98
/**
99
* Update L1 table with new L2 table offset and write it out
100
*/
101
-static int qed_aio_write_l1_update(QEDAIOCB *acb)
102
+static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
103
{
104
BDRVQEDState *s = acb_to_s(acb);
105
CachedL2Table *l2_table = acb->request.l2_table;
106
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
107
/**
108
* Update L2 table with new cluster offsets and write them out
109
*/
110
-static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
111
+static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
112
{
113
BDRVQEDState *s = acb_to_s(acb);
114
bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
115
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
116
/**
117
* Write data to the image file
118
*/
119
-static int qed_aio_write_main(QEDAIOCB *acb)
120
+static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
121
{
122
BDRVQEDState *s = acb_to_s(acb);
123
uint64_t offset = acb->cur_cluster +
124
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_main(QEDAIOCB *acb)
125
/**
126
* Populate untouched regions of new data cluster
127
*/
128
-static int qed_aio_write_cow(QEDAIOCB *acb)
129
+static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
130
{
131
BDRVQEDState *s = acb_to_s(acb);
132
uint64_t start, len, offset;
133
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
134
*
135
* This path is taken when writing to previously unallocated clusters.
136
*/
137
-static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
138
+static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
139
{
140
BDRVQEDState *s = acb_to_s(acb);
141
int ret;
142
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
143
*
144
* This path is taken when writing to already allocated clusters.
145
*/
146
-static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
147
+static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
148
+ size_t len)
149
{
150
/* Allocate buffer for zero writes */
151
if (acb->flags & QED_AIOCB_ZERO) {
152
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
153
* @offset: Cluster offset in bytes
154
* @len: Length in bytes
155
*/
156
-static int qed_aio_write_data(void *opaque, int ret,
157
- uint64_t offset, size_t len)
158
+static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
159
+ uint64_t offset, size_t len)
160
{
161
QEDAIOCB *acb = opaque;
162
163
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_data(void *opaque, int ret,
164
* @offset: Cluster offset in bytes
165
* @len: Length in bytes
166
*/
167
-static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
168
+static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
169
+ uint64_t offset, size_t len)
170
{
171
QEDAIOCB *acb = opaque;
172
BDRVQEDState *s = acb_to_s(acb);
173
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
174
/**
175
* Begin next I/O or complete the request
176
*/
177
-static int qed_aio_next_io(QEDAIOCB *acb)
178
+static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
179
{
180
BDRVQEDState *s = acb_to_s(acb);
181
uint64_t offset;
182
diff --git a/block/qed.h b/block/qed.h
183
index XXXXXXX..XXXXXXX 100644
184
--- a/block/qed.h
185
+++ b/block/qed.h
186
@@ -XXX,XX +XXX,XX @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
187
/**
188
* Cluster functions
189
*/
190
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
191
- size_t *len, uint64_t *img_offset);
192
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
193
+ uint64_t pos, size_t *len,
194
+ uint64_t *img_offset);
195
196
/**
197
* Consistency check
198
--
39
--
199
1.8.3.1
40
2.13.6
200
41
201
42
diff view generated by jsdifflib
1
When qemu is exited, all running jobs should be cancelled successfully.
1
This adds a test case that the BlockDriver callbacks for drain are
2
This adds a test for this for all types of block jobs that currently
2
called in bdrv_drained_all_begin/end(), and that both of them are called
3
exist in qemu.
3
exactly once.
4
4
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Eric Blake <eblake@redhat.com>
7
Reviewed-by: Eric Blake <eblake@redhat.com>
7
---
8
---
8
tests/qemu-iotests/185 | 206 +++++++++++++++++++++++++++++++++++++++++++++
9
tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
9
tests/qemu-iotests/185.out | 59 +++++++++++++
10
tests/Makefile.include | 2 +
10
tests/qemu-iotests/group | 1 +
11
2 files changed, 139 insertions(+)
11
3 files changed, 266 insertions(+)
12
create mode 100644 tests/test-bdrv-drain.c
12
create mode 100755 tests/qemu-iotests/185
13
create mode 100644 tests/qemu-iotests/185.out
14
13
15
diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
14
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
16
new file mode 100755
17
index XXXXXXX..XXXXXXX
18
--- /dev/null
19
+++ b/tests/qemu-iotests/185
20
@@ -XXX,XX +XXX,XX @@
21
+#!/bin/bash
22
+#
23
+# Test exiting qemu while jobs are still running
24
+#
25
+# Copyright (C) 2017 Red Hat, Inc.
26
+#
27
+# This program is free software; you can redistribute it and/or modify
28
+# it under the terms of the GNU General Public License as published by
29
+# the Free Software Foundation; either version 2 of the License, or
30
+# (at your option) any later version.
31
+#
32
+# This program is distributed in the hope that it will be useful,
33
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
34
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35
+# GNU General Public License for more details.
36
+#
37
+# You should have received a copy of the GNU General Public License
38
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
39
+#
40
+
41
+# creator
42
+owner=kwolf@redhat.com
43
+
44
+seq=`basename $0`
45
+echo "QA output created by $seq"
46
+
47
+here=`pwd`
48
+status=1 # failure is the default!
49
+
50
+MIG_SOCKET="${TEST_DIR}/migrate"
51
+
52
+_cleanup()
53
+{
54
+ rm -f "${TEST_IMG}.mid"
55
+ rm -f "${TEST_IMG}.copy"
56
+ _cleanup_test_img
57
+ _cleanup_qemu
58
+}
59
+trap "_cleanup; exit \$status" 0 1 2 3 15
60
+
61
+# get standard environment, filters and checks
62
+. ./common.rc
63
+. ./common.filter
64
+. ./common.qemu
65
+
66
+_supported_fmt qcow2
67
+_supported_proto file
68
+_supported_os Linux
69
+
70
+size=64M
71
+TEST_IMG="${TEST_IMG}.base" _make_test_img $size
72
+
73
+echo
74
+echo === Starting VM ===
75
+echo
76
+
77
+qemu_comm_method="qmp"
78
+
79
+_launch_qemu \
80
+ -drive file="${TEST_IMG}.base",cache=$CACHEMODE,driver=$IMGFMT,id=disk
81
+h=$QEMU_HANDLE
82
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
83
+
84
+echo
85
+echo === Creating backing chain ===
86
+echo
87
+
88
+_send_qemu_cmd $h \
89
+ "{ 'execute': 'blockdev-snapshot-sync',
90
+ 'arguments': { 'device': 'disk',
91
+ 'snapshot-file': '$TEST_IMG.mid',
92
+ 'format': '$IMGFMT',
93
+ 'mode': 'absolute-paths' } }" \
94
+ "return"
95
+
96
+_send_qemu_cmd $h \
97
+ "{ 'execute': 'human-monitor-command',
98
+ 'arguments': { 'command-line':
99
+ 'qemu-io disk \"write 0 4M\"' } }" \
100
+ "return"
101
+
102
+_send_qemu_cmd $h \
103
+ "{ 'execute': 'blockdev-snapshot-sync',
104
+ 'arguments': { 'device': 'disk',
105
+ 'snapshot-file': '$TEST_IMG',
106
+ 'format': '$IMGFMT',
107
+ 'mode': 'absolute-paths' } }" \
108
+ "return"
109
+
110
+echo
111
+echo === Start commit job and exit qemu ===
112
+echo
113
+
114
+# Note that the reference output intentionally includes the 'offset' field in
115
+# BLOCK_JOB_CANCELLED events for all of the following block jobs. They are
116
+# predictable and any change in the offsets would hint at a bug in the job
117
+# throttling code.
118
+#
119
+# In order to achieve these predictable offsets, all of the following tests
120
+# use speed=65536. Each job will perform exactly one iteration before it has
121
+# to sleep at least for a second, which is plenty of time for the 'quit' QMP
122
+# command to be received (after receiving the command, the rest runs
123
+# synchronously, so jobs can arbitrarily continue or complete).
124
+#
125
+# The buffer size for commit and streaming is 512k (waiting for 8 seconds after
126
+# the first request), for active commit and mirror it's large enough to cover
127
+# the full 4M, and for backup it's the qcow2 cluster size, which we know is
128
+# 64k. As all of these are at least as large as the speed, we are sure that the
129
+# offset doesn't advance after the first iteration before qemu exits.
130
+
131
+_send_qemu_cmd $h \
132
+ "{ 'execute': 'block-commit',
133
+ 'arguments': { 'device': 'disk',
134
+ 'base':'$TEST_IMG.base',
135
+ 'top': '$TEST_IMG.mid',
136
+ 'speed': 65536 } }" \
137
+ "return"
138
+
139
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
140
+wait=1 _cleanup_qemu
141
+
142
+echo
143
+echo === Start active commit job and exit qemu ===
144
+echo
145
+
146
+_launch_qemu \
147
+ -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
148
+h=$QEMU_HANDLE
149
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
150
+
151
+_send_qemu_cmd $h \
152
+ "{ 'execute': 'block-commit',
153
+ 'arguments': { 'device': 'disk',
154
+ 'base':'$TEST_IMG.base',
155
+ 'speed': 65536 } }" \
156
+ "return"
157
+
158
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
159
+wait=1 _cleanup_qemu
160
+
161
+echo
162
+echo === Start mirror job and exit qemu ===
163
+echo
164
+
165
+_launch_qemu \
166
+ -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
167
+h=$QEMU_HANDLE
168
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
169
+
170
+_send_qemu_cmd $h \
171
+ "{ 'execute': 'drive-mirror',
172
+ 'arguments': { 'device': 'disk',
173
+ 'target': '$TEST_IMG.copy',
174
+ 'format': '$IMGFMT',
175
+ 'sync': 'full',
176
+ 'speed': 65536 } }" \
177
+ "return"
178
+
179
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
180
+wait=1 _cleanup_qemu
181
+
182
+echo
183
+echo === Start backup job and exit qemu ===
184
+echo
185
+
186
+_launch_qemu \
187
+ -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
188
+h=$QEMU_HANDLE
189
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
190
+
191
+_send_qemu_cmd $h \
192
+ "{ 'execute': 'drive-backup',
193
+ 'arguments': { 'device': 'disk',
194
+ 'target': '$TEST_IMG.copy',
195
+ 'format': '$IMGFMT',
196
+ 'sync': 'full',
197
+ 'speed': 65536 } }" \
198
+ "return"
199
+
200
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
201
+wait=1 _cleanup_qemu
202
+
203
+echo
204
+echo === Start streaming job and exit qemu ===
205
+echo
206
+
207
+_launch_qemu \
208
+ -drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
209
+h=$QEMU_HANDLE
210
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
211
+
212
+_send_qemu_cmd $h \
213
+ "{ 'execute': 'block-stream',
214
+ 'arguments': { 'device': 'disk',
215
+ 'speed': 65536 } }" \
216
+ "return"
217
+
218
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
219
+wait=1 _cleanup_qemu
220
+
221
+_check_test_img
222
+
223
+# success, all done
224
+echo "*** done"
225
+rm -f $seq.full
226
+status=0
227
diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out
228
new file mode 100644
15
new file mode 100644
229
index XXXXXXX..XXXXXXX
16
index XXXXXXX..XXXXXXX
230
--- /dev/null
17
--- /dev/null
231
+++ b/tests/qemu-iotests/185.out
18
+++ b/tests/test-bdrv-drain.c
232
@@ -XXX,XX +XXX,XX @@
19
@@ -XXX,XX +XXX,XX @@
233
+QA output created by 185
20
+/*
234
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
21
+ * Block node draining tests
22
+ *
23
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
24
+ *
25
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
26
+ * of this software and associated documentation files (the "Software"), to deal
27
+ * in the Software without restriction, including without limitation the rights
28
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
29
+ * copies of the Software, and to permit persons to whom the Software is
30
+ * furnished to do so, subject to the following conditions:
31
+ *
32
+ * The above copyright notice and this permission notice shall be included in
33
+ * all copies or substantial portions of the Software.
34
+ *
35
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
37
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
38
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
39
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
40
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
41
+ * THE SOFTWARE.
42
+ */
235
+
43
+
236
+=== Starting VM ===
44
+#include "qemu/osdep.h"
45
+#include "block/block.h"
46
+#include "sysemu/block-backend.h"
47
+#include "qapi/error.h"
237
+
48
+
238
+{"return": {}}
49
+typedef struct BDRVTestState {
50
+ int drain_count;
51
+} BDRVTestState;
239
+
52
+
240
+=== Creating backing chain ===
53
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
54
+{
55
+ BDRVTestState *s = bs->opaque;
56
+ s->drain_count++;
57
+}
241
+
58
+
242
+Formatting 'TEST_DIR/t.qcow2.mid', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.base backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
59
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
243
+{"return": {}}
60
+{
244
+wrote 4194304/4194304 bytes at offset 0
61
+ BDRVTestState *s = bs->opaque;
245
+4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
62
+ s->drain_count--;
246
+{"return": ""}
63
+}
247
+Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=67108864 backing_file=TEST_DIR/t.qcow2.mid backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
248
+{"return": {}}
249
+
64
+
250
+=== Start commit job and exit qemu ===
65
+static void bdrv_test_close(BlockDriverState *bs)
66
+{
67
+ BDRVTestState *s = bs->opaque;
68
+ g_assert_cmpint(s->drain_count, >, 0);
69
+}
251
+
70
+
252
+{"return": {}}
71
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
253
+{"return": {}}
72
+ uint64_t offset, uint64_t bytes,
254
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
73
+ QEMUIOVector *qiov, int flags)
255
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "commit"}}
74
+{
75
+ /* We want this request to stay until the polling loop in drain waits for
76
+ * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
77
+ * first and polls its result, too, but it shouldn't accidentally complete
78
+ * this request yet. */
79
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
256
+
80
+
257
+=== Start active commit job and exit qemu ===
81
+ return 0;
82
+}
258
+
83
+
259
+{"return": {}}
84
+static BlockDriver bdrv_test = {
260
+{"return": {}}
85
+ .format_name = "test",
261
+{"return": {}}
86
+ .instance_size = sizeof(BDRVTestState),
262
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
263
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "commit"}}
264
+
87
+
265
+=== Start mirror job and exit qemu ===
88
+ .bdrv_close = bdrv_test_close,
89
+ .bdrv_co_preadv = bdrv_test_co_preadv,
266
+
90
+
267
+{"return": {}}
91
+ .bdrv_co_drain_begin = bdrv_test_co_drain_begin,
268
+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
92
+ .bdrv_co_drain_end = bdrv_test_co_drain_end,
269
+{"return": {}}
93
+};
270
+{"return": {}}
271
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
272
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 4194304, "offset": 4194304, "speed": 65536, "type": "mirror"}}
273
+
94
+
274
+=== Start backup job and exit qemu ===
95
+static void aio_ret_cb(void *opaque, int ret)
96
+{
97
+ int *aio_ret = opaque;
98
+ *aio_ret = ret;
99
+}
275
+
100
+
276
+{"return": {}}
101
+static void test_drv_cb_drain_all(void)
277
+Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
102
+{
278
+{"return": {}}
103
+ BlockBackend *blk;
279
+{"return": {}}
104
+ BlockDriverState *bs;
280
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
105
+ BDRVTestState *s;
281
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 65536, "speed": 65536, "type": "backup"}}
106
+ BlockAIOCB *acb;
107
+ int aio_ret;
282
+
108
+
283
+=== Start streaming job and exit qemu ===
109
+ QEMUIOVector qiov;
110
+ struct iovec iov = {
111
+ .iov_base = NULL,
112
+ .iov_len = 0,
113
+ };
114
+ qemu_iovec_init_external(&qiov, &iov, 1);
284
+
115
+
285
+{"return": {}}
116
+ blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
286
+{"return": {}}
117
+ bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
287
+{"return": {}}
118
+ &error_abort);
288
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false}}
119
+ s = bs->opaque;
289
+{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "disk", "len": 67108864, "offset": 524288, "speed": 65536, "type": "stream"}}
120
+ blk_insert_bs(blk, bs, &error_abort);
290
+No errors were found on the image.
121
+
291
+*** done
122
+ /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
292
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
123
+ g_assert_cmpint(s->drain_count, ==, 0);
124
+ bdrv_drain_all_begin();
125
+ g_assert_cmpint(s->drain_count, ==, 1);
126
+ bdrv_drain_all_end();
127
+ g_assert_cmpint(s->drain_count, ==, 0);
128
+
129
+ /* Now do the same while a request is pending */
130
+ aio_ret = -EINPROGRESS;
131
+ acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
132
+ g_assert(acb != NULL);
133
+ g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
134
+
135
+ g_assert_cmpint(s->drain_count, ==, 0);
136
+ bdrv_drain_all_begin();
137
+ g_assert_cmpint(aio_ret, ==, 0);
138
+ g_assert_cmpint(s->drain_count, ==, 1);
139
+ bdrv_drain_all_end();
140
+ g_assert_cmpint(s->drain_count, ==, 0);
141
+
142
+ bdrv_unref(bs);
143
+ blk_unref(blk);
144
+}
145
+
146
+int main(int argc, char **argv)
147
+{
148
+ bdrv_init();
149
+ qemu_init_main_loop(&error_abort);
150
+
151
+ g_test_init(&argc, &argv, NULL);
152
+
153
+ g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
154
+
155
+ return g_test_run();
156
+}
157
diff --git a/tests/Makefile.include b/tests/Makefile.include
293
index XXXXXXX..XXXXXXX 100644
158
index XXXXXXX..XXXXXXX 100644
294
--- a/tests/qemu-iotests/group
159
--- a/tests/Makefile.include
295
+++ b/tests/qemu-iotests/group
160
+++ b/tests/Makefile.include
296
@@ -XXX,XX +XXX,XX @@
161
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
297
181 rw auto migration
162
gcov-files-test-hbitmap-y = util/hbitmap.c
298
182 rw auto quick
163
check-unit-y += tests/test-hbitmap$(EXESUF)
299
183 rw auto migration
164
gcov-files-test-hbitmap-y = blockjob.c
300
+185 rw auto
165
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
166
check-unit-y += tests/test-blockjob$(EXESUF)
167
check-unit-y += tests/test-blockjob-txn$(EXESUF)
168
check-unit-y += tests/test-x86-cpuid$(EXESUF)
169
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
170
tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
171
tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
172
tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
173
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
174
tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
175
tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
176
tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
301
--
177
--
302
1.8.3.1
178
2.13.6
303
179
304
180
diff view generated by jsdifflib
1
This fixes the last place where we degraded from AIO to actual blocking
1
Now that the bdrv_drain_invoke() calls are pulled up to the callers of
2
synchronous I/O requests. Putting it into a coroutine means that instead
2
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
3
of blocking, the coroutine simply yields while doing I/O.
4
3
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
---
6
---
8
block/qed.c | 33 +++++++++++++++++----------------
7
block/io.c | 12 ++++++------
9
1 file changed, 17 insertions(+), 16 deletions(-)
8
1 file changed, 6 insertions(+), 6 deletions(-)
10
9
11
diff --git a/block/qed.c b/block/qed.c
10
diff --git a/block/io.c b/block/io.c
12
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
13
--- a/block/qed.c
12
--- a/block/io.c
14
+++ b/block/qed.c
13
+++ b/block/io.c
15
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
14
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
16
qemu_co_enter_next(&s->allocating_write_reqs);
15
}
17
}
16
}
18
17
19
-static void qed_clear_need_check(void *opaque, int ret)
18
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
20
+static void qed_need_check_timer_entry(void *opaque)
19
+static bool bdrv_drain_recurse(BlockDriverState *bs)
21
{
20
{
22
BDRVQEDState *s = opaque;
21
BdrvChild *child, *tmp;
23
+ int ret;
22
bool waited;
24
23
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
25
- if (ret) {
24
*/
26
+ /* The timer should only fire when allocating writes have drained */
25
bdrv_ref(bs);
27
+ assert(!s->allocating_acb);
26
}
28
+
27
- waited |= bdrv_drain_recurse(bs, begin);
29
+ trace_qed_need_check_timer_cb(s);
28
+ waited |= bdrv_drain_recurse(bs);
30
+
29
if (in_main_loop) {
31
+ qed_acquire(s);
30
bdrv_unref(bs);
32
+ qed_plug_allocating_write_reqs(s);
31
}
33
+
32
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
34
+ /* Ensure writes are on disk before clearing flag */
35
+ ret = bdrv_co_flush(s->bs->file->bs);
36
+ qed_release(s);
37
+ if (ret < 0) {
38
qed_unplug_allocating_write_reqs(s);
39
return;
40
}
33
}
41
@@ -XXX,XX +XXX,XX @@ static void qed_clear_need_check(void *opaque, int ret)
34
42
35
bdrv_drain_invoke(bs, true);
43
qed_unplug_allocating_write_reqs(s);
36
- bdrv_drain_recurse(bs, true);
44
37
+ bdrv_drain_recurse(bs);
45
- ret = bdrv_flush(s->bs);
46
+ ret = bdrv_co_flush(s->bs);
47
(void) ret;
48
}
38
}
49
39
50
static void qed_need_check_timer_cb(void *opaque)
40
void bdrv_drained_end(BlockDriverState *bs)
51
{
41
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
52
- BDRVQEDState *s = opaque;
42
53
-
43
bdrv_parent_drained_end(bs);
54
- /* The timer should only fire when allocating writes have drained */
44
bdrv_drain_invoke(bs, false);
55
- assert(!s->allocating_acb);
45
- bdrv_drain_recurse(bs, false);
56
-
46
+ bdrv_drain_recurse(bs);
57
- trace_qed_need_check_timer_cb(s);
47
aio_enable_external(bdrv_get_aio_context(bs));
58
-
59
- qed_acquire(s);
60
- qed_plug_allocating_write_reqs(s);
61
-
62
- /* Ensure writes are on disk before clearing flag */
63
- bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
64
- qed_release(s);
65
+ Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
66
+ qemu_coroutine_enter(co);
67
}
48
}
68
49
69
void qed_acquire(BDRVQEDState *s)
50
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
51
aio_context_acquire(aio_context);
52
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
53
if (aio_context == bdrv_get_aio_context(bs)) {
54
- waited |= bdrv_drain_recurse(bs, true);
55
+ waited |= bdrv_drain_recurse(bs);
56
}
57
}
58
aio_context_release(aio_context);
59
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
60
aio_enable_external(aio_context);
61
bdrv_parent_drained_end(bs);
62
bdrv_drain_invoke(bs, false);
63
- bdrv_drain_recurse(bs, false);
64
+ bdrv_drain_recurse(bs);
65
aio_context_release(aio_context);
66
}
67
70
--
68
--
71
1.8.3.1
69
2.13.6
72
70
73
71
diff view generated by jsdifflib
1
Now that we're running in coroutine context, the ad-hoc serialisation
1
The device is drained, so there is no point in waiting for requests at
2
code (which drops a request that has to wait out of coroutine context)
2
the end of the drained section. Remove the bdrv_drain_recurse() calls
3
can be replaced by a CoQueue.
3
there.
4
4
5
This means that when we resume a serialised request, it is running in
5
The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
6
coroutine context again and its I/O isn't blocking any more.
6
in order to call the .bdrv_co_drain_end() driver callback. This is now
7
done by a separate bdrv_drain_invoke() call.
7
8
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
12
---
11
block/qed.c | 49 +++++++++++++++++--------------------------------
13
block/io.c | 2 --
12
block/qed.h | 3 ++-
14
1 file changed, 2 deletions(-)
13
2 files changed, 19 insertions(+), 33 deletions(-)
14
15
15
diff --git a/block/qed.c b/block/qed.c
16
diff --git a/block/io.c b/block/io.c
16
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
17
--- a/block/qed.c
18
--- a/block/io.c
18
+++ b/block/qed.c
19
+++ b/block/io.c
19
@@ -XXX,XX +XXX,XX @@ static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
20
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
20
21
21
static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
22
bdrv_parent_drained_end(bs);
22
{
23
bdrv_drain_invoke(bs, false);
23
- QEDAIOCB *acb;
24
- bdrv_drain_recurse(bs);
24
-
25
aio_enable_external(bdrv_get_aio_context(bs));
25
assert(s->allocating_write_reqs_plugged);
26
27
s->allocating_write_reqs_plugged = false;
28
-
29
- acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
30
- if (acb) {
31
- qed_aio_start_io(acb);
32
- }
33
+ qemu_co_enter_next(&s->allocating_write_reqs);
34
}
26
}
35
27
36
static void qed_clear_need_check(void *opaque, int ret)
28
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
37
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
29
aio_enable_external(aio_context);
38
BDRVQEDState *s = opaque;
30
bdrv_parent_drained_end(bs);
39
31
bdrv_drain_invoke(bs, false);
40
/* The timer should only fire when allocating writes have drained */
32
- bdrv_drain_recurse(bs);
41
- assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
33
aio_context_release(aio_context);
42
+ assert(!s->allocating_acb);
43
44
trace_qed_need_check_timer_cb(s);
45
46
@@ -XXX,XX +XXX,XX @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags,
47
int ret;
48
49
s->bs = bs;
50
- QSIMPLEQ_INIT(&s->allocating_write_reqs);
51
+ qemu_co_queue_init(&s->allocating_write_reqs);
52
53
ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
54
if (ret < 0) {
55
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
56
qed_release(s);
57
}
58
59
-static void qed_resume_alloc_bh(void *opaque)
60
-{
61
- qed_aio_start_io(opaque);
62
-}
63
-
64
static void qed_aio_complete(QEDAIOCB *acb, int ret)
65
{
66
BDRVQEDState *s = acb_to_s(acb);
67
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
68
* next request in the queue. This ensures that we don't cycle through
69
* requests multiple times but rather finish one at a time completely.
70
*/
71
- if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
72
- QEDAIOCB *next_acb;
73
- QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
74
- next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
75
- if (next_acb) {
76
- aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
77
- qed_resume_alloc_bh, next_acb);
78
+ if (acb == s->allocating_acb) {
79
+ s->allocating_acb = NULL;
80
+ if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
81
+ qemu_co_enter_next(&s->allocating_write_reqs);
82
} else if (s->header.features & QED_F_NEED_CHECK) {
83
qed_start_need_check_timer(s);
84
}
85
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
86
int ret;
87
88
/* Cancel timer when the first allocating request comes in */
89
- if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
90
+ if (s->allocating_acb == NULL) {
91
qed_cancel_need_check_timer(s);
92
}
34
}
93
35
94
/* Freeze this request if another allocating write is in progress */
95
- if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
96
- QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
97
- }
98
- if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
99
- s->allocating_write_reqs_plugged) {
100
- return -EINPROGRESS; /* wait for existing request to finish */
101
+ if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
102
+ if (s->allocating_acb != NULL) {
103
+ qemu_co_queue_wait(&s->allocating_write_reqs, NULL);
104
+ assert(s->allocating_acb == NULL);
105
+ }
106
+ s->allocating_acb = acb;
107
+ return -EAGAIN; /* start over with looking up table entries */
108
}
109
110
acb->cur_nclusters = qed_bytes_to_clusters(s,
111
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
112
ret = qed_aio_read_data(acb, ret, offset, len);
113
}
114
115
- if (ret < 0) {
116
- if (ret != -EINPROGRESS) {
117
- qed_aio_complete(acb, ret);
118
- }
119
+ if (ret < 0 && ret != -EAGAIN) {
120
+ qed_aio_complete(acb, ret);
121
return;
122
}
123
}
124
diff --git a/block/qed.h b/block/qed.h
125
index XXXXXXX..XXXXXXX 100644
126
--- a/block/qed.h
127
+++ b/block/qed.h
128
@@ -XXX,XX +XXX,XX @@ typedef struct {
129
uint32_t l2_mask;
130
131
/* Allocating write request queue */
132
- QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
133
+ QEDAIOCB *allocating_acb;
134
+ CoQueue allocating_write_reqs;
135
bool allocating_write_reqs_plugged;
136
137
/* Periodic flush and clear need check flag */
138
--
36
--
139
1.8.3.1
37
2.13.6
140
38
141
39
diff view generated by jsdifflib
1
Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
1
Drain requests are propagated to child nodes, parent nodes and directly
2
just return an error code and let the caller handle it.
2
to the AioContext. The order in which this happened was different
3
between all combinations of drain/drain_all and begin/end.
3
4
4
While refactoring qed_aio_write_alloc() to accomodate the change,
5
The correct order is to keep children only drained when their parents
5
qed_aio_write_zero_cluster() ended up with a single line, so I chose to
6
are also drained. This means that at the start of a drained section, the
6
inline that line and remove the function completely.
7
AioContext needs to be drained first, the parents second and only then
8
the children. The correct order for the end of a drained section is the
9
opposite.
10
11
This patch changes the three other functions to follow the example of
12
bdrv_drained_begin(), which is the only one that got it right.
7
13
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
14
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
16
---
11
block/qed.c | 58 +++++++++++++++++++++-------------------------------------
17
block/io.c | 12 ++++++++----
12
1 file changed, 21 insertions(+), 37 deletions(-)
18
1 file changed, 8 insertions(+), 4 deletions(-)
13
19
14
diff --git a/block/qed.c b/block/qed.c
20
diff --git a/block/io.c b/block/io.c
15
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
16
--- a/block/qed.c
22
--- a/block/io.c
17
+++ b/block/qed.c
23
+++ b/block/io.c
18
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_main(QEDAIOCB *acb)
24
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
19
/**
25
return;
20
* Populate untouched regions of new data cluster
21
*/
22
-static void qed_aio_write_cow(void *opaque, int ret)
23
+static int qed_aio_write_cow(QEDAIOCB *acb)
24
{
25
- QEDAIOCB *acb = opaque;
26
BDRVQEDState *s = acb_to_s(acb);
27
uint64_t start, len, offset;
28
+ int ret;
29
30
/* Populate front untouched region of new data cluster */
31
start = qed_start_of_cluster(s, acb->cur_pos);
32
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_cow(void *opaque, int ret)
33
34
trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
35
ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
36
- if (ret) {
37
- qed_aio_complete(acb, ret);
38
- return;
39
+ if (ret < 0) {
40
+ return ret;
41
}
26
}
42
27
43
/* Populate back untouched region of new data cluster */
28
+ /* Stop things in parent-to-child order */
44
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_cow(void *opaque, int ret)
29
if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
45
30
aio_disable_external(bdrv_get_aio_context(bs));
46
trace_qed_aio_write_postfill(s, acb, start, len, offset);
31
bdrv_parent_drained_begin(bs);
47
ret = qed_copy_from_backing_file(s, start, len, offset);
32
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
48
- if (ret) {
33
return;
49
- qed_aio_complete(acb, ret);
50
- return;
51
- }
52
-
53
- ret = qed_aio_write_main(acb);
54
if (ret < 0) {
55
- qed_aio_complete(acb, ret);
56
- return;
57
+ return ret;
58
}
34
}
59
- qed_aio_next_io(acb, 0);
35
60
+
36
- bdrv_parent_drained_end(bs);
61
+ return qed_aio_write_main(acb);
37
+ /* Re-enable things in child-to-parent order */
38
bdrv_drain_invoke(bs, false);
39
+ bdrv_parent_drained_end(bs);
40
aio_enable_external(bdrv_get_aio_context(bs));
62
}
41
}
63
42
64
/**
43
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
65
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
44
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
66
return !(s->header.features & QED_F_NEED_CHECK);
45
AioContext *aio_context = bdrv_get_aio_context(bs);
67
}
46
68
47
+ /* Stop things in parent-to-child order */
69
-static void qed_aio_write_zero_cluster(void *opaque, int ret)
48
aio_context_acquire(aio_context);
70
-{
49
- bdrv_parent_drained_begin(bs);
71
- QEDAIOCB *acb = opaque;
50
aio_disable_external(aio_context);
72
-
51
+ bdrv_parent_drained_begin(bs);
73
- if (ret) {
52
bdrv_drain_invoke(bs, true);
74
- qed_aio_complete(acb, ret);
53
aio_context_release(aio_context);
75
- return;
54
76
- }
55
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
77
-
56
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
78
- ret = qed_aio_write_l2_update(acb, 1);
57
AioContext *aio_context = bdrv_get_aio_context(bs);
79
- if (ret < 0) {
58
80
- qed_aio_complete(acb, ret);
59
+ /* Re-enable things in child-to-parent order */
81
- return;
60
aio_context_acquire(aio_context);
82
- }
61
- aio_enable_external(aio_context);
83
- qed_aio_next_io(acb, 0);
62
- bdrv_parent_drained_end(bs);
84
-}
63
bdrv_drain_invoke(bs, false);
85
-
64
+ bdrv_parent_drained_end(bs);
86
/**
65
+ aio_enable_external(aio_context);
87
* Write new data cluster
66
aio_context_release(aio_context);
88
*
89
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
90
static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
91
{
92
BDRVQEDState *s = acb_to_s(acb);
93
- BlockCompletionFunc *cb;
94
int ret;
95
96
/* Cancel timer when the first allocating request comes in */
97
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
98
qed_aio_start_io(acb);
99
return;
100
}
101
-
102
- cb = qed_aio_write_zero_cluster;
103
} else {
104
- cb = qed_aio_write_cow;
105
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
106
}
67
}
107
68
108
if (qed_should_set_need_check(s)) {
109
s->header.features |= QED_F_NEED_CHECK;
110
ret = qed_write_header(s);
111
- cb(acb, ret);
112
+ if (ret < 0) {
113
+ qed_aio_complete(acb, ret);
114
+ return;
115
+ }
116
+ }
117
+
118
+ if (acb->flags & QED_AIOCB_ZERO) {
119
+ ret = qed_aio_write_l2_update(acb, 1);
120
} else {
121
- cb(acb, 0);
122
+ ret = qed_aio_write_cow(acb);
123
}
124
+ if (ret < 0) {
125
+ qed_aio_complete(acb, ret);
126
+ return;
127
+ }
128
+ qed_aio_next_io(acb, 0);
129
}
130
131
/**
132
--
69
--
133
1.8.3.1
70
2.13.6
134
71
135
72
diff view generated by jsdifflib
1
Now that we process a request in the same coroutine from beginning to
1
Commit 15afd94a047 added code to acquire and release the AioContext in
2
end and don't drop out of it any more, we can look like a proper
2
qemuio_command(). This means that the lock is taken twice now in the
3
coroutine-based driver and simply call qed_aio_next_io() and get a
3
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
4
return value from it instead of spawning an additional coroutine that
4
any requests issued to nodes in a non-mainloop AioContext.
5
reenters the parent when it's done.
5
6
Dropping the first locking from hmp_qemu_io() fixes the problem.
6
7
7
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
---
10
block/qed.c | 101 +++++++++++++-----------------------------------------------
11
hmp.c | 6 ------
11
block/qed.h | 3 +-
12
1 file changed, 6 deletions(-)
12
2 files changed, 22 insertions(+), 82 deletions(-)
13
13
14
diff --git a/block/qed.c b/block/qed.c
14
diff --git a/hmp.c b/hmp.c
15
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
16
--- a/block/qed.c
16
--- a/hmp.c
17
+++ b/block/qed.c
17
+++ b/hmp.c
18
@@ -XXX,XX +XXX,XX @@
18
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
19
#include "qapi/qmp/qerror.h"
20
#include "sysemu/block-backend.h"
21
22
-static const AIOCBInfo qed_aiocb_info = {
23
- .aiocb_size = sizeof(QEDAIOCB),
24
-};
25
-
26
static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
27
const char *filename)
28
{
19
{
29
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
20
BlockBackend *blk;
30
return l2_table;
21
BlockBackend *local_blk = NULL;
31
}
22
- AioContext *aio_context;
32
23
const char* device = qdict_get_str(qdict, "device");
33
-static void qed_aio_next_io(QEDAIOCB *acb);
24
const char* command = qdict_get_str(qdict, "command");
34
-
25
Error *err = NULL;
35
-static void qed_aio_start_io(QEDAIOCB *acb)
26
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
36
-{
37
- qed_aio_next_io(acb);
38
-}
39
-
40
static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
41
{
42
assert(!s->allocating_write_reqs_plugged);
43
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
44
45
static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
46
{
47
- return acb->common.bs->opaque;
48
+ return acb->bs->opaque;
49
}
50
51
/**
52
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
53
}
54
}
55
56
-static void qed_aio_complete_bh(void *opaque)
57
-{
58
- QEDAIOCB *acb = opaque;
59
- BDRVQEDState *s = acb_to_s(acb);
60
- BlockCompletionFunc *cb = acb->common.cb;
61
- void *user_opaque = acb->common.opaque;
62
- int ret = acb->bh_ret;
63
-
64
- qemu_aio_unref(acb);
65
-
66
- /* Invoke callback */
67
- qed_acquire(s);
68
- cb(user_opaque, ret);
69
- qed_release(s);
70
-}
71
-
72
-static void qed_aio_complete(QEDAIOCB *acb, int ret)
73
+static void qed_aio_complete(QEDAIOCB *acb)
74
{
75
BDRVQEDState *s = acb_to_s(acb);
76
77
- trace_qed_aio_complete(s, acb, ret);
78
-
79
/* Free resources */
80
qemu_iovec_destroy(&acb->cur_qiov);
81
qed_unref_l2_cache_entry(acb->request.l2_table);
82
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
83
acb->qiov->iov[0].iov_base = NULL;
84
}
85
86
- /* Arrange for a bh to invoke the completion function */
87
- acb->bh_ret = ret;
88
- aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
89
- qed_aio_complete_bh, acb);
90
-
91
/* Start next allocating write request waiting behind this one. Note that
92
* requests enqueue themselves when they first hit an unallocated cluster
93
* but they wait until the entire request is finished before waking up the
94
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
95
struct iovec *iov = acb->qiov->iov;
96
97
if (!iov->iov_base) {
98
- iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
99
+ iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
100
if (iov->iov_base == NULL) {
101
return -ENOMEM;
102
}
103
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
104
{
105
QEDAIOCB *acb = opaque;
106
BDRVQEDState *s = acb_to_s(acb);
107
- BlockDriverState *bs = acb->common.bs;
108
+ BlockDriverState *bs = acb->bs;
109
110
/* Adjust offset into cluster */
111
offset += qed_offset_into_cluster(s, acb->cur_pos);
112
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
113
/**
114
* Begin next I/O or complete the request
115
*/
116
-static void qed_aio_next_io(QEDAIOCB *acb)
117
+static int qed_aio_next_io(QEDAIOCB *acb)
118
{
119
BDRVQEDState *s = acb_to_s(acb);
120
uint64_t offset;
121
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
122
123
/* Complete request */
124
if (acb->cur_pos >= acb->end_pos) {
125
- qed_aio_complete(acb, 0);
126
- return;
127
+ ret = 0;
128
+ break;
129
}
130
131
/* Find next cluster and start I/O */
132
len = acb->end_pos - acb->cur_pos;
133
ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
134
if (ret < 0) {
135
- qed_aio_complete(acb, ret);
136
- return;
137
+ break;
138
}
139
140
if (acb->flags & QED_AIOCB_WRITE) {
141
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
142
}
143
144
if (ret < 0 && ret != -EAGAIN) {
145
- qed_aio_complete(acb, ret);
146
- return;
147
+ break;
148
}
27
}
149
}
28
}
150
-}
29
151
30
- aio_context = blk_get_aio_context(blk);
152
-typedef struct QEDRequestCo {
31
- aio_context_acquire(aio_context);
153
- Coroutine *co;
154
- bool done;
155
- int ret;
156
-} QEDRequestCo;
157
-
32
-
158
-static void qed_co_request_cb(void *opaque, int ret)
33
/*
159
-{
34
* Notably absent: Proper permission management. This is sad, but it seems
160
- QEDRequestCo *co = opaque;
35
* almost impossible to achieve without changing the semantics and thereby
36
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
37
*/
38
qemuio_command(blk, command);
39
40
- aio_context_release(aio_context);
161
-
41
-
162
- co->done = true;
42
fail:
163
- co->ret = ret;
43
blk_unref(local_blk);
164
- qemu_coroutine_enter_if_inactive(co->co);
44
hmp_handle_error(mon, &err);
165
+ trace_qed_aio_complete(s, acb, ret);
166
+ qed_aio_complete(acb);
167
+ return ret;
168
}
169
170
static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
171
QEMUIOVector *qiov, int nb_sectors,
172
int flags)
173
{
174
- QEDRequestCo co = {
175
- .co = qemu_coroutine_self(),
176
- .done = false,
177
+ QEDAIOCB acb = {
178
+ .bs = bs,
179
+ .cur_pos = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
180
+ .end_pos = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
181
+ .qiov = qiov,
182
+ .flags = flags,
183
};
184
- QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
185
-
186
- trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
187
+ qemu_iovec_init(&acb.cur_qiov, qiov->niov);
188
189
- acb->flags = flags;
190
- acb->qiov = qiov;
191
- acb->qiov_offset = 0;
192
- acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
193
- acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
194
- acb->backing_qiov = NULL;
195
- acb->request.l2_table = NULL;
196
- qemu_iovec_init(&acb->cur_qiov, qiov->niov);
197
+ trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
198
199
/* Start request */
200
- qed_aio_start_io(acb);
201
-
202
- if (!co.done) {
203
- qemu_coroutine_yield();
204
- }
205
-
206
- return co.ret;
207
+ return qed_aio_next_io(&acb);
208
}
209
210
static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
211
diff --git a/block/qed.h b/block/qed.h
212
index XXXXXXX..XXXXXXX 100644
213
--- a/block/qed.h
214
+++ b/block/qed.h
215
@@ -XXX,XX +XXX,XX @@ enum {
216
};
217
218
typedef struct QEDAIOCB {
219
- BlockAIOCB common;
220
- int bh_ret; /* final return status for completion bh */
221
+ BlockDriverState *bs;
222
QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */
223
int flags; /* QED_AIOCB_* bits ORed together */
224
uint64_t end_pos; /* request end on block device, in bytes */
225
--
45
--
226
1.8.3.1
46
2.13.6
227
47
228
48
diff view generated by jsdifflib
1
From: Alberto Garcia <berto@igalia.com>
1
From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
2
2
3
Qcow2COWRegion has two attributes:
3
Since bdrv_co_preadv does all neccessary checks including
4
reading after the end of the backing file, avoid duplication
5
of verification before bdrv_co_preadv call.
4
6
5
- The offset of the COW region from the start of the first cluster
7
Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
6
touched by the I/O request. Since it's always going to be positive
8
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
7
and the maximum request size is at most INT_MAX, we can use a
8
regular unsigned int to store this offset.
9
10
- The size of the COW region in bytes. This is guaranteed to be >= 0,
11
so we should use an unsigned type instead.
12
13
In x86_64 this reduces the size of Qcow2COWRegion from 16 to 8 bytes.
14
It will also help keep some assertions simpler now that we know that
15
there are no negative numbers.
16
17
The prototype of do_perform_cow() is also updated to reflect these
18
changes.
19
20
Signed-off-by: Alberto Garcia <berto@igalia.com>
21
Reviewed-by: Eric Blake <eblake@redhat.com>
9
Reviewed-by: Eric Blake <eblake@redhat.com>
22
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
23
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
24
---
11
---
25
block/qcow2-cluster.c | 4 ++--
12
block/qcow2.h | 3 ---
26
block/qcow2.h | 4 ++--
13
block/qcow2.c | 51 ++++++++-------------------------------------------
27
2 files changed, 4 insertions(+), 4 deletions(-)
14
2 files changed, 8 insertions(+), 46 deletions(-)
28
15
29
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
30
index XXXXXXX..XXXXXXX 100644
31
--- a/block/qcow2-cluster.c
32
+++ b/block/qcow2-cluster.c
33
@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
34
static int coroutine_fn do_perform_cow(BlockDriverState *bs,
35
uint64_t src_cluster_offset,
36
uint64_t cluster_offset,
37
- int offset_in_cluster,
38
- int bytes)
39
+ unsigned offset_in_cluster,
40
+ unsigned bytes)
41
{
42
BDRVQcow2State *s = bs->opaque;
43
QEMUIOVector qiov;
44
diff --git a/block/qcow2.h b/block/qcow2.h
16
diff --git a/block/qcow2.h b/block/qcow2.h
45
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
46
--- a/block/qcow2.h
18
--- a/block/qcow2.h
47
+++ b/block/qcow2.h
19
+++ b/block/qcow2.h
48
@@ -XXX,XX +XXX,XX @@ typedef struct Qcow2COWRegion {
20
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
49
* Offset of the COW region in bytes from the start of the first cluster
21
}
50
* touched by the request.
22
51
*/
23
/* qcow2.c functions */
52
- uint64_t offset;
24
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
53
+ unsigned offset;
25
- int64_t sector_num, int nb_sectors);
54
26
-
55
/** Number of bytes to copy */
27
int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
56
- int nb_bytes;
28
int refcount_order, bool generous_increase,
57
+ unsigned nb_bytes;
29
uint64_t *refblock_count);
58
} Qcow2COWRegion;
30
diff --git a/block/qcow2.c b/block/qcow2.c
59
31
index XXXXXXX..XXXXXXX 100644
60
/**
32
--- a/block/qcow2.c
33
+++ b/block/qcow2.c
34
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
35
return status;
36
}
37
38
-/* handle reading after the end of the backing file */
39
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
40
- int64_t offset, int bytes)
41
-{
42
- uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
43
- int n1;
44
-
45
- if ((offset + bytes) <= bs_size) {
46
- return bytes;
47
- }
48
-
49
- if (offset >= bs_size) {
50
- n1 = 0;
51
- } else {
52
- n1 = bs_size - offset;
53
- }
54
-
55
- qemu_iovec_memset(qiov, n1, 0, bytes - n1);
56
-
57
- return n1;
58
-}
59
-
60
static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
61
uint64_t bytes, QEMUIOVector *qiov,
62
int flags)
63
{
64
BDRVQcow2State *s = bs->opaque;
65
- int offset_in_cluster, n1;
66
+ int offset_in_cluster;
67
int ret;
68
unsigned int cur_bytes; /* number of bytes in current iteration */
69
uint64_t cluster_offset = 0;
70
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
71
case QCOW2_CLUSTER_UNALLOCATED:
72
73
if (bs->backing) {
74
- /* read from the base image */
75
- n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
76
- offset, cur_bytes);
77
- if (n1 > 0) {
78
- QEMUIOVector local_qiov;
79
-
80
- qemu_iovec_init(&local_qiov, hd_qiov.niov);
81
- qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
82
-
83
- BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
84
- qemu_co_mutex_unlock(&s->lock);
85
- ret = bdrv_co_preadv(bs->backing, offset, n1,
86
- &local_qiov, 0);
87
- qemu_co_mutex_lock(&s->lock);
88
-
89
- qemu_iovec_destroy(&local_qiov);
90
-
91
- if (ret < 0) {
92
- goto fail;
93
- }
94
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
95
+ qemu_co_mutex_unlock(&s->lock);
96
+ ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
97
+ &hd_qiov, 0);
98
+ qemu_co_mutex_lock(&s->lock);
99
+ if (ret < 0) {
100
+ goto fail;
101
}
102
} else {
103
/* Note: in this case, no need to wait */
61
--
104
--
62
1.8.3.1
105
2.13.6
63
106
64
107
diff view generated by jsdifflib
1
This adds documentation for the -blockdev options that apply to all
1
Removing a quorum child node with x-blockdev-change results in a quorum
2
nodes independent of the block driver used.
2
driver state that cannot be recreated with create options because it
3
would require a list with gaps. This causes trouble in at least
4
.bdrv_refresh_filename().
3
5
4
All options that are shared by -blockdev and -drive are now explained in
6
Document this problem so that we won't accidentally mark the command
5
the section for -blockdev. The documentation of -drive mentions that all
7
stable without having addressed it.
6
-blockdev options are accepted as well.
7
8
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Reviewed-by: Eric Blake <eblake@redhat.com>
10
Reviewed-by: Alberto Garcia <berto@igalia.com>
10
Reviewed-by: Max Reitz <mreitz@redhat.com>
11
---
11
---
12
qemu-options.hx | 108 +++++++++++++++++++++++++++++++++++++++++---------------
12
qapi/block-core.json | 4 ++++
13
1 file changed, 79 insertions(+), 29 deletions(-)
13
1 file changed, 4 insertions(+)
14
14
15
diff --git a/qemu-options.hx b/qemu-options.hx
15
diff --git a/qapi/block-core.json b/qapi/block-core.json
16
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
17
--- a/qemu-options.hx
17
--- a/qapi/block-core.json
18
+++ b/qemu-options.hx
18
+++ b/qapi/block-core.json
19
@@ -XXX,XX +XXX,XX @@ DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
19
@@ -XXX,XX +XXX,XX @@
20
" [,read-only=on|off][,detect-zeroes=on|off|unmap]\n"
20
# does not support all kinds of operations, all kinds of children, nor
21
" [,driver specific parameters...]\n"
21
# all block drivers.
22
" configure a block backend\n", QEMU_ARCH_ALL)
22
#
23
+STEXI
23
+# FIXME Removing children from a quorum node means introducing gaps in the
24
+@item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
24
+# child indices. This cannot be represented in the 'children' list of
25
+@findex -blockdev
25
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
26
+
26
+#
27
+Define a new block driver node.
27
# Warning: The data in a new quorum child MUST be consistent with that of
28
+
28
# the rest of the array.
29
+@table @option
29
#
30
+@item Valid options for any block driver node:
31
+
32
+@table @code
33
+@item driver
34
+Specifies the block driver to use for the given node.
35
+@item node-name
36
+This defines the name of the block driver node by which it will be referenced
37
+later. The name must be unique, i.e. it must not match the name of a different
38
+block driver node, or (if you use @option{-drive} as well) the ID of a drive.
39
+
40
+If no node name is specified, it is automatically generated. The generated node
41
+name is not intended to be predictable and changes between QEMU invocations.
42
+For the top level, an explicit node name must be specified.
43
+@item read-only
44
+Open the node read-only. Guest write attempts will fail.
45
+@item cache.direct
46
+The host page cache can be avoided with @option{cache.direct=on}. This will
47
+attempt to do disk IO directly to the guest's memory. QEMU may still perform an
48
+internal copy of the data.
49
+@item cache.no-flush
50
+In case you don't care about data integrity over host failures, you can use
51
+@option{cache.no-flush=on}. This option tells QEMU that it never needs to write
52
+any data to the disk but can instead keep things in cache. If anything goes
53
+wrong, like your host losing power, the disk storage getting disconnected
54
+accidentally, etc. your image will most probably be rendered unusable.
55
+@item discard=@var{discard}
56
+@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls
57
+whether @code{discard} (also known as @code{trim} or @code{unmap}) requests are
58
+ignored or passed to the filesystem. Some machine types may not support
59
+discard requests.
60
+@item detect-zeroes=@var{detect-zeroes}
61
+@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
62
+conversion of plain zero writes by the OS to driver specific optimized
63
+zero write commands. You may even choose "unmap" if @var{discard} is set
64
+to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
65
+@end table
66
+
67
+@end table
68
+
69
+ETEXI
70
71
DEF("drive", HAS_ARG, QEMU_OPTION_drive,
72
"-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
73
@@ -XXX,XX +XXX,XX @@ STEXI
74
@item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
75
@findex -drive
76
77
-Define a new drive. Valid options are:
78
+Define a new drive. This includes creating a block driver node (the backend) as
79
+well as a guest device, and is mostly a shortcut for defining the corresponding
80
+@option{-blockdev} and @option{-device} options.
81
+
82
+@option{-drive} accepts all options that are accepted by @option{-blockdev}. In
83
+addition, it knows the following options:
84
85
@table @option
86
@item file=@var{file}
87
@@ -XXX,XX +XXX,XX @@ These options have the same definition as they have in @option{-hdachs}.
88
@var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
89
(see @option{-snapshot}).
90
@item cache=@var{cache}
91
-@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" and controls how the host cache is used to access block data.
92
+@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough"
93
+and controls how the host cache is used to access block data. This is a
94
+shortcut that sets the @option{cache.direct} and @option{cache.no-flush}
95
+options (as in @option{-blockdev}), and additionally @option{cache.writeback},
96
+which provides a default for the @option{write-cache} option of block guest
97
+devices (as in @option{-device}). The modes correspond to the following
98
+settings:
99
+
100
+@c Our texi2pod.pl script doesn't support @multitable, so fall back to using
101
+@c plain ASCII art (well, UTF-8 art really). This looks okay both in the manpage
102
+@c and the HTML output.
103
+@example
104
+@ │ cache.writeback cache.direct cache.no-flush
105
+─────────────┼─────────────────────────────────────────────────
106
+writeback │ on off off
107
+none │ on on off
108
+writethrough │ off off off
109
+directsync │ off on off
110
+unsafe │ on off on
111
+@end example
112
+
113
+The default mode is @option{cache=writeback}.
114
+
115
@item aio=@var{aio}
116
@var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
117
-@item discard=@var{discard}
118
-@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem. Some machine types may not support discard requests.
119
@item format=@var{format}
120
Specify which disk @var{format} will be used rather than detecting
121
the format. Can be used to specify format=raw to avoid interpreting
122
@@ -XXX,XX +XXX,XX @@ Specify which @var{action} to take on write and read errors. Valid actions are:
123
"report" (report the error to the guest), "enospc" (pause QEMU only if the
124
host disk is full; report the error to the guest otherwise).
125
The default setting is @option{werror=enospc} and @option{rerror=report}.
126
-@item readonly
127
-Open drive @option{file} as read-only. Guest write attempts will fail.
128
@item copy-on-read=@var{copy-on-read}
129
@var{copy-on-read} is "on" or "off" and enables whether to copy read backing
130
file sectors into the image file.
131
-@item detect-zeroes=@var{detect-zeroes}
132
-@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
133
-conversion of plain zero writes by the OS to driver specific optimized
134
-zero write commands. You may even choose "unmap" if @var{discard} is set
135
-to "unmap" to allow a zero write to be converted to an UNMAP operation.
136
@item bps=@var{b},bps_rd=@var{r},bps_wr=@var{w}
137
Specify bandwidth throttling limits in bytes per second, either for all request
138
types or for reads or writes only. Small values can lead to timeouts or hangs
139
@@ -XXX,XX +XXX,XX @@ prevent guests from circumventing throttling limits by using many small disks
140
instead of a single larger disk.
141
@end table
142
143
-By default, the @option{cache=writeback} mode is used. It will report data
144
+By default, the @option{cache.writeback=on} mode is used. It will report data
145
writes as completed as soon as the data is present in the host page cache.
146
This is safe as long as your guest OS makes sure to correctly flush disk caches
147
where needed. If your guest OS does not handle volatile disk write caches
148
correctly and your host crashes or loses power, then the guest may experience
149
data corruption.
150
151
-For such guests, you should consider using @option{cache=writethrough}. This
152
+For such guests, you should consider using @option{cache.writeback=off}. This
153
means that the host page cache will be used to read and write data, but write
154
notification will be sent to the guest only after QEMU has made sure to flush
155
each write to the disk. Be aware that this has a major impact on performance.
156
157
-The host page cache can be avoided entirely with @option{cache=none}. This will
158
-attempt to do disk IO directly to the guest's memory. QEMU may still perform
159
-an internal copy of the data. Note that this is considered a writeback mode and
160
-the guest OS must handle the disk write cache correctly in order to avoid data
161
-corruption on host crashes.
162
-
163
-The host page cache can be avoided while only sending write notifications to
164
-the guest when the data has been flushed to the disk using
165
-@option{cache=directsync}.
166
-
167
-In case you don't care about data integrity over host failures, use
168
-@option{cache=unsafe}. This option tells QEMU that it never needs to write any
169
-data to the disk but can instead keep things in cache. If anything goes wrong,
170
-like your host losing power, the disk storage getting disconnected accidentally,
171
-etc. your image will most probably be rendered unusable. When using
172
-the @option{-snapshot} option, unsafe caching is always used.
173
+When using the @option{-snapshot} option, unsafe caching is always used.
174
175
Copy-on-read avoids accessing the same backing file sectors repeatedly and is
176
useful when the backing file is over a slow network. By default copy-on-read
177
--
30
--
178
1.8.3.1
31
2.13.6
179
32
180
33
diff view generated by jsdifflib
1
From: Stephen Bates <sbates@raithlin.com>
1
From: Doug Gale <doug16k@gmail.com>
2
2
3
Add the ability for the NVMe model to support both the RDS and WDS
3
Add trace output for commands, errors, and undefined behavior.
4
modes in the Controller Memory Buffer.
4
Add guest error log output for undefined behavior.
5
Report invalid undefined accesses to MMIO.
6
Annotate unlikely error checks with unlikely.
5
7
6
Although not currently supported in the upstreamed Linux kernel a fork
8
Signed-off-by: Doug Gale <doug16k@gmail.com>
7
with support exists [1] and user-space test programs that build on
9
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
8
this also exist [2].
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
10
Useful for testing CMB functionality in preperation for real CMB
11
enabled NVMe devices (coming soon).
12
13
[1] https://github.com/sbates130272/linux-p2pmem
14
[2] https://github.com/sbates130272/p2pmem-test
15
16
Signed-off-by: Stephen Bates <sbates@raithlin.com>
17
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
18
Reviewed-by: Keith Busch <keith.busch@intel.com>
19
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
20
---
12
---
21
hw/block/nvme.c | 83 +++++++++++++++++++++++++++++++++++++++------------------
13
hw/block/nvme.c | 349 ++++++++++++++++++++++++++++++++++++++++++--------
22
hw/block/nvme.h | 1 +
14
hw/block/trace-events | 93 ++++++++++++++
23
2 files changed, 58 insertions(+), 26 deletions(-)
15
2 files changed, 390 insertions(+), 52 deletions(-)
24
16
25
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
17
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
26
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
27
--- a/hw/block/nvme.c
19
--- a/hw/block/nvme.c
28
+++ b/hw/block/nvme.c
20
+++ b/hw/block/nvme.c
29
@@ -XXX,XX +XXX,XX @@
21
@@ -XXX,XX +XXX,XX @@
30
* cmb_size_mb=<cmb_size_mb[optional]>
22
#include "qapi/visitor.h"
31
*
23
#include "sysemu/block-backend.h"
32
* Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
24
33
- * offset 0 in BAR2 and supports SQS only for now.
25
+#include "qemu/log.h"
34
+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
26
+#include "trace.h"
35
*/
27
#include "nvme.h"
36
28
37
#include "qemu/osdep.h"
29
+#define NVME_GUEST_ERR(trace, fmt, ...) \
30
+ do { \
31
+ (trace_##trace)(__VA_ARGS__); \
32
+ qemu_log_mask(LOG_GUEST_ERROR, #trace \
33
+ " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
34
+ } while (0)
35
+
36
static void nvme_process_sq(void *opaque);
37
38
static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
38
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
39
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
39
}
40
}
41
42
-static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
43
- uint32_t len, NvmeCtrl *n)
44
+static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
45
+ uint64_t prp2, uint32_t len, NvmeCtrl *n)
46
{
40
{
47
hwaddr trans_len = n->page_size - (prp1 % n->page_size);
41
if (cq->irq_enabled) {
42
if (msix_enabled(&(n->parent_obj))) {
43
+ trace_nvme_irq_msix(cq->vector);
44
msix_notify(&(n->parent_obj), cq->vector);
45
} else {
46
+ trace_nvme_irq_pin();
47
pci_irq_pulse(&n->parent_obj);
48
}
49
+ } else {
50
+ trace_nvme_irq_masked();
51
}
52
}
53
54
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
48
trans_len = MIN(len, trans_len);
55
trans_len = MIN(len, trans_len);
49
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
56
int num_prps = (len >> n->page_bits) + 1;
50
57
51
if (!prp1) {
58
- if (!prp1) {
52
return NVME_INVALID_FIELD | NVME_DNR;
59
+ if (unlikely(!prp1)) {
53
+ } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
60
+ trace_nvme_err_invalid_prp();
54
+ prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
61
return NVME_INVALID_FIELD | NVME_DNR;
55
+ qsg->nsg = 0;
62
} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
56
+ qemu_iovec_init(iov, num_prps);
63
prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
57
+ qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
64
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
58
+ } else {
65
}
59
+ pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
60
+ qemu_sglist_add(qsg, prp1, trans_len);
61
}
62
-
63
- pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
64
- qemu_sglist_add(qsg, prp1, trans_len);
65
len -= trans_len;
66
len -= trans_len;
66
if (len) {
67
if (len) {
67
if (!prp2) {
68
- if (!prp2) {
68
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
69
+ if (unlikely(!prp2)) {
69
70
+ trace_nvme_err_invalid_prp2_missing();
70
nents = (len + n->page_size - 1) >> n->page_bits;
71
goto unmap;
71
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
72
}
72
- pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
73
if (len > n->page_size) {
73
+ nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
74
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
74
while (len != 0) {
75
uint64_t prp_ent = le64_to_cpu(prp_list[i]);
75
uint64_t prp_ent = le64_to_cpu(prp_list[i]);
76
76
77
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
77
if (i == n->max_prp_ents - 1 && len > n->page_size) {
78
i = 0;
78
- if (!prp_ent || prp_ent & (n->page_size - 1)) {
79
nents = (len + n->page_size - 1) >> n->page_bits;
79
+ if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
80
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
80
+ trace_nvme_err_invalid_prplist_ent(prp_ent);
81
- pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
81
goto unmap;
82
+ nvme_addr_read(n, prp_ent, (void *)prp_list,
82
}
83
prp_trans);
83
84
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
84
prp_ent = le64_to_cpu(prp_list[i]);
85
prp_ent = le64_to_cpu(prp_list[i]);
85
}
86
}
86
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
87
88
- if (!prp_ent || prp_ent & (n->page_size - 1)) {
89
+ if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
90
+ trace_nvme_err_invalid_prplist_ent(prp_ent);
91
goto unmap;
87
}
92
}
88
93
89
trans_len = MIN(len, n->page_size);
94
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
90
- qemu_sglist_add(qsg, prp_ent, trans_len);
91
+ if (qsg->nsg){
92
+ qemu_sglist_add(qsg, prp_ent, trans_len);
93
+ } else {
94
+ qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
95
+ }
96
len -= trans_len;
97
i++;
95
i++;
98
}
96
}
99
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
97
} else {
100
if (prp2 & (n->page_size - 1)) {
98
- if (prp2 & (n->page_size - 1)) {
99
+ if (unlikely(prp2 & (n->page_size - 1))) {
100
+ trace_nvme_err_invalid_prp2_align(prp2);
101
goto unmap;
101
goto unmap;
102
}
102
}
103
- qemu_sglist_add(qsg, prp2, len);
103
if (qsg->nsg) {
104
+ if (qsg->nsg) {
104
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
105
+ qemu_sglist_add(qsg, prp2, len);
105
QEMUIOVector iov;
106
+ } else {
106
uint16_t status = NVME_SUCCESS;
107
+ qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
107
108
+ }
108
+ trace_nvme_dma_read(prp1, prp2);
109
}
109
+
110
}
110
if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
111
return NVME_INVALID_FIELD | NVME_DNR;
112
}
113
if (qsg.nsg > 0) {
114
- if (dma_buf_read(ptr, len, &qsg)) {
115
+ if (unlikely(dma_buf_read(ptr, len, &qsg))) {
116
+ trace_nvme_err_invalid_dma();
117
status = NVME_INVALID_FIELD | NVME_DNR;
118
}
119
qemu_sglist_destroy(&qsg);
120
} else {
121
- if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
122
+ if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
123
+ trace_nvme_err_invalid_dma();
124
status = NVME_INVALID_FIELD | NVME_DNR;
125
}
126
qemu_iovec_destroy(&iov);
127
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
128
uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
129
uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
130
131
- if (slba + nlb > ns->id_ns.nsze) {
132
+ if (unlikely(slba + nlb > ns->id_ns.nsze)) {
133
+ trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
134
return NVME_LBA_RANGE | NVME_DNR;
135
}
136
137
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
138
int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
139
enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
140
141
- if ((slba + nlb) > ns->id_ns.nsze) {
142
+ trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
143
+
144
+ if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
145
block_acct_invalid(blk_get_stats(n->conf.blk), acct);
146
+ trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
147
return NVME_LBA_RANGE | NVME_DNR;
148
}
149
150
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
151
NvmeNamespace *ns;
152
uint32_t nsid = le32_to_cpu(cmd->nsid);
153
154
- if (nsid == 0 || nsid > n->num_namespaces) {
155
+ if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
156
+ trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
157
return NVME_INVALID_NSID | NVME_DNR;
158
}
159
160
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
161
case NVME_CMD_READ:
162
return nvme_rw(n, ns, cmd, req);
163
default:
164
+ trace_nvme_err_invalid_opc(cmd->opcode);
165
return NVME_INVALID_OPCODE | NVME_DNR;
166
}
167
}
168
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
169
NvmeCQueue *cq;
170
uint16_t qid = le16_to_cpu(c->qid);
171
172
- if (!qid || nvme_check_sqid(n, qid)) {
173
+ if (unlikely(!qid || nvme_check_sqid(n, qid))) {
174
+ trace_nvme_err_invalid_del_sq(qid);
175
return NVME_INVALID_QID | NVME_DNR;
176
}
177
178
+ trace_nvme_del_sq(qid);
179
+
180
sq = n->sq[qid];
181
while (!QTAILQ_EMPTY(&sq->out_req_list)) {
182
req = QTAILQ_FIRST(&sq->out_req_list);
183
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
184
uint16_t qflags = le16_to_cpu(c->sq_flags);
185
uint64_t prp1 = le64_to_cpu(c->prp1);
186
187
- if (!cqid || nvme_check_cqid(n, cqid)) {
188
+ trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
189
+
190
+ if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
191
+ trace_nvme_err_invalid_create_sq_cqid(cqid);
192
return NVME_INVALID_CQID | NVME_DNR;
193
}
194
- if (!sqid || !nvme_check_sqid(n, sqid)) {
195
+ if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
196
+ trace_nvme_err_invalid_create_sq_sqid(sqid);
197
return NVME_INVALID_QID | NVME_DNR;
198
}
199
- if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
200
+ if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
201
+ trace_nvme_err_invalid_create_sq_size(qsize);
202
return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
203
}
204
- if (!prp1 || prp1 & (n->page_size - 1)) {
205
+ if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
206
+ trace_nvme_err_invalid_create_sq_addr(prp1);
207
return NVME_INVALID_FIELD | NVME_DNR;
208
}
209
- if (!(NVME_SQ_FLAGS_PC(qflags))) {
210
+ if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
211
+ trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
212
return NVME_INVALID_FIELD | NVME_DNR;
213
}
214
sq = g_malloc0(sizeof(*sq));
215
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
216
NvmeCQueue *cq;
217
uint16_t qid = le16_to_cpu(c->qid);
218
219
- if (!qid || nvme_check_cqid(n, qid)) {
220
+ if (unlikely(!qid || nvme_check_cqid(n, qid))) {
221
+ trace_nvme_err_invalid_del_cq_cqid(qid);
222
return NVME_INVALID_CQID | NVME_DNR;
223
}
224
225
cq = n->cq[qid];
226
- if (!QTAILQ_EMPTY(&cq->sq_list)) {
227
+ if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
228
+ trace_nvme_err_invalid_del_cq_notempty(qid);
229
return NVME_INVALID_QUEUE_DEL;
230
}
231
+ trace_nvme_del_cq(qid);
232
nvme_free_cq(cq, n);
111
return NVME_SUCCESS;
233
return NVME_SUCCESS;
112
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
234
}
113
uint64_t prp1, uint64_t prp2)
235
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
236
uint16_t qflags = le16_to_cpu(c->cq_flags);
237
uint64_t prp1 = le64_to_cpu(c->prp1);
238
239
- if (!cqid || !nvme_check_cqid(n, cqid)) {
240
+ trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
241
+ NVME_CQ_FLAGS_IEN(qflags) != 0);
242
+
243
+ if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
244
+ trace_nvme_err_invalid_create_cq_cqid(cqid);
245
return NVME_INVALID_CQID | NVME_DNR;
246
}
247
- if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
248
+ if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
249
+ trace_nvme_err_invalid_create_cq_size(qsize);
250
return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
251
}
252
- if (!prp1) {
253
+ if (unlikely(!prp1)) {
254
+ trace_nvme_err_invalid_create_cq_addr(prp1);
255
return NVME_INVALID_FIELD | NVME_DNR;
256
}
257
- if (vector > n->num_queues) {
258
+ if (unlikely(vector > n->num_queues)) {
259
+ trace_nvme_err_invalid_create_cq_vector(vector);
260
return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
261
}
262
- if (!(NVME_CQ_FLAGS_PC(qflags))) {
263
+ if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
264
+ trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
265
return NVME_INVALID_FIELD | NVME_DNR;
266
}
267
268
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
269
uint64_t prp1 = le64_to_cpu(c->prp1);
270
uint64_t prp2 = le64_to_cpu(c->prp2);
271
272
+ trace_nvme_identify_ctrl();
273
+
274
return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
275
prp1, prp2);
276
}
277
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
278
uint64_t prp1 = le64_to_cpu(c->prp1);
279
uint64_t prp2 = le64_to_cpu(c->prp2);
280
281
- if (nsid == 0 || nsid > n->num_namespaces) {
282
+ trace_nvme_identify_ns(nsid);
283
+
284
+ if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
285
+ trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
286
return NVME_INVALID_NSID | NVME_DNR;
287
}
288
289
ns = &n->namespaces[nsid - 1];
290
+
291
return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
292
prp1, prp2);
293
}
294
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
295
uint16_t ret;
296
int i, j = 0;
297
298
+ trace_nvme_identify_nslist(min_nsid);
299
+
300
list = g_malloc0(data_len);
301
for (i = 0; i < n->num_namespaces; i++) {
302
if (i < min_nsid) {
303
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
304
case 0x02:
305
return nvme_identify_nslist(n, c);
306
default:
307
+ trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
308
return NVME_INVALID_FIELD | NVME_DNR;
309
}
310
}
311
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
312
switch (dw10) {
313
case NVME_VOLATILE_WRITE_CACHE:
314
result = blk_enable_write_cache(n->conf.blk);
315
+ trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
316
break;
317
case NVME_NUMBER_OF_QUEUES:
318
result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
319
+ trace_nvme_getfeat_numq(result);
320
break;
321
default:
322
+ trace_nvme_err_invalid_getfeat(dw10);
323
return NVME_INVALID_FIELD | NVME_DNR;
324
}
325
326
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
327
blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
328
break;
329
case NVME_NUMBER_OF_QUEUES:
330
+ trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
331
+ ((dw11 >> 16) & 0xFFFF) + 1,
332
+ n->num_queues - 1, n->num_queues - 1);
333
req->cqe.result =
334
cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
335
break;
336
default:
337
+ trace_nvme_err_invalid_setfeat(dw10);
338
return NVME_INVALID_FIELD | NVME_DNR;
339
}
340
return NVME_SUCCESS;
341
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
342
case NVME_ADM_CMD_GET_FEATURES:
343
return nvme_get_feature(n, cmd, req);
344
default:
345
+ trace_nvme_err_invalid_admin_opc(cmd->opcode);
346
return NVME_INVALID_OPCODE | NVME_DNR;
347
}
348
}
349
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
350
uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
351
uint32_t page_size = 1 << page_bits;
352
353
- if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
354
- n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
355
- NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
356
- NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
357
- NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
358
- NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
359
- NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
360
- NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
361
- !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
362
+ if (unlikely(n->cq[0])) {
363
+ trace_nvme_err_startfail_cq();
364
+ return -1;
365
+ }
366
+ if (unlikely(n->sq[0])) {
367
+ trace_nvme_err_startfail_sq();
368
+ return -1;
369
+ }
370
+ if (unlikely(!n->bar.asq)) {
371
+ trace_nvme_err_startfail_nbarasq();
372
+ return -1;
373
+ }
374
+ if (unlikely(!n->bar.acq)) {
375
+ trace_nvme_err_startfail_nbaracq();
376
+ return -1;
377
+ }
378
+ if (unlikely(n->bar.asq & (page_size - 1))) {
379
+ trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
380
+ return -1;
381
+ }
382
+ if (unlikely(n->bar.acq & (page_size - 1))) {
383
+ trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
384
+ return -1;
385
+ }
386
+ if (unlikely(NVME_CC_MPS(n->bar.cc) <
387
+ NVME_CAP_MPSMIN(n->bar.cap))) {
388
+ trace_nvme_err_startfail_page_too_small(
389
+ NVME_CC_MPS(n->bar.cc),
390
+ NVME_CAP_MPSMIN(n->bar.cap));
391
+ return -1;
392
+ }
393
+ if (unlikely(NVME_CC_MPS(n->bar.cc) >
394
+ NVME_CAP_MPSMAX(n->bar.cap))) {
395
+ trace_nvme_err_startfail_page_too_large(
396
+ NVME_CC_MPS(n->bar.cc),
397
+ NVME_CAP_MPSMAX(n->bar.cap));
398
+ return -1;
399
+ }
400
+ if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
401
+ NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
402
+ trace_nvme_err_startfail_cqent_too_small(
403
+ NVME_CC_IOCQES(n->bar.cc),
404
+ NVME_CTRL_CQES_MIN(n->bar.cap));
405
+ return -1;
406
+ }
407
+ if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
408
+ NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
409
+ trace_nvme_err_startfail_cqent_too_large(
410
+ NVME_CC_IOCQES(n->bar.cc),
411
+ NVME_CTRL_CQES_MAX(n->bar.cap));
412
+ return -1;
413
+ }
414
+ if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
415
+ NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
416
+ trace_nvme_err_startfail_sqent_too_small(
417
+ NVME_CC_IOSQES(n->bar.cc),
418
+ NVME_CTRL_SQES_MIN(n->bar.cap));
419
+ return -1;
420
+ }
421
+ if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
422
+ NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
423
+ trace_nvme_err_startfail_sqent_too_large(
424
+ NVME_CC_IOSQES(n->bar.cc),
425
+ NVME_CTRL_SQES_MAX(n->bar.cap));
426
+ return -1;
427
+ }
428
+ if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
429
+ trace_nvme_err_startfail_asqent_sz_zero();
430
+ return -1;
431
+ }
432
+ if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
433
+ trace_nvme_err_startfail_acqent_sz_zero();
434
return -1;
435
}
436
437
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
438
static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
439
unsigned size)
114
{
440
{
115
QEMUSGList qsg;
441
+ if (unlikely(offset & (sizeof(uint32_t) - 1))) {
116
+ QEMUIOVector iov;
442
+ NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
117
+ uint16_t status = NVME_SUCCESS;
443
+ "MMIO write not 32-bit aligned,"
118
444
+ " offset=0x%"PRIx64"", offset);
119
- if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
445
+ /* should be ignored, fall through for now */
120
+ if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
446
+ }
121
return NVME_INVALID_FIELD | NVME_DNR;
447
+
122
}
448
+ if (unlikely(size < sizeof(uint32_t))) {
123
- if (dma_buf_read(ptr, len, &qsg)) {
449
+ NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
124
+ if (qsg.nsg > 0) {
450
+ "MMIO write smaller than 32-bits,"
125
+ if (dma_buf_read(ptr, len, &qsg)) {
451
+ " offset=0x%"PRIx64", size=%u",
126
+ status = NVME_INVALID_FIELD | NVME_DNR;
452
+ offset, size);
453
+ /* should be ignored, fall through for now */
454
+ }
455
+
456
switch (offset) {
457
- case 0xc:
458
+ case 0xc: /* INTMS */
459
+ if (unlikely(msix_enabled(&(n->parent_obj)))) {
460
+ NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
461
+ "undefined access to interrupt mask set"
462
+ " when MSI-X is enabled");
463
+ /* should be ignored, fall through for now */
127
+ }
464
+ }
128
qemu_sglist_destroy(&qsg);
465
n->bar.intms |= data & 0xffffffff;
129
- return NVME_INVALID_FIELD | NVME_DNR;
466
n->bar.intmc = n->bar.intms;
467
+ trace_nvme_mmio_intm_set(data & 0xffffffff,
468
+ n->bar.intmc);
469
break;
470
- case 0x10:
471
+ case 0x10: /* INTMC */
472
+ if (unlikely(msix_enabled(&(n->parent_obj)))) {
473
+ NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
474
+ "undefined access to interrupt mask clr"
475
+ " when MSI-X is enabled");
476
+ /* should be ignored, fall through for now */
477
+ }
478
n->bar.intms &= ~(data & 0xffffffff);
479
n->bar.intmc = n->bar.intms;
480
+ trace_nvme_mmio_intm_clr(data & 0xffffffff,
481
+ n->bar.intmc);
482
break;
483
- case 0x14:
484
+ case 0x14: /* CC */
485
+ trace_nvme_mmio_cfg(data & 0xffffffff);
486
/* Windows first sends data, then sends enable bit */
487
if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
488
!NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
489
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
490
491
if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
492
n->bar.cc = data;
493
- if (nvme_start_ctrl(n)) {
494
+ if (unlikely(nvme_start_ctrl(n))) {
495
+ trace_nvme_err_startfail();
496
n->bar.csts = NVME_CSTS_FAILED;
497
} else {
498
+ trace_nvme_mmio_start_success();
499
n->bar.csts = NVME_CSTS_READY;
500
}
501
} else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
502
+ trace_nvme_mmio_stopped();
503
nvme_clear_ctrl(n);
504
n->bar.csts &= ~NVME_CSTS_READY;
505
}
506
if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
507
- nvme_clear_ctrl(n);
508
- n->bar.cc = data;
509
- n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
510
+ trace_nvme_mmio_shutdown_set();
511
+ nvme_clear_ctrl(n);
512
+ n->bar.cc = data;
513
+ n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
514
} else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
515
- n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
516
- n->bar.cc = data;
517
+ trace_nvme_mmio_shutdown_cleared();
518
+ n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
519
+ n->bar.cc = data;
520
+ }
521
+ break;
522
+ case 0x1C: /* CSTS */
523
+ if (data & (1 << 4)) {
524
+ NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
525
+ "attempted to W1C CSTS.NSSRO"
526
+ " but CAP.NSSRS is zero (not supported)");
527
+ } else if (data != 0) {
528
+ NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
529
+ "attempted to set a read only bit"
530
+ " of controller status");
531
+ }
532
+ break;
533
+ case 0x20: /* NSSR */
534
+ if (data == 0x4E564D65) {
535
+ trace_nvme_ub_mmiowr_ssreset_unsupported();
536
+ } else {
537
+ /* The spec says that writes of other values have no effect */
538
+ return;
539
}
540
break;
541
- case 0x24:
542
+ case 0x24: /* AQA */
543
n->bar.aqa = data & 0xffffffff;
544
+ trace_nvme_mmio_aqattr(data & 0xffffffff);
545
break;
546
- case 0x28:
547
+ case 0x28: /* ASQ */
548
n->bar.asq = data;
549
+ trace_nvme_mmio_asqaddr(data);
550
break;
551
- case 0x2c:
552
+ case 0x2c: /* ASQ hi */
553
n->bar.asq |= data << 32;
554
+ trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
555
break;
556
- case 0x30:
557
+ case 0x30: /* ACQ */
558
+ trace_nvme_mmio_acqaddr(data);
559
n->bar.acq = data;
560
break;
561
- case 0x34:
562
+ case 0x34: /* ACQ hi */
563
n->bar.acq |= data << 32;
564
+ trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
565
break;
566
+ case 0x38: /* CMBLOC */
567
+ NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
568
+ "invalid write to reserved CMBLOC"
569
+ " when CMBSZ is zero, ignored");
570
+ return;
571
+ case 0x3C: /* CMBSZ */
572
+ NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
573
+ "invalid write to read only CMBSZ, ignored");
574
+ return;
575
default:
576
+ NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
577
+ "invalid MMIO write,"
578
+ " offset=0x%"PRIx64", data=%"PRIx64"",
579
+ offset, data);
580
break;
581
}
582
}
583
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
584
uint8_t *ptr = (uint8_t *)&n->bar;
585
uint64_t val = 0;
586
587
+ if (unlikely(addr & (sizeof(uint32_t) - 1))) {
588
+ NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
589
+ "MMIO read not 32-bit aligned,"
590
+ " offset=0x%"PRIx64"", addr);
591
+ /* should RAZ, fall through for now */
592
+ } else if (unlikely(size < sizeof(uint32_t))) {
593
+ NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
594
+ "MMIO read smaller than 32-bits,"
595
+ " offset=0x%"PRIx64"", addr);
596
+ /* should RAZ, fall through for now */
597
+ }
598
+
599
if (addr < sizeof(n->bar)) {
600
memcpy(&val, ptr + addr, size);
130
+ } else {
601
+ } else {
131
+ if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
602
+ NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
132
+ status = NVME_INVALID_FIELD | NVME_DNR;
603
+ "MMIO read beyond last register,"
133
+ }
604
+ " offset=0x%"PRIx64", returning 0", addr);
134
+ qemu_iovec_destroy(&iov);
605
}
135
}
606
+
136
- qemu_sglist_destroy(&qsg);
607
return val;
137
- return NVME_SUCCESS;
608
}
138
+ return status;
609
139
}
610
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
140
611
{
141
static void nvme_post_cqes(void *opaque)
612
uint32_t qid;
142
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
613
143
return NVME_LBA_RANGE | NVME_DNR;
614
- if (addr & ((1 << 2) - 1)) {
144
}
615
+ if (unlikely(addr & ((1 << 2) - 1))) {
145
616
+ NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
146
- if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
617
+ "doorbell write not 32-bit aligned,"
147
+ if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
618
+ " offset=0x%"PRIx64", ignoring", addr);
148
block_acct_invalid(blk_get_stats(n->conf.blk), acct);
619
return;
149
return NVME_INVALID_FIELD | NVME_DNR;
620
}
150
}
621
151
622
if (((addr - 0x1000) >> 2) & 1) {
152
- assert((nlb << data_shift) == req->qsg.size);
623
+ /* Completion queue doorbell write */
153
-
624
+
154
- req->has_sg = true;
625
uint16_t new_head = val & 0xffff;
155
dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
626
int start_sqs;
156
- req->aiocb = is_write ?
627
NvmeCQueue *cq;
157
- dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
628
158
- nvme_rw_cb, req) :
629
qid = (addr - (0x1000 + (1 << 2))) >> 3;
159
- dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
630
- if (nvme_check_cqid(n, qid)) {
160
- nvme_rw_cb, req);
631
+ if (unlikely(nvme_check_cqid(n, qid))) {
161
+ if (req->qsg.nsg > 0) {
632
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
162
+ req->has_sg = true;
633
+ "completion queue doorbell write"
163
+ req->aiocb = is_write ?
634
+ " for nonexistent queue,"
164
+ dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
635
+ " sqid=%"PRIu32", ignoring", qid);
165
+ nvme_rw_cb, req) :
636
return;
166
+ dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
637
}
167
+ nvme_rw_cb, req);
638
168
+ } else {
639
cq = n->cq[qid];
169
+ req->has_sg = false;
640
- if (new_head >= cq->size) {
170
+ req->aiocb = is_write ?
641
+ if (unlikely(new_head >= cq->size)) {
171
+ blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
642
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
172
+ req) :
643
+ "completion queue doorbell write value"
173
+ blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
644
+ " beyond queue size, sqid=%"PRIu32","
174
+ req);
645
+ " new_head=%"PRIu16", ignoring",
175
+ }
646
+ qid, new_head);
176
647
return;
177
return NVME_NO_COMPLETE;
648
}
178
}
649
179
@@ -XXX,XX +XXX,XX @@ static int nvme_init(PCIDevice *pci_dev)
650
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
180
NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
651
nvme_isr_notify(n, cq);
181
NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
652
}
182
NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
653
} else {
183
- NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 0);
654
+ /* Submission queue doorbell write */
184
- NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 0);
655
+
185
+ NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
656
uint16_t new_tail = val & 0xffff;
186
+ NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
657
NvmeSQueue *sq;
187
NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
658
188
NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
659
qid = (addr - 0x1000) >> 3;
189
660
- if (nvme_check_sqid(n, qid)) {
190
+ n->cmbloc = n->bar.cmbloc;
661
+ if (unlikely(nvme_check_sqid(n, qid))) {
191
+ n->cmbsz = n->bar.cmbsz;
662
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
192
+
663
+ "submission queue doorbell write"
193
n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
664
+ " for nonexistent queue,"
194
memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
665
+ " sqid=%"PRIu32", ignoring", qid);
195
"nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
666
return;
196
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
667
}
668
669
sq = n->sq[qid];
670
- if (new_tail >= sq->size) {
671
+ if (unlikely(new_tail >= sq->size)) {
672
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
673
+ "submission queue doorbell write value"
674
+ " beyond queue size, sqid=%"PRIu32","
675
+ " new_tail=%"PRIu16", ignoring",
676
+ qid, new_tail);
677
return;
678
}
679
680
diff --git a/hw/block/trace-events b/hw/block/trace-events
197
index XXXXXXX..XXXXXXX 100644
681
index XXXXXXX..XXXXXXX 100644
198
--- a/hw/block/nvme.h
682
--- a/hw/block/trace-events
199
+++ b/hw/block/nvme.h
683
+++ b/hw/block/trace-events
200
@@ -XXX,XX +XXX,XX @@ typedef struct NvmeRequest {
684
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
201
NvmeCqe cqe;
685
hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
202
BlockAcctCookie acct;
686
hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
203
QEMUSGList qsg;
687
204
+ QEMUIOVector iov;
688
+# hw/block/nvme.c
205
QTAILQ_ENTRY(NvmeRequest)entry;
689
+# nvme traces for successful events
206
} NvmeRequest;
690
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
207
691
+nvme_irq_pin(void) "pulsing IRQ pin"
692
+nvme_irq_masked(void) "IRQ is masked"
693
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
694
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
695
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
696
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
697
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
698
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
699
+nvme_identify_ctrl(void) "identify controller"
700
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
701
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
702
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
703
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
704
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
705
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
706
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
707
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
708
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
709
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
710
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
711
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
712
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
713
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
714
+nvme_mmio_stopped(void) "cleared controller enable bit"
715
+nvme_mmio_shutdown_set(void) "shutdown bit set"
716
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
717
+
718
+# nvme traces for error conditions
719
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
720
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
721
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
722
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
723
+nvme_err_invalid_field(void) "invalid field"
724
+nvme_err_invalid_prp(void) "invalid PRP"
725
+nvme_err_invalid_sgl(void) "invalid SGL"
726
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
727
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
728
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
729
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
730
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
731
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
732
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
733
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
734
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
735
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
736
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
737
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
738
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
739
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
740
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
741
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
742
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
743
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
744
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
745
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
746
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
747
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
748
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
749
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
750
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
751
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
752
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
753
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
754
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
755
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
756
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
757
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
758
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
759
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
760
+nvme_err_startfail(void) "setting controller enable bit failed"
761
+
762
+# Traces for undefined behavior
763
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
764
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
765
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
766
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
767
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
768
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
769
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
770
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
771
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
772
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
773
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
774
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
775
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
776
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
777
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
778
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
779
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
780
+
781
# hw/block/xen_disk.c
782
xen_disk_alloc(char *name) "%s"
783
xen_disk_init(char *name) "%s"
208
--
784
--
209
1.8.3.1
785
2.13.6
210
786
211
787
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
1
From: Fam Zheng <famz@redhat.com>
2
2
3
blk/bdrv_drain_all() only takes effect for a single instant and then
3
Management tools create overlays of running guests with qemu-img:
4
resumes block jobs, guest devices, and other external clients like the
5
NBD server. This can be handy when performing a synchronous drain
6
before terminating the program, for example.
7
4
8
Monitor commands usually need to quiesce I/O across an entire code
5
$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
9
region so blk/bdrv_drain_all() is not suitable. They must use
10
bdrv_drain_all_begin/end() to mark the region. This prevents new I/O
11
requests from slipping in or worse - block jobs completing and modifying
12
the graph.
13
6
14
I audited other blk/bdrv_drain_all() callers but did not find anything
7
but this doesn't work anymore due to image locking:
15
that needs a similar fix. This patch fixes the savevm/loadvm commands.
16
Although I haven't encountered a read world issue this makes the code
17
safer.
18
8
19
Suggested-by: Kevin Wolf <kwolf@redhat.com>
9
qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Is another process using the image?
11
Could not open backing image to determine size.
12
Use the force share option to allow this use case again.
13
14
Cc: qemu-stable@nongnu.org
15
Signed-off-by: Fam Zheng <famz@redhat.com>
21
Reviewed-by: Eric Blake <eblake@redhat.com>
16
Reviewed-by: Eric Blake <eblake@redhat.com>
22
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
23
---
18
---
24
migration/savevm.c | 18 +++++++++++++++---
19
block.c | 3 ++-
25
1 file changed, 15 insertions(+), 3 deletions(-)
20
1 file changed, 2 insertions(+), 1 deletion(-)
26
21
27
diff --git a/migration/savevm.c b/migration/savevm.c
22
diff --git a/block.c b/block.c
28
index XXXXXXX..XXXXXXX 100644
23
index XXXXXXX..XXXXXXX 100644
29
--- a/migration/savevm.c
24
--- a/block.c
30
+++ b/migration/savevm.c
25
+++ b/block.c
31
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
26
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
32
}
27
back_flags = flags;
33
vm_stop(RUN_STATE_SAVE_VM);
28
back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
34
29
35
+ bdrv_drain_all_begin();
30
+ backing_options = qdict_new();
36
+
31
if (backing_fmt) {
37
aio_context_acquire(aio_context);
32
- backing_options = qdict_new();
38
33
qdict_put_str(backing_options, "driver", backing_fmt);
39
memset(sn, 0, sizeof(*sn));
34
}
40
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
35
+ qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
41
if (aio_context) {
36
42
aio_context_release(aio_context);
37
bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
43
}
38
&local_err);
44
+
45
+ bdrv_drain_all_end();
46
+
47
if (saved_vm_running) {
48
vm_start();
49
}
50
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
51
}
52
53
/* Flush all IO requests so they don't interfere with the new state. */
54
- bdrv_drain_all();
55
+ bdrv_drain_all_begin();
56
57
ret = bdrv_all_goto_snapshot(name, &bs);
58
if (ret < 0) {
59
error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
60
ret, name, bdrv_get_device_name(bs));
61
- return ret;
62
+ goto err_drain;
63
}
64
65
/* restore the VM state */
66
f = qemu_fopen_bdrv(bs_vm_state, 0);
67
if (!f) {
68
error_setg(errp, "Could not open VM state file");
69
- return -EINVAL;
70
+ ret = -EINVAL;
71
+ goto err_drain;
72
}
73
74
qemu_system_reset(SHUTDOWN_CAUSE_NONE);
75
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
76
ret = qemu_loadvm_state(f);
77
aio_context_release(aio_context);
78
79
+ bdrv_drain_all_end();
80
+
81
migration_incoming_state_destroy();
82
if (ret < 0) {
83
error_setg(errp, "Error %d while loading VM state", ret);
84
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
85
}
86
87
return 0;
88
+
89
+err_drain:
90
+ bdrv_drain_all_end();
91
+ return ret;
92
}
93
94
void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
95
--
39
--
96
1.8.3.1
40
2.13.6
97
41
98
42
diff view generated by jsdifflib
1
From: Alberto Garcia <berto@igalia.com>
1
From: Thomas Huth <thuth@redhat.com>
2
2
3
Instead of passing a single buffer pointer to do_perform_cow_write(),
3
It's not working anymore since QEMU v1.3.0 - time to remove it now.
4
pass a QEMUIOVector. This will allow us to merge the write requests
5
for the COW regions and the actual data into a single one.
6
4
7
Although do_perform_cow_read() does not strictly need to change its
5
Signed-off-by: Thomas Huth <thuth@redhat.com>
8
API, we're doing it here as well for consistency.
6
Reviewed-by: John Snow <jsnow@redhat.com>
9
7
Reviewed-by: Markus Armbruster <armbru@redhat.com>
10
Signed-off-by: Alberto Garcia <berto@igalia.com>
11
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
---
9
---
14
block/qcow2-cluster.c | 51 ++++++++++++++++++++++++---------------------------
10
blockdev.c | 11 -----------
15
1 file changed, 24 insertions(+), 27 deletions(-)
11
qemu-doc.texi | 6 ------
12
2 files changed, 17 deletions(-)
16
13
17
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
14
diff --git a/blockdev.c b/blockdev.c
18
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
19
--- a/block/qcow2-cluster.c
16
--- a/blockdev.c
20
+++ b/block/qcow2-cluster.c
17
+++ b/blockdev.c
21
@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
18
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
22
static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
19
.type = QEMU_OPT_STRING,
23
uint64_t src_cluster_offset,
20
.help = "chs translation (auto, lba, none)",
24
unsigned offset_in_cluster,
21
},{
25
- uint8_t *buffer,
22
- .name = "boot",
26
- unsigned bytes)
23
- .type = QEMU_OPT_BOOL,
27
+ QEMUIOVector *qiov)
24
- .help = "(deprecated, ignored)",
28
{
25
- },{
29
- QEMUIOVector qiov;
26
.name = "addr",
30
- struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
27
.type = QEMU_OPT_STRING,
31
int ret;
28
.help = "pci address (virtio only)",
32
29
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
33
- if (bytes == 0) {
34
+ if (qiov->size == 0) {
35
return 0;
36
}
37
38
- qemu_iovec_init_external(&qiov, &iov, 1);
39
-
40
BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
41
42
if (!bs->drv) {
43
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
44
* which can lead to deadlock when block layer copy-on-read is enabled.
45
*/
46
ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
47
- bytes, &qiov, 0);
48
+ qiov->size, qiov, 0);
49
if (ret < 0) {
50
return ret;
51
}
52
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
53
static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
54
uint64_t cluster_offset,
55
unsigned offset_in_cluster,
56
- uint8_t *buffer,
57
- unsigned bytes)
58
+ QEMUIOVector *qiov)
59
{
60
- QEMUIOVector qiov;
61
- struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
62
int ret;
63
64
- if (bytes == 0) {
65
+ if (qiov->size == 0) {
66
return 0;
67
}
68
69
- qemu_iovec_init_external(&qiov, &iov, 1);
70
-
71
ret = qcow2_pre_write_overlap_check(bs, 0,
72
- cluster_offset + offset_in_cluster, bytes);
73
+ cluster_offset + offset_in_cluster, qiov->size);
74
if (ret < 0) {
75
return ret;
76
}
77
78
BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
79
ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
80
- bytes, &qiov, 0);
81
+ qiov->size, qiov, 0);
82
if (ret < 0) {
83
return ret;
84
}
85
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
86
unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
87
bool merge_reads;
88
uint8_t *start_buffer, *end_buffer;
89
+ QEMUIOVector qiov;
90
int ret;
91
92
assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
93
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
94
/* The part of the buffer where the end region is located */
95
end_buffer = start_buffer + buffer_size - end->nb_bytes;
96
97
+ qemu_iovec_init(&qiov, 1);
98
+
99
qemu_co_mutex_unlock(&s->lock);
100
/* First we read the existing data from both COW regions. We
101
* either read the whole region in one go, or the start and end
102
* regions separately. */
103
if (merge_reads) {
104
- ret = do_perform_cow_read(bs, m->offset, start->offset,
105
- start_buffer, buffer_size);
106
+ qemu_iovec_add(&qiov, start_buffer, buffer_size);
107
+ ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
108
} else {
109
- ret = do_perform_cow_read(bs, m->offset, start->offset,
110
- start_buffer, start->nb_bytes);
111
+ qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
112
+ ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
113
if (ret < 0) {
114
goto fail;
115
}
116
117
- ret = do_perform_cow_read(bs, m->offset, end->offset,
118
- end_buffer, end->nb_bytes);
119
+ qemu_iovec_reset(&qiov);
120
+ qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
121
+ ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
122
}
123
if (ret < 0) {
124
goto fail;
125
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
126
}
127
128
/* And now we can write everything */
129
- ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
130
- start_buffer, start->nb_bytes);
131
+ qemu_iovec_reset(&qiov);
132
+ qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
133
+ ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
134
if (ret < 0) {
135
goto fail;
30
goto fail;
136
}
31
}
137
32
138
- ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
33
- /* Deprecated option boot=[on|off] */
139
- end_buffer, end->nb_bytes);
34
- if (qemu_opt_get(legacy_opts, "boot") != NULL) {
140
+ qemu_iovec_reset(&qiov);
35
- fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
141
+ qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
36
- "ignored. Future versions will reject this parameter. Please "
142
+ ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
37
- "update your scripts.\n");
143
fail:
38
- }
144
qemu_co_mutex_lock(&s->lock);
39
-
145
40
/* Other deprecated options */
146
@@ -XXX,XX +XXX,XX @@ fail:
41
if (!qtest_enabled()) {
147
}
42
for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
148
43
diff --git a/qemu-doc.texi b/qemu-doc.texi
149
qemu_vfree(start_buffer);
44
index XXXXXXX..XXXXXXX 100644
150
+ qemu_iovec_destroy(&qiov);
45
--- a/qemu-doc.texi
151
return ret;
46
+++ b/qemu-doc.texi
152
}
47
@@ -XXX,XX +XXX,XX @@ deprecated.
153
48
49
@section System emulator command line arguments
50
51
-@subsection -drive boot=on|off (since 1.3.0)
52
-
53
-The ``boot=on|off'' option to the ``-drive'' argument is
54
-ignored. Applications should use the ``bootindex=N'' parameter
55
-to set an absolute ordering between devices instead.
56
-
57
@subsection -tdf (since 1.3.0)
58
59
The ``-tdf'' argument is ignored. The behaviour implemented
154
--
60
--
155
1.8.3.1
61
2.13.6
156
62
157
63
diff view generated by jsdifflib
1
This documents the driver-specific options for the raw, qcow2 and file
1
From: Thomas Huth <thuth@redhat.com>
2
block drivers for the man page. For everything else, we refer to the
2
3
QAPI documentation.
3
It's been marked as deprecated since QEMU v2.10.0, and so far nobody
4
4
complained that we should keep it, so let's remove this legacy option
5
now to simplify the code quite a bit.
6
7
Signed-off-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: John Snow <jsnow@redhat.com>
9
Reviewed-by: Markus Armbruster <armbru@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Eric Blake <eblake@redhat.com>
7
Reviewed-by: Max Reitz <mreitz@redhat.com>
8
---
11
---
9
qemu-options.hx | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
12
vl.c | 86 ++-------------------------------------------------------
10
1 file changed, 114 insertions(+), 1 deletion(-)
13
qemu-doc.texi | 8 ------
11
14
qemu-options.hx | 19 ++-----------
15
3 files changed, 4 insertions(+), 109 deletions(-)
16
17
diff --git a/vl.c b/vl.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/vl.c
20
+++ b/vl.c
21
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
22
const char *boot_order = NULL;
23
const char *boot_once = NULL;
24
DisplayState *ds;
25
- int cyls, heads, secs, translation;
26
QemuOpts *opts, *machine_opts;
27
- QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
28
+ QemuOpts *icount_opts = NULL, *accel_opts = NULL;
29
QemuOptsList *olist;
30
int optind;
31
const char *optarg;
32
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
33
34
cpu_model = NULL;
35
snapshot = 0;
36
- cyls = heads = secs = 0;
37
- translation = BIOS_ATA_TRANSLATION_AUTO;
38
39
nb_nics = 0;
40
41
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
42
if (optind >= argc)
43
break;
44
if (argv[optind][0] != '-') {
45
- hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
46
+ drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
47
} else {
48
const QEMUOption *popt;
49
50
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
51
cpu_model = optarg;
52
break;
53
case QEMU_OPTION_hda:
54
- {
55
- char buf[256];
56
- if (cyls == 0)
57
- snprintf(buf, sizeof(buf), "%s", HD_OPTS);
58
- else
59
- snprintf(buf, sizeof(buf),
60
- "%s,cyls=%d,heads=%d,secs=%d%s",
61
- HD_OPTS , cyls, heads, secs,
62
- translation == BIOS_ATA_TRANSLATION_LBA ?
63
- ",trans=lba" :
64
- translation == BIOS_ATA_TRANSLATION_NONE ?
65
- ",trans=none" : "");
66
- drive_add(IF_DEFAULT, 0, optarg, buf);
67
- break;
68
- }
69
case QEMU_OPTION_hdb:
70
case QEMU_OPTION_hdc:
71
case QEMU_OPTION_hdd:
72
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
73
case QEMU_OPTION_snapshot:
74
snapshot = 1;
75
break;
76
- case QEMU_OPTION_hdachs:
77
- {
78
- const char *p;
79
- p = optarg;
80
- cyls = strtol(p, (char **)&p, 0);
81
- if (cyls < 1 || cyls > 16383)
82
- goto chs_fail;
83
- if (*p != ',')
84
- goto chs_fail;
85
- p++;
86
- heads = strtol(p, (char **)&p, 0);
87
- if (heads < 1 || heads > 16)
88
- goto chs_fail;
89
- if (*p != ',')
90
- goto chs_fail;
91
- p++;
92
- secs = strtol(p, (char **)&p, 0);
93
- if (secs < 1 || secs > 63)
94
- goto chs_fail;
95
- if (*p == ',') {
96
- p++;
97
- if (!strcmp(p, "large")) {
98
- translation = BIOS_ATA_TRANSLATION_LARGE;
99
- } else if (!strcmp(p, "rechs")) {
100
- translation = BIOS_ATA_TRANSLATION_RECHS;
101
- } else if (!strcmp(p, "none")) {
102
- translation = BIOS_ATA_TRANSLATION_NONE;
103
- } else if (!strcmp(p, "lba")) {
104
- translation = BIOS_ATA_TRANSLATION_LBA;
105
- } else if (!strcmp(p, "auto")) {
106
- translation = BIOS_ATA_TRANSLATION_AUTO;
107
- } else {
108
- goto chs_fail;
109
- }
110
- } else if (*p != '\0') {
111
- chs_fail:
112
- error_report("invalid physical CHS format");
113
- exit(1);
114
- }
115
- if (hda_opts != NULL) {
116
- qemu_opt_set_number(hda_opts, "cyls", cyls,
117
- &error_abort);
118
- qemu_opt_set_number(hda_opts, "heads", heads,
119
- &error_abort);
120
- qemu_opt_set_number(hda_opts, "secs", secs,
121
- &error_abort);
122
- if (translation == BIOS_ATA_TRANSLATION_LARGE) {
123
- qemu_opt_set(hda_opts, "trans", "large",
124
- &error_abort);
125
- } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
126
- qemu_opt_set(hda_opts, "trans", "rechs",
127
- &error_abort);
128
- } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
129
- qemu_opt_set(hda_opts, "trans", "lba",
130
- &error_abort);
131
- } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
132
- qemu_opt_set(hda_opts, "trans", "none",
133
- &error_abort);
134
- }
135
- }
136
- }
137
- error_report("'-hdachs' is deprecated, please use '-device"
138
- " ide-hd,cyls=c,heads=h,secs=s,...' instead");
139
- break;
140
case QEMU_OPTION_numa:
141
opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
142
optarg, true);
143
diff --git a/qemu-doc.texi b/qemu-doc.texi
144
index XXXXXXX..XXXXXXX 100644
145
--- a/qemu-doc.texi
146
+++ b/qemu-doc.texi
147
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
148
``-object filter-dump'' argument which works in combination
149
with the modern ``-netdev`` backends instead.
150
151
-@subsection -hdachs (since 2.10.0)
152
-
153
-The ``-hdachs'' argument is now a synonym for setting
154
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
155
-on the ``ide-hd'' device using the ``-device'' argument.
156
-The new syntax allows different settings to be provided
157
-per disk.
158
-
159
@subsection -usbdevice (since 2.10.0)
160
161
The ``-usbdevice DEV'' argument is now a synonym for setting
12
diff --git a/qemu-options.hx b/qemu-options.hx
162
diff --git a/qemu-options.hx b/qemu-options.hx
13
index XXXXXXX..XXXXXXX 100644
163
index XXXXXXX..XXXXXXX 100644
14
--- a/qemu-options.hx
164
--- a/qemu-options.hx
15
+++ b/qemu-options.hx
165
+++ b/qemu-options.hx
16
@@ -XXX,XX +XXX,XX @@ STEXI
166
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
17
@item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
167
@item media=@var{media}
18
@findex -blockdev
168
This option defines the type of the media: disk or cdrom.
19
169
@item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
20
-Define a new block driver node.
170
-These options have the same definition as they have in @option{-hdachs}.
21
+Define a new block driver node. Some of the options apply to all block drivers,
171
-These parameters are deprecated, use the corresponding parameters
22
+other options are only accepted for a specific block driver. See below for a
172
+Force disk physical geometry and the optional BIOS translation (trans=none or
23
+list of generic options and options for the most common block drivers.
173
+lba). These parameters are deprecated, use the corresponding parameters
24
+
174
of @code{-device} instead.
25
+Options that expect a reference to another node (e.g. @code{file}) can be
175
@item snapshot=@var{snapshot}
26
+given in two ways. Either you specify the node name of an already existing node
176
@var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
27
+(file=@var{node-name}), or you define a new node inline, adding options
177
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
28
+for the referenced node after a dot (file.filename=@var{path},file.aio=native).
178
the write back by pressing @key{C-a s} (@pxref{disk_images}).
29
+
30
+A block driver node created with @option{-blockdev} can be used for a guest
31
+device by specifying its node name for the @code{drive} property in a
32
+@option{-device} argument that defines a block device.
33
34
@table @option
35
@item Valid options for any block driver node:
36
@@ -XXX,XX +XXX,XX @@ zero write commands. You may even choose "unmap" if @var{discard} is set
37
to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
38
@end table
39
40
+@item Driver-specific options for @code{file}
41
+
42
+This is the protocol-level block driver for accessing regular files.
43
+
44
+@table @code
45
+@item filename
46
+The path to the image file in the local filesystem
47
+@item aio
48
+Specifies the AIO backend (threads/native, default: threads)
49
+@end table
50
+Example:
51
+@example
52
+-blockdev driver=file,node-name=disk,filename=disk.img
53
+@end example
54
+
55
+@item Driver-specific options for @code{raw}
56
+
57
+This is the image format block driver for raw images. It is usually
58
+stacked on top of a protocol level block driver such as @code{file}.
59
+
60
+@table @code
61
+@item file
62
+Reference to or definition of the data source block driver node
63
+(e.g. a @code{file} driver node)
64
+@end table
65
+Example 1:
66
+@example
67
+-blockdev driver=file,node-name=disk_file,filename=disk.img
68
+-blockdev driver=raw,node-name=disk,file=disk_file
69
+@end example
70
+Example 2:
71
+@example
72
+-blockdev driver=raw,node-name=disk,file.driver=file,file.filename=disk.img
73
+@end example
74
+
75
+@item Driver-specific options for @code{qcow2}
76
+
77
+This is the image format block driver for qcow2 images. It is usually
78
+stacked on top of a protocol level block driver such as @code{file}.
79
+
80
+@table @code
81
+@item file
82
+Reference to or definition of the data source block driver node
83
+(e.g. a @code{file} driver node)
84
+
85
+@item backing
86
+Reference to or definition of the backing file block device (default is taken
87
+from the image file). It is allowed to pass an empty string here in order to
88
+disable the default backing file.
89
+
90
+@item lazy-refcounts
91
+Whether to enable the lazy refcounts feature (on/off; default is taken from the
92
+image file)
93
+
94
+@item cache-size
95
+The maximum total size of the L2 table and refcount block caches in bytes
96
+(default: 1048576 bytes or 8 clusters, whichever is larger)
97
+
98
+@item l2-cache-size
99
+The maximum size of the L2 table cache in bytes
100
+(default: 4/5 of the total cache size)
101
+
102
+@item refcount-cache-size
103
+The maximum size of the refcount block cache in bytes
104
+(default: 1/5 of the total cache size)
105
+
106
+@item cache-clean-interval
107
+Clean unused entries in the L2 and refcount caches. The interval is in seconds.
108
+The default value is 0 and it disables this feature.
109
+
110
+@item pass-discard-request
111
+Whether discard requests to the qcow2 device should be forwarded to the data
112
+source (on/off; default: on if discard=unmap is specified, off otherwise)
113
+
114
+@item pass-discard-snapshot
115
+Whether discard requests for the data source should be issued when a snapshot
116
+operation (e.g. deleting a snapshot) frees clusters in the qcow2 file (on/off;
117
+default: on)
118
+
119
+@item pass-discard-other
120
+Whether discard requests for the data source should be issued on other
121
+occasions where a cluster gets freed (on/off; default: off)
122
+
123
+@item overlap-check
124
+Which overlap checks to perform for writes to the image
125
+(none/constant/cached/all; default: cached). For details or finer
126
+granularity control refer to the QAPI documentation of @code{blockdev-add}.
127
+@end table
128
+
129
+Example 1:
130
+@example
131
+-blockdev driver=file,node-name=my_file,filename=/tmp/disk.qcow2
132
+-blockdev driver=qcow2,node-name=hda,file=my_file,overlap-check=none,cache-size=16777216
133
+@end example
134
+Example 2:
135
+@example
136
+-blockdev driver=qcow2,node-name=disk,file.driver=http,file.filename=http://example.com/image.qcow2
137
+@end example
138
+
139
+@item Driver-specific options for other drivers
140
+Please refer to the QAPI documentation of the @code{blockdev-add} QMP command.
141
+
142
@end table
143
144
ETEXI
179
ETEXI
180
181
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
182
- "-hdachs c,h,s[,t]\n" \
183
- " force hard disk 0 physical geometry and the optional BIOS\n" \
184
- " translation (t=none or lba) (usually QEMU can guess them)\n",
185
- QEMU_ARCH_ALL)
186
-STEXI
187
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
188
-@findex -hdachs
189
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
190
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
191
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
192
-all those parameters. This option is deprecated, please use
193
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
194
-ETEXI
195
-
196
DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
197
"-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
198
" [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
145
--
199
--
146
1.8.3.1
200
2.13.6
147
201
148
202
diff view generated by jsdifflib
1
From: Alberto Garcia <berto@igalia.com>
1
From: Thomas Huth <thuth@redhat.com>
2
2
3
We already have functions for doing these calculations, so let's use
3
Looks like we forgot to announce the deprecation of these options in
4
them instead of doing everything by hand. This makes the code a bit
4
the corresponding chapter of the qemu-doc text, so let's do that now.
5
more readable.
6
5
7
Signed-off-by: Alberto Garcia <berto@igalia.com>
6
Signed-off-by: Thomas Huth <thuth@redhat.com>
7
Reviewed-by: John Snow <jsnow@redhat.com>
8
Reviewed-by: Markus Armbruster <armbru@redhat.com>
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
---
10
---
10
block/qcow2-cluster.c | 4 ++--
11
qemu-doc.texi | 15 +++++++++++++++
11
block/qcow2.c | 2 +-
12
1 file changed, 15 insertions(+)
12
2 files changed, 3 insertions(+), 3 deletions(-)
13
13
14
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
14
diff --git a/qemu-doc.texi b/qemu-doc.texi
15
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
16
--- a/block/qcow2-cluster.c
16
--- a/qemu-doc.texi
17
+++ b/block/qcow2-cluster.c
17
+++ b/qemu-doc.texi
18
@@ -XXX,XX +XXX,XX @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
18
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
19
19
The ``-drive if=scsi'' argument is replaced by the the
20
/* find the cluster offset for the given disk offset */
20
``-device BUS-TYPE'' argument combined with ``-drive if=none''.
21
21
22
- l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
22
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
23
+ l2_index = offset_to_l2_index(s, offset);
23
+
24
*cluster_offset = be64_to_cpu(l2_table[l2_index]);
24
+The drive geometry arguments are replaced by the the geometry arguments
25
25
+that can be specified with the ``-device'' parameter.
26
nb_clusters = size_to_clusters(s, bytes_needed);
26
+
27
@@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
27
+@subsection -drive serial=... (since 2.10.0)
28
28
+
29
/* find the cluster offset for the given disk offset */
29
+The drive serial argument is replaced by the the serial argument
30
30
+that can be specified with the ``-device'' parameter.
31
- l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
31
+
32
+ l2_index = offset_to_l2_index(s, offset);
32
+@subsection -drive addr=... (since 2.10.0)
33
33
+
34
*new_l2_table = l2_table;
34
+The drive addr argument is replaced by the the addr argument
35
*new_l2_index = l2_index;
35
+that can be specified with the ``-device'' parameter.
36
diff --git a/block/qcow2.c b/block/qcow2.c
36
+
37
index XXXXXXX..XXXXXXX 100644
37
@subsection -net dump (since 2.10.0)
38
--- a/block/qcow2.c
38
39
+++ b/block/qcow2.c
39
The ``--net dump'' argument is now replaced with the
40
@@ -XXX,XX +XXX,XX @@ static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
41
}
42
43
/* Tables must be cluster aligned */
44
- if (offset & (s->cluster_size - 1)) {
45
+ if (offset_into_cluster(s, offset) != 0) {
46
return -EINVAL;
47
}
48
49
--
40
--
50
1.8.3.1
41
2.13.6
51
42
52
43
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
2
3
Signed-off-by: Fam Zheng <famz@redhat.com>
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2
Reviewed-by: Eric Blake <eblake@redhat.com>
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
---
5
---
5
block/qed-cluster.c | 94 ++++++++++++++++++-----------------------------------
6
include/block/block_int.h | 1 -
6
block/qed-table.c | 15 +++------
7
block/io.c | 18 ------------------
7
block/qed.h | 3 +-
8
2 files changed, 19 deletions(-)
8
3 files changed, 36 insertions(+), 76 deletions(-)
9
9
10
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
10
diff --git a/include/block/block_int.h b/include/block/block_int.h
11
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
12
--- a/block/qed-cluster.c
12
--- a/include/block/block_int.h
13
+++ b/block/qed-cluster.c
13
+++ b/include/block/block_int.h
14
@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
14
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
15
return i - index;
15
bool blk_dev_is_medium_locked(BlockBackend *blk);
16
17
void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
18
-bool bdrv_requests_pending(BlockDriverState *bs);
19
20
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
21
void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
22
diff --git a/block/io.c b/block/io.c
23
index XXXXXXX..XXXXXXX 100644
24
--- a/block/io.c
25
+++ b/block/io.c
26
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
27
assert(old >= 1);
16
}
28
}
17
29
18
-typedef struct {
30
-/* Check if any requests are in-flight (including throttled requests) */
19
- BDRVQEDState *s;
31
-bool bdrv_requests_pending(BlockDriverState *bs)
20
- uint64_t pos;
32
-{
21
- size_t len;
33
- BdrvChild *child;
22
-
34
-
23
- QEDRequest *request;
35
- if (atomic_read(&bs->in_flight)) {
24
-
36
- return true;
25
- /* User callback */
26
- QEDFindClusterFunc *cb;
27
- void *opaque;
28
-} QEDFindClusterCB;
29
-
30
-static void qed_find_cluster_cb(void *opaque, int ret)
31
-{
32
- QEDFindClusterCB *find_cluster_cb = opaque;
33
- BDRVQEDState *s = find_cluster_cb->s;
34
- QEDRequest *request = find_cluster_cb->request;
35
- uint64_t offset = 0;
36
- size_t len = 0;
37
- unsigned int index;
38
- unsigned int n;
39
-
40
- qed_acquire(s);
41
- if (ret) {
42
- goto out;
43
- }
37
- }
44
-
38
-
45
- index = qed_l2_index(s, find_cluster_cb->pos);
39
- QLIST_FOREACH(child, &bs->children, next) {
46
- n = qed_bytes_to_clusters(s,
40
- if (bdrv_requests_pending(child->bs)) {
47
- qed_offset_into_cluster(s, find_cluster_cb->pos) +
41
- return true;
48
- find_cluster_cb->len);
42
- }
49
- n = qed_count_contiguous_clusters(s, request->l2_table->table,
50
- index, n, &offset);
51
-
52
- if (qed_offset_is_unalloc_cluster(offset)) {
53
- ret = QED_CLUSTER_L2;
54
- } else if (qed_offset_is_zero_cluster(offset)) {
55
- ret = QED_CLUSTER_ZERO;
56
- } else if (qed_check_cluster_offset(s, offset)) {
57
- ret = QED_CLUSTER_FOUND;
58
- } else {
59
- ret = -EINVAL;
60
- }
43
- }
61
-
44
-
62
- len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
45
- return false;
63
- qed_offset_into_cluster(s, find_cluster_cb->pos));
64
-
65
-out:
66
- find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
67
- qed_release(s);
68
- g_free(find_cluster_cb);
69
-}
46
-}
70
-
47
-
71
/**
48
typedef struct {
72
* Find the offset of a data cluster
49
Coroutine *co;
73
*
50
BlockDriverState *bs;
74
@@ -XXX,XX +XXX,XX @@ out:
75
void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
76
size_t len, QEDFindClusterFunc *cb, void *opaque)
77
{
78
- QEDFindClusterCB *find_cluster_cb;
79
uint64_t l2_offset;
80
+ uint64_t offset = 0;
81
+ unsigned int index;
82
+ unsigned int n;
83
+ int ret;
84
85
/* Limit length to L2 boundary. Requests are broken up at the L2 boundary
86
* so that a request acts on one L2 table at a time.
87
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
88
return;
89
}
90
91
- find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
92
- find_cluster_cb->s = s;
93
- find_cluster_cb->pos = pos;
94
- find_cluster_cb->len = len;
95
- find_cluster_cb->cb = cb;
96
- find_cluster_cb->opaque = opaque;
97
- find_cluster_cb->request = request;
98
+ ret = qed_read_l2_table(s, request, l2_offset);
99
+ qed_acquire(s);
100
+ if (ret) {
101
+ goto out;
102
+ }
103
+
104
+ index = qed_l2_index(s, pos);
105
+ n = qed_bytes_to_clusters(s,
106
+ qed_offset_into_cluster(s, pos) + len);
107
+ n = qed_count_contiguous_clusters(s, request->l2_table->table,
108
+ index, n, &offset);
109
+
110
+ if (qed_offset_is_unalloc_cluster(offset)) {
111
+ ret = QED_CLUSTER_L2;
112
+ } else if (qed_offset_is_zero_cluster(offset)) {
113
+ ret = QED_CLUSTER_ZERO;
114
+ } else if (qed_check_cluster_offset(s, offset)) {
115
+ ret = QED_CLUSTER_FOUND;
116
+ } else {
117
+ ret = -EINVAL;
118
+ }
119
+
120
+ len = MIN(len,
121
+ n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
122
123
- qed_read_l2_table(s, request, l2_offset,
124
- qed_find_cluster_cb, find_cluster_cb);
125
+out:
126
+ cb(opaque, ret, offset, len);
127
+ qed_release(s);
128
}
129
diff --git a/block/qed-table.c b/block/qed-table.c
130
index XXXXXXX..XXXXXXX 100644
131
--- a/block/qed-table.c
132
+++ b/block/qed-table.c
133
@@ -XXX,XX +XXX,XX @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
134
return ret;
135
}
136
137
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
138
- BlockCompletionFunc *cb, void *opaque)
139
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
140
{
141
int ret;
142
143
@@ -XXX,XX +XXX,XX @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
144
/* Check for cached L2 entry */
145
request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
146
if (request->l2_table) {
147
- cb(opaque, 0);
148
- return;
149
+ return 0;
150
}
151
152
request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
153
@@ -XXX,XX +XXX,XX @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
154
}
155
qed_release(s);
156
157
- cb(opaque, ret);
158
+ return ret;
159
}
160
161
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
162
{
163
- int ret = -EINPROGRESS;
164
-
165
- qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
166
- BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
167
-
168
- return ret;
169
+ return qed_read_l2_table(s, request, offset);
170
}
171
172
void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
173
diff --git a/block/qed.h b/block/qed.h
174
index XXXXXXX..XXXXXXX 100644
175
--- a/block/qed.h
176
+++ b/block/qed.h
177
@@ -XXX,XX +XXX,XX @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
178
unsigned int n);
179
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
180
uint64_t offset);
181
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
182
- BlockCompletionFunc *cb, void *opaque);
183
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
184
void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
185
unsigned int index, unsigned int n, bool flush,
186
BlockCompletionFunc *cb, void *opaque);
187
--
51
--
188
1.8.3.1
52
2.13.6
189
53
190
54
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
3
Calling aio_poll() directly may have been fine previously, but this is
4
the future, man! The difference between an aio_poll() loop and
5
BDRV_POLL_WHILE() is that BDRV_POLL_WHILE() releases the AioContext
6
around aio_poll().
7
8
This allows the IOThread to run fd handlers or BHs to complete the
9
request. Failure to release the AioContext causes deadlocks.
10
11
Using BDRV_POLL_WHILE() partially fixes a 'savevm' hang with -object
12
iothread.
13
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Reviewed-by: Eric Blake <eblake@redhat.com>
16
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2
Reviewed-by: Fam Zheng <famz@redhat.com>
18
---
3
---
19
block/io.c | 4 +---
4
block/io.c | 6 ++++++
20
1 file changed, 1 insertion(+), 3 deletions(-)
5
1 file changed, 6 insertions(+)
21
6
22
diff --git a/block/io.c b/block/io.c
7
diff --git a/block/io.c b/block/io.c
23
index XXXXXXX..XXXXXXX 100644
8
index XXXXXXX..XXXXXXX 100644
24
--- a/block/io.c
9
--- a/block/io.c
25
+++ b/block/io.c
10
+++ b/block/io.c
26
@@ -XXX,XX +XXX,XX @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
11
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
27
Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
12
BdrvNextIterator it;
28
13
GSList *aio_ctxs = NULL, *ctx;
29
bdrv_coroutine_enter(bs, co);
14
30
- while (data.ret == -EINPROGRESS) {
15
+ /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
31
- aio_poll(bdrv_get_aio_context(bs), true);
16
+ * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
32
- }
17
+ * nodes in several different AioContexts, so make sure we're in the main
33
+ BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
18
+ * context. */
34
return data.ret;
19
+ assert(qemu_get_current_aio_context() == qemu_get_aio_context());
35
}
20
+
36
}
21
block_job_pause_all();
22
23
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
37
--
24
--
38
1.8.3.1
25
2.13.6
39
26
40
27
diff view generated by jsdifflib
1
Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
1
bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
2
just return an error code and let the caller handle it.
2
and also doesn't notify other parent nodes of children, which both means
3
that the child nodes are not actually drained, and bdrv_drained_begin()
4
is providing useful functionality only on a single node.
5
6
To keep things consistent, we also shouldn't call the block driver
7
callbacks recursively.
8
9
A proper recursive drain version that provides an actually working
10
drained section for child nodes will be introduced later.
3
11
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Fam Zheng <famz@redhat.com>
6
---
14
---
7
block/qed.c | 43 ++++++++++++++++++++++++++-----------------
15
block/io.c | 16 +++++++++-------
8
1 file changed, 26 insertions(+), 17 deletions(-)
16
1 file changed, 9 insertions(+), 7 deletions(-)
9
17
10
diff --git a/block/qed.c b/block/qed.c
18
diff --git a/block/io.c b/block/io.c
11
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
12
--- a/block/qed.c
20
--- a/block/io.c
13
+++ b/block/qed.c
21
+++ b/block/io.c
14
@@ -XXX,XX +XXX,XX @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
22
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
15
/**
23
}
16
* Update L2 table with new cluster offsets and write them out
24
17
*/
25
/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
18
-static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
26
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
19
+static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
27
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
20
{
28
{
21
BDRVQEDState *s = acb_to_s(acb);
29
BdrvChild *child, *tmp;
22
bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
30
BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
23
- int index;
31
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
24
-
32
bdrv_coroutine_enter(bs, data.co);
25
- if (ret) {
33
BDRV_POLL_WHILE(bs, !data.done);
26
- goto err;
34
27
- }
35
- QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
28
+ int index, ret;
36
- bdrv_drain_invoke(child->bs, begin);
29
37
+ if (recursive) {
30
if (need_alloc) {
38
+ QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
31
qed_unref_l2_cache_entry(acb->request.l2_table);
39
+ bdrv_drain_invoke(child->bs, begin, true);
32
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
33
/* Write out the whole new L2 table */
34
ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
35
if (ret) {
36
- goto err;
37
+ return ret;
38
}
39
- ret = qed_aio_write_l1_update(acb);
40
- qed_aio_next_io(acb, ret);
41
-
42
+ return qed_aio_write_l1_update(acb);
43
} else {
44
/* Write out only the updated part of the L2 table */
45
ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
46
false);
47
- qed_aio_next_io(acb, ret);
48
+ if (ret) {
49
+ return ret;
50
+ }
40
+ }
51
}
41
}
52
- return;
53
-
54
-err:
55
- qed_aio_complete(acb, ret);
56
+ return 0;
57
}
42
}
58
43
59
/**
44
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
60
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
45
bdrv_parent_drained_begin(bs);
61
*/
62
ret = bdrv_flush(s->bs->file->bs);
63
}
64
- qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
65
+ if (ret) {
66
+ goto err;
67
+ }
68
+ ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
69
+ if (ret) {
70
+ goto err;
71
+ }
72
+ qed_aio_next_io(acb, 0);
73
}
46
}
74
+ return;
47
75
+
48
- bdrv_drain_invoke(bs, true);
76
+err:
49
+ bdrv_drain_invoke(bs, true, false);
77
+ qed_aio_complete(acb, ret);
50
bdrv_drain_recurse(bs);
78
}
51
}
79
52
80
/**
53
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
81
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
82
return;
83
}
54
}
84
55
85
- qed_aio_write_l2_update(acb, 0, 1);
56
/* Re-enable things in child-to-parent order */
86
+ ret = qed_aio_write_l2_update(acb, 1);
57
- bdrv_drain_invoke(bs, false);
87
+ if (ret < 0) {
58
+ bdrv_drain_invoke(bs, false, false);
88
+ qed_aio_complete(acb, ret);
59
bdrv_parent_drained_end(bs);
89
+ return;
60
aio_enable_external(bdrv_get_aio_context(bs));
90
+ }
91
+ qed_aio_next_io(acb, 0);
92
}
61
}
93
62
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
94
/**
63
aio_context_acquire(aio_context);
64
aio_disable_external(aio_context);
65
bdrv_parent_drained_begin(bs);
66
- bdrv_drain_invoke(bs, true);
67
+ bdrv_drain_invoke(bs, true, true);
68
aio_context_release(aio_context);
69
70
if (!g_slist_find(aio_ctxs, aio_context)) {
71
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
72
73
/* Re-enable things in child-to-parent order */
74
aio_context_acquire(aio_context);
75
- bdrv_drain_invoke(bs, false);
76
+ bdrv_drain_invoke(bs, false, true);
77
bdrv_parent_drained_end(bs);
78
aio_enable_external(aio_context);
79
aio_context_release(aio_context);
95
--
80
--
96
1.8.3.1
81
2.13.6
97
82
98
83
diff view generated by jsdifflib
1
qed_commit_l2_update() is unconditionally called at the end of
1
The existing test is for bdrv_drain_all_begin/end() only. Generalise the
2
qed_aio_write_l1_update(). Inline it.
2
test case so that it can be run for the other variants as well. At the
3
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
4
another one.
5
6
Also, add a backing file to the test node to test whether the operations
7
work recursively.
3
8
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
---
10
---
7
block/qed.c | 36 ++++++++++++++----------------------
11
tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
8
1 file changed, 14 insertions(+), 22 deletions(-)
12
1 file changed, 62 insertions(+), 7 deletions(-)
9
13
10
diff --git a/block/qed.c b/block/qed.c
14
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
11
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
12
--- a/block/qed.c
16
--- a/tests/test-bdrv-drain.c
13
+++ b/block/qed.c
17
+++ b/tests/test-bdrv-drain.c
14
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
18
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
19
20
.bdrv_co_drain_begin = bdrv_test_co_drain_begin,
21
.bdrv_co_drain_end = bdrv_test_co_drain_end,
22
+
23
+ .bdrv_child_perm = bdrv_format_default_perms,
24
};
25
26
static void aio_ret_cb(void *opaque, int ret)
27
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
28
*aio_ret = ret;
15
}
29
}
16
30
17
/**
31
-static void test_drv_cb_drain_all(void)
18
- * Commit the current L2 table to the cache
32
+enum drain_type {
19
+ * Update L1 table with new L2 table offset and write it out
33
+ BDRV_DRAIN_ALL,
20
*/
34
+ BDRV_DRAIN,
21
-static void qed_commit_l2_update(void *opaque, int ret)
35
+};
22
+static void qed_aio_write_l1_update(void *opaque, int ret)
36
+
37
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
38
+{
39
+ switch (drain_type) {
40
+ case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break;
41
+ case BDRV_DRAIN: bdrv_drained_begin(bs); break;
42
+ default: g_assert_not_reached();
43
+ }
44
+}
45
+
46
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
47
+{
48
+ switch (drain_type) {
49
+ case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break;
50
+ case BDRV_DRAIN: bdrv_drained_end(bs); break;
51
+ default: g_assert_not_reached();
52
+ }
53
+}
54
+
55
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
23
{
56
{
24
QEDAIOCB *acb = opaque;
57
BlockBackend *blk;
25
BDRVQEDState *s = acb_to_s(acb);
58
- BlockDriverState *bs;
26
CachedL2Table *l2_table = acb->request.l2_table;
59
- BDRVTestState *s;
27
uint64_t l2_offset = l2_table->offset;
60
+ BlockDriverState *bs, *backing;
28
+ int index;
61
+ BDRVTestState *s, *backing_s;
62
BlockAIOCB *acb;
63
int aio_ret;
64
65
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
66
s = bs->opaque;
67
blk_insert_bs(blk, bs, &error_abort);
68
69
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
70
+ backing_s = backing->opaque;
71
+ bdrv_set_backing_hd(bs, backing, &error_abort);
29
+
72
+
30
+ if (ret) {
73
/* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
31
+ qed_aio_complete(acb, ret);
74
g_assert_cmpint(s->drain_count, ==, 0);
32
+ return;
75
- bdrv_drain_all_begin();
33
+ }
76
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
34
35
+ index = qed_l1_index(s, acb->cur_pos);
36
+ s->l1_table->offsets[index] = l2_table->offset;
37
+
77
+
38
+ ret = qed_write_l1_table(s, index, 1);
78
+ do_drain_begin(drain_type, bs);
39
+
79
+
40
+ /* Commit the current L2 table to the cache */
80
g_assert_cmpint(s->drain_count, ==, 1);
41
qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
81
- bdrv_drain_all_end();
42
82
+ g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
43
/* This is guaranteed to succeed because we just committed the entry to the
83
+
44
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
84
+ do_drain_end(drain_type, bs);
45
qed_aio_next_io(acb, ret);
85
+
86
g_assert_cmpint(s->drain_count, ==, 0);
87
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
88
89
/* Now do the same while a request is pending */
90
aio_ret = -EINPROGRESS;
91
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
92
g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
93
94
g_assert_cmpint(s->drain_count, ==, 0);
95
- bdrv_drain_all_begin();
96
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
97
+
98
+ do_drain_begin(drain_type, bs);
99
+
100
g_assert_cmpint(aio_ret, ==, 0);
101
g_assert_cmpint(s->drain_count, ==, 1);
102
- bdrv_drain_all_end();
103
+ g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
104
+
105
+ do_drain_end(drain_type, bs);
106
+
107
g_assert_cmpint(s->drain_count, ==, 0);
108
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
109
110
+ bdrv_unref(backing);
111
bdrv_unref(bs);
112
blk_unref(blk);
46
}
113
}
47
114
48
-/**
115
+static void test_drv_cb_drain_all(void)
49
- * Update L1 table with new L2 table offset and write it out
116
+{
50
- */
117
+ test_drv_cb_common(BDRV_DRAIN_ALL, true);
51
-static void qed_aio_write_l1_update(void *opaque, int ret)
118
+}
52
-{
119
+
53
- QEDAIOCB *acb = opaque;
120
+static void test_drv_cb_drain(void)
54
- BDRVQEDState *s = acb_to_s(acb);
121
+{
55
- int index;
122
+ test_drv_cb_common(BDRV_DRAIN, false);
56
-
123
+}
57
- if (ret) {
124
+
58
- qed_aio_complete(acb, ret);
125
int main(int argc, char **argv)
59
- return;
126
{
60
- }
127
bdrv_init();
61
-
128
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
62
- index = qed_l1_index(s, acb->cur_pos);
129
g_test_init(&argc, &argv, NULL);
63
- s->l1_table->offsets[index] = acb->request.l2_table->offset;
130
64
-
131
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
65
- ret = qed_write_l1_table(s, index, 1);
132
+ g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
66
- qed_commit_l2_update(acb, ret);
133
67
-}
134
return g_test_run();
68
135
}
69
/**
70
* Update L2 table with new cluster offsets and write them out
71
--
136
--
72
1.8.3.1
137
2.13.6
73
138
74
139
diff view generated by jsdifflib
1
After _cleanup_qemu(), test cases should be able to start the next qemu
1
This is currently only working correctly for bdrv_drain(), not for
2
process and call _cleanup_qemu() for that one as well. For this to work
2
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
3
cleanly, we need to improve the cleanup so that the second invocation
3
it later.
4
doesn't try to kill the qemu instances from the first invocation a
5
second time (which would result in error messages).
6
4
7
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
Reviewed-by: Eric Blake <eblake@redhat.com>
9
Reviewed-by: Max Reitz <mreitz@redhat.com>
10
---
6
---
11
tests/qemu-iotests/common.qemu | 3 +++
7
tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
12
1 file changed, 3 insertions(+)
8
1 file changed, 45 insertions(+)
13
9
14
diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
10
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
15
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
16
--- a/tests/qemu-iotests/common.qemu
12
--- a/tests/test-bdrv-drain.c
17
+++ b/tests/qemu-iotests/common.qemu
13
+++ b/tests/test-bdrv-drain.c
18
@@ -XXX,XX +XXX,XX @@ function _cleanup_qemu()
14
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
19
rm -f "${QEMU_FIFO_IN}_${i}" "${QEMU_FIFO_OUT}_${i}"
15
test_drv_cb_common(BDRV_DRAIN, false);
20
eval "exec ${QEMU_IN[$i]}<&-" # close file descriptors
16
}
21
eval "exec ${QEMU_OUT[$i]}<&-"
17
18
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
19
+{
20
+ BlockBackend *blk;
21
+ BlockDriverState *bs, *backing;
22
+
22
+
23
+ unset QEMU_IN[$i]
23
+ blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
24
+ unset QEMU_OUT[$i]
24
+ bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
25
done
25
+ &error_abort);
26
+ blk_insert_bs(blk, bs, &error_abort);
27
+
28
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
29
+ bdrv_set_backing_hd(bs, backing, &error_abort);
30
+
31
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
32
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
33
+
34
+ do_drain_begin(drain_type, bs);
35
+
36
+ g_assert_cmpint(bs->quiesce_counter, ==, 1);
37
+ g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
38
+
39
+ do_drain_end(drain_type, bs);
40
+
41
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
42
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
43
+
44
+ bdrv_unref(backing);
45
+ bdrv_unref(bs);
46
+ blk_unref(blk);
47
+}
48
+
49
+static void test_quiesce_drain_all(void)
50
+{
51
+ // XXX drain_all doesn't quiesce
52
+ //test_quiesce_common(BDRV_DRAIN_ALL, true);
53
+}
54
+
55
+static void test_quiesce_drain(void)
56
+{
57
+ test_quiesce_common(BDRV_DRAIN, false);
58
+}
59
+
60
int main(int argc, char **argv)
61
{
62
bdrv_init();
63
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
64
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
65
g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
66
67
+ g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
68
+ g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
69
+
70
return g_test_run();
26
}
71
}
27
--
72
--
28
1.8.3.1
73
2.13.6
29
74
30
75
diff view generated by jsdifflib
1
From: "sochin.jiang" <sochin.jiang@huawei.com>
1
Block jobs already paused themselves when their main BlockBackend
2
entered a drained section. This is not good enough: We also want to
3
pause a block job and may not submit new requests if, for example, the
4
mirror target node should be drained.
2
5
3
img_commit could fall into an infinite loop calling run_block_job() if
6
This implements .drained_begin/end callbacks in child_job in order to
4
its blockjob fails on any I/O error, fix this already known problem.
7
consider all block nodes related to the job, and removes the
8
BlockBackend callbacks which are unnecessary now because the root of the
9
job main BlockBackend is always referenced with a child_job, too.
5
10
6
Signed-off-by: sochin.jiang <sochin.jiang@huawei.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
7
Message-id: 1497509253-28941-1-git-send-email-sochin.jiang@huawei.com
8
Signed-off-by: Max Reitz <mreitz@redhat.com>
9
---
12
---
10
blockjob.c | 4 ++--
13
blockjob.c | 22 +++++++++-------------
11
include/block/blockjob.h | 18 ++++++++++++++++++
14
1 file changed, 9 insertions(+), 13 deletions(-)
12
qemu-img.c | 20 +++++++++++++-------
13
3 files changed, 33 insertions(+), 9 deletions(-)
14
15
15
diff --git a/blockjob.c b/blockjob.c
16
diff --git a/blockjob.c b/blockjob.c
16
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
17
--- a/blockjob.c
18
--- a/blockjob.c
18
+++ b/blockjob.c
19
+++ b/blockjob.c
19
@@ -XXX,XX +XXX,XX @@ static void block_job_resume(BlockJob *job)
20
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
20
block_job_enter(job);
21
job->id);
21
}
22
}
22
23
23
-static void block_job_ref(BlockJob *job)
24
-static const BdrvChildRole child_job = {
24
+void block_job_ref(BlockJob *job)
25
- .get_parent_desc = child_job_get_parent_desc,
26
- .stay_at_node = true,
27
-};
28
-
29
-static void block_job_drained_begin(void *opaque)
30
+static void child_job_drained_begin(BdrvChild *c)
25
{
31
{
26
++job->refcnt;
32
- BlockJob *job = opaque;
33
+ BlockJob *job = c->opaque;
34
block_job_pause(job);
27
}
35
}
28
@@ -XXX,XX +XXX,XX @@ static void block_job_attached_aio_context(AioContext *new_context,
36
29
void *opaque);
37
-static void block_job_drained_end(void *opaque)
30
static void block_job_detach_aio_context(void *opaque);
38
+static void child_job_drained_end(BdrvChild *c)
31
32
-static void block_job_unref(BlockJob *job)
33
+void block_job_unref(BlockJob *job)
34
{
39
{
35
if (--job->refcnt == 0) {
40
- BlockJob *job = opaque;
36
BlockDriverState *bs = blk_bs(job->blk);
41
+ BlockJob *job = c->opaque;
37
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
42
block_job_resume(job);
38
index XXXXXXX..XXXXXXX 100644
39
--- a/include/block/blockjob.h
40
+++ b/include/block/blockjob.h
41
@@ -XXX,XX +XXX,XX @@ void block_job_iostatus_reset(BlockJob *job);
42
BlockJobTxn *block_job_txn_new(void);
43
44
/**
45
+ * block_job_ref:
46
+ *
47
+ * Add a reference to BlockJob refcnt, it will be decreased with
48
+ * block_job_unref, and then be freed if it comes to be the last
49
+ * reference.
50
+ */
51
+void block_job_ref(BlockJob *job);
52
+
53
+/**
54
+ * block_job_unref:
55
+ *
56
+ * Release a reference that was previously acquired with block_job_ref
57
+ * or block_job_create. If it's the last reference to the object, it will be
58
+ * freed.
59
+ */
60
+void block_job_unref(BlockJob *job);
61
+
62
+/**
63
* block_job_txn_unref:
64
*
65
* Release a reference that was previously acquired with block_job_txn_add_job
66
diff --git a/qemu-img.c b/qemu-img.c
67
index XXXXXXX..XXXXXXX 100644
68
--- a/qemu-img.c
69
+++ b/qemu-img.c
70
@@ -XXX,XX +XXX,XX @@ static void common_block_job_cb(void *opaque, int ret)
71
static void run_block_job(BlockJob *job, Error **errp)
72
{
73
AioContext *aio_context = blk_get_aio_context(job->blk);
74
+ int ret = 0;
75
76
- /* FIXME In error cases, the job simply goes away and we access a dangling
77
- * pointer below. */
78
aio_context_acquire(aio_context);
79
+ block_job_ref(job);
80
do {
81
aio_poll(aio_context, true);
82
qemu_progress_print(job->len ?
83
((float)job->offset / job->len * 100.f) : 0.0f, 0);
84
- } while (!job->ready);
85
+ } while (!job->ready && !job->completed);
86
87
- block_job_complete_sync(job, errp);
88
+ if (!job->completed) {
89
+ ret = block_job_complete_sync(job, errp);
90
+ } else {
91
+ ret = job->ret;
92
+ }
93
+ block_job_unref(job);
94
aio_context_release(aio_context);
95
96
- /* A block job may finish instantaneously without publishing any progress,
97
- * so just signal completion here */
98
- qemu_progress_print(100.f, 0);
99
+ /* publish completion progress only when success */
100
+ if (!ret) {
101
+ qemu_progress_print(100.f, 0);
102
+ }
103
}
43
}
104
44
105
static int img_commit(int argc, char **argv)
45
-static const BlockDevOps block_job_dev_ops = {
46
- .drained_begin = block_job_drained_begin,
47
- .drained_end = block_job_drained_end,
48
+static const BdrvChildRole child_job = {
49
+ .get_parent_desc = child_job_get_parent_desc,
50
+ .drained_begin = child_job_drained_begin,
51
+ .drained_end = child_job_drained_end,
52
+ .stay_at_node = true,
53
};
54
55
void block_job_remove_all_bdrv(BlockJob *job)
56
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
57
block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
58
bs->job = job;
59
60
- blk_set_dev_ops(blk, &block_job_dev_ops, job);
61
bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
62
63
QLIST_INSERT_HEAD(&block_jobs, job, job_list);
106
--
64
--
107
1.8.3.1
65
2.13.6
108
66
109
67
diff view generated by jsdifflib
1
From: Alberto Garcia <berto@igalia.com>
1
Block jobs must be paused if any of the involved nodes are drained.
2
2
3
This patch splits do_perform_cow() into three separate functions to
4
read, encrypt and write the COW regions.
5
6
perform_cow() can now read both regions first, then encrypt them and
7
finally write them to disk. The memory allocation is also done in
8
this function now, using one single buffer large enough to hold both
9
regions.
10
11
Signed-off-by: Alberto Garcia <berto@igalia.com>
12
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
13
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
14
---
4
---
15
block/qcow2-cluster.c | 117 +++++++++++++++++++++++++++++++++++++-------------
5
tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
16
1 file changed, 87 insertions(+), 30 deletions(-)
6
1 file changed, 121 insertions(+)
17
7
18
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
8
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
19
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
20
--- a/block/qcow2-cluster.c
10
--- a/tests/test-bdrv-drain.c
21
+++ b/block/qcow2-cluster.c
11
+++ b/tests/test-bdrv-drain.c
22
@@ -XXX,XX +XXX,XX @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
12
@@ -XXX,XX +XXX,XX @@
23
return 0;
13
14
#include "qemu/osdep.h"
15
#include "block/block.h"
16
+#include "block/blockjob_int.h"
17
#include "sysemu/block-backend.h"
18
#include "qapi/error.h"
19
20
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
21
test_quiesce_common(BDRV_DRAIN, false);
24
}
22
}
25
23
26
-static int coroutine_fn do_perform_cow(BlockDriverState *bs,
24
+
27
- uint64_t src_cluster_offset,
25
+typedef struct TestBlockJob {
28
- uint64_t cluster_offset,
26
+ BlockJob common;
29
- unsigned offset_in_cluster,
27
+ bool should_complete;
30
- unsigned bytes)
28
+} TestBlockJob;
31
+static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
29
+
32
+ uint64_t src_cluster_offset,
30
+static void test_job_completed(BlockJob *job, void *opaque)
33
+ unsigned offset_in_cluster,
31
+{
34
+ uint8_t *buffer,
32
+ block_job_completed(job, 0);
35
+ unsigned bytes)
36
{
37
- BDRVQcow2State *s = bs->opaque;
38
QEMUIOVector qiov;
39
- struct iovec iov;
40
+ struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
41
int ret;
42
43
if (bytes == 0) {
44
return 0;
45
}
46
47
- iov.iov_len = bytes;
48
- iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
49
- if (iov.iov_base == NULL) {
50
- return -ENOMEM;
51
- }
52
-
53
qemu_iovec_init_external(&qiov, &iov, 1);
54
55
BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
56
57
if (!bs->drv) {
58
- ret = -ENOMEDIUM;
59
- goto out;
60
+ return -ENOMEDIUM;
61
}
62
63
/* Call .bdrv_co_readv() directly instead of using the public block-layer
64
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
65
ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
66
bytes, &qiov, 0);
67
if (ret < 0) {
68
- goto out;
69
+ return ret;
70
}
71
72
- if (bs->encrypted) {
73
+ return 0;
74
+}
33
+}
75
+
34
+
76
+static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
35
+static void coroutine_fn test_job_start(void *opaque)
77
+ uint64_t src_cluster_offset,
78
+ unsigned offset_in_cluster,
79
+ uint8_t *buffer,
80
+ unsigned bytes)
81
+{
36
+{
82
+ if (bytes && bs->encrypted) {
37
+ TestBlockJob *s = opaque;
83
+ BDRVQcow2State *s = bs->opaque;
38
+
84
int64_t sector = (src_cluster_offset + offset_in_cluster)
39
+ while (!s->should_complete) {
85
>> BDRV_SECTOR_BITS;
40
+ block_job_sleep_ns(&s->common, 100000);
86
assert(s->cipher);
41
+ }
87
assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
42
+
88
assert((bytes & ~BDRV_SECTOR_MASK) == 0);
43
+ block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
89
- if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
90
+ if (qcow2_encrypt_sectors(s, sector, buffer, buffer,
91
bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
92
- ret = -EIO;
93
- goto out;
94
+ return false;
95
}
96
}
97
+ return true;
98
+}
44
+}
99
+
45
+
100
+static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
46
+static void test_job_complete(BlockJob *job, Error **errp)
101
+ uint64_t cluster_offset,
102
+ unsigned offset_in_cluster,
103
+ uint8_t *buffer,
104
+ unsigned bytes)
105
+{
47
+{
106
+ QEMUIOVector qiov;
48
+ TestBlockJob *s = container_of(job, TestBlockJob, common);
107
+ struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
49
+ s->should_complete = true;
50
+}
51
+
52
+BlockJobDriver test_job_driver = {
53
+ .instance_size = sizeof(TestBlockJob),
54
+ .start = test_job_start,
55
+ .complete = test_job_complete,
56
+};
57
+
58
+static void test_blockjob_common(enum drain_type drain_type)
59
+{
60
+ BlockBackend *blk_src, *blk_target;
61
+ BlockDriverState *src, *target;
62
+ BlockJob *job;
108
+ int ret;
63
+ int ret;
109
+
64
+
110
+ if (bytes == 0) {
65
+ src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
111
+ return 0;
66
+ &error_abort);
67
+ blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
68
+ blk_insert_bs(blk_src, src, &error_abort);
69
+
70
+ target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
71
+ &error_abort);
72
+ blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
73
+ blk_insert_bs(blk_target, target, &error_abort);
74
+
75
+ job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
76
+ 0, NULL, NULL, &error_abort);
77
+ block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
78
+ block_job_start(job);
79
+
80
+ g_assert_cmpint(job->pause_count, ==, 0);
81
+ g_assert_false(job->paused);
82
+ g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
83
+
84
+ do_drain_begin(drain_type, src);
85
+
86
+ if (drain_type == BDRV_DRAIN_ALL) {
87
+ /* bdrv_drain_all() drains both src and target, and involves an
88
+ * additional block_job_pause_all() */
89
+ g_assert_cmpint(job->pause_count, ==, 3);
90
+ } else {
91
+ g_assert_cmpint(job->pause_count, ==, 1);
112
+ }
92
+ }
93
+ /* XXX We don't wait until the job is actually paused. Is this okay? */
94
+ /* g_assert_true(job->paused); */
95
+ g_assert_false(job->busy); /* The job is paused */
113
+
96
+
114
+ qemu_iovec_init_external(&qiov, &iov, 1);
97
+ do_drain_end(drain_type, src);
115
98
+
116
ret = qcow2_pre_write_overlap_check(bs, 0,
99
+ g_assert_cmpint(job->pause_count, ==, 0);
117
cluster_offset + offset_in_cluster, bytes);
100
+ g_assert_false(job->paused);
118
if (ret < 0) {
101
+ g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
119
- goto out;
102
+
120
+ return ret;
103
+ do_drain_begin(drain_type, target);
121
}
104
+
122
105
+ if (drain_type == BDRV_DRAIN_ALL) {
123
BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
106
+ /* bdrv_drain_all() drains both src and target, and involves an
124
ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
107
+ * additional block_job_pause_all() */
125
bytes, &qiov, 0);
108
+ g_assert_cmpint(job->pause_count, ==, 3);
126
if (ret < 0) {
109
+ } else {
127
- goto out;
110
+ g_assert_cmpint(job->pause_count, ==, 1);
128
+ return ret;
111
+ }
129
}
112
+ /* XXX We don't wait until the job is actually paused. Is this okay? */
130
113
+ /* g_assert_true(job->paused); */
131
- ret = 0;
114
+ g_assert_false(job->busy); /* The job is paused */
132
-out:
115
+
133
- qemu_vfree(iov.iov_base);
116
+ do_drain_end(drain_type, target);
134
- return ret;
117
+
135
+ return 0;
118
+ g_assert_cmpint(job->pause_count, ==, 0);
119
+ g_assert_false(job->paused);
120
+ g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
121
+
122
+ ret = block_job_complete_sync(job, &error_abort);
123
+ g_assert_cmpint(ret, ==, 0);
124
+
125
+ blk_unref(blk_src);
126
+ blk_unref(blk_target);
127
+ bdrv_unref(src);
128
+ bdrv_unref(target);
129
+}
130
+
131
+static void test_blockjob_drain_all(void)
132
+{
133
+ test_blockjob_common(BDRV_DRAIN_ALL);
134
+}
135
+
136
+static void test_blockjob_drain(void)
137
+{
138
+ test_blockjob_common(BDRV_DRAIN);
139
+}
140
+
141
int main(int argc, char **argv)
142
{
143
bdrv_init();
144
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
145
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
146
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
147
148
+ g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
149
+ g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
150
+
151
return g_test_run();
136
}
152
}
137
138
139
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
140
BDRVQcow2State *s = bs->opaque;
141
Qcow2COWRegion *start = &m->cow_start;
142
Qcow2COWRegion *end = &m->cow_end;
143
+ unsigned buffer_size;
144
+ uint8_t *start_buffer, *end_buffer;
145
int ret;
146
147
+ assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
148
+
149
if (start->nb_bytes == 0 && end->nb_bytes == 0) {
150
return 0;
151
}
152
153
+ /* Reserve a buffer large enough to store the data from both the
154
+ * start and end COW regions. Add some padding in the middle if
155
+ * necessary to make sure that the end region is optimally aligned */
156
+ buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) +
157
+ end->nb_bytes;
158
+ start_buffer = qemu_try_blockalign(bs, buffer_size);
159
+ if (start_buffer == NULL) {
160
+ return -ENOMEM;
161
+ }
162
+ /* The part of the buffer where the end region is located */
163
+ end_buffer = start_buffer + buffer_size - end->nb_bytes;
164
+
165
qemu_co_mutex_unlock(&s->lock);
166
- ret = do_perform_cow(bs, m->offset, m->alloc_offset,
167
- start->offset, start->nb_bytes);
168
+ /* First we read the existing data from both COW regions */
169
+ ret = do_perform_cow_read(bs, m->offset, start->offset,
170
+ start_buffer, start->nb_bytes);
171
if (ret < 0) {
172
goto fail;
173
}
174
175
- ret = do_perform_cow(bs, m->offset, m->alloc_offset,
176
- end->offset, end->nb_bytes);
177
+ ret = do_perform_cow_read(bs, m->offset, end->offset,
178
+ end_buffer, end->nb_bytes);
179
+ if (ret < 0) {
180
+ goto fail;
181
+ }
182
+
183
+ /* Encrypt the data if necessary before writing it */
184
+ if (bs->encrypted) {
185
+ if (!do_perform_cow_encrypt(bs, m->offset, start->offset,
186
+ start_buffer, start->nb_bytes) ||
187
+ !do_perform_cow_encrypt(bs, m->offset, end->offset,
188
+ end_buffer, end->nb_bytes)) {
189
+ ret = -EIO;
190
+ goto fail;
191
+ }
192
+ }
193
+
194
+ /* And now we can write everything */
195
+ ret = do_perform_cow_write(bs, m->alloc_offset, start->offset,
196
+ start_buffer, start->nb_bytes);
197
+ if (ret < 0) {
198
+ goto fail;
199
+ }
200
201
+ ret = do_perform_cow_write(bs, m->alloc_offset, end->offset,
202
+ end_buffer, end->nb_bytes);
203
fail:
204
qemu_co_mutex_lock(&s->lock);
205
206
@@ -XXX,XX +XXX,XX @@ fail:
207
qcow2_cache_depends_on_flush(s->l2_table_cache);
208
}
209
210
+ qemu_vfree(start_buffer);
211
return ret;
212
}
213
214
--
153
--
215
1.8.3.1
154
2.13.6
216
155
217
156
diff view generated by jsdifflib
1
From: Alberto Garcia <berto@igalia.com>
1
Block jobs are already paused using the BdrvChildRole drain callbacks,
2
so we don't need an additional block_job_pause_all() call.
2
3
3
Instead of calling perform_cow() twice with a different COW region
4
each time, call it just once and make perform_cow() handle both
5
regions.
6
7
This patch simply moves code around. The next one will do the actual
8
reordering of the COW operations.
9
10
Signed-off-by: Alberto Garcia <berto@igalia.com>
11
Reviewed-by: Eric Blake <eblake@redhat.com>
12
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
13
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
14
---
5
---
15
block/qcow2-cluster.c | 36 ++++++++++++++++++++++--------------
6
block/io.c | 4 ----
16
1 file changed, 22 insertions(+), 14 deletions(-)
7
tests/test-bdrv-drain.c | 10 ++++------
8
2 files changed, 4 insertions(+), 10 deletions(-)
17
9
18
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
10
diff --git a/block/io.c b/block/io.c
19
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
20
--- a/block/qcow2-cluster.c
12
--- a/block/io.c
21
+++ b/block/qcow2-cluster.c
13
+++ b/block/io.c
22
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow(BlockDriverState *bs,
14
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
23
struct iovec iov;
15
* context. */
24
int ret;
16
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
25
17
26
+ if (bytes == 0) {
18
- block_job_pause_all();
27
+ return 0;
19
-
28
+ }
20
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
29
+
21
AioContext *aio_context = bdrv_get_aio_context(bs);
30
iov.iov_len = bytes;
22
31
iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
23
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
32
if (iov.iov_base == NULL) {
24
aio_enable_external(aio_context);
33
@@ -XXX,XX +XXX,XX @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
25
aio_context_release(aio_context);
34
return cluster_offset;
26
}
27
-
28
- block_job_resume_all();
35
}
29
}
36
30
37
-static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
31
void bdrv_drain_all(void)
38
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
32
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
39
{
33
index XXXXXXX..XXXXXXX 100644
40
BDRVQcow2State *s = bs->opaque;
34
--- a/tests/test-bdrv-drain.c
41
+ Qcow2COWRegion *start = &m->cow_start;
35
+++ b/tests/test-bdrv-drain.c
42
+ Qcow2COWRegion *end = &m->cow_end;
36
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
43
int ret;
37
do_drain_begin(drain_type, src);
44
38
45
- if (r->nb_bytes == 0) {
39
if (drain_type == BDRV_DRAIN_ALL) {
46
+ if (start->nb_bytes == 0 && end->nb_bytes == 0) {
40
- /* bdrv_drain_all() drains both src and target, and involves an
47
return 0;
41
- * additional block_job_pause_all() */
42
- g_assert_cmpint(job->pause_count, ==, 3);
43
+ /* bdrv_drain_all() drains both src and target */
44
+ g_assert_cmpint(job->pause_count, ==, 2);
45
} else {
46
g_assert_cmpint(job->pause_count, ==, 1);
48
}
47
}
49
48
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
50
qemu_co_mutex_unlock(&s->lock);
49
do_drain_begin(drain_type, target);
51
- ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes);
50
52
- qemu_co_mutex_lock(&s->lock);
51
if (drain_type == BDRV_DRAIN_ALL) {
53
-
52
- /* bdrv_drain_all() drains both src and target, and involves an
54
+ ret = do_perform_cow(bs, m->offset, m->alloc_offset,
53
- * additional block_job_pause_all() */
55
+ start->offset, start->nb_bytes);
54
- g_assert_cmpint(job->pause_count, ==, 3);
56
if (ret < 0) {
55
+ /* bdrv_drain_all() drains both src and target */
57
- return ret;
56
+ g_assert_cmpint(job->pause_count, ==, 2);
58
+ goto fail;
57
} else {
59
}
58
g_assert_cmpint(job->pause_count, ==, 1);
60
61
+ ret = do_perform_cow(bs, m->offset, m->alloc_offset,
62
+ end->offset, end->nb_bytes);
63
+
64
+fail:
65
+ qemu_co_mutex_lock(&s->lock);
66
+
67
/*
68
* Before we update the L2 table to actually point to the new cluster, we
69
* need to be sure that the refcounts have been increased and COW was
70
* handled.
71
*/
72
- qcow2_cache_depends_on_flush(s->l2_table_cache);
73
+ if (ret == 0) {
74
+ qcow2_cache_depends_on_flush(s->l2_table_cache);
75
+ }
76
77
- return 0;
78
+ return ret;
79
}
80
81
int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
82
@@ -XXX,XX +XXX,XX @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
83
}
84
85
/* copy content of unmodified sectors */
86
- ret = perform_cow(bs, m, &m->cow_start);
87
- if (ret < 0) {
88
- goto err;
89
- }
90
-
91
- ret = perform_cow(bs, m, &m->cow_end);
92
+ ret = perform_cow(bs, m);
93
if (ret < 0) {
94
goto err;
95
}
59
}
96
--
60
--
97
1.8.3.1
61
2.13.6
98
62
99
63
diff view generated by jsdifflib
1
Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
1
bdrv_do_drained_begin() restricts the call of parent callbacks and
2
just return an error code and let the caller handle it.
2
aio_disable_external() to the outermost drain section, but the block
3
driver callbacks are always called. bdrv_do_drained_end() must match
4
this behaviour, otherwise nodes stay drained even if begin/end calls
5
were balanced.
3
6
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
7
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
---
8
---
7
block/qed.c | 19 +++++++++----------
9
block/io.c | 12 +++++++-----
8
1 file changed, 9 insertions(+), 10 deletions(-)
10
1 file changed, 7 insertions(+), 5 deletions(-)
9
11
10
diff --git a/block/qed.c b/block/qed.c
12
diff --git a/block/io.c b/block/io.c
11
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
12
--- a/block/qed.c
14
--- a/block/io.c
13
+++ b/block/qed.c
15
+++ b/block/io.c
14
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
16
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
15
/**
17
16
* Update L1 table with new L2 table offset and write it out
18
void bdrv_drained_end(BlockDriverState *bs)
17
*/
18
-static void qed_aio_write_l1_update(void *opaque, int ret)
19
+static int qed_aio_write_l1_update(QEDAIOCB *acb)
20
{
19
{
21
- QEDAIOCB *acb = opaque;
20
+ int old_quiesce_counter;
22
BDRVQEDState *s = acb_to_s(acb);
21
+
23
CachedL2Table *l2_table = acb->request.l2_table;
22
if (qemu_in_coroutine()) {
24
uint64_t l2_offset = l2_table->offset;
23
bdrv_co_yield_to_drain(bs, false);
25
- int index;
24
return;
26
-
25
}
27
- if (ret) {
26
assert(bs->quiesce_counter > 0);
28
- qed_aio_complete(acb, ret);
27
- if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
29
- return;
28
- return;
30
- }
29
- }
31
+ int index, ret;
30
+ old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
32
31
33
index = qed_l1_index(s, acb->cur_pos);
32
/* Re-enable things in child-to-parent order */
34
s->l1_table->offsets[index] = l2_table->offset;
33
bdrv_drain_invoke(bs, false, false);
35
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l1_update(void *opaque, int ret)
34
- bdrv_parent_drained_end(bs);
36
acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
35
- aio_enable_external(bdrv_get_aio_context(bs));
37
assert(acb->request.l2_table != NULL);
36
+ if (old_quiesce_counter == 1) {
38
37
+ bdrv_parent_drained_end(bs);
39
- qed_aio_next_io(acb, ret);
38
+ aio_enable_external(bdrv_get_aio_context(bs));
40
+ return ret;
39
+ }
41
}
40
}
42
41
43
42
/*
44
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
45
if (need_alloc) {
46
/* Write out the whole new L2 table */
47
ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
48
- qed_aio_write_l1_update(acb, ret);
49
+ if (ret) {
50
+ goto err;
51
+ }
52
+ ret = qed_aio_write_l1_update(acb);
53
+ qed_aio_next_io(acb, ret);
54
+
55
} else {
56
/* Write out only the updated part of the L2 table */
57
ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
58
--
43
--
59
1.8.3.1
44
2.13.6
60
45
61
46
diff view generated by jsdifflib
1
From: Alberto Garcia <berto@igalia.com>
2
3
If the guest tries to write data that results on the allocation of a
4
new cluster, instead of writing the guest data first and then the data
5
from the COW regions, write everything together using one single I/O
6
operation.
7
8
This can improve the write performance by 25% or more, depending on
9
several factors such as the media type, the cluster size and the I/O
10
request size.
11
12
Signed-off-by: Alberto Garcia <berto@igalia.com>
13
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
14
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
15
---
2
---
16
block/qcow2-cluster.c | 40 ++++++++++++++++++++++++--------
3
tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
17
block/qcow2.c | 64 +++++++++++++++++++++++++++++++++++++++++++--------
4
1 file changed, 57 insertions(+)
18
block/qcow2.h | 7 ++++++
19
3 files changed, 91 insertions(+), 20 deletions(-)
20
5
21
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
6
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
22
index XXXXXXX..XXXXXXX 100644
7
index XXXXXXX..XXXXXXX 100644
23
--- a/block/qcow2-cluster.c
8
--- a/tests/test-bdrv-drain.c
24
+++ b/block/qcow2-cluster.c
9
+++ b/tests/test-bdrv-drain.c
25
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
10
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
26
assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
11
enum drain_type {
27
assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
12
BDRV_DRAIN_ALL,
28
assert(start->offset + start->nb_bytes <= end->offset);
13
BDRV_DRAIN,
29
+ assert(!m->data_qiov || m->data_qiov->size == data_bytes);
14
+ DRAIN_TYPE_MAX,
30
15
};
31
if (start->nb_bytes == 0 && end->nb_bytes == 0) {
16
32
return 0;
17
static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
33
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
18
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
34
/* The part of the buffer where the end region is located */
19
test_quiesce_common(BDRV_DRAIN, false);
35
end_buffer = start_buffer + buffer_size - end->nb_bytes;
20
}
36
21
37
- qemu_iovec_init(&qiov, 1);
22
+static void test_nested(void)
38
+ qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
23
+{
39
24
+ BlockBackend *blk;
40
qemu_co_mutex_unlock(&s->lock);
25
+ BlockDriverState *bs, *backing;
41
/* First we read the existing data from both COW regions. We
26
+ BDRVTestState *s, *backing_s;
42
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
27
+ enum drain_type outer, inner;
43
}
28
+
44
}
29
+ blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
45
30
+ bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
46
- /* And now we can write everything */
31
+ &error_abort);
47
- qemu_iovec_reset(&qiov);
32
+ s = bs->opaque;
48
- qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
33
+ blk_insert_bs(blk, bs, &error_abort);
49
- ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
34
+
50
- if (ret < 0) {
35
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
51
- goto fail;
36
+ backing_s = backing->opaque;
52
+ /* And now we can write everything. If we have the guest data we
37
+ bdrv_set_backing_hd(bs, backing, &error_abort);
53
+ * can write everything in one single operation */
38
+
54
+ if (m->data_qiov) {
39
+ for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
55
+ qemu_iovec_reset(&qiov);
40
+ for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
56
+ if (start->nb_bytes) {
41
+ /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
57
+ qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
42
+ int bs_quiesce = (outer != BDRV_DRAIN_ALL) +
43
+ (inner != BDRV_DRAIN_ALL);
44
+ int backing_quiesce = 0;
45
+ int backing_cb_cnt = (outer != BDRV_DRAIN) +
46
+ (inner != BDRV_DRAIN);
47
+
48
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
49
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
50
+ g_assert_cmpint(s->drain_count, ==, 0);
51
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
52
+
53
+ do_drain_begin(outer, bs);
54
+ do_drain_begin(inner, bs);
55
+
56
+ g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
57
+ g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
58
+ g_assert_cmpint(s->drain_count, ==, 2);
59
+ g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
60
+
61
+ do_drain_end(inner, bs);
62
+ do_drain_end(outer, bs);
63
+
64
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
65
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
66
+ g_assert_cmpint(s->drain_count, ==, 0);
67
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
58
+ }
68
+ }
59
+ qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
60
+ if (end->nb_bytes) {
61
+ qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
62
+ }
63
+ /* NOTE: we have a write_aio blkdebug event here followed by
64
+ * a cow_write one in do_perform_cow_write(), but there's only
65
+ * one single I/O operation */
66
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
67
+ ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
68
+ } else {
69
+ /* If there's no guest data then write both COW regions separately */
70
+ qemu_iovec_reset(&qiov);
71
+ qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
72
+ ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
73
+ if (ret < 0) {
74
+ goto fail;
75
+ }
76
+
77
+ qemu_iovec_reset(&qiov);
78
+ qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
79
+ ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
80
}
81
82
- qemu_iovec_reset(&qiov);
83
- qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
84
- ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
85
fail:
86
qemu_co_mutex_lock(&s->lock);
87
88
diff --git a/block/qcow2.c b/block/qcow2.c
89
index XXXXXXX..XXXXXXX 100644
90
--- a/block/qcow2.c
91
+++ b/block/qcow2.c
92
@@ -XXX,XX +XXX,XX @@ fail:
93
return ret;
94
}
95
96
+/* Check if it's possible to merge a write request with the writing of
97
+ * the data from the COW regions */
98
+static bool merge_cow(uint64_t offset, unsigned bytes,
99
+ QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
100
+{
101
+ QCowL2Meta *m;
102
+
103
+ for (m = l2meta; m != NULL; m = m->next) {
104
+ /* If both COW regions are empty then there's nothing to merge */
105
+ if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
106
+ continue;
107
+ }
108
+
109
+ /* The data (middle) region must be immediately after the
110
+ * start region */
111
+ if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
112
+ continue;
113
+ }
114
+
115
+ /* The end region must be immediately after the data (middle)
116
+ * region */
117
+ if (m->offset + m->cow_end.offset != offset + bytes) {
118
+ continue;
119
+ }
120
+
121
+ /* Make sure that adding both COW regions to the QEMUIOVector
122
+ * does not exceed IOV_MAX */
123
+ if (hd_qiov->niov > IOV_MAX - 2) {
124
+ continue;
125
+ }
126
+
127
+ m->data_qiov = hd_qiov;
128
+ return true;
129
+ }
69
+ }
130
+
70
+
131
+ return false;
71
+ bdrv_unref(backing);
72
+ bdrv_unref(bs);
73
+ blk_unref(blk);
132
+}
74
+}
133
+
75
+
134
static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
76
135
uint64_t bytes, QEMUIOVector *qiov,
77
typedef struct TestBlockJob {
136
int flags)
78
BlockJob common;
137
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
79
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
138
goto fail;
80
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
139
}
81
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
140
82
141
- qemu_co_mutex_unlock(&s->lock);
83
+ g_test_add_func("/bdrv-drain/nested", test_nested);
142
- BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
143
- trace_qcow2_writev_data(qemu_coroutine_self(),
144
- cluster_offset + offset_in_cluster);
145
- ret = bdrv_co_pwritev(bs->file,
146
- cluster_offset + offset_in_cluster,
147
- cur_bytes, &hd_qiov, 0);
148
- qemu_co_mutex_lock(&s->lock);
149
- if (ret < 0) {
150
- goto fail;
151
+ /* If we need to do COW, check if it's possible to merge the
152
+ * writing of the guest data together with that of the COW regions.
153
+ * If it's not possible (or not necessary) then write the
154
+ * guest data now. */
155
+ if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
156
+ qemu_co_mutex_unlock(&s->lock);
157
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
158
+ trace_qcow2_writev_data(qemu_coroutine_self(),
159
+ cluster_offset + offset_in_cluster);
160
+ ret = bdrv_co_pwritev(bs->file,
161
+ cluster_offset + offset_in_cluster,
162
+ cur_bytes, &hd_qiov, 0);
163
+ qemu_co_mutex_lock(&s->lock);
164
+ if (ret < 0) {
165
+ goto fail;
166
+ }
167
}
168
169
while (l2meta != NULL) {
170
diff --git a/block/qcow2.h b/block/qcow2.h
171
index XXXXXXX..XXXXXXX 100644
172
--- a/block/qcow2.h
173
+++ b/block/qcow2.h
174
@@ -XXX,XX +XXX,XX @@ typedef struct QCowL2Meta
175
*/
176
Qcow2COWRegion cow_end;
177
178
+ /**
179
+ * The I/O vector with the data from the actual guest write request.
180
+ * If non-NULL, this is meant to be merged together with the data
181
+ * from @cow_start and @cow_end into one single write operation.
182
+ */
183
+ QEMUIOVector *data_qiov;
184
+
84
+
185
/** Pointer to next L2Meta of the same write request */
85
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
186
struct QCowL2Meta *next;
86
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
187
87
188
--
88
--
189
1.8.3.1
89
2.13.6
190
90
191
91
diff view generated by jsdifflib
1
With this change, qed_aio_write_prefill() and qed_aio_write_postfill()
1
This is in preparation for subtree drains, i.e. drained sections that
2
collapse into a single function. This is reflected by a rename of the
2
affect not only a single node, but recursively all child nodes, too.
3
combined function to qed_aio_write_cow().
3
4
Calling the parent callbacks for drain is pointless when we just came
5
from that parent node recursively and leads to multiple increases of
6
bs->quiesce_counter in a single drain call. Don't do it.
7
8
In order for this to work correctly, the parent callback must be called
9
for every bdrv_drain_begin/end() call, not only for the outermost one:
10
11
If we have a node N with two parents A and B, recursive draining of A
12
should cause the quiesce_counter of B to increase because its child N is
13
drained independently of B. If now B is recursively drained, too, A must
14
increase its quiesce_counter because N is drained independently of A
15
only now, even if N is going from quiesce_counter 1 to 2.
4
16
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Eric Blake <eblake@redhat.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
---
18
---
9
block/qed.c | 57 +++++++++++++++++++++++----------------------------------
19
include/block/block.h | 4 ++--
10
1 file changed, 23 insertions(+), 34 deletions(-)
20
block.c | 13 +++++++++----
11
21
block/io.c | 47 ++++++++++++++++++++++++++++++++++-------------
12
diff --git a/block/qed.c b/block/qed.c
22
3 files changed, 45 insertions(+), 19 deletions(-)
23
24
diff --git a/include/block/block.h b/include/block/block.h
13
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
14
--- a/block/qed.c
26
--- a/include/block/block.h
15
+++ b/block/qed.c
27
+++ b/include/block/block.h
16
@@ -XXX,XX +XXX,XX @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
28
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
17
* @pos: Byte position in device
29
* Begin a quiesced section of all users of @bs. This is part of
18
* @len: Number of bytes
30
* bdrv_drained_begin.
19
* @offset: Byte offset in image file
20
- * @cb: Completion function
21
- * @opaque: User data for completion function
22
*/
31
*/
23
-static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
32
-void bdrv_parent_drained_begin(BlockDriverState *bs);
24
- uint64_t len, uint64_t offset,
33
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
25
- BlockCompletionFunc *cb,
26
- void *opaque)
27
+static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
28
+ uint64_t len, uint64_t offset)
29
{
30
QEMUIOVector qiov;
31
QEMUIOVector *backing_qiov = NULL;
32
@@ -XXX,XX +XXX,XX @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
33
34
/* Skip copy entirely if there is no work to do */
35
if (len == 0) {
36
- cb(opaque, 0);
37
- return;
38
+ return 0;
39
}
40
41
iov = (struct iovec) {
42
@@ -XXX,XX +XXX,XX @@ static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
43
ret = 0;
44
out:
45
qemu_vfree(iov.iov_base);
46
- cb(opaque, ret);
47
+ return ret;
48
}
49
34
50
/**
35
/**
51
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
36
* bdrv_parent_drained_end:
52
}
37
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
38
* End a quiesced section of all users of @bs. This is part of
39
* bdrv_drained_end.
40
*/
41
-void bdrv_parent_drained_end(BlockDriverState *bs);
42
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
53
43
54
/**
44
/**
55
- * Populate back untouched region of new data cluster
45
* bdrv_drained_begin:
56
+ * Populate untouched regions of new data cluster
46
diff --git a/block.c b/block.c
57
*/
47
index XXXXXXX..XXXXXXX 100644
58
-static void qed_aio_write_postfill(void *opaque, int ret)
48
--- a/block.c
59
+static void qed_aio_write_cow(void *opaque, int ret)
49
+++ b/block.c
60
{
50
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
61
QEDAIOCB *acb = opaque;
51
BlockDriverState *new_bs)
62
BDRVQEDState *s = acb_to_s(acb);
52
{
63
- uint64_t start = acb->cur_pos + acb->cur_qiov.size;
53
BlockDriverState *old_bs = child->bs;
64
- uint64_t len =
54
+ int i;
65
- qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
55
66
- uint64_t offset = acb->cur_cluster +
56
if (old_bs && new_bs) {
67
- qed_offset_into_cluster(s, acb->cur_pos) +
57
assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
68
- acb->cur_qiov.size;
58
}
69
+ uint64_t start, len, offset;
59
if (old_bs) {
60
if (old_bs->quiesce_counter && child->role->drained_end) {
61
- child->role->drained_end(child);
62
+ for (i = 0; i < old_bs->quiesce_counter; i++) {
63
+ child->role->drained_end(child);
64
+ }
65
}
66
if (child->role->detach) {
67
child->role->detach(child);
68
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
69
if (new_bs) {
70
QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
71
if (new_bs->quiesce_counter && child->role->drained_begin) {
72
- child->role->drained_begin(child);
73
+ for (i = 0; i < new_bs->quiesce_counter; i++) {
74
+ child->role->drained_begin(child);
75
+ }
76
}
77
78
if (child->role->attach) {
79
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
80
AioContext *ctx = bdrv_get_aio_context(bs);
81
82
aio_disable_external(ctx);
83
- bdrv_parent_drained_begin(bs);
84
+ bdrv_parent_drained_begin(bs, NULL);
85
bdrv_drain(bs); /* ensure there are no in-flight requests */
86
87
while (aio_poll(ctx, false)) {
88
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
89
*/
90
aio_context_acquire(new_context);
91
bdrv_attach_aio_context(bs, new_context);
92
- bdrv_parent_drained_end(bs);
93
+ bdrv_parent_drained_end(bs, NULL);
94
aio_enable_external(ctx);
95
aio_context_release(new_context);
96
}
97
diff --git a/block/io.c b/block/io.c
98
index XXXXXXX..XXXXXXX 100644
99
--- a/block/io.c
100
+++ b/block/io.c
101
@@ -XXX,XX +XXX,XX @@
102
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
103
int64_t offset, int bytes, BdrvRequestFlags flags);
104
105
-void bdrv_parent_drained_begin(BlockDriverState *bs)
106
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
107
{
108
BdrvChild *c, *next;
109
110
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
111
+ if (c == ignore) {
112
+ continue;
113
+ }
114
if (c->role->drained_begin) {
115
c->role->drained_begin(c);
116
}
117
}
118
}
119
120
-void bdrv_parent_drained_end(BlockDriverState *bs)
121
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
122
{
123
BdrvChild *c, *next;
124
125
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
126
+ if (c == ignore) {
127
+ continue;
128
+ }
129
if (c->role->drained_end) {
130
c->role->drained_end(c);
131
}
132
@@ -XXX,XX +XXX,XX @@ typedef struct {
133
BlockDriverState *bs;
134
bool done;
135
bool begin;
136
+ BdrvChild *parent;
137
} BdrvCoDrainData;
138
139
static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
140
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
141
return waited;
142
}
143
144
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
145
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
70
+
146
+
71
+ /* Populate front untouched region of new data cluster */
147
static void bdrv_co_drain_bh_cb(void *opaque)
72
+ start = qed_start_of_cluster(s, acb->cur_pos);
148
{
73
+ len = qed_offset_into_cluster(s, acb->cur_pos);
149
BdrvCoDrainData *data = opaque;
74
150
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
75
+ trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
151
76
+ ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
152
bdrv_dec_in_flight(bs);
77
if (ret) {
153
if (data->begin) {
78
qed_aio_complete(acb, ret);
154
- bdrv_drained_begin(bs);
155
+ bdrv_do_drained_begin(bs, data->parent);
156
} else {
157
- bdrv_drained_end(bs);
158
+ bdrv_do_drained_end(bs, data->parent);
159
}
160
161
data->done = true;
162
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
163
}
164
165
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
166
- bool begin)
167
+ bool begin, BdrvChild *parent)
168
{
169
BdrvCoDrainData data;
170
171
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
172
.bs = bs,
173
.done = false,
174
.begin = begin,
175
+ .parent = parent,
176
};
177
bdrv_inc_in_flight(bs);
178
aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
179
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
180
assert(data.done);
181
}
182
183
-void bdrv_drained_begin(BlockDriverState *bs)
184
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
185
{
186
if (qemu_in_coroutine()) {
187
- bdrv_co_yield_to_drain(bs, true);
188
+ bdrv_co_yield_to_drain(bs, true, parent);
79
return;
189
return;
80
}
190
}
81
191
82
- trace_qed_aio_write_postfill(s, acb, start, len, offset);
192
/* Stop things in parent-to-child order */
83
- qed_copy_from_backing_file(s, start, len, offset,
193
if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
84
- qed_aio_write_main, acb);
194
aio_disable_external(bdrv_get_aio_context(bs));
85
-}
195
- bdrv_parent_drained_begin(bs);
86
+ /* Populate back untouched region of new data cluster */
196
}
87
+ start = acb->cur_pos + acb->cur_qiov.size;
197
88
+ len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
198
+ bdrv_parent_drained_begin(bs, parent);
89
+ offset = acb->cur_cluster +
199
bdrv_drain_invoke(bs, true, false);
90
+ qed_offset_into_cluster(s, acb->cur_pos) +
200
bdrv_drain_recurse(bs);
91
+ acb->cur_qiov.size;
201
}
92
202
93
-/**
203
-void bdrv_drained_end(BlockDriverState *bs)
94
- * Populate front untouched region of new data cluster
204
+void bdrv_drained_begin(BlockDriverState *bs)
95
- */
205
+{
96
-static void qed_aio_write_prefill(void *opaque, int ret)
206
+ bdrv_do_drained_begin(bs, NULL);
97
-{
207
+}
98
- QEDAIOCB *acb = opaque;
208
+
99
- BDRVQEDState *s = acb_to_s(acb);
209
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
100
- uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
210
{
101
- uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
211
int old_quiesce_counter;
102
+ trace_qed_aio_write_postfill(s, acb, start, len, offset);
212
103
+ ret = qed_copy_from_backing_file(s, start, len, offset);
213
if (qemu_in_coroutine()) {
104
214
- bdrv_co_yield_to_drain(bs, false);
105
- trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
215
+ bdrv_co_yield_to_drain(bs, false, parent);
106
- qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
216
return;
107
- qed_aio_write_postfill, acb);
217
}
108
+ qed_aio_write_main(acb, ret);
218
assert(bs->quiesce_counter > 0);
109
}
219
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
110
220
111
/**
221
/* Re-enable things in child-to-parent order */
112
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
222
bdrv_drain_invoke(bs, false, false);
113
223
+ bdrv_parent_drained_end(bs, parent);
114
cb = qed_aio_write_zero_cluster;
224
if (old_quiesce_counter == 1) {
115
} else {
225
- bdrv_parent_drained_end(bs);
116
- cb = qed_aio_write_prefill;
226
aio_enable_external(bdrv_get_aio_context(bs));
117
+ cb = qed_aio_write_cow;
227
}
118
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
228
}
119
}
229
120
230
+void bdrv_drained_end(BlockDriverState *bs)
231
+{
232
+ bdrv_do_drained_end(bs, NULL);
233
+}
234
+
235
/*
236
* Wait for pending requests to complete on a single BlockDriverState subtree,
237
* and suspend block driver's internal I/O until next request arrives.
238
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
239
/* Stop things in parent-to-child order */
240
aio_context_acquire(aio_context);
241
aio_disable_external(aio_context);
242
- bdrv_parent_drained_begin(bs);
243
+ bdrv_parent_drained_begin(bs, NULL);
244
bdrv_drain_invoke(bs, true, true);
245
aio_context_release(aio_context);
246
247
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
248
/* Re-enable things in child-to-parent order */
249
aio_context_acquire(aio_context);
250
bdrv_drain_invoke(bs, false, true);
251
- bdrv_parent_drained_end(bs);
252
+ bdrv_parent_drained_end(bs, NULL);
253
aio_enable_external(aio_context);
254
aio_context_release(aio_context);
255
}
121
--
256
--
122
1.8.3.1
257
2.13.6
123
258
124
259
diff view generated by jsdifflib
1
bdrv_drained_begin() waits for the completion of requests in the whole
2
subtree, but it only actually keeps its immediate bs parameter quiesced
3
until bdrv_drained_end().
4
5
Add a version that keeps the whole subtree drained. As of this commit,
6
graph changes cannot be allowed during a subtree drained section, but
7
this will be fixed soon.
8
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
3
---
10
---
4
block/qed-table.c | 47 ++++++++++++-----------------------------------
11
include/block/block.h | 13 +++++++++++++
5
block/qed.c | 12 +++++++-----
12
block/io.c | 54 ++++++++++++++++++++++++++++++++++++++++-----------
6
block/qed.h | 8 +++-----
13
2 files changed, 56 insertions(+), 11 deletions(-)
7
3 files changed, 22 insertions(+), 45 deletions(-)
8
14
9
diff --git a/block/qed-table.c b/block/qed-table.c
15
diff --git a/include/block/block.h b/include/block/block.h
10
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
11
--- a/block/qed-table.c
17
--- a/include/block/block.h
12
+++ b/block/qed-table.c
18
+++ b/include/block/block.h
13
@@ -XXX,XX +XXX,XX @@ out:
19
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
14
* @index: Index of first element
20
void bdrv_drained_begin(BlockDriverState *bs);
15
* @n: Number of elements
21
16
* @flush: Whether or not to sync to disk
22
/**
17
- * @cb: Completion function
23
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
18
- * @opaque: Argument for completion function
24
+ * exclusive access to all child nodes as well.
25
+ *
26
+ * Graph changes are not allowed during a subtree drain section.
27
+ */
28
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
29
+
30
+/**
31
* bdrv_drained_end:
32
*
33
* End a quiescent section started by bdrv_drained_begin().
19
*/
34
*/
20
-static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
35
void bdrv_drained_end(BlockDriverState *bs);
21
- unsigned int index, unsigned int n, bool flush,
36
22
- BlockCompletionFunc *cb, void *opaque)
37
+/**
23
+static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
38
+ * End a quiescent section started by bdrv_subtree_drained_begin().
24
+ unsigned int index, unsigned int n, bool flush)
39
+ */
40
+void bdrv_subtree_drained_end(BlockDriverState *bs);
41
+
42
void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
43
Error **errp);
44
void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
45
diff --git a/block/io.c b/block/io.c
46
index XXXXXXX..XXXXXXX 100644
47
--- a/block/io.c
48
+++ b/block/io.c
49
@@ -XXX,XX +XXX,XX @@ typedef struct {
50
BlockDriverState *bs;
51
bool done;
52
bool begin;
53
+ bool recursive;
54
BdrvChild *parent;
55
} BdrvCoDrainData;
56
57
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
58
return waited;
59
}
60
61
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
62
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
63
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
64
+ BdrvChild *parent);
65
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
66
+ BdrvChild *parent);
67
68
static void bdrv_co_drain_bh_cb(void *opaque)
25
{
69
{
26
unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
70
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
27
unsigned int start, end, i;
71
28
@@ -XXX,XX +XXX,XX @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
72
bdrv_dec_in_flight(bs);
29
ret = 0;
73
if (data->begin) {
30
out:
74
- bdrv_do_drained_begin(bs, data->parent);
31
qemu_vfree(new_table);
75
+ bdrv_do_drained_begin(bs, data->recursive, data->parent);
32
- cb(opaque, ret);
76
} else {
33
-}
77
- bdrv_do_drained_end(bs, data->parent);
34
-
78
+ bdrv_do_drained_end(bs, data->recursive, data->parent);
35
-/**
79
}
36
- * Propagate return value from async callback
80
37
- */
81
data->done = true;
38
-static void qed_sync_cb(void *opaque, int ret)
82
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
39
-{
40
- *(int *)opaque = ret;
41
+ return ret;
42
}
83
}
43
84
44
int qed_read_l1_table_sync(BDRVQEDState *s)
85
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
45
@@ -XXX,XX +XXX,XX @@ int qed_read_l1_table_sync(BDRVQEDState *s)
86
- bool begin, BdrvChild *parent)
46
return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
87
+ bool begin, bool recursive,
88
+ BdrvChild *parent)
89
{
90
BdrvCoDrainData data;
91
92
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
93
.bs = bs,
94
.done = false,
95
.begin = begin,
96
+ .recursive = recursive,
97
.parent = parent,
98
};
99
bdrv_inc_in_flight(bs);
100
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
101
assert(data.done);
47
}
102
}
48
103
49
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
104
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
50
- BlockCompletionFunc *cb, void *opaque)
105
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
51
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
106
+ BdrvChild *parent)
52
{
107
{
53
BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
108
+ BdrvChild *child, *next;
54
- qed_write_table(s, s->header.l1_table_offset,
109
+
55
- s->l1_table, index, n, false, cb, opaque);
110
if (qemu_in_coroutine()) {
56
+ return qed_write_table(s, s->header.l1_table_offset,
111
- bdrv_co_yield_to_drain(bs, true, parent);
57
+ s->l1_table, index, n, false);
112
+ bdrv_co_yield_to_drain(bs, true, recursive, parent);
113
return;
114
}
115
116
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
117
bdrv_parent_drained_begin(bs, parent);
118
bdrv_drain_invoke(bs, true, false);
119
bdrv_drain_recurse(bs);
120
+
121
+ if (recursive) {
122
+ QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
123
+ bdrv_do_drained_begin(child->bs, true, child);
124
+ }
125
+ }
58
}
126
}
59
127
60
int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
128
void bdrv_drained_begin(BlockDriverState *bs)
61
unsigned int n)
62
{
129
{
63
- int ret = -EINPROGRESS;
130
- bdrv_do_drained_begin(bs, NULL);
64
-
131
+ bdrv_do_drained_begin(bs, false, NULL);
65
- qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
132
+}
66
- BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
133
+
67
-
134
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
68
- return ret;
135
+{
69
+ return qed_write_l1_table(s, index, n);
136
+ bdrv_do_drained_begin(bs, true, NULL);
70
}
137
}
71
138
72
int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
139
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
73
@@ -XXX,XX +XXX,XX @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
140
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
74
return qed_read_l2_table(s, request, offset);
141
+ BdrvChild *parent)
142
{
143
+ BdrvChild *child, *next;
144
int old_quiesce_counter;
145
146
if (qemu_in_coroutine()) {
147
- bdrv_co_yield_to_drain(bs, false, parent);
148
+ bdrv_co_yield_to_drain(bs, false, recursive, parent);
149
return;
150
}
151
assert(bs->quiesce_counter > 0);
152
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
153
if (old_quiesce_counter == 1) {
154
aio_enable_external(bdrv_get_aio_context(bs));
155
}
156
+
157
+ if (recursive) {
158
+ QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
159
+ bdrv_do_drained_end(child->bs, true, child);
160
+ }
161
+ }
75
}
162
}
76
163
77
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
164
void bdrv_drained_end(BlockDriverState *bs)
78
- unsigned int index, unsigned int n, bool flush,
79
- BlockCompletionFunc *cb, void *opaque)
80
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
81
+ unsigned int index, unsigned int n, bool flush)
82
{
165
{
83
BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
166
- bdrv_do_drained_end(bs, NULL);
84
- qed_write_table(s, request->l2_table->offset,
167
+ bdrv_do_drained_end(bs, false, NULL);
85
- request->l2_table->table, index, n, flush, cb, opaque);
168
+}
86
+ return qed_write_table(s, request->l2_table->offset,
169
+
87
+ request->l2_table->table, index, n, flush);
170
+void bdrv_subtree_drained_end(BlockDriverState *bs)
171
+{
172
+ bdrv_do_drained_end(bs, true, NULL);
88
}
173
}
89
174
90
int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
175
/*
91
unsigned int index, unsigned int n, bool flush)
92
{
93
- int ret = -EINPROGRESS;
94
-
95
- qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
96
- BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
97
-
98
- return ret;
99
+ return qed_write_l2_table(s, request, index, n, flush);
100
}
101
diff --git a/block/qed.c b/block/qed.c
102
index XXXXXXX..XXXXXXX 100644
103
--- a/block/qed.c
104
+++ b/block/qed.c
105
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l1_update(void *opaque, int ret)
106
index = qed_l1_index(s, acb->cur_pos);
107
s->l1_table->offsets[index] = acb->request.l2_table->offset;
108
109
- qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
110
+ ret = qed_write_l1_table(s, index, 1);
111
+ qed_commit_l2_update(acb, ret);
112
}
113
114
/**
115
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
116
117
if (need_alloc) {
118
/* Write out the whole new L2 table */
119
- qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
120
- qed_aio_write_l1_update, acb);
121
+ ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
122
+ qed_aio_write_l1_update(acb, ret);
123
} else {
124
/* Write out only the updated part of the L2 table */
125
- qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
126
- qed_aio_next_io_cb, acb);
127
+ ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
128
+ false);
129
+ qed_aio_next_io(acb, ret);
130
}
131
return;
132
133
diff --git a/block/qed.h b/block/qed.h
134
index XXXXXXX..XXXXXXX 100644
135
--- a/block/qed.h
136
+++ b/block/qed.h
137
@@ -XXX,XX +XXX,XX @@ void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table);
138
* Table I/O functions
139
*/
140
int qed_read_l1_table_sync(BDRVQEDState *s);
141
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
142
- BlockCompletionFunc *cb, void *opaque);
143
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n);
144
int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
145
unsigned int n);
146
int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
147
uint64_t offset);
148
int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset);
149
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
150
- unsigned int index, unsigned int n, bool flush,
151
- BlockCompletionFunc *cb, void *opaque);
152
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
153
+ unsigned int index, unsigned int n, bool flush);
154
int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
155
unsigned int index, unsigned int n, bool flush);
156
157
--
176
--
158
1.8.3.1
177
2.13.6
159
178
160
179
diff view generated by jsdifflib
1
Note that this code is generally not running in coroutine context, so
1
Add a subtree drain version to the existing test cases.
2
this is an actual blocking synchronous operation. We'll fix this in a
3
moment.
4
2
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
---
4
---
8
block/qed.c | 61 +++++++++++++++++++------------------------------------------
5
tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
9
1 file changed, 19 insertions(+), 42 deletions(-)
6
1 file changed, 26 insertions(+), 1 deletion(-)
10
7
11
diff --git a/block/qed.c b/block/qed.c
8
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
12
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
13
--- a/block/qed.c
10
--- a/tests/test-bdrv-drain.c
14
+++ b/block/qed.c
11
+++ b/tests/test-bdrv-drain.c
15
@@ -XXX,XX +XXX,XX @@ static void qed_aio_start_io(QEDAIOCB *acb)
12
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
16
qed_aio_next_io(acb, 0);
13
enum drain_type {
14
BDRV_DRAIN_ALL,
15
BDRV_DRAIN,
16
+ BDRV_SUBTREE_DRAIN,
17
DRAIN_TYPE_MAX,
18
};
19
20
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
21
switch (drain_type) {
22
case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break;
23
case BDRV_DRAIN: bdrv_drained_begin(bs); break;
24
+ case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_begin(bs); break;
25
default: g_assert_not_reached();
26
}
17
}
27
}
18
28
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
19
-static void qed_aio_next_io_cb(void *opaque, int ret)
29
switch (drain_type) {
20
-{
30
case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break;
21
- QEDAIOCB *acb = opaque;
31
case BDRV_DRAIN: bdrv_drained_end(bs); break;
22
-
32
+ case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_end(bs); break;
23
- qed_aio_next_io(acb, ret);
33
default: g_assert_not_reached();
24
-}
34
}
25
-
35
}
26
static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
36
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
37
test_drv_cb_common(BDRV_DRAIN, false);
38
}
39
40
+static void test_drv_cb_drain_subtree(void)
41
+{
42
+ test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
43
+}
44
+
45
static void test_quiesce_common(enum drain_type drain_type, bool recursive)
27
{
46
{
28
assert(!s->allocating_write_reqs_plugged);
47
BlockBackend *blk;
29
@@ -XXX,XX +XXX,XX @@ err:
48
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
30
qed_aio_complete(acb, ret);
49
test_quiesce_common(BDRV_DRAIN, false);
31
}
50
}
32
51
33
-static void qed_aio_write_l2_update_cb(void *opaque, int ret)
52
+static void test_quiesce_drain_subtree(void)
34
-{
53
+{
35
- QEDAIOCB *acb = opaque;
54
+ test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
36
- qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
55
+}
37
-}
38
-
39
-/**
40
- * Flush new data clusters before updating the L2 table
41
- *
42
- * This flush is necessary when a backing file is in use. A crash during an
43
- * allocating write could result in empty clusters in the image. If the write
44
- * only touched a subregion of the cluster, then backing image sectors have
45
- * been lost in the untouched region. The solution is to flush after writing a
46
- * new data cluster and before updating the L2 table.
47
- */
48
-static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
49
-{
50
- QEDAIOCB *acb = opaque;
51
- BDRVQEDState *s = acb_to_s(acb);
52
-
53
- if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) {
54
- qed_aio_complete(acb, -EIO);
55
- }
56
-}
57
-
58
/**
59
* Write data to the image file
60
*/
61
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
62
BDRVQEDState *s = acb_to_s(acb);
63
uint64_t offset = acb->cur_cluster +
64
qed_offset_into_cluster(s, acb->cur_pos);
65
- BlockCompletionFunc *next_fn;
66
67
trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
68
69
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
70
return;
71
}
72
73
+ BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
74
+ ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
75
+ if (ret >= 0) {
76
+ ret = 0;
77
+ }
78
+
56
+
79
if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
57
static void test_nested(void)
80
- next_fn = qed_aio_next_io_cb;
58
{
81
+ qed_aio_next_io(acb, ret);
59
BlockBackend *blk;
82
} else {
60
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
83
if (s->bs->backing) {
61
/* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
84
- next_fn = qed_aio_write_flush_before_l2_update;
62
int bs_quiesce = (outer != BDRV_DRAIN_ALL) +
85
- } else {
63
(inner != BDRV_DRAIN_ALL);
86
- next_fn = qed_aio_write_l2_update_cb;
64
- int backing_quiesce = 0;
87
+ /*
65
+ int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
88
+ * Flush new data clusters before updating the L2 table
66
+ (inner == BDRV_SUBTREE_DRAIN);
89
+ *
67
int backing_cb_cnt = (outer != BDRV_DRAIN) +
90
+ * This flush is necessary when a backing file is in use. A crash
68
(inner != BDRV_DRAIN);
91
+ * during an allocating write could result in empty clusters in the
69
92
+ * image. If the write only touched a subregion of the cluster,
70
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
93
+ * then backing image sectors have been lost in the untouched
71
test_blockjob_common(BDRV_DRAIN);
94
+ * region. The solution is to flush after writing a new data
95
+ * cluster and before updating the L2 table.
96
+ */
97
+ ret = bdrv_flush(s->bs->file->bs);
98
}
99
+ qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
100
}
101
-
102
- BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
103
- bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
104
- &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
105
- next_fn, acb);
106
}
72
}
107
73
108
/**
74
+static void test_blockjob_drain_subtree(void)
75
+{
76
+ test_blockjob_common(BDRV_SUBTREE_DRAIN);
77
+}
78
+
79
int main(int argc, char **argv)
80
{
81
bdrv_init();
82
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
83
84
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
85
g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
86
+ g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
87
+ test_drv_cb_drain_subtree);
88
89
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
90
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
91
+ g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
92
+ test_quiesce_drain_subtree);
93
94
g_test_add_func("/bdrv-drain/nested", test_nested);
95
96
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
97
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
98
+ g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
99
+ test_blockjob_drain_subtree);
100
101
return g_test_run();
102
}
109
--
103
--
110
1.8.3.1
104
2.13.6
111
105
112
106
diff view generated by jsdifflib
1
Most of the qed code is now synchronous and matches the coroutine model.
1
If bdrv_do_drained_begin/end() are called in coroutine context, they
2
One notable exception is the serialisation between requests which can
2
first use a BH to get out of the coroutine context. Call some existing
3
still schedule a callback. Before we can replace this with coroutine
3
tests again from a coroutine to cover this code path.
4
locks, let's convert the driver's external interfaces to the coroutine
5
versions.
6
7
We need to be careful to handle both requests that call the completion
8
callback directly from the calling coroutine (i.e. fully synchronous
9
code) and requests that involve some callback, so that we need to yield
10
and wait for the completion callback coming from outside the coroutine.
11
4
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
Reviewed-by: Manos Pitsidianakis <el13635@mail.ntua.gr>
14
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
6
---
16
block/qed.c | 97 ++++++++++++++++++++++++++-----------------------------------
7
tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
17
1 file changed, 42 insertions(+), 55 deletions(-)
8
1 file changed, 59 insertions(+)
18
9
19
diff --git a/block/qed.c b/block/qed.c
10
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
20
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
21
--- a/block/qed.c
12
--- a/tests/test-bdrv-drain.c
22
+++ b/block/qed.c
13
+++ b/tests/test-bdrv-drain.c
23
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb)
14
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
24
}
15
*aio_ret = ret;
25
}
16
}
26
17
27
-static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
18
+typedef struct CallInCoroutineData {
28
- int64_t sector_num,
19
+ void (*entry)(void);
29
- QEMUIOVector *qiov, int nb_sectors,
30
- BlockCompletionFunc *cb,
31
- void *opaque, int flags)
32
+typedef struct QEDRequestCo {
33
+ Coroutine *co;
34
+ bool done;
20
+ bool done;
35
+ int ret;
21
+} CallInCoroutineData;
36
+} QEDRequestCo;
37
+
22
+
38
+static void qed_co_request_cb(void *opaque, int ret)
23
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
39
{
24
+{
40
- QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
25
+ CallInCoroutineData *data = opaque;
41
+ QEDRequestCo *co = opaque;
26
+
42
27
+ data->entry();
43
- trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
28
+ data->done = true;
44
- opaque, flags);
45
+ co->done = true;
46
+ co->ret = ret;
47
+ qemu_coroutine_enter_if_inactive(co->co);
48
+}
29
+}
49
+
30
+
50
+static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
31
+static void call_in_coroutine(void (*entry)(void))
51
+ QEMUIOVector *qiov, int nb_sectors,
52
+ int flags)
53
+{
32
+{
54
+ QEDRequestCo co = {
33
+ Coroutine *co;
55
+ .co = qemu_coroutine_self(),
34
+ CallInCoroutineData data = {
35
+ .entry = entry,
56
+ .done = false,
36
+ .done = false,
57
+ };
37
+ };
58
+ QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
59
+
38
+
60
+ trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
39
+ co = qemu_coroutine_create(call_in_coroutine_entry, &data);
61
40
+ qemu_coroutine_enter(co);
62
acb->flags = flags;
41
+ while (!data.done) {
63
acb->qiov = qiov;
42
+ aio_poll(qemu_get_aio_context(), true);
64
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
65
66
/* Start request */
67
qed_aio_start_io(acb);
68
- return &acb->common;
69
-}
70
71
-static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
72
- int64_t sector_num,
73
- QEMUIOVector *qiov, int nb_sectors,
74
- BlockCompletionFunc *cb,
75
- void *opaque)
76
-{
77
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
78
+ if (!co.done) {
79
+ qemu_coroutine_yield();
80
+ }
43
+ }
44
+}
81
+
45
+
82
+ return co.ret;
46
enum drain_type {
47
BDRV_DRAIN_ALL,
48
BDRV_DRAIN,
49
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
50
test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
83
}
51
}
84
52
85
-static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
53
+static void test_drv_cb_co_drain(void)
86
- int64_t sector_num,
54
+{
87
- QEMUIOVector *qiov, int nb_sectors,
55
+ call_in_coroutine(test_drv_cb_drain);
88
- BlockCompletionFunc *cb,
56
+}
89
- void *opaque)
57
+
90
+static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
58
+static void test_drv_cb_co_drain_subtree(void)
91
+ int64_t sector_num, int nb_sectors,
59
+{
92
+ QEMUIOVector *qiov)
60
+ call_in_coroutine(test_drv_cb_drain_subtree);
61
+}
62
+
63
static void test_quiesce_common(enum drain_type drain_type, bool recursive)
93
{
64
{
94
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
65
BlockBackend *blk;
95
- opaque, QED_AIOCB_WRITE);
66
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
96
+ return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
67
test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
97
}
68
}
98
69
99
-typedef struct {
70
+static void test_quiesce_co_drain(void)
100
- Coroutine *co;
71
+{
101
- int ret;
72
+ call_in_coroutine(test_quiesce_drain);
102
- bool done;
73
+}
103
-} QEDWriteZeroesCB;
74
+
104
-
75
+static void test_quiesce_co_drain_subtree(void)
105
-static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
76
+{
106
+static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
77
+ call_in_coroutine(test_quiesce_drain_subtree);
107
+ int64_t sector_num, int nb_sectors,
78
+}
108
+ QEMUIOVector *qiov)
79
+
80
static void test_nested(void)
109
{
81
{
110
- QEDWriteZeroesCB *cb = opaque;
82
BlockBackend *blk;
111
-
83
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
112
- cb->done = true;
84
g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
113
- cb->ret = ret;
85
test_drv_cb_drain_subtree);
114
- if (cb->co) {
86
115
- aio_co_wake(cb->co);
87
+ // XXX bdrv_drain_all() doesn't work in coroutine context
116
- }
88
+ g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
117
+ return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
89
+ g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
118
}
90
+ test_drv_cb_co_drain_subtree);
119
91
+
120
static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
92
+
121
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
93
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
122
int count,
94
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
123
BdrvRequestFlags flags)
95
g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
124
{
96
test_quiesce_drain_subtree);
125
- BlockAIOCB *blockacb;
97
126
BDRVQEDState *s = bs->opaque;
98
+ // XXX bdrv_drain_all() doesn't work in coroutine context
127
- QEDWriteZeroesCB cb = { .done = false };
99
+ g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
128
QEMUIOVector qiov;
100
+ g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
129
struct iovec iov;
101
+ test_quiesce_co_drain_subtree);
130
102
+
131
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
103
g_test_add_func("/bdrv-drain/nested", test_nested);
132
iov.iov_len = count;
104
133
105
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
134
qemu_iovec_init_external(&qiov, &iov, 1);
135
- blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
136
- count >> BDRV_SECTOR_BITS,
137
- qed_co_pwrite_zeroes_cb, &cb,
138
- QED_AIOCB_WRITE | QED_AIOCB_ZERO);
139
- if (!blockacb) {
140
- return -EIO;
141
- }
142
- if (!cb.done) {
143
- cb.co = qemu_coroutine_self();
144
- qemu_coroutine_yield();
145
- }
146
- assert(cb.done);
147
- return cb.ret;
148
+ return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
149
+ count >> BDRV_SECTOR_BITS,
150
+ QED_AIOCB_WRITE | QED_AIOCB_ZERO);
151
}
152
153
static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
154
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_qed = {
155
.bdrv_create = bdrv_qed_create,
156
.bdrv_has_zero_init = bdrv_has_zero_init_1,
157
.bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
158
- .bdrv_aio_readv = bdrv_qed_aio_readv,
159
- .bdrv_aio_writev = bdrv_qed_aio_writev,
160
+ .bdrv_co_readv = bdrv_qed_co_readv,
161
+ .bdrv_co_writev = bdrv_qed_co_writev,
162
.bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes,
163
.bdrv_truncate = bdrv_qed_truncate,
164
.bdrv_getlength = bdrv_qed_getlength,
165
--
106
--
166
1.8.3.1
107
2.13.6
167
108
168
109
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
1
Test that drain sections are correctly propagated through the graph.
2
2
3
Perform the savevm/loadvm test with both iothread on and off. This
4
covers the recently found savevm/loadvm hang when iothread is enabled.
5
6
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
---
4
---
9
tests/qemu-iotests/068 | 23 ++++++++++++++---------
5
tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
10
tests/qemu-iotests/068.out | 11 ++++++++++-
6
1 file changed, 74 insertions(+)
11
2 files changed, 24 insertions(+), 10 deletions(-)
12
7
13
diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
8
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
14
index XXXXXXX..XXXXXXX 100755
9
index XXXXXXX..XXXXXXX 100644
15
--- a/tests/qemu-iotests/068
10
--- a/tests/test-bdrv-drain.c
16
+++ b/tests/qemu-iotests/068
11
+++ b/tests/test-bdrv-drain.c
17
@@ -XXX,XX +XXX,XX @@ _supported_os Linux
12
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
18
IMGOPTS="compat=1.1"
13
blk_unref(blk);
19
IMG_SIZE=128K
20
21
-echo
22
-echo "=== Saving and reloading a VM state to/from a qcow2 image ==="
23
-echo
24
-_make_test_img $IMG_SIZE
25
-
26
case "$QEMU_DEFAULT_MACHINE" in
27
s390-ccw-virtio)
28
platform_parm="-no-shutdown"
29
@@ -XXX,XX +XXX,XX @@ _qemu()
30
_filter_qemu | _filter_hmp
31
}
14
}
32
15
33
-# Give qemu some time to boot before saving the VM state
16
+static void test_multiparent(void)
34
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
17
+{
35
-# Now try to continue from that VM state (this should just work)
18
+ BlockBackend *blk_a, *blk_b;
36
-echo quit | _qemu -loadvm 0
19
+ BlockDriverState *bs_a, *bs_b, *backing;
37
+for extra_args in \
20
+ BDRVTestState *a_s, *b_s, *backing_s;
38
+ "" \
39
+ "-object iothread,id=iothread0 -set device.hba0.iothread=iothread0"; do
40
+ echo
41
+ echo "=== Saving and reloading a VM state to/from a qcow2 image ($extra_args) ==="
42
+ echo
43
+
21
+
44
+ _make_test_img $IMG_SIZE
22
+ blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
23
+ bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
24
+ &error_abort);
25
+ a_s = bs_a->opaque;
26
+ blk_insert_bs(blk_a, bs_a, &error_abort);
45
+
27
+
46
+ # Give qemu some time to boot before saving the VM state
28
+ blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
47
+ bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu $extra_args
29
+ bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
48
+ # Now try to continue from that VM state (this should just work)
30
+ &error_abort);
49
+ echo quit | _qemu $extra_args -loadvm 0
31
+ b_s = bs_b->opaque;
50
+done
32
+ blk_insert_bs(blk_b, bs_b, &error_abort);
51
52
# success, all done
53
echo "*** done"
54
diff --git a/tests/qemu-iotests/068.out b/tests/qemu-iotests/068.out
55
index XXXXXXX..XXXXXXX 100644
56
--- a/tests/qemu-iotests/068.out
57
+++ b/tests/qemu-iotests/068.out
58
@@ -XXX,XX +XXX,XX @@
59
QA output created by 068
60
61
-=== Saving and reloading a VM state to/from a qcow2 image ===
62
+=== Saving and reloading a VM state to/from a qcow2 image () ===
63
+
33
+
64
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
34
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
65
+QEMU X.Y.Z monitor - type 'help' for more information
35
+ backing_s = backing->opaque;
66
+(qemu) savevm 0
36
+ bdrv_set_backing_hd(bs_a, backing, &error_abort);
67
+(qemu) quit
37
+ bdrv_set_backing_hd(bs_b, backing, &error_abort);
68
+QEMU X.Y.Z monitor - type 'help' for more information
69
+(qemu) quit
70
+
38
+
71
+=== Saving and reloading a VM state to/from a qcow2 image (-object iothread,id=iothread0 -set device.hba0.iothread=iothread0) ===
39
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
72
40
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
73
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
41
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
74
QEMU X.Y.Z monitor - type 'help' for more information
42
+ g_assert_cmpint(a_s->drain_count, ==, 0);
43
+ g_assert_cmpint(b_s->drain_count, ==, 0);
44
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
45
+
46
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
47
+
48
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
49
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
50
+ g_assert_cmpint(backing->quiesce_counter, ==, 1);
51
+ g_assert_cmpint(a_s->drain_count, ==, 1);
52
+ g_assert_cmpint(b_s->drain_count, ==, 1);
53
+ g_assert_cmpint(backing_s->drain_count, ==, 1);
54
+
55
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
56
+
57
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
58
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
59
+ g_assert_cmpint(backing->quiesce_counter, ==, 2);
60
+ g_assert_cmpint(a_s->drain_count, ==, 2);
61
+ g_assert_cmpint(b_s->drain_count, ==, 2);
62
+ g_assert_cmpint(backing_s->drain_count, ==, 2);
63
+
64
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
65
+
66
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
67
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
68
+ g_assert_cmpint(backing->quiesce_counter, ==, 1);
69
+ g_assert_cmpint(a_s->drain_count, ==, 1);
70
+ g_assert_cmpint(b_s->drain_count, ==, 1);
71
+ g_assert_cmpint(backing_s->drain_count, ==, 1);
72
+
73
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
74
+
75
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
76
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
77
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
78
+ g_assert_cmpint(a_s->drain_count, ==, 0);
79
+ g_assert_cmpint(b_s->drain_count, ==, 0);
80
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
81
+
82
+ bdrv_unref(backing);
83
+ bdrv_unref(bs_a);
84
+ bdrv_unref(bs_b);
85
+ blk_unref(blk_a);
86
+ blk_unref(blk_b);
87
+}
88
+
89
90
typedef struct TestBlockJob {
91
BlockJob common;
92
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
93
test_quiesce_co_drain_subtree);
94
95
g_test_add_func("/bdrv-drain/nested", test_nested);
96
+ g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
97
98
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
99
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
75
--
100
--
76
1.8.3.1
101
2.13.6
77
102
78
103
diff view generated by jsdifflib
1
Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
1
We need to remember how many of the drain sections in which a node is
2
just return an error code and let the caller handle it.
2
were recursive (i.e. subtree drain rather than node drain), so that they
3
can be correctly applied when children are added or removed during the
4
drained section.
5
6
With this change, it is safe to modify the graph even inside a
7
bdrv_subtree_drained_begin/end() section.
3
8
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
---
10
---
7
block/qed.c | 43 ++++++++++++++++++++-----------------------
11
include/block/block.h | 2 --
8
1 file changed, 20 insertions(+), 23 deletions(-)
12
include/block/block_int.h | 5 +++++
9
13
block.c | 32 +++++++++++++++++++++++++++++---
10
diff --git a/block/qed.c b/block/qed.c
14
block/io.c | 28 ++++++++++++++++++++++++----
11
index XXXXXXX..XXXXXXX 100644
15
4 files changed, 58 insertions(+), 9 deletions(-)
12
--- a/block/qed.c
16
13
+++ b/block/qed.c
17
diff --git a/include/block/block.h b/include/block/block.h
14
@@ -XXX,XX +XXX,XX @@ static bool qed_should_set_need_check(BDRVQEDState *s)
18
index XXXXXXX..XXXXXXX 100644
15
*
19
--- a/include/block/block.h
16
* This path is taken when writing to previously unallocated clusters.
20
+++ b/include/block/block.h
21
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
22
/**
23
* Like bdrv_drained_begin, but recursively begins a quiesced section for
24
* exclusive access to all child nodes as well.
25
- *
26
- * Graph changes are not allowed during a subtree drain section.
17
*/
27
*/
18
-static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
28
void bdrv_subtree_drained_begin(BlockDriverState *bs);
19
+static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
29
30
diff --git a/include/block/block_int.h b/include/block/block_int.h
31
index XXXXXXX..XXXXXXX 100644
32
--- a/include/block/block_int.h
33
+++ b/include/block/block_int.h
34
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
35
36
/* Accessed with atomic ops. */
37
int quiesce_counter;
38
+ int recursive_quiesce_counter;
39
+
40
unsigned int write_gen; /* Current data generation */
41
42
/* Protected by reqs_lock. */
43
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
44
int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
45
BdrvRequestFlags flags);
46
47
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
48
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
49
+
50
int get_tmp_filename(char *filename, int size);
51
BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
52
const char *filename);
53
diff --git a/block.c b/block.c
54
index XXXXXXX..XXXXXXX 100644
55
--- a/block.c
56
+++ b/block.c
57
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
58
bdrv_drained_end(bs);
59
}
60
61
+static void bdrv_child_cb_attach(BdrvChild *child)
62
+{
63
+ BlockDriverState *bs = child->opaque;
64
+ bdrv_apply_subtree_drain(child, bs);
65
+}
66
+
67
+static void bdrv_child_cb_detach(BdrvChild *child)
68
+{
69
+ BlockDriverState *bs = child->opaque;
70
+ bdrv_unapply_subtree_drain(child, bs);
71
+}
72
+
73
static int bdrv_child_cb_inactivate(BdrvChild *child)
20
{
74
{
21
BDRVQEDState *s = acb_to_s(acb);
75
BlockDriverState *bs = child->opaque;
22
int ret;
76
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
23
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
77
.inherit_options = bdrv_inherited_options,
78
.drained_begin = bdrv_child_cb_drained_begin,
79
.drained_end = bdrv_child_cb_drained_end,
80
+ .attach = bdrv_child_cb_attach,
81
+ .detach = bdrv_child_cb_detach,
82
.inactivate = bdrv_child_cb_inactivate,
83
};
84
85
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
86
.inherit_options = bdrv_inherited_fmt_options,
87
.drained_begin = bdrv_child_cb_drained_begin,
88
.drained_end = bdrv_child_cb_drained_end,
89
+ .attach = bdrv_child_cb_attach,
90
+ .detach = bdrv_child_cb_detach,
91
.inactivate = bdrv_child_cb_inactivate,
92
};
93
94
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
95
parent->backing_blocker);
96
bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
97
parent->backing_blocker);
98
+
99
+ bdrv_child_cb_attach(c);
100
}
101
102
static void bdrv_backing_detach(BdrvChild *c)
103
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
104
bdrv_op_unblock_all(c->bs, parent->backing_blocker);
105
error_free(parent->backing_blocker);
106
parent->backing_blocker = NULL;
107
+
108
+ bdrv_child_cb_detach(c);
109
}
110
111
/*
112
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
113
assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
24
}
114
}
25
if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
115
if (old_bs) {
26
s->allocating_write_reqs_plugged) {
116
+ /* Detach first so that the recursive drain sections coming from @child
27
- return; /* wait for existing request to finish */
117
+ * are already gone and we only end the drain sections that came from
28
+ return -EINPROGRESS; /* wait for existing request to finish */
118
+ * elsewhere. */
119
+ if (child->role->detach) {
120
+ child->role->detach(child);
121
+ }
122
if (old_bs->quiesce_counter && child->role->drained_end) {
123
for (i = 0; i < old_bs->quiesce_counter; i++) {
124
child->role->drained_end(child);
125
}
126
}
127
- if (child->role->detach) {
128
- child->role->detach(child);
129
- }
130
QLIST_REMOVE(child, next_parent);
29
}
131
}
30
132
31
acb->cur_nclusters = qed_bytes_to_clusters(s,
133
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
32
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
134
}
33
if (acb->flags & QED_AIOCB_ZERO) {
135
}
34
/* Skip ahead if the clusters are already zero */
136
35
if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
137
+ /* Attach only after starting new drained sections, so that recursive
36
- qed_aio_start_io(acb);
138
+ * drain sections coming from @child don't get an extra .drained_begin
37
- return;
139
+ * callback. */
38
+ return 0;
140
if (child->role->attach) {
39
}
141
child->role->attach(child);
40
} else {
142
}
41
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
143
diff --git a/block/io.c b/block/io.c
42
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
144
index XXXXXXX..XXXXXXX 100644
43
s->header.features |= QED_F_NEED_CHECK;
145
--- a/block/io.c
44
ret = qed_write_header(s);
146
+++ b/block/io.c
45
if (ret < 0) {
147
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
46
- qed_aio_complete(acb, ret);
148
assert(data.done);
47
- return;
149
}
48
+ return ret;
150
49
}
151
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
152
- BdrvChild *parent)
153
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
154
+ BdrvChild *parent)
155
{
156
BdrvChild *child, *next;
157
158
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
159
bdrv_drain_recurse(bs);
160
161
if (recursive) {
162
+ bs->recursive_quiesce_counter++;
163
QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
164
bdrv_do_drained_begin(child->bs, true, child);
165
}
166
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
167
bdrv_do_drained_begin(bs, true, NULL);
168
}
169
170
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
171
- BdrvChild *parent)
172
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
173
+ BdrvChild *parent)
174
{
175
BdrvChild *child, *next;
176
int old_quiesce_counter;
177
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
50
}
178
}
51
179
52
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
180
if (recursive) {
53
ret = qed_aio_write_cow(acb);
181
+ bs->recursive_quiesce_counter--;
54
}
182
QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
55
if (ret < 0) {
183
bdrv_do_drained_end(child->bs, true, child);
56
- qed_aio_complete(acb, ret);
184
}
57
- return;
185
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
58
+ return ret;
186
bdrv_do_drained_end(bs, true, NULL);
59
}
187
}
60
- qed_aio_next_io(acb, 0);
188
61
+ return 0;
189
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
62
}
190
+{
63
191
+ int i;
64
/**
192
+
65
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
193
+ for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
66
*
194
+ bdrv_do_drained_begin(child->bs, true, child);
67
* This path is taken when writing to already allocated clusters.
68
*/
69
-static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
70
+static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
71
{
72
- int ret;
73
-
74
/* Allocate buffer for zero writes */
75
if (acb->flags & QED_AIOCB_ZERO) {
76
struct iovec *iov = acb->qiov->iov;
77
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
78
if (!iov->iov_base) {
79
iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
80
if (iov->iov_base == NULL) {
81
- qed_aio_complete(acb, -ENOMEM);
82
- return;
83
+ return -ENOMEM;
84
}
85
memset(iov->iov_base, 0, iov->iov_len);
86
}
87
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
88
qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
89
90
/* Do the actual write */
91
- ret = qed_aio_write_main(acb);
92
- if (ret < 0) {
93
- qed_aio_complete(acb, ret);
94
- return;
95
- }
96
- qed_aio_next_io(acb, 0);
97
+ return qed_aio_write_main(acb);
98
}
99
100
/**
101
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_data(void *opaque, int ret,
102
103
switch (ret) {
104
case QED_CLUSTER_FOUND:
105
- qed_aio_write_inplace(acb, offset, len);
106
+ ret = qed_aio_write_inplace(acb, offset, len);
107
break;
108
109
case QED_CLUSTER_L2:
110
case QED_CLUSTER_L1:
111
case QED_CLUSTER_ZERO:
112
- qed_aio_write_alloc(acb, len);
113
+ ret = qed_aio_write_alloc(acb, len);
114
break;
115
116
default:
117
- qed_aio_complete(acb, ret);
118
+ assert(ret < 0);
119
break;
120
}
121
+
122
+ if (ret < 0) {
123
+ if (ret != -EINPROGRESS) {
124
+ qed_aio_complete(acb, ret);
125
+ }
126
+ return;
127
+ }
195
+ }
128
+ qed_aio_next_io(acb, 0);
196
+}
129
}
197
+
130
198
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
131
/**
199
+{
200
+ int i;
201
+
202
+ for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
203
+ bdrv_do_drained_end(child->bs, true, child);
204
+ }
205
+}
206
+
207
/*
208
* Wait for pending requests to complete on a single BlockDriverState subtree,
209
* and suspend block driver's internal I/O until next request arrives.
132
--
210
--
133
1.8.3.1
211
2.13.6
134
212
135
213
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
3
Avoid duplicating the QEMU command-line.
4
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
7
---
2
---
8
tests/qemu-iotests/068 | 15 +++++++++------
3
tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
9
1 file changed, 9 insertions(+), 6 deletions(-)
4
1 file changed, 80 insertions(+)
10
5
11
diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
6
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
12
index XXXXXXX..XXXXXXX 100755
7
index XXXXXXX..XXXXXXX 100644
13
--- a/tests/qemu-iotests/068
8
--- a/tests/test-bdrv-drain.c
14
+++ b/tests/qemu-iotests/068
9
+++ b/tests/test-bdrv-drain.c
15
@@ -XXX,XX +XXX,XX @@ case "$QEMU_DEFAULT_MACHINE" in
10
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
16
;;
11
blk_unref(blk_b);
17
esac
12
}
18
13
19
-# Give qemu some time to boot before saving the VM state
14
+static void test_graph_change(void)
20
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' |\
21
- $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" |\
22
+_qemu()
23
+{
15
+{
24
+ $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" \
16
+ BlockBackend *blk_a, *blk_b;
25
+ "$@" |\
17
+ BlockDriverState *bs_a, *bs_b, *backing;
26
_filter_qemu | _filter_hmp
18
+ BDRVTestState *a_s, *b_s, *backing_s;
19
+
20
+ blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
21
+ bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
22
+ &error_abort);
23
+ a_s = bs_a->opaque;
24
+ blk_insert_bs(blk_a, bs_a, &error_abort);
25
+
26
+ blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
27
+ bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
28
+ &error_abort);
29
+ b_s = bs_b->opaque;
30
+ blk_insert_bs(blk_b, bs_b, &error_abort);
31
+
32
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
33
+ backing_s = backing->opaque;
34
+ bdrv_set_backing_hd(bs_a, backing, &error_abort);
35
+
36
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
37
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
38
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
39
+ g_assert_cmpint(a_s->drain_count, ==, 0);
40
+ g_assert_cmpint(b_s->drain_count, ==, 0);
41
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
42
+
43
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
44
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
45
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
46
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
47
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
48
+
49
+ bdrv_set_backing_hd(bs_b, backing, &error_abort);
50
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
51
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
52
+ g_assert_cmpint(backing->quiesce_counter, ==, 5);
53
+ g_assert_cmpint(a_s->drain_count, ==, 5);
54
+ g_assert_cmpint(b_s->drain_count, ==, 5);
55
+ g_assert_cmpint(backing_s->drain_count, ==, 5);
56
+
57
+ bdrv_set_backing_hd(bs_b, NULL, &error_abort);
58
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
59
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
60
+ g_assert_cmpint(backing->quiesce_counter, ==, 3);
61
+ g_assert_cmpint(a_s->drain_count, ==, 3);
62
+ g_assert_cmpint(b_s->drain_count, ==, 2);
63
+ g_assert_cmpint(backing_s->drain_count, ==, 3);
64
+
65
+ bdrv_set_backing_hd(bs_b, backing, &error_abort);
66
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
67
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
68
+ g_assert_cmpint(backing->quiesce_counter, ==, 5);
69
+ g_assert_cmpint(a_s->drain_count, ==, 5);
70
+ g_assert_cmpint(b_s->drain_count, ==, 5);
71
+ g_assert_cmpint(backing_s->drain_count, ==, 5);
72
+
73
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
74
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
75
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
76
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
77
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
78
+
79
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
80
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
81
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
82
+ g_assert_cmpint(a_s->drain_count, ==, 0);
83
+ g_assert_cmpint(b_s->drain_count, ==, 0);
84
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
85
+
86
+ bdrv_unref(backing);
87
+ bdrv_unref(bs_a);
88
+ bdrv_unref(bs_b);
89
+ blk_unref(blk_a);
90
+ blk_unref(blk_b);
27
+}
91
+}
28
+
92
+
29
+# Give qemu some time to boot before saving the VM state
93
30
+bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
94
typedef struct TestBlockJob {
31
# Now try to continue from that VM state (this should just work)
95
BlockJob common;
32
-echo quit |\
96
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
33
- $QEMU $platform_parm -nographic -monitor stdio -serial none -hda "$TEST_IMG" -loadvm 0 |\
97
34
- _filter_qemu | _filter_hmp
98
g_test_add_func("/bdrv-drain/nested", test_nested);
35
+echo quit | _qemu -loadvm 0
99
g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
36
100
+ g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
37
# success, all done
101
38
echo "*** done"
102
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
103
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
39
--
104
--
40
1.8.3.1
105
2.13.6
41
106
42
107
diff view generated by jsdifflib
1
commit_complete() can't assume that after its block_job_completed() the
1
Since commit bde70715, base is the only node that is reopened in
2
job is actually immediately freed; someone else may still be holding
2
commit_start(). This means that the code, which still involves an
3
references. In this case, the op blockers on the intermediate nodes make
3
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
4
the graph reconfiguration in the completion code fail.
5
4
6
Call block_job_remove_all_bdrv() manually so that we know for sure that
7
any blockers on intermediate nodes are given up.
8
9
Cc: qemu-stable@nongnu.org
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
Reviewed-by: Eric Blake <eblake@redhat.com>
6
Reviewed-by: Fam Zheng <famz@redhat.com>
12
Reviewed-by: Max Reitz <mreitz@redhat.com>
13
---
7
---
14
block/commit.c | 7 +++++++
8
block/commit.c | 8 +-------
15
1 file changed, 7 insertions(+)
9
1 file changed, 1 insertion(+), 7 deletions(-)
16
10
17
diff --git a/block/commit.c b/block/commit.c
11
diff --git a/block/commit.c b/block/commit.c
18
index XXXXXXX..XXXXXXX 100644
12
index XXXXXXX..XXXXXXX 100644
19
--- a/block/commit.c
13
--- a/block/commit.c
20
+++ b/block/commit.c
14
+++ b/block/commit.c
21
@@ -XXX,XX +XXX,XX @@ static void commit_complete(BlockJob *job, void *opaque)
15
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
22
}
16
const char *filter_node_name, Error **errp)
23
g_free(s->backing_file_str);
17
{
24
blk_unref(s->top);
18
CommitBlockJob *s;
25
+
19
- BlockReopenQueue *reopen_queue = NULL;
26
+ /* If there is more than one reference to the job (e.g. if called from
20
int orig_base_flags;
27
+ * block_job_finish_sync()), block_job_completed() won't free it and
21
BlockDriverState *iter;
28
+ * therefore the blockers on the intermediate nodes remain. This would
22
BlockDriverState *commit_top_bs = NULL;
29
+ * cause bdrv_set_backing_hd() to fail. */
23
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
30
+ block_job_remove_all_bdrv(job);
24
/* convert base to r/w, if necessary */
31
+
25
orig_base_flags = bdrv_get_flags(base);
32
block_job_completed(&s->common, ret);
26
if (!(orig_base_flags & BDRV_O_RDWR)) {
33
g_free(data);
27
- reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
34
28
- orig_base_flags | BDRV_O_RDWR);
29
- }
30
-
31
- if (reopen_queue) {
32
- bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
33
+ bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
34
if (local_err != NULL) {
35
error_propagate(errp, local_err);
36
goto fail;
35
--
37
--
36
1.8.3.1
38
2.13.6
37
39
38
40
diff view generated by jsdifflib
Deleted patch
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
1
3
AioContext was designed to allow nested acquire/release calls. It uses
4
a recursive mutex so callers don't need to worry about nesting...or so
5
we thought.
6
7
BDRV_POLL_WHILE() is used to wait for block I/O requests. It releases
8
the AioContext temporarily around aio_poll(). This gives IOThreads a
9
chance to acquire the AioContext to process I/O completions.
10
11
It turns out that recursive locking and BDRV_POLL_WHILE() don't mix.
12
BDRV_POLL_WHILE() only releases the AioContext once, so the IOThread
13
will not be able to acquire the AioContext if it was acquired
14
multiple times.
15
16
Instead of trying to release AioContext n times in BDRV_POLL_WHILE(),
17
this patch simply avoids nested locking in save_vmstate(). It's the
18
simplest fix and we should step back to consider the big picture with
19
all the recent changes to block layer threading.
20
21
This patch is the final fix to solve 'savevm' hanging with -object
22
iothread.
23
24
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
25
Reviewed-by: Eric Blake <eblake@redhat.com>
26
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
27
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
28
---
29
migration/savevm.c | 12 +++++++++++-
30
1 file changed, 11 insertions(+), 1 deletion(-)
31
32
diff --git a/migration/savevm.c b/migration/savevm.c
33
index XXXXXXX..XXXXXXX 100644
34
--- a/migration/savevm.c
35
+++ b/migration/savevm.c
36
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
37
goto the_end;
38
}
39
40
+ /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
41
+ * for itself. BDRV_POLL_WHILE() does not support nested locking because
42
+ * it only releases the lock once. Therefore synchronous I/O will deadlock
43
+ * unless we release the AioContext before bdrv_all_create_snapshot().
44
+ */
45
+ aio_context_release(aio_context);
46
+ aio_context = NULL;
47
+
48
ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
49
if (ret < 0) {
50
error_setg(errp, "Error while creating snapshot on '%s'",
51
@@ -XXX,XX +XXX,XX @@ int save_snapshot(const char *name, Error **errp)
52
ret = 0;
53
54
the_end:
55
- aio_context_release(aio_context);
56
+ if (aio_context) {
57
+ aio_context_release(aio_context);
58
+ }
59
if (saved_vm_running) {
60
vm_start();
61
}
62
--
63
1.8.3.1
64
65
diff view generated by jsdifflib
Deleted patch
1
From: Alberto Garcia <berto@igalia.com>
2
1
3
There used to be throttle_timers_{detach,attach}_aio_context() calls
4
in bdrv_set_aio_context(), but since 7ca7f0f6db1fedd28d490795d778cf239
5
they are now in blk_set_aio_context().
6
7
Signed-off-by: Alberto Garcia <berto@igalia.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
11
block/throttle-groups.c | 2 +-
12
1 file changed, 1 insertion(+), 1 deletion(-)
13
14
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/block/throttle-groups.c
17
+++ b/block/throttle-groups.c
18
@@ -XXX,XX +XXX,XX @@
19
* Again, all this is handled internally and is mostly transparent to
20
* the outside. The 'throttle_timers' field however has an additional
21
* constraint because it may be temporarily invalid (see for example
22
- * bdrv_set_aio_context()). Therefore in this file a thread will
23
+ * blk_set_aio_context()). Therefore in this file a thread will
24
* access some other BlockBackend's timers only after verifying that
25
* that BlockBackend has throttled requests in the queue.
26
*/
27
--
28
1.8.3.1
29
30
diff view generated by jsdifflib
Deleted patch
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
1
3
Old kvm.ko versions only supported a tiny number of ioeventfds so
4
virtio-pci avoids ioeventfds when kvm_has_many_ioeventfds() returns 0.
5
6
Do not check kvm_has_many_ioeventfds() when KVM is disabled since it
7
always returns 0. Since commit 8c56c1a592b5092d91da8d8943c17777d6462a6f
8
("memory: emulate ioeventfd") it has been possible to use ioeventfds in
9
qtest or TCG mode.
10
11
This patch makes -device virtio-blk-pci,iothread=iothread0 work even
12
when KVM is disabled.
13
14
I have tested that virtio-blk-pci works under TCG both with and without
15
iothread.
16
17
Cc: Michael S. Tsirkin <mst@redhat.com>
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
20
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
21
---
22
hw/virtio/virtio-pci.c | 2 +-
23
1 file changed, 1 insertion(+), 1 deletion(-)
24
25
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
26
index XXXXXXX..XXXXXXX 100644
27
--- a/hw/virtio/virtio-pci.c
28
+++ b/hw/virtio/virtio-pci.c
29
@@ -XXX,XX +XXX,XX @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
30
bool pcie_port = pci_bus_is_express(pci_dev->bus) &&
31
!pci_bus_is_root(pci_dev->bus);
32
33
- if (!kvm_has_many_ioeventfds()) {
34
+ if (kvm_enabled() && !kvm_has_many_ioeventfds()) {
35
proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
36
}
37
38
--
39
1.8.3.1
40
41
diff view generated by jsdifflib
Deleted patch
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
1
3
migration_incoming_state_destroy() uses qemu_fclose() on the vmstate
4
file. Make sure to call it inside an AioContext acquire/release region.
5
6
This fixes an 'qemu: qemu_mutex_unlock: Operation not permitted' abort
7
in loadvm.
8
9
This patch closes the vmstate file before ending the drained region.
10
Previously we closed the vmstate file after ending the drained region.
11
The order does not matter.
12
13
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
15
---
16
migration/savevm.c | 2 +-
17
1 file changed, 1 insertion(+), 1 deletion(-)
18
19
diff --git a/migration/savevm.c b/migration/savevm.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/migration/savevm.c
22
+++ b/migration/savevm.c
23
@@ -XXX,XX +XXX,XX @@ int load_snapshot(const char *name, Error **errp)
24
25
aio_context_acquire(aio_context);
26
ret = qemu_loadvm_state(f);
27
+ migration_incoming_state_destroy();
28
aio_context_release(aio_context);
29
30
bdrv_drain_all_end();
31
32
- migration_incoming_state_destroy();
33
if (ret < 0) {
34
error_setg(errp, "Error %d while loading VM state", ret);
35
return ret;
36
--
37
1.8.3.1
38
39
diff view generated by jsdifflib
Deleted patch
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
3
---
4
block/qed-cluster.c | 39 ++++++++++++++++++++++-----------------
5
block/qed.c | 24 +++++++++++-------------
6
block/qed.h | 4 ++--
7
3 files changed, 35 insertions(+), 32 deletions(-)
8
1
9
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
10
index XXXXXXX..XXXXXXX 100644
11
--- a/block/qed-cluster.c
12
+++ b/block/qed-cluster.c
13
@@ -XXX,XX +XXX,XX @@ static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s,
14
* @s: QED state
15
* @request: L2 cache entry
16
* @pos: Byte position in device
17
- * @len: Number of bytes
18
- * @cb: Completion function
19
- * @opaque: User data for completion function
20
+ * @len: Number of bytes (may be shortened on return)
21
+ * @img_offset: Contains offset in the image file on success
22
*
23
* This function translates a position in the block device to an offset in the
24
- * image file. It invokes the cb completion callback to report back the
25
- * translated offset or unallocated range in the image file.
26
+ * image file. The translated offset or unallocated range in the image file is
27
+ * reported back in *img_offset and *len.
28
*
29
* If the L2 table exists, request->l2_table points to the L2 table cache entry
30
* and the caller must free the reference when they are finished. The cache
31
* entry is exposed in this way to avoid callers having to read the L2 table
32
* again later during request processing. If request->l2_table is non-NULL it
33
* will be unreferenced before taking on the new cache entry.
34
+ *
35
+ * On success QED_CLUSTER_FOUND is returned and img_offset/len are a contiguous
36
+ * range in the image file.
37
+ *
38
+ * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
39
+ * table offset, respectively. len is number of contiguous unallocated bytes.
40
*/
41
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
42
- size_t len, QEDFindClusterFunc *cb, void *opaque)
43
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
44
+ size_t *len, uint64_t *img_offset)
45
{
46
uint64_t l2_offset;
47
uint64_t offset = 0;
48
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
49
/* Limit length to L2 boundary. Requests are broken up at the L2 boundary
50
* so that a request acts on one L2 table at a time.
51
*/
52
- len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
53
+ *len = MIN(*len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
54
55
l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
56
if (qed_offset_is_unalloc_cluster(l2_offset)) {
57
- cb(opaque, QED_CLUSTER_L1, 0, len);
58
- return;
59
+ *img_offset = 0;
60
+ return QED_CLUSTER_L1;
61
}
62
if (!qed_check_table_offset(s, l2_offset)) {
63
- cb(opaque, -EINVAL, 0, 0);
64
- return;
65
+ *img_offset = *len = 0;
66
+ return -EINVAL;
67
}
68
69
ret = qed_read_l2_table(s, request, l2_offset);
70
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
71
}
72
73
index = qed_l2_index(s, pos);
74
- n = qed_bytes_to_clusters(s,
75
- qed_offset_into_cluster(s, pos) + len);
76
+ n = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, pos) + *len);
77
n = qed_count_contiguous_clusters(s, request->l2_table->table,
78
index, n, &offset);
79
80
@@ -XXX,XX +XXX,XX @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
81
ret = -EINVAL;
82
}
83
84
- len = MIN(len,
85
- n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
86
+ *len = MIN(*len,
87
+ n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
88
89
out:
90
- cb(opaque, ret, offset, len);
91
+ *img_offset = offset;
92
qed_release(s);
93
+ return ret;
94
}
95
diff --git a/block/qed.c b/block/qed.c
96
index XXXXXXX..XXXXXXX 100644
97
--- a/block/qed.c
98
+++ b/block/qed.c
99
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
100
.file = file,
101
};
102
QEDRequest request = { .l2_table = NULL };
103
+ uint64_t offset;
104
+ int ret;
105
106
- qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
107
+ ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
108
+ qed_is_allocated_cb(&cb, ret, offset, len);
109
110
- /* Now sleep if the callback wasn't invoked immediately */
111
- while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
112
- cb.co = qemu_coroutine_self();
113
- qemu_coroutine_yield();
114
- }
115
+ /* The callback was invoked immediately */
116
+ assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
117
118
qed_unref_l2_cache_entry(request.l2_table);
119
120
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
121
* or -errno
122
* @offset: Cluster offset in bytes
123
* @len: Length in bytes
124
- *
125
- * Callback from qed_find_cluster().
126
*/
127
static void qed_aio_write_data(void *opaque, int ret,
128
uint64_t offset, size_t len)
129
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_data(void *opaque, int ret,
130
* or -errno
131
* @offset: Cluster offset in bytes
132
* @len: Length in bytes
133
- *
134
- * Callback from qed_find_cluster().
135
*/
136
static void qed_aio_read_data(void *opaque, int ret,
137
uint64_t offset, size_t len)
138
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
139
BDRVQEDState *s = acb_to_s(acb);
140
QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
141
qed_aio_write_data : qed_aio_read_data;
142
+ uint64_t offset;
143
+ size_t len;
144
145
trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
146
147
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
148
}
149
150
/* Find next cluster and start I/O */
151
- qed_find_cluster(s, &acb->request,
152
- acb->cur_pos, acb->end_pos - acb->cur_pos,
153
- io_fn, acb);
154
+ len = acb->end_pos - acb->cur_pos;
155
+ ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
156
+ io_fn(acb, ret, offset, len);
157
}
158
159
static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
160
diff --git a/block/qed.h b/block/qed.h
161
index XXXXXXX..XXXXXXX 100644
162
--- a/block/qed.h
163
+++ b/block/qed.h
164
@@ -XXX,XX +XXX,XX @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
165
/**
166
* Cluster functions
167
*/
168
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
169
- size_t len, QEDFindClusterFunc *cb, void *opaque);
170
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
171
+ size_t *len, uint64_t *img_offset);
172
173
/**
174
* Consistency check
175
--
176
1.8.3.1
177
178
diff view generated by jsdifflib
1
All callers pass ret = 0, so we can just remove it.
1
The bdrv_reopen*() implementation doesn't like it if the graph is
2
changed between queuing nodes for reopen and actually reopening them
3
(one of the reasons is that queuing can be recursive).
4
5
So instead of draining the device only in bdrv_reopen_multiple(),
6
require that callers already drained all affected nodes, and assert this
7
in bdrv_reopen_queue().
2
8
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Reviewed-by: Fam Zheng <famz@redhat.com>
5
---
11
---
6
block/qed.c | 17 ++++++-----------
12
block.c | 23 ++++++++++++++++-------
7
1 file changed, 6 insertions(+), 11 deletions(-)
13
block/replication.c | 6 ++++++
14
qemu-io-cmds.c | 3 +++
15
3 files changed, 25 insertions(+), 7 deletions(-)
8
16
9
diff --git a/block/qed.c b/block/qed.c
17
diff --git a/block.c b/block.c
10
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
11
--- a/block/qed.c
19
--- a/block.c
12
+++ b/block/qed.c
20
+++ b/block.c
13
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
21
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
14
return l2_table;
22
* returns a pointer to bs_queue, which is either the newly allocated
23
* bs_queue, or the existing bs_queue being used.
24
*
25
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
26
*/
27
static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
28
BlockDriverState *bs,
29
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
30
BdrvChild *child;
31
QDict *old_options, *explicit_options;
32
33
+ /* Make sure that the caller remembered to use a drained section. This is
34
+ * important to avoid graph changes between the recursive queuing here and
35
+ * bdrv_reopen_multiple(). */
36
+ assert(bs->quiesce_counter > 0);
37
+
38
if (bs_queue == NULL) {
39
bs_queue = g_new0(BlockReopenQueue, 1);
40
QSIMPLEQ_INIT(bs_queue);
41
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
42
* If all devices prepare successfully, then the changes are committed
43
* to all devices.
44
*
45
+ * All affected nodes must be drained between bdrv_reopen_queue() and
46
+ * bdrv_reopen_multiple().
47
*/
48
int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
49
{
50
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
51
52
assert(bs_queue != NULL);
53
54
- aio_context_release(ctx);
55
- bdrv_drain_all_begin();
56
- aio_context_acquire(ctx);
57
-
58
QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
59
+ assert(bs_entry->state.bs->quiesce_counter > 0);
60
if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
61
error_propagate(errp, local_err);
62
goto cleanup;
63
@@ -XXX,XX +XXX,XX @@ cleanup:
64
}
65
g_free(bs_queue);
66
67
- bdrv_drain_all_end();
68
-
69
return ret;
15
}
70
}
16
71
17
-static void qed_aio_next_io(QEDAIOCB *acb, int ret);
72
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
18
+static void qed_aio_next_io(QEDAIOCB *acb);
19
20
static void qed_aio_start_io(QEDAIOCB *acb)
21
{
73
{
22
- qed_aio_next_io(acb, 0);
74
int ret = -1;
23
+ qed_aio_next_io(acb);
75
Error *local_err = NULL;
76
- BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
77
+ BlockReopenQueue *queue;
78
79
+ bdrv_subtree_drained_begin(bs);
80
+
81
+ queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
82
ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
83
if (local_err != NULL) {
84
error_propagate(errp, local_err);
85
}
86
+
87
+ bdrv_subtree_drained_end(bs);
88
+
89
return ret;
24
}
90
}
25
91
26
static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
92
diff --git a/block/replication.c b/block/replication.c
27
@@ -XXX,XX +XXX,XX @@ static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
93
index XXXXXXX..XXXXXXX 100644
28
/**
94
--- a/block/replication.c
29
* Begin next I/O or complete the request
95
+++ b/block/replication.c
30
*/
96
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
31
-static void qed_aio_next_io(QEDAIOCB *acb, int ret)
97
new_secondary_flags = s->orig_secondary_flags;
32
+static void qed_aio_next_io(QEDAIOCB *acb)
33
{
34
BDRVQEDState *s = acb_to_s(acb);
35
uint64_t offset;
36
size_t len;
37
+ int ret;
38
39
- trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
40
+ trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
41
42
if (acb->backing_qiov) {
43
qemu_iovec_destroy(acb->backing_qiov);
44
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
45
acb->backing_qiov = NULL;
46
}
98
}
47
99
48
- /* Handle I/O error */
100
+ bdrv_subtree_drained_begin(s->hidden_disk->bs);
49
- if (ret) {
101
+ bdrv_subtree_drained_begin(s->secondary_disk->bs);
50
- qed_aio_complete(acb, ret);
102
+
51
- return;
103
if (orig_hidden_flags != new_hidden_flags) {
52
- }
104
reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
53
-
105
new_hidden_flags);
54
acb->qiov_offset += acb->cur_qiov.size;
106
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
55
acb->cur_pos += acb->cur_qiov.size;
107
reopen_queue, &local_err);
56
qemu_iovec_reset(&acb->cur_qiov);
108
error_propagate(errp, local_err);
57
@@ -XXX,XX +XXX,XX @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
58
}
59
return;
60
}
109
}
61
- qed_aio_next_io(acb, 0);
110
+
62
+ qed_aio_next_io(acb);
111
+ bdrv_subtree_drained_end(s->hidden_disk->bs);
112
+ bdrv_subtree_drained_end(s->secondary_disk->bs);
63
}
113
}
64
114
65
static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
115
static void backup_job_cleanup(BlockDriverState *bs)
116
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
117
index XXXXXXX..XXXXXXX 100644
118
--- a/qemu-io-cmds.c
119
+++ b/qemu-io-cmds.c
120
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
121
opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
122
qemu_opts_reset(&reopen_opts);
123
124
+ bdrv_subtree_drained_begin(bs);
125
brq = bdrv_reopen_queue(NULL, bs, opts, flags);
126
bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
127
+ bdrv_subtree_drained_end(bs);
128
+
129
if (local_err) {
130
error_report_err(local_err);
131
} else {
66
--
132
--
67
1.8.3.1
133
2.13.6
68
134
69
135
diff view generated by jsdifflib
Deleted patch
1
From: Max Reitz <mreitz@redhat.com>
2
1
3
The bs->exact_filename field may not be sufficient to store the full
4
blkdebug node filename. In this case, we should not generate a filename
5
at all instead of an unusable one.
6
7
Cc: qemu-stable@nongnu.org
8
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
9
Signed-off-by: Max Reitz <mreitz@redhat.com>
10
Message-id: 20170613172006.19685-2-mreitz@redhat.com
11
Reviewed-by: Alberto Garcia <berto@igalia.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Signed-off-by: Max Reitz <mreitz@redhat.com>
14
---
15
block/blkdebug.c | 10 +++++++---
16
1 file changed, 7 insertions(+), 3 deletions(-)
17
18
diff --git a/block/blkdebug.c b/block/blkdebug.c
19
index XXXXXXX..XXXXXXX 100644
20
--- a/block/blkdebug.c
21
+++ b/block/blkdebug.c
22
@@ -XXX,XX +XXX,XX @@ static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
23
}
24
25
if (!force_json && bs->file->bs->exact_filename[0]) {
26
- snprintf(bs->exact_filename, sizeof(bs->exact_filename),
27
- "blkdebug:%s:%s", s->config_file ?: "",
28
- bs->file->bs->exact_filename);
29
+ int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
30
+ "blkdebug:%s:%s", s->config_file ?: "",
31
+ bs->file->bs->exact_filename);
32
+ if (ret >= sizeof(bs->exact_filename)) {
33
+ /* An overflow makes the filename unusable, so do not report any */
34
+ bs->exact_filename[0] = 0;
35
+ }
36
}
37
38
opts = qdict_new();
39
--
40
1.8.3.1
41
42
diff view generated by jsdifflib
Deleted patch
1
From: Max Reitz <mreitz@redhat.com>
2
1
3
uri_parse(...)->scheme may be NULL. In fact, probably every field may be
4
NULL, and the callers do test this for all of the other fields but not
5
for scheme (except for block/gluster.c; block/vxhs.c does not access
6
that field at all).
7
8
We can easily fix this by using g_strcmp0() instead of strcmp().
9
10
Cc: qemu-stable@nongnu.org
11
Signed-off-by: Max Reitz <mreitz@redhat.com>
12
Message-id: 20170613205726.13544-1-mreitz@redhat.com
13
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Max Reitz <mreitz@redhat.com>
15
---
16
block/nbd.c | 6 +++---
17
block/nfs.c | 2 +-
18
block/sheepdog.c | 6 +++---
19
block/ssh.c | 2 +-
20
4 files changed, 8 insertions(+), 8 deletions(-)
21
22
diff --git a/block/nbd.c b/block/nbd.c
23
index XXXXXXX..XXXXXXX 100644
24
--- a/block/nbd.c
25
+++ b/block/nbd.c
26
@@ -XXX,XX +XXX,XX @@ static int nbd_parse_uri(const char *filename, QDict *options)
27
}
28
29
/* transport */
30
- if (!strcmp(uri->scheme, "nbd")) {
31
+ if (!g_strcmp0(uri->scheme, "nbd")) {
32
is_unix = false;
33
- } else if (!strcmp(uri->scheme, "nbd+tcp")) {
34
+ } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
35
is_unix = false;
36
- } else if (!strcmp(uri->scheme, "nbd+unix")) {
37
+ } else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
38
is_unix = true;
39
} else {
40
ret = -EINVAL;
41
diff --git a/block/nfs.c b/block/nfs.c
42
index XXXXXXX..XXXXXXX 100644
43
--- a/block/nfs.c
44
+++ b/block/nfs.c
45
@@ -XXX,XX +XXX,XX @@ static int nfs_parse_uri(const char *filename, QDict *options, Error **errp)
46
error_setg(errp, "Invalid URI specified");
47
goto out;
48
}
49
- if (strcmp(uri->scheme, "nfs") != 0) {
50
+ if (g_strcmp0(uri->scheme, "nfs") != 0) {
51
error_setg(errp, "URI scheme must be 'nfs'");
52
goto out;
53
}
54
diff --git a/block/sheepdog.c b/block/sheepdog.c
55
index XXXXXXX..XXXXXXX 100644
56
--- a/block/sheepdog.c
57
+++ b/block/sheepdog.c
58
@@ -XXX,XX +XXX,XX @@ static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
59
}
60
61
/* transport */
62
- if (!strcmp(uri->scheme, "sheepdog")) {
63
+ if (!g_strcmp0(uri->scheme, "sheepdog")) {
64
is_unix = false;
65
- } else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
66
+ } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
67
is_unix = false;
68
- } else if (!strcmp(uri->scheme, "sheepdog+unix")) {
69
+ } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
70
is_unix = true;
71
} else {
72
error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
73
diff --git a/block/ssh.c b/block/ssh.c
74
index XXXXXXX..XXXXXXX 100644
75
--- a/block/ssh.c
76
+++ b/block/ssh.c
77
@@ -XXX,XX +XXX,XX @@ static int parse_uri(const char *filename, QDict *options, Error **errp)
78
return -EINVAL;
79
}
80
81
- if (strcmp(uri->scheme, "ssh") != 0) {
82
+ if (g_strcmp0(uri->scheme, "ssh") != 0) {
83
error_setg(errp, "URI scheme must be 'ssh'");
84
goto err;
85
}
86
--
87
1.8.3.1
88
89
diff view generated by jsdifflib