1
The following changes since commit 3a821c52e1a30ecd9a436f2c67cc66b5628c829f:
1
The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8:
2
2
3
Merge tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme into staging (2022-06-23 14:52:30 -0700)
3
Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000)
4
4
5
are available in the Git repository at:
5
are available in the git repository at:
6
6
7
git://repo.or.cz/qemu/kevin.git tags/for-upstream
7
git://repo.or.cz/qemu/kevin.git tags/for-upstream
8
8
9
for you to fetch changes up to 779d82e1d305f2a9cbd7f48cf6555ad58145e04a:
9
for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8:
10
10
11
vduse-blk: Add name option (2022-06-24 17:07:06 +0200)
11
block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Block layer patches
14
Block layer patches
15
15
16
- Add vduse-blk export
16
----------------------------------------------------------------
17
- Dirty bitmaps: Fix and improve bitmap merge
17
Doug Gale (1):
18
- gluster: correctly set max_pdiscard
18
nvme: Add tracing
19
- rbd: report a better error when namespace does not exist
20
- aio_wait_kick: add missing memory barrier
21
- Code cleanups
22
19
23
----------------------------------------------------------------
20
Edgar Kaziakhmedov (1):
24
Emanuele Giuseppe Esposito (1):
21
qcow2: get rid of qcow2_backing_read1 routine
25
aio_wait_kick: add missing memory barrier
26
22
27
Eric Blake (1):
23
Fam Zheng (2):
28
nbd: Drop dead code spotted by Coverity
24
block: Open backing image in force share mode for size probe
25
block: Remove unused bdrv_requests_pending
29
26
30
Fabian Ebner (1):
27
John Snow (1):
31
block/gluster: correctly set max_pdiscard
28
iotests: fix 197 for vpc
32
29
33
Stefan Hajnoczi (3):
30
Kevin Wolf (27):
34
block: drop unused bdrv_co_drain() API
31
block: Formats don't need CONSISTENT_READ with NO_IO
35
block: get rid of blk->guest_block_size
32
block: Make bdrv_drain_invoke() recursive
36
qsd: document vduse-blk exports
33
block: Call .drain_begin only once in bdrv_drain_all_begin()
34
test-bdrv-drain: Test BlockDriver callbacks for drain
35
block: bdrv_drain_recurse(): Remove unused begin parameter
36
block: Don't wait for requests in bdrv_drain*_end()
37
block: Unify order in drain functions
38
block: Don't acquire AioContext in hmp_qemu_io()
39
block: Document that x-blockdev-change breaks quorum children list
40
block: Assert drain_all is only called from main AioContext
41
block: Make bdrv_drain() driver callbacks non-recursive
42
test-bdrv-drain: Test callback for bdrv_drain
43
test-bdrv-drain: Test bs->quiesce_counter
44
blockjob: Pause job on draining any job BDS
45
test-bdrv-drain: Test drain vs. block jobs
46
block: Don't block_job_pause_all() in bdrv_drain_all()
47
block: Nested drain_end must still call callbacks
48
test-bdrv-drain: Test nested drain sections
49
block: Don't notify parents in drain call chain
50
block: Add bdrv_subtree_drained_begin/end()
51
test-bdrv-drain: Tests for bdrv_subtree_drain
52
test-bdrv-drain: Test behaviour in coroutine context
53
test-bdrv-drain: Recursive draining with multiple parents
54
block: Allow graph changes in subtree drained section
55
test-bdrv-drain: Test graph changes in drained section
56
commit: Simplify reopen of base
57
block: Keep nodes drained between reopen_queue/multiple
37
58
38
Stefano Garzarella (1):
59
Thomas Huth (3):
39
block/rbd: report a better error when namespace does not exist
60
block: Remove the obsolete -drive boot=on|off parameter
61
block: Remove the deprecated -hdachs option
62
block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter
40
63
41
Vladimir Sementsov-Ogievskiy (3):
64
qapi/block-core.json | 4 +
42
block: block_dirty_bitmap_merge(): fix error path
65
block/qcow2.h | 3 -
43
block: improve block_dirty_bitmap_merge(): don't allocate extra bitmap
66
include/block/block.h | 15 +-
44
block: simplify handling of try to merge different sized bitmaps
67
include/block/block_int.h | 6 +-
68
block.c | 75 ++++-
69
block/commit.c | 8 +-
70
block/io.c | 164 +++++++---
71
block/qcow2.c | 51 +--
72
block/replication.c | 6 +
73
blockdev.c | 11 -
74
blockjob.c | 22 +-
75
hmp.c | 6 -
76
hw/block/nvme.c | 349 +++++++++++++++++----
77
qemu-io-cmds.c | 3 +
78
tests/test-bdrv-drain.c | 651 +++++++++++++++++++++++++++++++++++++++
79
vl.c | 86 +-----
80
hw/block/trace-events | 93 ++++++
81
qemu-doc.texi | 29 +-
82
qemu-options.hx | 19 +-
83
tests/Makefile.include | 2 +
84
tests/qemu-iotests/197 | 4 +
85
tests/qemu-iotests/common.filter | 3 +-
86
22 files changed, 1294 insertions(+), 316 deletions(-)
87
create mode 100644 tests/test-bdrv-drain.c
45
88
46
Xie Yongji (10):
47
block: Support passing NULL ops to blk_set_dev_ops()
48
block/export: Fix incorrect length passed to vu_queue_push()
49
block/export: Abstract out the logic of virtio-blk I/O process
50
linux-headers: Add vduse.h
51
libvduse: Add VDUSE (vDPA Device in Userspace) library
52
vduse-blk: Implement vduse-blk export
53
vduse-blk: Add vduse-blk resize support
54
libvduse: Add support for reconnecting
55
vduse-blk: Add serial option
56
vduse-blk: Add name option
57
58
qapi/block-export.json | 29 +-
59
docs/tools/qemu-storage-daemon.rst | 22 +
60
meson_options.txt | 4 +
61
block/export/vduse-blk.h | 20 +
62
block/export/virtio-blk-handler.h | 37 +
63
include/block/aio-wait.h | 2 +
64
include/block/block-io.h | 1 -
65
include/block/block_int-io.h | 2 +-
66
include/qemu/hbitmap.h | 15 +-
67
include/sysemu/block-backend-io.h | 1 -
68
linux-headers/linux/vduse.h | 306 ++++++
69
subprojects/libvduse/include/atomic.h | 1 +
70
subprojects/libvduse/include/compiler.h | 1 +
71
subprojects/libvduse/libvduse.h | 247 +++++
72
block/backup.c | 6 +-
73
block/block-backend.c | 12 +-
74
block/dirty-bitmap.c | 26 +-
75
block/export/export.c | 6 +
76
block/export/vduse-blk.c | 374 ++++++++
77
block/export/vhost-user-blk-server.c | 263 +----
78
block/export/virtio-blk-handler.c | 240 +++++
79
block/gluster.c | 2 +-
80
block/io.c | 15 -
81
block/monitor/bitmap-qmp-cmds.c | 40 +-
82
block/nbd.c | 8 +-
83
block/rbd.c | 24 +
84
hw/block/virtio-blk.c | 1 -
85
hw/block/xen-block.c | 1 -
86
hw/ide/core.c | 1 -
87
hw/scsi/scsi-disk.c | 1 -
88
hw/scsi/scsi-generic.c | 1 -
89
storage-daemon/qemu-storage-daemon.c | 10 +
90
subprojects/libvduse/libvduse.c | 1375 +++++++++++++++++++++++++++
91
util/aio-wait.c | 16 +-
92
util/hbitmap.c | 25 +-
93
MAINTAINERS | 9 +
94
block/export/meson.build | 7 +-
95
meson.build | 34 +
96
scripts/meson-buildoptions.sh | 7 +
97
scripts/update-linux-headers.sh | 2 +-
98
subprojects/libvduse/linux-headers/linux | 1 +
99
subprojects/libvduse/meson.build | 10 +
100
subprojects/libvduse/standard-headers/linux | 1 +
101
43 files changed, 2852 insertions(+), 354 deletions(-)
102
create mode 100644 block/export/vduse-blk.h
103
create mode 100644 block/export/virtio-blk-handler.h
104
create mode 100644 linux-headers/linux/vduse.h
105
create mode 120000 subprojects/libvduse/include/atomic.h
106
create mode 120000 subprojects/libvduse/include/compiler.h
107
create mode 100644 subprojects/libvduse/libvduse.h
108
create mode 100644 block/export/vduse-blk.c
109
create mode 100644 block/export/virtio-blk-handler.c
110
create mode 100644 subprojects/libvduse/libvduse.c
111
create mode 120000 subprojects/libvduse/linux-headers/linux
112
create mode 100644 subprojects/libvduse/meson.build
113
create mode 120000 subprojects/libvduse/standard-headers/linux
diff view generated by jsdifflib
New patch
1
Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently
2
in use as a mirror target. It is not enough for image formats, though,
3
as these still unconditionally request BLK_PERM_CONSISTENT_READ.
1
4
5
As this permission is geared towards whether the guest-visible data is
6
consistent, and has no impact on whether the metadata is sane, and
7
'qemu-img info' does not read guest-visible data (except for the raw
8
format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there
9
is not going to be any guest I/O performed, regardless of image format.
10
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
---
13
block.c | 6 +++++-
14
1 file changed, 5 insertions(+), 1 deletion(-)
15
16
diff --git a/block.c b/block.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/block.c
19
+++ b/block.c
20
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
21
assert(role == &child_backing || role == &child_file);
22
23
if (!backing) {
24
+ int flags = bdrv_reopen_get_flags(reopen_queue, bs);
25
+
26
/* Apart from the modifications below, the same permissions are
27
* forwarded and left alone as for filters */
28
bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
29
@@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
30
31
/* bs->file always needs to be consistent because of the metadata. We
32
* can never allow other users to resize or write to it. */
33
- perm |= BLK_PERM_CONSISTENT_READ;
34
+ if (!(flags & BDRV_O_NO_IO)) {
35
+ perm |= BLK_PERM_CONSISTENT_READ;
36
+ }
37
shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
38
} else {
39
/* We want consistent read from backing files if the parent needs it.
40
--
41
2.13.6
42
43
diff view generated by jsdifflib
New patch
1
From: John Snow <jsnow@redhat.com>
1
2
3
VPC has some difficulty creating geometries of particular size.
4
However, we can indeed force it to use a literal one, so let's
5
do that for the sake of test 197, which is testing some specific
6
offsets.
7
8
Signed-off-by: John Snow <jsnow@redhat.com>
9
Reviewed-by: Eric Blake <eblake@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
Reviewed-by: Lukáš Doktor <ldoktor@redhat.com>
13
---
14
tests/qemu-iotests/197 | 4 ++++
15
tests/qemu-iotests/common.filter | 3 ++-
16
2 files changed, 6 insertions(+), 1 deletion(-)
17
18
diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
19
index XXXXXXX..XXXXXXX 100755
20
--- a/tests/qemu-iotests/197
21
+++ b/tests/qemu-iotests/197
22
@@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ==='
23
echo
24
25
# Prep the images
26
+# VPC rounds image sizes to a specific geometry, force a specific size.
27
+if [ "$IMGFMT" = "vpc" ]; then
28
+ IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size")
29
+fi
30
_make_test_img 4G
31
$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io
32
IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \
33
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
34
index XXXXXXX..XXXXXXX 100644
35
--- a/tests/qemu-iotests/common.filter
36
+++ b/tests/qemu-iotests/common.filter
37
@@ -XXX,XX +XXX,XX @@ _filter_img_create()
38
-e "s# log_size=[0-9]\\+##g" \
39
-e "s# refcount_bits=[0-9]\\+##g" \
40
-e "s# key-secret=[a-zA-Z0-9]\\+##g" \
41
- -e "s# iter-time=[0-9]\\+##g"
42
+ -e "s# iter-time=[0-9]\\+##g" \
43
+ -e "s# force_size=\\(on\\|off\\)##g"
44
}
45
46
_filter_img_info()
47
--
48
2.13.6
49
50
diff view generated by jsdifflib
New patch
1
This change separates bdrv_drain_invoke(), which calls the BlockDriver
2
drain callbacks, from bdrv_drain_recurse(). Instead, the function
3
performs its own recursion now.
1
4
5
One reason for this is that bdrv_drain_recurse() can be called multiple
6
times by bdrv_drain_all_begin(), but the callbacks may only be called
7
once. The separation is necessary to fix this bug.
8
9
The other reason is that we intend to go to a model where we call all
10
driver callbacks first, and only then start polling. This is not fully
11
achieved yet with this patch, as bdrv_drain_invoke() contains a
12
BDRV_POLL_WHILE() loop for the block driver callbacks, which can still
13
call callbacks for any unrelated event. It's a step in this direction
14
anyway.
15
16
Cc: qemu-stable@nongnu.org
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
18
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
19
---
20
block/io.c | 14 +++++++++++---
21
1 file changed, 11 insertions(+), 3 deletions(-)
22
23
diff --git a/block/io.c b/block/io.c
24
index XXXXXXX..XXXXXXX 100644
25
--- a/block/io.c
26
+++ b/block/io.c
27
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
28
bdrv_wakeup(bs);
29
}
30
31
+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
32
static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
33
{
34
+ BdrvChild *child, *tmp;
35
BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
36
37
if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
38
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
39
data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
40
bdrv_coroutine_enter(bs, data.co);
41
BDRV_POLL_WHILE(bs, !data.done);
42
+
43
+ QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
44
+ bdrv_drain_invoke(child->bs, begin);
45
+ }
46
}
47
48
static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
49
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
50
BdrvChild *child, *tmp;
51
bool waited;
52
53
- /* Ensure any pending metadata writes are submitted to bs->file. */
54
- bdrv_drain_invoke(bs, begin);
55
-
56
/* Wait for drained requests to finish */
57
waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
58
59
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
60
bdrv_parent_drained_begin(bs);
61
}
62
63
+ bdrv_drain_invoke(bs, true);
64
bdrv_drain_recurse(bs, true);
65
}
66
67
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
68
}
69
70
bdrv_parent_drained_end(bs);
71
+ bdrv_drain_invoke(bs, false);
72
bdrv_drain_recurse(bs, false);
73
aio_enable_external(bdrv_get_aio_context(bs));
74
}
75
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
76
aio_context_acquire(aio_context);
77
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
78
if (aio_context == bdrv_get_aio_context(bs)) {
79
+ /* FIXME Calling this multiple times is wrong */
80
+ bdrv_drain_invoke(bs, true);
81
waited |= bdrv_drain_recurse(bs, true);
82
}
83
}
84
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
85
aio_context_acquire(aio_context);
86
aio_enable_external(aio_context);
87
bdrv_parent_drained_end(bs);
88
+ bdrv_drain_invoke(bs, false);
89
bdrv_drain_recurse(bs, false);
90
aio_context_release(aio_context);
91
}
92
--
93
2.13.6
94
95
diff view generated by jsdifflib
New patch
1
bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver
2
callback inside its polling loop. This means that how many times it got
3
called for each node depended on long it had to poll the event loop.
1
4
5
This is obviously not right and results in nodes that stay drained even
6
after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per
7
node.
8
9
Fix bdrv_drain_all_begin() to call the callback only once, too.
10
11
Cc: qemu-stable@nongnu.org
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
14
---
15
block/io.c | 3 +--
16
1 file changed, 1 insertion(+), 2 deletions(-)
17
18
diff --git a/block/io.c b/block/io.c
19
index XXXXXXX..XXXXXXX 100644
20
--- a/block/io.c
21
+++ b/block/io.c
22
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
23
aio_context_acquire(aio_context);
24
bdrv_parent_drained_begin(bs);
25
aio_disable_external(aio_context);
26
+ bdrv_drain_invoke(bs, true);
27
aio_context_release(aio_context);
28
29
if (!g_slist_find(aio_ctxs, aio_context)) {
30
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
31
aio_context_acquire(aio_context);
32
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
33
if (aio_context == bdrv_get_aio_context(bs)) {
34
- /* FIXME Calling this multiple times is wrong */
35
- bdrv_drain_invoke(bs, true);
36
waited |= bdrv_drain_recurse(bs, true);
37
}
38
}
39
--
40
2.13.6
41
42
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
This adds a test case that the BlockDriver callbacks for drain are
2
called in bdrv_drained_all_begin/end(), and that both of them are called
3
exactly once.
2
4
3
VDUSE [1] is a linux framework that makes it possible to implement
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
software-emulated vDPA devices in userspace. This adds a library
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
as a subproject to help implementing VDUSE backends in QEMU.
7
Reviewed-by: Eric Blake <eblake@redhat.com>
8
---
9
tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
10
tests/Makefile.include | 2 +
11
2 files changed, 139 insertions(+)
12
create mode 100644 tests/test-bdrv-drain.c
6
13
7
[1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html
14
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
8
9
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
10
Message-Id: <20220523084611.91-6-xieyongji@bytedance.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
---
14
meson_options.txt | 2 +
15
subprojects/libvduse/include/atomic.h | 1 +
16
subprojects/libvduse/include/compiler.h | 1 +
17
subprojects/libvduse/libvduse.h | 235 ++++
18
subprojects/libvduse/libvduse.c | 1150 +++++++++++++++++++
19
MAINTAINERS | 5 +
20
meson.build | 15 +
21
scripts/meson-buildoptions.sh | 3 +
22
subprojects/libvduse/linux-headers/linux | 1 +
23
subprojects/libvduse/meson.build | 10 +
24
subprojects/libvduse/standard-headers/linux | 1 +
25
11 files changed, 1424 insertions(+)
26
create mode 120000 subprojects/libvduse/include/atomic.h
27
create mode 120000 subprojects/libvduse/include/compiler.h
28
create mode 100644 subprojects/libvduse/libvduse.h
29
create mode 100644 subprojects/libvduse/libvduse.c
30
create mode 120000 subprojects/libvduse/linux-headers/linux
31
create mode 100644 subprojects/libvduse/meson.build
32
create mode 120000 subprojects/libvduse/standard-headers/linux
33
34
diff --git a/meson_options.txt b/meson_options.txt
35
index XXXXXXX..XXXXXXX 100644
36
--- a/meson_options.txt
37
+++ b/meson_options.txt
38
@@ -XXX,XX +XXX,XX @@ option('virtfs', type: 'feature', value: 'auto',
39
description: 'virtio-9p support')
40
option('virtiofsd', type: 'feature', value: 'auto',
41
description: 'build virtiofs daemon (virtiofsd)')
42
+option('libvduse', type: 'feature', value: 'auto',
43
+ description: 'build VDUSE Library')
44
45
option('capstone', type: 'feature', value: 'auto',
46
description: 'Whether and how to find the capstone library')
47
diff --git a/subprojects/libvduse/include/atomic.h b/subprojects/libvduse/include/atomic.h
48
new file mode 120000
49
index XXXXXXX..XXXXXXX
50
--- /dev/null
51
+++ b/subprojects/libvduse/include/atomic.h
52
@@ -0,0 +1 @@
53
+../../../include/qemu/atomic.h
54
\ No newline at end of file
55
diff --git a/subprojects/libvduse/include/compiler.h b/subprojects/libvduse/include/compiler.h
56
new file mode 120000
57
index XXXXXXX..XXXXXXX
58
--- /dev/null
59
+++ b/subprojects/libvduse/include/compiler.h
60
@@ -0,0 +1 @@
61
+../../../include/qemu/compiler.h
62
\ No newline at end of file
63
diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
64
new file mode 100644
15
new file mode 100644
65
index XXXXXXX..XXXXXXX
16
index XXXXXXX..XXXXXXX
66
--- /dev/null
17
--- /dev/null
67
+++ b/subprojects/libvduse/libvduse.h
18
+++ b/tests/test-bdrv-drain.c
68
@@ -XXX,XX +XXX,XX @@
19
@@ -XXX,XX +XXX,XX @@
69
+/*
20
+/*
70
+ * VDUSE (vDPA Device in Userspace) library
21
+ * Block node draining tests
71
+ *
22
+ *
72
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
23
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
73
+ *
24
+ *
74
+ * Author:
25
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
75
+ * Xie Yongji <xieyongji@bytedance.com>
26
+ * of this software and associated documentation files (the "Software"), to deal
27
+ * in the Software without restriction, including without limitation the rights
28
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
29
+ * copies of the Software, and to permit persons to whom the Software is
30
+ * furnished to do so, subject to the following conditions:
76
+ *
31
+ *
77
+ * This work is licensed under the terms of the GNU GPL, version 2 or
32
+ * The above copyright notice and this permission notice shall be included in
78
+ * later. See the COPYING file in the top-level directory.
33
+ * all copies or substantial portions of the Software.
34
+ *
35
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
37
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
38
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
39
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
40
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
41
+ * THE SOFTWARE.
79
+ */
42
+ */
80
+
43
+
81
+#ifndef LIBVDUSE_H
44
+#include "qemu/osdep.h"
82
+#define LIBVDUSE_H
45
+#include "block/block.h"
46
+#include "sysemu/block-backend.h"
47
+#include "qapi/error.h"
83
+
48
+
84
+#include <stdint.h>
49
+typedef struct BDRVTestState {
85
+#include <sys/uio.h>
50
+ int drain_count;
51
+} BDRVTestState;
86
+
52
+
87
+#define VIRTQUEUE_MAX_SIZE 1024
53
+static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs)
88
+
89
+/* VDUSE device structure */
90
+typedef struct VduseDev VduseDev;
91
+
92
+/* Virtqueue structure */
93
+typedef struct VduseVirtq VduseVirtq;
94
+
95
+/* Some operation of VDUSE backend */
96
+typedef struct VduseOps {
97
+ /* Called when virtqueue can be processed */
98
+ void (*enable_queue)(VduseDev *dev, VduseVirtq *vq);
99
+ /* Called when virtqueue processing should be stopped */
100
+ void (*disable_queue)(VduseDev *dev, VduseVirtq *vq);
101
+} VduseOps;
102
+
103
+/* Describing elements of the I/O buffer */
104
+typedef struct VduseVirtqElement {
105
+ /* Descriptor table index */
106
+ unsigned int index;
107
+ /* Number of physically-contiguous device-readable descriptors */
108
+ unsigned int out_num;
109
+ /* Number of physically-contiguous device-writable descriptors */
110
+ unsigned int in_num;
111
+ /* Array to store physically-contiguous device-writable descriptors */
112
+ struct iovec *in_sg;
113
+ /* Array to store physically-contiguous device-readable descriptors */
114
+ struct iovec *out_sg;
115
+} VduseVirtqElement;
116
+
117
+
118
+/**
119
+ * vduse_get_virtio_features:
120
+ *
121
+ * Get supported virtio features
122
+ *
123
+ * Returns: supported feature bits
124
+ */
125
+uint64_t vduse_get_virtio_features(void);
126
+
127
+/**
128
+ * vduse_queue_get_dev:
129
+ * @vq: specified virtqueue
130
+ *
131
+ * Get corresponding VDUSE device from the virtqueue.
132
+ *
133
+ * Returns: a pointer to VDUSE device on success, NULL on failure.
134
+ */
135
+VduseDev *vduse_queue_get_dev(VduseVirtq *vq);
136
+
137
+/**
138
+ * vduse_queue_get_fd:
139
+ * @vq: specified virtqueue
140
+ *
141
+ * Get the kick fd for the virtqueue.
142
+ *
143
+ * Returns: file descriptor on success, -1 on failure.
144
+ */
145
+int vduse_queue_get_fd(VduseVirtq *vq);
146
+
147
+/**
148
+ * vduse_queue_pop:
149
+ * @vq: specified virtqueue
150
+ * @sz: the size of struct to return (must be >= VduseVirtqElement)
151
+ *
152
+ * Pop an element from virtqueue available ring.
153
+ *
154
+ * Returns: a pointer to a structure containing VduseVirtqElement on success,
155
+ * NULL on failure.
156
+ */
157
+void *vduse_queue_pop(VduseVirtq *vq, size_t sz);
158
+
159
+/**
160
+ * vduse_queue_push:
161
+ * @vq: specified virtqueue
162
+ * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop()
163
+ * @len: length in bytes to write
164
+ *
165
+ * Push an element to virtqueue used ring.
166
+ */
167
+void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
168
+ unsigned int len);
169
+/**
170
+ * vduse_queue_notify:
171
+ * @vq: specified virtqueue
172
+ *
173
+ * Request to notify the queue.
174
+ */
175
+void vduse_queue_notify(VduseVirtq *vq);
176
+
177
+/**
178
+ * vduse_dev_get_priv:
179
+ * @dev: VDUSE device
180
+ *
181
+ * Get the private pointer passed to vduse_dev_create().
182
+ *
183
+ * Returns: private pointer on success, NULL on failure.
184
+ */
185
+void *vduse_dev_get_priv(VduseDev *dev);
186
+
187
+/**
188
+ * vduse_dev_get_queue:
189
+ * @dev: VDUSE device
190
+ * @index: virtqueue index
191
+ *
192
+ * Get the specified virtqueue.
193
+ *
194
+ * Returns: a pointer to the virtqueue on success, NULL on failure.
195
+ */
196
+VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index);
197
+
198
+/**
199
+ * vduse_dev_get_fd:
200
+ * @dev: VDUSE device
201
+ *
202
+ * Get the control message fd for the VDUSE device.
203
+ *
204
+ * Returns: file descriptor on success, -1 on failure.
205
+ */
206
+int vduse_dev_get_fd(VduseDev *dev);
207
+
208
+/**
209
+ * vduse_dev_handler:
210
+ * @dev: VDUSE device
211
+ *
212
+ * Used to process the control message.
213
+ *
214
+ * Returns: file descriptor on success, -errno on failure.
215
+ */
216
+int vduse_dev_handler(VduseDev *dev);
217
+
218
+/**
219
+ * vduse_dev_update_config:
220
+ * @dev: VDUSE device
221
+ * @size: the size to write to configuration space
222
+ * @offset: the offset from the beginning of configuration space
223
+ * @buffer: the buffer used to write from
224
+ *
225
+ * Update device configuration space and inject a config interrupt.
226
+ *
227
+ * Returns: 0 on success, -errno on failure.
228
+ */
229
+int vduse_dev_update_config(VduseDev *dev, uint32_t size,
230
+ uint32_t offset, char *buffer);
231
+
232
+/**
233
+ * vduse_dev_setup_queue:
234
+ * @dev: VDUSE device
235
+ * @index: virtqueue index
236
+ * @max_size: the max size of virtqueue
237
+ *
238
+ * Setup the specified virtqueue.
239
+ *
240
+ * Returns: 0 on success, -errno on failure.
241
+ */
242
+int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
243
+
244
+/**
245
+ * vduse_dev_create_by_fd:
246
+ * @fd: passed file descriptor
247
+ * @num_queues: the number of virtqueues
248
+ * @ops: the operation of VDUSE backend
249
+ * @priv: private pointer
250
+ *
251
+ * Create VDUSE device from a passed file descriptor.
252
+ *
253
+ * Returns: pointer to VDUSE device on success, NULL on failure.
254
+ */
255
+VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
256
+ const VduseOps *ops, void *priv);
257
+
258
+/**
259
+ * vduse_dev_create_by_name:
260
+ * @name: VDUSE device name
261
+ * @num_queues: the number of virtqueues
262
+ * @ops: the operation of VDUSE backend
263
+ * @priv: private pointer
264
+ *
265
+ * Create VDUSE device on /dev/vduse/$NAME.
266
+ *
267
+ * Returns: pointer to VDUSE device on success, NULL on failure.
268
+ */
269
+VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
270
+ const VduseOps *ops, void *priv);
271
+
272
+/**
273
+ * vduse_dev_create:
274
+ * @name: VDUSE device name
275
+ * @device_id: virtio device id
276
+ * @vendor_id: virtio vendor id
277
+ * @features: virtio features
278
+ * @num_queues: the number of virtqueues
279
+ * @config_size: the size of the configuration space
280
+ * @config: the buffer of the configuration space
281
+ * @ops: the operation of VDUSE backend
282
+ * @priv: private pointer
283
+ *
284
+ * Create VDUSE device.
285
+ *
286
+ * Returns: pointer to VDUSE device on success, NULL on failure.
287
+ */
288
+VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
289
+ uint32_t vendor_id, uint64_t features,
290
+ uint16_t num_queues, uint32_t config_size,
291
+ char *config, const VduseOps *ops, void *priv);
292
+
293
+/**
294
+ * vduse_dev_destroy:
295
+ * @dev: VDUSE device
296
+ *
297
+ * Destroy the VDUSE device.
298
+ *
299
+ * Returns: 0 on success, -errno on failure.
300
+ */
301
+int vduse_dev_destroy(VduseDev *dev);
302
+
303
+#endif
304
diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
305
new file mode 100644
306
index XXXXXXX..XXXXXXX
307
--- /dev/null
308
+++ b/subprojects/libvduse/libvduse.c
309
@@ -XXX,XX +XXX,XX @@
310
+/*
311
+ * VDUSE (vDPA Device in Userspace) library
312
+ *
313
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
314
+ * Portions of codes and concepts borrowed from libvhost-user.c, so:
315
+ * Copyright IBM, Corp. 2007
316
+ * Copyright (c) 2016 Red Hat, Inc.
317
+ *
318
+ * Author:
319
+ * Xie Yongji <xieyongji@bytedance.com>
320
+ * Anthony Liguori <aliguori@us.ibm.com>
321
+ * Marc-André Lureau <mlureau@redhat.com>
322
+ * Victor Kaplansky <victork@redhat.com>
323
+ *
324
+ * This work is licensed under the terms of the GNU GPL, version 2 or
325
+ * later. See the COPYING file in the top-level directory.
326
+ */
327
+
328
+#include <stdlib.h>
329
+#include <stdio.h>
330
+#include <stdbool.h>
331
+#include <stddef.h>
332
+#include <errno.h>
333
+#include <string.h>
334
+#include <assert.h>
335
+#include <endian.h>
336
+#include <unistd.h>
337
+#include <limits.h>
338
+#include <fcntl.h>
339
+#include <inttypes.h>
340
+
341
+#include <sys/ioctl.h>
342
+#include <sys/eventfd.h>
343
+#include <sys/mman.h>
344
+
345
+#include "include/atomic.h"
346
+#include "linux-headers/linux/virtio_ring.h"
347
+#include "linux-headers/linux/virtio_config.h"
348
+#include "linux-headers/linux/vduse.h"
349
+#include "libvduse.h"
350
+
351
+#define VDUSE_VQ_ALIGN 4096
352
+#define MAX_IOVA_REGIONS 256
353
+
354
+/* Round number down to multiple */
355
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
356
+
357
+/* Round number up to multiple */
358
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
359
+
360
+#ifndef unlikely
361
+#define unlikely(x) __builtin_expect(!!(x), 0)
362
+#endif
363
+
364
+typedef struct VduseRing {
365
+ unsigned int num;
366
+ uint64_t desc_addr;
367
+ uint64_t avail_addr;
368
+ uint64_t used_addr;
369
+ struct vring_desc *desc;
370
+ struct vring_avail *avail;
371
+ struct vring_used *used;
372
+} VduseRing;
373
+
374
+struct VduseVirtq {
375
+ VduseRing vring;
376
+ uint16_t last_avail_idx;
377
+ uint16_t shadow_avail_idx;
378
+ uint16_t used_idx;
379
+ uint16_t signalled_used;
380
+ bool signalled_used_valid;
381
+ int index;
382
+ int inuse;
383
+ bool ready;
384
+ int fd;
385
+ VduseDev *dev;
386
+};
387
+
388
+typedef struct VduseIovaRegion {
389
+ uint64_t iova;
390
+ uint64_t size;
391
+ uint64_t mmap_offset;
392
+ uint64_t mmap_addr;
393
+} VduseIovaRegion;
394
+
395
+struct VduseDev {
396
+ VduseVirtq *vqs;
397
+ VduseIovaRegion regions[MAX_IOVA_REGIONS];
398
+ int num_regions;
399
+ char *name;
400
+ uint32_t device_id;
401
+ uint32_t vendor_id;
402
+ uint16_t num_queues;
403
+ uint16_t queue_size;
404
+ uint64_t features;
405
+ const VduseOps *ops;
406
+ int fd;
407
+ int ctrl_fd;
408
+ void *priv;
409
+};
410
+
411
+static inline bool has_feature(uint64_t features, unsigned int fbit)
412
+{
54
+{
413
+ assert(fbit < 64);
55
+ BDRVTestState *s = bs->opaque;
414
+ return !!(features & (1ULL << fbit));
56
+ s->drain_count++;
415
+}
57
+}
416
+
58
+
417
+static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
59
+static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs)
418
+{
60
+{
419
+ return has_feature(dev->features, fbit);
61
+ BDRVTestState *s = bs->opaque;
62
+ s->drain_count--;
420
+}
63
+}
421
+
64
+
422
+uint64_t vduse_get_virtio_features(void)
65
+static void bdrv_test_close(BlockDriverState *bs)
423
+{
66
+{
424
+ return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
67
+ BDRVTestState *s = bs->opaque;
425
+ (1ULL << VIRTIO_F_VERSION_1) |
68
+ g_assert_cmpint(s->drain_count, >, 0);
426
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
427
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) |
428
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
429
+}
69
+}
430
+
70
+
431
+VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
71
+static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs,
72
+ uint64_t offset, uint64_t bytes,
73
+ QEMUIOVector *qiov, int flags)
432
+{
74
+{
433
+ return vq->dev;
75
+ /* We want this request to stay until the polling loop in drain waits for
434
+}
76
+ * it to complete. We need to sleep a while as bdrv_drain_invoke() comes
435
+
77
+ * first and polls its result, too, but it shouldn't accidentally complete
436
+int vduse_queue_get_fd(VduseVirtq *vq)
78
+ * this request yet. */
437
+{
79
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
438
+ return vq->fd;
439
+}
440
+
441
+void *vduse_dev_get_priv(VduseDev *dev)
442
+{
443
+ return dev->priv;
444
+}
445
+
446
+VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
447
+{
448
+ return &dev->vqs[index];
449
+}
450
+
451
+int vduse_dev_get_fd(VduseDev *dev)
452
+{
453
+ return dev->fd;
454
+}
455
+
456
+static int vduse_inject_irq(VduseDev *dev, int index)
457
+{
458
+ return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
459
+}
460
+
461
+static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
462
+ uint64_t last)
463
+{
464
+ int i;
465
+
466
+ if (last == start) {
467
+ return;
468
+ }
469
+
470
+ for (i = 0; i < MAX_IOVA_REGIONS; i++) {
471
+ if (!dev->regions[i].mmap_addr) {
472
+ continue;
473
+ }
474
+
475
+ if (start <= dev->regions[i].iova &&
476
+ last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
477
+ munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
478
+ dev->regions[i].mmap_offset + dev->regions[i].size);
479
+ dev->regions[i].mmap_addr = 0;
480
+ dev->num_regions--;
481
+ }
482
+ }
483
+}
484
+
485
+static int vduse_iova_add_region(VduseDev *dev, int fd,
486
+ uint64_t offset, uint64_t start,
487
+ uint64_t last, int prot)
488
+{
489
+ int i;
490
+ uint64_t size = last - start + 1;
491
+ void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
492
+
493
+ if (mmap_addr == MAP_FAILED) {
494
+ close(fd);
495
+ return -EINVAL;
496
+ }
497
+
498
+ for (i = 0; i < MAX_IOVA_REGIONS; i++) {
499
+ if (!dev->regions[i].mmap_addr) {
500
+ dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
501
+ dev->regions[i].mmap_offset = offset;
502
+ dev->regions[i].iova = start;
503
+ dev->regions[i].size = size;
504
+ dev->num_regions++;
505
+ break;
506
+ }
507
+ }
508
+ assert(i < MAX_IOVA_REGIONS);
509
+ close(fd);
510
+
80
+
511
+ return 0;
81
+ return 0;
512
+}
82
+}
513
+
83
+
514
+static int perm_to_prot(uint8_t perm)
84
+static BlockDriver bdrv_test = {
85
+ .format_name = "test",
86
+ .instance_size = sizeof(BDRVTestState),
87
+
88
+ .bdrv_close = bdrv_test_close,
89
+ .bdrv_co_preadv = bdrv_test_co_preadv,
90
+
91
+ .bdrv_co_drain_begin = bdrv_test_co_drain_begin,
92
+ .bdrv_co_drain_end = bdrv_test_co_drain_end,
93
+};
94
+
95
+static void aio_ret_cb(void *opaque, int ret)
515
+{
96
+{
516
+ int prot = 0;
97
+ int *aio_ret = opaque;
517
+
98
+ *aio_ret = ret;
518
+ switch (perm) {
519
+ case VDUSE_ACCESS_WO:
520
+ prot |= PROT_WRITE;
521
+ break;
522
+ case VDUSE_ACCESS_RO:
523
+ prot |= PROT_READ;
524
+ break;
525
+ case VDUSE_ACCESS_RW:
526
+ prot |= PROT_READ | PROT_WRITE;
527
+ break;
528
+ default:
529
+ break;
530
+ }
531
+
532
+ return prot;
533
+}
99
+}
534
+
100
+
535
+static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
101
+static void test_drv_cb_drain_all(void)
536
+{
102
+{
537
+ int i, ret;
103
+ BlockBackend *blk;
538
+ struct vduse_iotlb_entry entry;
104
+ BlockDriverState *bs;
105
+ BDRVTestState *s;
106
+ BlockAIOCB *acb;
107
+ int aio_ret;
539
+
108
+
540
+ for (i = 0; i < MAX_IOVA_REGIONS; i++) {
109
+ QEMUIOVector qiov;
541
+ VduseIovaRegion *r = &dev->regions[i];
110
+ struct iovec iov = {
111
+ .iov_base = NULL,
112
+ .iov_len = 0,
113
+ };
114
+ qemu_iovec_init_external(&qiov, &iov, 1);
542
+
115
+
543
+ if (!r->mmap_addr) {
116
+ blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
544
+ continue;
117
+ bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
545
+ }
118
+ &error_abort);
119
+ s = bs->opaque;
120
+ blk_insert_bs(blk, bs, &error_abort);
546
+
121
+
547
+ if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
122
+ /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
548
+ if ((iova + *plen) > (r->iova + r->size)) {
123
+ g_assert_cmpint(s->drain_count, ==, 0);
549
+ *plen = r->iova + r->size - iova;
124
+ bdrv_drain_all_begin();
550
+ }
125
+ g_assert_cmpint(s->drain_count, ==, 1);
551
+ return (void *)(uintptr_t)(iova - r->iova +
126
+ bdrv_drain_all_end();
552
+ r->mmap_addr + r->mmap_offset);
127
+ g_assert_cmpint(s->drain_count, ==, 0);
553
+ }
554
+ }
555
+
128
+
556
+ entry.start = iova;
129
+ /* Now do the same while a request is pending */
557
+ entry.last = iova + 1;
130
+ aio_ret = -EINPROGRESS;
558
+ ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
131
+ acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret);
559
+ if (ret < 0) {
132
+ g_assert(acb != NULL);
560
+ return NULL;
133
+ g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
561
+ }
562
+
134
+
563
+ if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
135
+ g_assert_cmpint(s->drain_count, ==, 0);
564
+ entry.last, perm_to_prot(entry.perm))) {
136
+ bdrv_drain_all_begin();
565
+ return iova_to_va(dev, plen, iova);
137
+ g_assert_cmpint(aio_ret, ==, 0);
566
+ }
138
+ g_assert_cmpint(s->drain_count, ==, 1);
139
+ bdrv_drain_all_end();
140
+ g_assert_cmpint(s->drain_count, ==, 0);
567
+
141
+
568
+ return NULL;
142
+ bdrv_unref(bs);
143
+ blk_unref(blk);
569
+}
144
+}
570
+
145
+
571
+static inline uint16_t vring_avail_flags(VduseVirtq *vq)
146
+int main(int argc, char **argv)
572
+{
147
+{
573
+ return le16toh(vq->vring.avail->flags);
148
+ bdrv_init();
149
+ qemu_init_main_loop(&error_abort);
150
+
151
+ g_test_init(&argc, &argv, NULL);
152
+
153
+ g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
154
+
155
+ return g_test_run();
574
+}
156
+}
575
+
157
diff --git a/tests/Makefile.include b/tests/Makefile.include
576
+static inline uint16_t vring_avail_idx(VduseVirtq *vq)
577
+{
578
+ vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
579
+
580
+ return vq->shadow_avail_idx;
581
+}
582
+
583
+static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
584
+{
585
+ return le16toh(vq->vring.avail->ring[i]);
586
+}
587
+
588
+static inline uint16_t vring_get_used_event(VduseVirtq *vq)
589
+{
590
+ return vring_avail_ring(vq, vq->vring.num);
591
+}
592
+
593
+static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
594
+ unsigned int *head)
595
+{
596
+ /*
597
+ * Grab the next descriptor number they're advertising, and increment
598
+ * the index we've seen.
599
+ */
600
+ *head = vring_avail_ring(vq, idx % vq->vring.num);
601
+
602
+ /* If their number is silly, that's a fatal mistake. */
603
+ if (*head >= vq->vring.num) {
604
+ fprintf(stderr, "Guest says index %u is available\n", *head);
605
+ return false;
606
+ }
607
+
608
+ return true;
609
+}
610
+
611
+static int
612
+vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
613
+ uint64_t addr, size_t len)
614
+{
615
+ struct vring_desc *ori_desc;
616
+ uint64_t read_len;
617
+
618
+ if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
619
+ return -1;
620
+ }
621
+
622
+ if (len == 0) {
623
+ return -1;
624
+ }
625
+
626
+ while (len) {
627
+ read_len = len;
628
+ ori_desc = iova_to_va(dev, &read_len, addr);
629
+ if (!ori_desc) {
630
+ return -1;
631
+ }
632
+
633
+ memcpy(desc, ori_desc, read_len);
634
+ len -= read_len;
635
+ addr += read_len;
636
+ desc += read_len;
637
+ }
638
+
639
+ return 0;
640
+}
641
+
642
+enum {
643
+ VIRTQUEUE_READ_DESC_ERROR = -1,
644
+ VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
645
+ VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
646
+};
647
+
648
+static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
649
+ unsigned int max, unsigned int *next)
650
+{
651
+ /* If this descriptor says it doesn't chain, we're done. */
652
+ if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
653
+ return VIRTQUEUE_READ_DESC_DONE;
654
+ }
655
+
656
+ /* Check they're not leading us off end of descriptors. */
657
+ *next = desc[i].next;
658
+ /* Make sure compiler knows to grab that: we don't want it changing! */
659
+ smp_wmb();
660
+
661
+ if (*next >= max) {
662
+ fprintf(stderr, "Desc next is %u\n", *next);
663
+ return VIRTQUEUE_READ_DESC_ERROR;
664
+ }
665
+
666
+ return VIRTQUEUE_READ_DESC_MORE;
667
+}
668
+
669
+/*
670
+ * Fetch avail_idx from VQ memory only when we really need to know if
671
+ * guest has added some buffers.
672
+ */
673
+static bool vduse_queue_empty(VduseVirtq *vq)
674
+{
675
+ if (unlikely(!vq->vring.avail)) {
676
+ return true;
677
+ }
678
+
679
+ if (vq->shadow_avail_idx != vq->last_avail_idx) {
680
+ return false;
681
+ }
682
+
683
+ return vring_avail_idx(vq) == vq->last_avail_idx;
684
+}
685
+
686
+static bool vduse_queue_should_notify(VduseVirtq *vq)
687
+{
688
+ VduseDev *dev = vq->dev;
689
+ uint16_t old, new;
690
+ bool v;
691
+
692
+ /* We need to expose used array entries before checking used event. */
693
+ smp_mb();
694
+
695
+ /* Always notify when queue is empty (when feature acknowledge) */
696
+ if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
697
+ !vq->inuse && vduse_queue_empty(vq)) {
698
+ return true;
699
+ }
700
+
701
+ if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
702
+ return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
703
+ }
704
+
705
+ v = vq->signalled_used_valid;
706
+ vq->signalled_used_valid = true;
707
+ old = vq->signalled_used;
708
+ new = vq->signalled_used = vq->used_idx;
709
+ return !v || vring_need_event(vring_get_used_event(vq), new, old);
710
+}
711
+
712
+void vduse_queue_notify(VduseVirtq *vq)
713
+{
714
+ VduseDev *dev = vq->dev;
715
+
716
+ if (unlikely(!vq->vring.avail)) {
717
+ return;
718
+ }
719
+
720
+ if (!vduse_queue_should_notify(vq)) {
721
+ return;
722
+ }
723
+
724
+ if (vduse_inject_irq(dev, vq->index) < 0) {
725
+ fprintf(stderr, "Error inject irq for vq %d: %s\n",
726
+ vq->index, strerror(errno));
727
+ }
728
+}
729
+
730
+static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
731
+{
732
+ *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
733
+}
734
+
735
+static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
736
+ struct iovec *iov, unsigned int max_num_sg,
737
+ bool is_write, uint64_t pa, size_t sz)
738
+{
739
+ unsigned num_sg = *p_num_sg;
740
+ VduseDev *dev = vq->dev;
741
+
742
+ assert(num_sg <= max_num_sg);
743
+
744
+ if (!sz) {
745
+ fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
746
+ return false;
747
+ }
748
+
749
+ while (sz) {
750
+ uint64_t len = sz;
751
+
752
+ if (num_sg == max_num_sg) {
753
+ fprintf(stderr,
754
+ "virtio: too many descriptors in indirect table\n");
755
+ return false;
756
+ }
757
+
758
+ iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
759
+ if (iov[num_sg].iov_base == NULL) {
760
+ fprintf(stderr, "virtio: invalid address for buffers\n");
761
+ return false;
762
+ }
763
+ iov[num_sg++].iov_len = len;
764
+ sz -= len;
765
+ pa += len;
766
+ }
767
+
768
+ *p_num_sg = num_sg;
769
+ return true;
770
+}
771
+
772
+static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
773
+ unsigned in_num)
774
+{
775
+ VduseVirtqElement *elem;
776
+ size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
777
+ size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
778
+ size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
779
+
780
+ assert(sz >= sizeof(VduseVirtqElement));
781
+ elem = malloc(out_sg_end);
782
+ if (!elem) {
783
+ return NULL;
784
+ }
785
+ elem->out_num = out_num;
786
+ elem->in_num = in_num;
787
+ elem->in_sg = (void *)elem + in_sg_ofs;
788
+ elem->out_sg = (void *)elem + out_sg_ofs;
789
+ return elem;
790
+}
791
+
792
+static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
793
+{
794
+ struct vring_desc *desc = vq->vring.desc;
795
+ VduseDev *dev = vq->dev;
796
+ uint64_t desc_addr, read_len;
797
+ unsigned int desc_len;
798
+ unsigned int max = vq->vring.num;
799
+ unsigned int i = idx;
800
+ VduseVirtqElement *elem;
801
+ struct iovec iov[VIRTQUEUE_MAX_SIZE];
802
+ struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
803
+ unsigned int out_num = 0, in_num = 0;
804
+ int rc;
805
+
806
+ if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
807
+ if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
808
+ fprintf(stderr, "Invalid size for indirect buffer table\n");
809
+ return NULL;
810
+ }
811
+
812
+ /* loop over the indirect descriptor table */
813
+ desc_addr = le64toh(desc[i].addr);
814
+ desc_len = le32toh(desc[i].len);
815
+ max = desc_len / sizeof(struct vring_desc);
816
+ read_len = desc_len;
817
+ desc = iova_to_va(dev, &read_len, desc_addr);
818
+ if (unlikely(desc && read_len != desc_len)) {
819
+ /* Failed to use zero copy */
820
+ desc = NULL;
821
+ if (!vduse_queue_read_indirect_desc(dev, desc_buf,
822
+ desc_addr,
823
+ desc_len)) {
824
+ desc = desc_buf;
825
+ }
826
+ }
827
+ if (!desc) {
828
+ fprintf(stderr, "Invalid indirect buffer table\n");
829
+ return NULL;
830
+ }
831
+ i = 0;
832
+ }
833
+
834
+ /* Collect all the descriptors */
835
+ do {
836
+ if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
837
+ if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
838
+ VIRTQUEUE_MAX_SIZE - out_num,
839
+ true, le64toh(desc[i].addr),
840
+ le32toh(desc[i].len))) {
841
+ return NULL;
842
+ }
843
+ } else {
844
+ if (in_num) {
845
+ fprintf(stderr, "Incorrect order for descriptors\n");
846
+ return NULL;
847
+ }
848
+ if (!vduse_queue_map_single_desc(vq, &out_num, iov,
849
+ VIRTQUEUE_MAX_SIZE, false,
850
+ le64toh(desc[i].addr),
851
+ le32toh(desc[i].len))) {
852
+ return NULL;
853
+ }
854
+ }
855
+
856
+ /* If we've got too many, that implies a descriptor loop. */
857
+ if ((in_num + out_num) > max) {
858
+ fprintf(stderr, "Looped descriptor\n");
859
+ return NULL;
860
+ }
861
+ rc = vduse_queue_read_next_desc(desc, i, max, &i);
862
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
863
+
864
+ if (rc == VIRTQUEUE_READ_DESC_ERROR) {
865
+ fprintf(stderr, "read descriptor error\n");
866
+ return NULL;
867
+ }
868
+
869
+ /* Now copy what we have collected and mapped */
870
+ elem = vduse_queue_alloc_element(sz, out_num, in_num);
871
+ if (!elem) {
872
+ fprintf(stderr, "read descriptor error\n");
873
+ return NULL;
874
+ }
875
+ elem->index = idx;
876
+ for (i = 0; i < out_num; i++) {
877
+ elem->out_sg[i] = iov[i];
878
+ }
879
+ for (i = 0; i < in_num; i++) {
880
+ elem->in_sg[i] = iov[out_num + i];
881
+ }
882
+
883
+ return elem;
884
+}
885
+
886
+void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
887
+{
888
+ unsigned int head;
889
+ VduseVirtqElement *elem;
890
+ VduseDev *dev = vq->dev;
891
+
892
+ if (unlikely(!vq->vring.avail)) {
893
+ return NULL;
894
+ }
895
+
896
+ if (vduse_queue_empty(vq)) {
897
+ return NULL;
898
+ }
899
+ /* Needed after virtio_queue_empty() */
900
+ smp_rmb();
901
+
902
+ if (vq->inuse >= vq->vring.num) {
903
+ fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
904
+ return NULL;
905
+ }
906
+
907
+ if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
908
+ return NULL;
909
+ }
910
+
911
+ if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
912
+ vring_set_avail_event(vq, vq->last_avail_idx);
913
+ }
914
+
915
+ elem = vduse_queue_map_desc(vq, head, sz);
916
+
917
+ if (!elem) {
918
+ return NULL;
919
+ }
920
+
921
+ vq->inuse++;
922
+
923
+ return elem;
924
+}
925
+
926
+static inline void vring_used_write(VduseVirtq *vq,
927
+ struct vring_used_elem *uelem, int i)
928
+{
929
+ struct vring_used *used = vq->vring.used;
930
+
931
+ used->ring[i] = *uelem;
932
+}
933
+
934
+static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
935
+ unsigned int len, unsigned int idx)
936
+{
937
+ struct vring_used_elem uelem;
938
+
939
+ if (unlikely(!vq->vring.used)) {
940
+ return;
941
+ }
942
+
943
+ idx = (idx + vq->used_idx) % vq->vring.num;
944
+
945
+ uelem.id = htole32(elem->index);
946
+ uelem.len = htole32(len);
947
+ vring_used_write(vq, &uelem, idx);
948
+}
949
+
950
+static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
951
+{
952
+ vq->vring.used->idx = htole16(val);
953
+ vq->used_idx = val;
954
+}
955
+
956
+static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
957
+{
958
+ uint16_t old, new;
959
+
960
+ if (unlikely(!vq->vring.used)) {
961
+ return;
962
+ }
963
+
964
+ /* Make sure buffer is written before we update index. */
965
+ smp_wmb();
966
+
967
+ old = vq->used_idx;
968
+ new = old + count;
969
+ vring_used_idx_set(vq, new);
970
+ vq->inuse -= count;
971
+ if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
972
+ vq->signalled_used_valid = false;
973
+ }
974
+}
975
+
976
+void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
977
+ unsigned int len)
978
+{
979
+ vduse_queue_fill(vq, elem, len, 0);
980
+ vduse_queue_flush(vq, 1);
981
+}
982
+
983
+static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
984
+ uint64_t avail_addr, uint64_t used_addr)
985
+{
986
+ struct VduseDev *dev = vq->dev;
987
+ uint64_t len;
988
+
989
+ len = sizeof(struct vring_desc);
990
+ vq->vring.desc = iova_to_va(dev, &len, desc_addr);
991
+ if (len != sizeof(struct vring_desc)) {
992
+ return -EINVAL;
993
+ }
994
+
995
+ len = sizeof(struct vring_avail);
996
+ vq->vring.avail = iova_to_va(dev, &len, avail_addr);
997
+ if (len != sizeof(struct vring_avail)) {
998
+ return -EINVAL;
999
+ }
1000
+
1001
+ len = sizeof(struct vring_used);
1002
+ vq->vring.used = iova_to_va(dev, &len, used_addr);
1003
+ if (len != sizeof(struct vring_used)) {
1004
+ return -EINVAL;
1005
+ }
1006
+
1007
+ if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
1008
+ fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
1009
+ return -EINVAL;
1010
+ }
1011
+
1012
+ return 0;
1013
+}
1014
+
1015
+static void vduse_queue_enable(VduseVirtq *vq)
1016
+{
1017
+ struct VduseDev *dev = vq->dev;
1018
+ struct vduse_vq_info vq_info;
1019
+ struct vduse_vq_eventfd vq_eventfd;
1020
+ int fd;
1021
+
1022
+ vq_info.index = vq->index;
1023
+ if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
1024
+ fprintf(stderr, "Failed to get vq[%d] info: %s\n",
1025
+ vq->index, strerror(errno));
1026
+ return;
1027
+ }
1028
+
1029
+ if (!vq_info.ready) {
1030
+ return;
1031
+ }
1032
+
1033
+ vq->vring.num = vq_info.num;
1034
+ vq->vring.desc_addr = vq_info.desc_addr;
1035
+ vq->vring.avail_addr = vq_info.driver_addr;
1036
+ vq->vring.used_addr = vq_info.device_addr;
1037
+
1038
+ if (vduse_queue_update_vring(vq, vq_info.desc_addr,
1039
+ vq_info.driver_addr, vq_info.device_addr)) {
1040
+ fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
1041
+ return;
1042
+ }
1043
+
1044
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1045
+ if (fd < 0) {
1046
+ fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
1047
+ return;
1048
+ }
1049
+
1050
+ vq_eventfd.index = vq->index;
1051
+ vq_eventfd.fd = fd;
1052
+ if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
1053
+ fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
1054
+ close(fd);
1055
+ return;
1056
+ }
1057
+
1058
+ vq->fd = fd;
1059
+ vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
1060
+ vq->inuse = 0;
1061
+ vq->used_idx = 0;
1062
+ vq->signalled_used_valid = false;
1063
+ vq->ready = true;
1064
+
1065
+ dev->ops->enable_queue(dev, vq);
1066
+}
1067
+
1068
+static void vduse_queue_disable(VduseVirtq *vq)
1069
+{
1070
+ struct VduseDev *dev = vq->dev;
1071
+ struct vduse_vq_eventfd eventfd;
1072
+
1073
+ if (!vq->ready) {
1074
+ return;
1075
+ }
1076
+
1077
+ dev->ops->disable_queue(dev, vq);
1078
+
1079
+ eventfd.index = vq->index;
1080
+ eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
1081
+ ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
1082
+ close(vq->fd);
1083
+
1084
+ assert(vq->inuse == 0);
1085
+
1086
+ vq->vring.num = 0;
1087
+ vq->vring.desc_addr = 0;
1088
+ vq->vring.avail_addr = 0;
1089
+ vq->vring.used_addr = 0;
1090
+ vq->vring.desc = 0;
1091
+ vq->vring.avail = 0;
1092
+ vq->vring.used = 0;
1093
+ vq->ready = false;
1094
+ vq->fd = -1;
1095
+}
1096
+
1097
+static void vduse_dev_start_dataplane(VduseDev *dev)
1098
+{
1099
+ int i;
1100
+
1101
+ if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1102
+ fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1103
+ return;
1104
+ }
1105
+ assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
1106
+
1107
+ for (i = 0; i < dev->num_queues; i++) {
1108
+ vduse_queue_enable(&dev->vqs[i]);
1109
+ }
1110
+}
1111
+
1112
+static void vduse_dev_stop_dataplane(VduseDev *dev)
1113
+{
1114
+ int i;
1115
+
1116
+ for (i = 0; i < dev->num_queues; i++) {
1117
+ vduse_queue_disable(&dev->vqs[i]);
1118
+ }
1119
+ dev->features = 0;
1120
+ vduse_iova_remove_region(dev, 0, ULONG_MAX);
1121
+}
1122
+
1123
+int vduse_dev_handler(VduseDev *dev)
1124
+{
1125
+ struct vduse_dev_request req;
1126
+ struct vduse_dev_response resp = { 0 };
1127
+ VduseVirtq *vq;
1128
+ int i, ret;
1129
+
1130
+ ret = read(dev->fd, &req, sizeof(req));
1131
+ if (ret != sizeof(req)) {
1132
+ fprintf(stderr, "Read request error [%d]: %s\n",
1133
+ ret, strerror(errno));
1134
+ return -errno;
1135
+ }
1136
+ resp.request_id = req.request_id;
1137
+
1138
+ switch (req.type) {
1139
+ case VDUSE_GET_VQ_STATE:
1140
+ vq = &dev->vqs[req.vq_state.index];
1141
+ resp.vq_state.split.avail_index = vq->last_avail_idx;
1142
+ resp.result = VDUSE_REQ_RESULT_OK;
1143
+ break;
1144
+ case VDUSE_SET_STATUS:
1145
+ if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
1146
+ vduse_dev_start_dataplane(dev);
1147
+ } else if (req.s.status == 0) {
1148
+ vduse_dev_stop_dataplane(dev);
1149
+ }
1150
+ resp.result = VDUSE_REQ_RESULT_OK;
1151
+ break;
1152
+ case VDUSE_UPDATE_IOTLB:
1153
+ /* The iova will be updated by iova_to_va() later, so just remove it */
1154
+ vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
1155
+ for (i = 0; i < dev->num_queues; i++) {
1156
+ VduseVirtq *vq = &dev->vqs[i];
1157
+ if (vq->ready) {
1158
+ if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
1159
+ vq->vring.avail_addr,
1160
+ vq->vring.used_addr)) {
1161
+ fprintf(stderr, "Failed to update vring for vq[%d]\n",
1162
+ vq->index);
1163
+ }
1164
+ }
1165
+ }
1166
+ resp.result = VDUSE_REQ_RESULT_OK;
1167
+ break;
1168
+ default:
1169
+ resp.result = VDUSE_REQ_RESULT_FAILED;
1170
+ break;
1171
+ }
1172
+
1173
+ ret = write(dev->fd, &resp, sizeof(resp));
1174
+ if (ret != sizeof(resp)) {
1175
+ fprintf(stderr, "Write request %d error [%d]: %s\n",
1176
+ req.type, ret, strerror(errno));
1177
+ return -errno;
1178
+ }
1179
+ return 0;
1180
+}
1181
+
1182
+int vduse_dev_update_config(VduseDev *dev, uint32_t size,
1183
+ uint32_t offset, char *buffer)
1184
+{
1185
+ int ret;
1186
+ struct vduse_config_data *data;
1187
+
1188
+ data = malloc(offsetof(struct vduse_config_data, buffer) + size);
1189
+ if (!data) {
1190
+ return -ENOMEM;
1191
+ }
1192
+
1193
+ data->offset = offset;
1194
+ data->length = size;
1195
+ memcpy(data->buffer, buffer, size);
1196
+
1197
+ ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
1198
+ free(data);
1199
+
1200
+ if (ret) {
1201
+ return -errno;
1202
+ }
1203
+
1204
+ if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
1205
+ return -errno;
1206
+ }
1207
+
1208
+ return 0;
1209
+}
1210
+
1211
+int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
1212
+{
1213
+ VduseVirtq *vq = &dev->vqs[index];
1214
+ struct vduse_vq_config vq_config = { 0 };
1215
+
1216
+ if (max_size > VIRTQUEUE_MAX_SIZE) {
1217
+ return -EINVAL;
1218
+ }
1219
+
1220
+ vq_config.index = vq->index;
1221
+ vq_config.max_size = max_size;
1222
+
1223
+ if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
1224
+ return -errno;
1225
+ }
1226
+
1227
+ return 0;
1228
+}
1229
+
1230
+static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
1231
+{
1232
+ VduseVirtq *vqs;
1233
+ int i;
1234
+
1235
+ vqs = calloc(sizeof(VduseVirtq), num_queues);
1236
+ if (!vqs) {
1237
+ return -ENOMEM;
1238
+ }
1239
+
1240
+ for (i = 0; i < num_queues; i++) {
1241
+ vqs[i].index = i;
1242
+ vqs[i].dev = dev;
1243
+ vqs[i].fd = -1;
1244
+ }
1245
+ dev->vqs = vqs;
1246
+
1247
+ return 0;
1248
+}
1249
+
1250
+static int vduse_dev_init(VduseDev *dev, const char *name,
1251
+ uint16_t num_queues, const VduseOps *ops,
1252
+ void *priv)
1253
+{
1254
+ char *dev_path, *dev_name;
1255
+ int ret, fd;
1256
+
1257
+ dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
1258
+ if (!dev_path) {
1259
+ return -ENOMEM;
1260
+ }
1261
+ sprintf(dev_path, "/dev/vduse/%s", name);
1262
+
1263
+ fd = open(dev_path, O_RDWR);
1264
+ free(dev_path);
1265
+ if (fd < 0) {
1266
+ fprintf(stderr, "Failed to open vduse dev %s: %s\n",
1267
+ name, strerror(errno));
1268
+ return -errno;
1269
+ }
1270
+
1271
+ dev_name = strdup(name);
1272
+ if (!dev_name) {
1273
+ close(fd);
1274
+ return -ENOMEM;
1275
+ }
1276
+
1277
+ ret = vduse_dev_init_vqs(dev, num_queues);
1278
+ if (ret) {
1279
+ free(dev_name);
1280
+ close(fd);
1281
+ return ret;
1282
+ }
1283
+
1284
+ dev->name = dev_name;
1285
+ dev->num_queues = num_queues;
1286
+ dev->fd = fd;
1287
+ dev->ops = ops;
1288
+ dev->priv = priv;
1289
+
1290
+ return 0;
1291
+}
1292
+
1293
+static inline bool vduse_name_is_valid(const char *name)
1294
+{
1295
+ return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
1296
+}
1297
+
1298
+VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
1299
+ const VduseOps *ops, void *priv)
1300
+{
1301
+ VduseDev *dev;
1302
+ int ret;
1303
+
1304
+ if (!ops || !ops->enable_queue || !ops->disable_queue) {
1305
+ fprintf(stderr, "Invalid parameter for vduse\n");
1306
+ return NULL;
1307
+ }
1308
+
1309
+ dev = calloc(sizeof(VduseDev), 1);
1310
+ if (!dev) {
1311
+ fprintf(stderr, "Failed to allocate vduse device\n");
1312
+ return NULL;
1313
+ }
1314
+
1315
+ ret = vduse_dev_init_vqs(dev, num_queues);
1316
+ if (ret) {
1317
+ fprintf(stderr, "Failed to init vqs\n");
1318
+ free(dev);
1319
+ return NULL;
1320
+ }
1321
+
1322
+ dev->num_queues = num_queues;
1323
+ dev->fd = fd;
1324
+ dev->ops = ops;
1325
+ dev->priv = priv;
1326
+
1327
+ return dev;
1328
+}
1329
+
1330
+VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1331
+ const VduseOps *ops, void *priv)
1332
+{
1333
+ VduseDev *dev;
1334
+ int ret;
1335
+
1336
+ if (!name || vduse_name_is_valid(name) || !ops ||
1337
+ !ops->enable_queue || !ops->disable_queue) {
1338
+ fprintf(stderr, "Invalid parameter for vduse\n");
1339
+ return NULL;
1340
+ }
1341
+
1342
+ dev = calloc(sizeof(VduseDev), 1);
1343
+ if (!dev) {
1344
+ fprintf(stderr, "Failed to allocate vduse device\n");
1345
+ return NULL;
1346
+ }
1347
+
1348
+ ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1349
+ if (ret < 0) {
1350
+ fprintf(stderr, "Failed to init vduse device %s: %s\n",
1351
+ name, strerror(ret));
1352
+ free(dev);
1353
+ return NULL;
1354
+ }
1355
+
1356
+ return dev;
1357
+}
1358
+
1359
+VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1360
+ uint32_t vendor_id, uint64_t features,
1361
+ uint16_t num_queues, uint32_t config_size,
1362
+ char *config, const VduseOps *ops, void *priv)
1363
+{
1364
+ VduseDev *dev;
1365
+ int ret, ctrl_fd;
1366
+ uint64_t version;
1367
+ struct vduse_dev_config *dev_config;
1368
+ size_t size = offsetof(struct vduse_dev_config, config);
1369
+
1370
+ if (!name || vduse_name_is_valid(name) ||
1371
+ !has_feature(features, VIRTIO_F_VERSION_1) || !config ||
1372
+ !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1373
+ fprintf(stderr, "Invalid parameter for vduse\n");
1374
+ return NULL;
1375
+ }
1376
+
1377
+ dev = calloc(sizeof(VduseDev), 1);
1378
+ if (!dev) {
1379
+ fprintf(stderr, "Failed to allocate vduse device\n");
1380
+ return NULL;
1381
+ }
1382
+
1383
+ ctrl_fd = open("/dev/vduse/control", O_RDWR);
1384
+ if (ctrl_fd < 0) {
1385
+ fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1386
+ strerror(errno));
1387
+ goto err_ctrl;
1388
+ }
1389
+
1390
+ version = VDUSE_API_VERSION;
1391
+ if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1392
+ fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1393
+ version, strerror(errno));
1394
+ goto err_dev;
1395
+ }
1396
+
1397
+ dev_config = calloc(size + config_size, 1);
1398
+ if (!dev_config) {
1399
+ fprintf(stderr, "Failed to allocate config space\n");
1400
+ goto err_dev;
1401
+ }
1402
+
1403
+ strcpy(dev_config->name, name);
1404
+ dev_config->device_id = device_id;
1405
+ dev_config->vendor_id = vendor_id;
1406
+ dev_config->features = features;
1407
+ dev_config->vq_num = num_queues;
1408
+ dev_config->vq_align = VDUSE_VQ_ALIGN;
1409
+ dev_config->config_size = config_size;
1410
+ memcpy(dev_config->config, config, config_size);
1411
+
1412
+ ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1413
+ free(dev_config);
1414
+ if (ret < 0) {
1415
+ fprintf(stderr, "Failed to create vduse device %s: %s\n",
1416
+ name, strerror(errno));
1417
+ goto err_dev;
1418
+ }
1419
+ dev->ctrl_fd = ctrl_fd;
1420
+
1421
+ ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1422
+ if (ret < 0) {
1423
+ fprintf(stderr, "Failed to init vduse device %s: %s\n",
1424
+ name, strerror(ret));
1425
+ goto err;
1426
+ }
1427
+
1428
+ return dev;
1429
+err:
1430
+ ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1431
+err_dev:
1432
+ close(ctrl_fd);
1433
+err_ctrl:
1434
+ free(dev);
1435
+
1436
+ return NULL;
1437
+}
1438
+
1439
+int vduse_dev_destroy(VduseDev *dev)
1440
+{
1441
+ int ret = 0;
1442
+
1443
+ free(dev->vqs);
1444
+ if (dev->fd >= 0) {
1445
+ close(dev->fd);
1446
+ dev->fd = -1;
1447
+ }
1448
+ if (dev->ctrl_fd >= 0) {
1449
+ if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1450
+ ret = -errno;
1451
+ }
1452
+ close(dev->ctrl_fd);
1453
+ dev->ctrl_fd = -1;
1454
+ }
1455
+ free(dev->name);
1456
+ free(dev);
1457
+
1458
+ return ret;
1459
+}
1460
diff --git a/MAINTAINERS b/MAINTAINERS
1461
index XXXXXXX..XXXXXXX 100644
158
index XXXXXXX..XXXXXXX 100644
1462
--- a/MAINTAINERS
159
--- a/tests/Makefile.include
1463
+++ b/MAINTAINERS
160
+++ b/tests/Makefile.include
1464
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
161
@@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c
1465
S: Supported
162
gcov-files-test-hbitmap-y = util/hbitmap.c
1466
F: block/export/fuse.c
163
check-unit-y += tests/test-hbitmap$(EXESUF)
1467
164
gcov-files-test-hbitmap-y = blockjob.c
1468
+VDUSE library
165
+check-unit-y += tests/test-bdrv-drain$(EXESUF)
1469
+M: Xie Yongji <xieyongji@bytedance.com>
166
check-unit-y += tests/test-blockjob$(EXESUF)
1470
+S: Maintained
167
check-unit-y += tests/test-blockjob-txn$(EXESUF)
1471
+F: subprojects/libvduse/
168
check-unit-y += tests/test-x86-cpuid$(EXESUF)
1472
+
169
@@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
1473
Replication
170
tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
1474
M: Wen Congyang <wencongyang2@huawei.com>
171
tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
1475
M: Xie Changlong <xiechanglong.d@gmail.com>
172
tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
1476
diff --git a/meson.build b/meson.build
173
+tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
1477
index XXXXXXX..XXXXXXX 100644
174
tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
1478
--- a/meson.build
175
tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
1479
+++ b/meson.build
176
tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
1480
@@ -XXX,XX +XXX,XX @@ if get_option('fuse_lseek').allowed()
1481
endif
1482
endif
1483
1484
+have_libvduse = (targetos == 'linux')
1485
+if get_option('libvduse').enabled()
1486
+ if targetos != 'linux'
1487
+ error('libvduse requires linux')
1488
+ endif
1489
+elif get_option('libvduse').disabled()
1490
+ have_libvduse = false
1491
+endif
1492
+
1493
# libbpf
1494
libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config')
1495
if libbpf.found() and not cc.links('''
1496
@@ -XXX,XX +XXX,XX @@ if targetos == 'linux' and have_vhost_user
1497
vhost_user = libvhost_user.get_variable('vhost_user_dep')
1498
endif
1499
1500
+libvduse = not_found
1501
+if have_libvduse
1502
+ libvduse_proj = subproject('libvduse')
1503
+ libvduse = libvduse_proj.get_variable('libvduse_dep')
1504
+endif
1505
+
1506
# NOTE: the trace/ subdirectory needs the qapi_trace_events variable
1507
# that is filled in by qapi/.
1508
subdir('qapi')
1509
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
1510
index XXXXXXX..XXXXXXX 100644
1511
--- a/scripts/meson-buildoptions.sh
1512
+++ b/scripts/meson-buildoptions.sh
1513
@@ -XXX,XX +XXX,XX @@ meson_options_help() {
1514
printf "%s\n" ' libssh ssh block device support'
1515
printf "%s\n" ' libudev Use libudev to enumerate host devices'
1516
printf "%s\n" ' libusb libusb support for USB passthrough'
1517
+ printf "%s\n" ' libvduse build VDUSE Library'
1518
printf "%s\n" ' linux-aio Linux AIO support'
1519
printf "%s\n" ' linux-io-uring Linux io_uring support'
1520
printf "%s\n" ' live-block-migration'
1521
@@ -XXX,XX +XXX,XX @@ _meson_option_parse() {
1522
--disable-libudev) printf "%s" -Dlibudev=disabled ;;
1523
--enable-libusb) printf "%s" -Dlibusb=enabled ;;
1524
--disable-libusb) printf "%s" -Dlibusb=disabled ;;
1525
+ --enable-libvduse) printf "%s" -Dlibvduse=enabled ;;
1526
+ --disable-libvduse) printf "%s" -Dlibvduse=disabled ;;
1527
--enable-linux-aio) printf "%s" -Dlinux_aio=enabled ;;
1528
--disable-linux-aio) printf "%s" -Dlinux_aio=disabled ;;
1529
--enable-linux-io-uring) printf "%s" -Dlinux_io_uring=enabled ;;
1530
diff --git a/subprojects/libvduse/linux-headers/linux b/subprojects/libvduse/linux-headers/linux
1531
new file mode 120000
1532
index XXXXXXX..XXXXXXX
1533
--- /dev/null
1534
+++ b/subprojects/libvduse/linux-headers/linux
1535
@@ -0,0 +1 @@
1536
+../../../linux-headers/linux/
1537
\ No newline at end of file
1538
diff --git a/subprojects/libvduse/meson.build b/subprojects/libvduse/meson.build
1539
new file mode 100644
1540
index XXXXXXX..XXXXXXX
1541
--- /dev/null
1542
+++ b/subprojects/libvduse/meson.build
1543
@@ -XXX,XX +XXX,XX @@
1544
+project('libvduse', 'c',
1545
+ license: 'GPL-2.0-or-later',
1546
+ default_options: ['c_std=gnu99'])
1547
+
1548
+libvduse = static_library('vduse',
1549
+ files('libvduse.c'),
1550
+ c_args: '-D_GNU_SOURCE')
1551
+
1552
+libvduse_dep = declare_dependency(link_with: libvduse,
1553
+ include_directories: include_directories('.'))
1554
diff --git a/subprojects/libvduse/standard-headers/linux b/subprojects/libvduse/standard-headers/linux
1555
new file mode 120000
1556
index XXXXXXX..XXXXXXX
1557
--- /dev/null
1558
+++ b/subprojects/libvduse/standard-headers/linux
1559
@@ -0,0 +1 @@
1560
+../../../include/standard-headers/linux/
1561
\ No newline at end of file
1562
--
177
--
1563
2.35.3
178
2.13.6
1564
179
1565
180
diff view generated by jsdifflib
New patch
1
Now that the bdrv_drain_invoke() calls are pulled up to the callers of
2
bdrv_drain_recurse(), the 'begin' parameter isn't needed any more.
1
3
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
---
7
block/io.c | 12 ++++++------
8
1 file changed, 6 insertions(+), 6 deletions(-)
9
10
diff --git a/block/io.c b/block/io.c
11
index XXXXXXX..XXXXXXX 100644
12
--- a/block/io.c
13
+++ b/block/io.c
14
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
15
}
16
}
17
18
-static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
19
+static bool bdrv_drain_recurse(BlockDriverState *bs)
20
{
21
BdrvChild *child, *tmp;
22
bool waited;
23
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
24
*/
25
bdrv_ref(bs);
26
}
27
- waited |= bdrv_drain_recurse(bs, begin);
28
+ waited |= bdrv_drain_recurse(bs);
29
if (in_main_loop) {
30
bdrv_unref(bs);
31
}
32
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
33
}
34
35
bdrv_drain_invoke(bs, true);
36
- bdrv_drain_recurse(bs, true);
37
+ bdrv_drain_recurse(bs);
38
}
39
40
void bdrv_drained_end(BlockDriverState *bs)
41
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
42
43
bdrv_parent_drained_end(bs);
44
bdrv_drain_invoke(bs, false);
45
- bdrv_drain_recurse(bs, false);
46
+ bdrv_drain_recurse(bs);
47
aio_enable_external(bdrv_get_aio_context(bs));
48
}
49
50
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
51
aio_context_acquire(aio_context);
52
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
53
if (aio_context == bdrv_get_aio_context(bs)) {
54
- waited |= bdrv_drain_recurse(bs, true);
55
+ waited |= bdrv_drain_recurse(bs);
56
}
57
}
58
aio_context_release(aio_context);
59
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
60
aio_enable_external(aio_context);
61
bdrv_parent_drained_end(bs);
62
bdrv_drain_invoke(bs, false);
63
- bdrv_drain_recurse(bs, false);
64
+ bdrv_drain_recurse(bs);
65
aio_context_release(aio_context);
66
}
67
68
--
69
2.13.6
70
71
diff view generated by jsdifflib
New patch
1
The device is drained, so there is no point in waiting for requests at
2
the end of the drained section. Remove the bdrv_drain_recurse() calls
3
there.
1
4
5
The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e
6
in order to call the .bdrv_co_drain_end() driver callback. This is now
7
done by a separate bdrv_drain_invoke() call.
8
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
block/io.c | 2 --
14
1 file changed, 2 deletions(-)
15
16
diff --git a/block/io.c b/block/io.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/block/io.c
19
+++ b/block/io.c
20
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
21
22
bdrv_parent_drained_end(bs);
23
bdrv_drain_invoke(bs, false);
24
- bdrv_drain_recurse(bs);
25
aio_enable_external(bdrv_get_aio_context(bs));
26
}
27
28
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
29
aio_enable_external(aio_context);
30
bdrv_parent_drained_end(bs);
31
bdrv_drain_invoke(bs, false);
32
- bdrv_drain_recurse(bs);
33
aio_context_release(aio_context);
34
}
35
36
--
37
2.13.6
38
39
diff view generated by jsdifflib
New patch
1
Drain requests are propagated to child nodes, parent nodes and directly
2
to the AioContext. The order in which this happened was different
3
between all combinations of drain/drain_all and begin/end.
1
4
5
The correct order is to keep children only drained when their parents
6
are also drained. This means that at the start of a drained section, the
7
AioContext needs to be drained first, the parents second and only then
8
the children. The correct order for the end of a drained section is the
9
opposite.
10
11
This patch changes the three other functions to follow the example of
12
bdrv_drained_begin(), which is the only one that got it right.
13
14
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
15
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
block/io.c | 12 ++++++++----
18
1 file changed, 8 insertions(+), 4 deletions(-)
19
20
diff --git a/block/io.c b/block/io.c
21
index XXXXXXX..XXXXXXX 100644
22
--- a/block/io.c
23
+++ b/block/io.c
24
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
25
return;
26
}
27
28
+ /* Stop things in parent-to-child order */
29
if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
30
aio_disable_external(bdrv_get_aio_context(bs));
31
bdrv_parent_drained_begin(bs);
32
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
33
return;
34
}
35
36
- bdrv_parent_drained_end(bs);
37
+ /* Re-enable things in child-to-parent order */
38
bdrv_drain_invoke(bs, false);
39
+ bdrv_parent_drained_end(bs);
40
aio_enable_external(bdrv_get_aio_context(bs));
41
}
42
43
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
44
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
45
AioContext *aio_context = bdrv_get_aio_context(bs);
46
47
+ /* Stop things in parent-to-child order */
48
aio_context_acquire(aio_context);
49
- bdrv_parent_drained_begin(bs);
50
aio_disable_external(aio_context);
51
+ bdrv_parent_drained_begin(bs);
52
bdrv_drain_invoke(bs, true);
53
aio_context_release(aio_context);
54
55
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
56
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
57
AioContext *aio_context = bdrv_get_aio_context(bs);
58
59
+ /* Re-enable things in child-to-parent order */
60
aio_context_acquire(aio_context);
61
- aio_enable_external(aio_context);
62
- bdrv_parent_drained_end(bs);
63
bdrv_drain_invoke(bs, false);
64
+ bdrv_parent_drained_end(bs);
65
+ aio_enable_external(aio_context);
66
aio_context_release(aio_context);
67
}
68
69
--
70
2.13.6
71
72
diff view generated by jsdifflib
New patch
1
Commit 15afd94a047 added code to acquire and release the AioContext in
2
qemuio_command(). This means that the lock is taken twice now in the
3
call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for
4
any requests issued to nodes in a non-mainloop AioContext.
1
5
6
Dropping the first locking from hmp_qemu_io() fixes the problem.
7
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
11
hmp.c | 6 ------
12
1 file changed, 6 deletions(-)
13
14
diff --git a/hmp.c b/hmp.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/hmp.c
17
+++ b/hmp.c
18
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
19
{
20
BlockBackend *blk;
21
BlockBackend *local_blk = NULL;
22
- AioContext *aio_context;
23
const char* device = qdict_get_str(qdict, "device");
24
const char* command = qdict_get_str(qdict, "command");
25
Error *err = NULL;
26
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
27
}
28
}
29
30
- aio_context = blk_get_aio_context(blk);
31
- aio_context_acquire(aio_context);
32
-
33
/*
34
* Notably absent: Proper permission management. This is sad, but it seems
35
* almost impossible to achieve without changing the semantics and thereby
36
@@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
37
*/
38
qemuio_command(blk, command);
39
40
- aio_context_release(aio_context);
41
-
42
fail:
43
blk_unref(local_blk);
44
hmp_handle_error(mon, &err);
45
--
46
2.13.6
47
48
diff view generated by jsdifflib
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>
1
From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
2
2
3
We don't need extra bitmap. All we need is to backup the original
3
Since bdrv_co_preadv does all neccessary checks including
4
bitmap when we do first merge. So, drop extra temporary bitmap and work
4
reading after the end of the backing file, avoid duplication
5
directly with target and backup.
5
of verification before bdrv_co_preadv call.
6
6
7
Still to keep old semantics, that on failure target is unchanged and
7
Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com>
8
user don't need to restore, we need a local_backup variable and do
8
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
9
restore ourselves on failure path.
10
11
Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
12
Message-Id: <20220517111206.23585-3-v.sementsov-og@mail.ru>
13
Reviewed-by: Eric Blake <eblake@redhat.com>
9
Reviewed-by: Eric Blake <eblake@redhat.com>
14
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
15
---
11
---
16
block/monitor/bitmap-qmp-cmds.c | 41 +++++++++++++++++----------------
12
block/qcow2.h | 3 ---
17
1 file changed, 21 insertions(+), 20 deletions(-)
13
block/qcow2.c | 51 ++++++++-------------------------------------------
14
2 files changed, 8 insertions(+), 46 deletions(-)
18
15
19
diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c
16
diff --git a/block/qcow2.h b/block/qcow2.h
20
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
21
--- a/block/monitor/bitmap-qmp-cmds.c
18
--- a/block/qcow2.h
22
+++ b/block/monitor/bitmap-qmp-cmds.c
19
+++ b/block/qcow2.h
23
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
20
@@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
24
HBitmap **backup, Error **errp)
21
}
25
{
22
26
BlockDriverState *bs;
23
/* qcow2.c functions */
27
- BdrvDirtyBitmap *dst, *src, *anon;
24
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
28
+ BdrvDirtyBitmap *dst, *src;
25
- int64_t sector_num, int nb_sectors);
29
BlockDirtyBitmapOrStrList *lst;
26
-
30
+ HBitmap *local_backup = NULL;
27
int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
31
28
int refcount_order, bool generous_increase,
32
GLOBAL_STATE_CODE();
29
uint64_t *refblock_count);
33
30
diff --git a/block/qcow2.c b/block/qcow2.c
34
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
31
index XXXXXXX..XXXXXXX 100644
35
return NULL;
32
--- a/block/qcow2.c
36
}
33
+++ b/block/qcow2.c
37
34
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
38
- anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst),
35
return status;
39
- NULL, errp);
36
}
40
- if (!anon) {
37
41
- return NULL;
38
-/* handle reading after the end of the backing file */
39
-int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
40
- int64_t offset, int bytes)
41
-{
42
- uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
43
- int n1;
44
-
45
- if ((offset + bytes) <= bs_size) {
46
- return bytes;
42
- }
47
- }
43
-
48
-
44
for (lst = bms; lst; lst = lst->next) {
49
- if (offset >= bs_size) {
45
switch (lst->value->type) {
50
- n1 = 0;
46
const char *name, *node;
51
- } else {
47
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
52
- n1 = bs_size - offset;
48
src = bdrv_find_dirty_bitmap(bs, name);
53
- }
49
if (!src) {
54
-
50
error_setg(errp, "Dirty bitmap '%s' not found", name);
55
- qemu_iovec_memset(qiov, n1, 0, bytes - n1);
51
- dst = NULL;
56
-
52
- goto out;
57
- return n1;
53
+ goto fail;
58
-}
54
}
59
-
55
break;
60
static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
56
case QTYPE_QDICT:
61
uint64_t bytes, QEMUIOVector *qiov,
57
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
62
int flags)
58
name = lst->value->u.external.name;
63
{
59
src = block_dirty_bitmap_lookup(node, name, NULL, errp);
64
BDRVQcow2State *s = bs->opaque;
60
if (!src) {
65
- int offset_in_cluster, n1;
61
- dst = NULL;
66
+ int offset_in_cluster;
62
- goto out;
67
int ret;
63
+ goto fail;
68
unsigned int cur_bytes; /* number of bytes in current iteration */
64
}
69
uint64_t cluster_offset = 0;
65
break;
70
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
66
default:
71
case QCOW2_CLUSTER_UNALLOCATED:
67
abort();
72
68
}
73
if (bs->backing) {
69
74
- /* read from the base image */
70
- if (!bdrv_merge_dirty_bitmap(anon, src, NULL, errp)) {
75
- n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov,
71
- dst = NULL;
76
- offset, cur_bytes);
72
- goto out;
77
- if (n1 > 0) {
73
+ /* We do backup only for first merge operation */
78
- QEMUIOVector local_qiov;
74
+ if (!bdrv_merge_dirty_bitmap(dst, src,
79
-
75
+ local_backup ? NULL : &local_backup,
80
- qemu_iovec_init(&local_qiov, hd_qiov.niov);
76
+ errp))
81
- qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1);
77
+ {
82
-
78
+ goto fail;
83
- BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
79
}
84
- qemu_co_mutex_unlock(&s->lock);
80
}
85
- ret = bdrv_co_preadv(bs->backing, offset, n1,
81
86
- &local_qiov, 0);
82
- /* Merge into dst; dst is unchanged on failure. */
87
- qemu_co_mutex_lock(&s->lock);
83
- if (!bdrv_merge_dirty_bitmap(dst, anon, backup, errp)) {
88
-
84
- dst = NULL;
89
- qemu_iovec_destroy(&local_qiov);
85
- goto out;
90
-
86
+ if (backup) {
91
- if (ret < 0) {
87
+ *backup = local_backup;
92
- goto fail;
88
+ } else {
93
- }
89
+ hbitmap_free(local_backup);
94
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
90
}
95
+ qemu_co_mutex_unlock(&s->lock);
91
96
+ ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
92
- out:
97
+ &hd_qiov, 0);
93
- bdrv_release_dirty_bitmap(anon);
98
+ qemu_co_mutex_lock(&s->lock);
94
return dst;
99
+ if (ret < 0) {
95
+
100
+ goto fail;
96
+fail:
101
}
97
+ if (local_backup) {
102
} else {
98
+ bdrv_restore_dirty_bitmap(dst, local_backup);
103
/* Note: in this case, no need to wait */
99
+ }
100
+
101
+ return NULL;
102
}
103
104
void qmp_block_dirty_bitmap_merge(const char *node, const char *target,
105
--
104
--
106
2.35.3
105
2.13.6
106
107
diff view generated by jsdifflib
New patch
1
Removing a quorum child node with x-blockdev-change results in a quorum
2
driver state that cannot be recreated with create options because it
3
would require a list with gaps. This causes trouble in at least
4
.bdrv_refresh_filename().
1
5
6
Document this problem so that we won't accidentally mark the command
7
stable without having addressed it.
8
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Reviewed-by: Alberto Garcia <berto@igalia.com>
11
---
12
qapi/block-core.json | 4 ++++
13
1 file changed, 4 insertions(+)
14
15
diff --git a/qapi/block-core.json b/qapi/block-core.json
16
index XXXXXXX..XXXXXXX 100644
17
--- a/qapi/block-core.json
18
+++ b/qapi/block-core.json
19
@@ -XXX,XX +XXX,XX @@
20
# does not support all kinds of operations, all kinds of children, nor
21
# all block drivers.
22
#
23
+# FIXME Removing children from a quorum node means introducing gaps in the
24
+# child indices. This cannot be represented in the 'children' list of
25
+# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename().
26
+#
27
# Warning: The data in a new quorum child MUST be consistent with that of
28
# the rest of the array.
29
#
30
--
31
2.13.6
32
33
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
From: Doug Gale <doug16k@gmail.com>
2
2
3
Now the req->size is set to the correct value only
3
Add trace output for commands, errors, and undefined behavior.
4
when handling VIRTIO_BLK_T_GET_ID request. This patch
4
Add guest error log output for undefined behavior.
5
fixes it.
5
Report invalid undefined accesses to MMIO.
6
Annotate unlikely error checks with unlikely.
6
7
7
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
8
Signed-off-by: Doug Gale <doug16k@gmail.com>
8
Message-Id: <20220523084611.91-3-xieyongji@bytedance.com>
9
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
12
---
12
block/export/vhost-user-blk-server.c | 5 ++---
13
hw/block/nvme.c | 349 ++++++++++++++++++++++++++++++++++++++++++--------
13
1 file changed, 2 insertions(+), 3 deletions(-)
14
hw/block/trace-events | 93 ++++++++++++++
15
2 files changed, 390 insertions(+), 52 deletions(-)
14
16
15
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
17
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
16
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
17
--- a/block/export/vhost-user-blk-server.c
19
--- a/hw/block/nvme.c
18
+++ b/block/export/vhost-user-blk-server.c
20
+++ b/hw/block/nvme.c
19
@@ -XXX,XX +XXX,XX @@ static void vu_blk_req_complete(VuBlkReq *req)
21
@@ -XXX,XX +XXX,XX @@
22
#include "qapi/visitor.h"
23
#include "sysemu/block-backend.h"
24
25
+#include "qemu/log.h"
26
+#include "trace.h"
27
#include "nvme.h"
28
29
+#define NVME_GUEST_ERR(trace, fmt, ...) \
30
+ do { \
31
+ (trace_##trace)(__VA_ARGS__); \
32
+ qemu_log_mask(LOG_GUEST_ERROR, #trace \
33
+ " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
34
+ } while (0)
35
+
36
static void nvme_process_sq(void *opaque);
37
38
static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
39
@@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
20
{
40
{
21
VuDev *vu_dev = &req->server->vu_dev;
41
if (cq->irq_enabled) {
22
42
if (msix_enabled(&(n->parent_obj))) {
23
- /* IO size with 1 extra status byte */
43
+ trace_nvme_irq_msix(cq->vector);
24
- vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1);
44
msix_notify(&(n->parent_obj), cq->vector);
25
+ vu_queue_push(vu_dev, req->vq, &req->elem, req->size);
45
} else {
26
vu_queue_notify(vu_dev, req->vq);
46
+ trace_nvme_irq_pin();
27
47
pci_irq_pulse(&n->parent_obj);
28
free(req);
48
}
29
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
49
+ } else {
30
goto err;
50
+ trace_nvme_irq_masked();
31
}
51
}
32
52
}
33
+ req->size = iov_size(in_iov, in_num);
53
34
/* We always touch the last byte, so just see how big in_iov is. */
54
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
35
req->in = (void *)in_iov[in_num - 1].iov_base
55
trans_len = MIN(len, trans_len);
36
+ in_iov[in_num - 1].iov_len
56
int num_prps = (len >> n->page_bits) + 1;
37
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
57
38
VIRTIO_BLK_ID_BYTES);
58
- if (!prp1) {
39
snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
59
+ if (unlikely(!prp1)) {
40
req->in->status = VIRTIO_BLK_S_OK;
60
+ trace_nvme_err_invalid_prp();
41
- req->size = elem->in_sg[0].iov_len;
61
return NVME_INVALID_FIELD | NVME_DNR;
42
break;
62
} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
43
}
63
prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
44
case VIRTIO_BLK_T_DISCARD:
64
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
65
}
66
len -= trans_len;
67
if (len) {
68
- if (!prp2) {
69
+ if (unlikely(!prp2)) {
70
+ trace_nvme_err_invalid_prp2_missing();
71
goto unmap;
72
}
73
if (len > n->page_size) {
74
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
75
uint64_t prp_ent = le64_to_cpu(prp_list[i]);
76
77
if (i == n->max_prp_ents - 1 && len > n->page_size) {
78
- if (!prp_ent || prp_ent & (n->page_size - 1)) {
79
+ if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
80
+ trace_nvme_err_invalid_prplist_ent(prp_ent);
81
goto unmap;
82
}
83
84
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
85
prp_ent = le64_to_cpu(prp_list[i]);
86
}
87
88
- if (!prp_ent || prp_ent & (n->page_size - 1)) {
89
+ if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
90
+ trace_nvme_err_invalid_prplist_ent(prp_ent);
91
goto unmap;
92
}
93
94
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
95
i++;
96
}
97
} else {
98
- if (prp2 & (n->page_size - 1)) {
99
+ if (unlikely(prp2 & (n->page_size - 1))) {
100
+ trace_nvme_err_invalid_prp2_align(prp2);
101
goto unmap;
102
}
103
if (qsg->nsg) {
104
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
105
QEMUIOVector iov;
106
uint16_t status = NVME_SUCCESS;
107
108
+ trace_nvme_dma_read(prp1, prp2);
109
+
110
if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
111
return NVME_INVALID_FIELD | NVME_DNR;
112
}
113
if (qsg.nsg > 0) {
114
- if (dma_buf_read(ptr, len, &qsg)) {
115
+ if (unlikely(dma_buf_read(ptr, len, &qsg))) {
116
+ trace_nvme_err_invalid_dma();
117
status = NVME_INVALID_FIELD | NVME_DNR;
118
}
119
qemu_sglist_destroy(&qsg);
120
} else {
121
- if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
122
+ if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
123
+ trace_nvme_err_invalid_dma();
124
status = NVME_INVALID_FIELD | NVME_DNR;
125
}
126
qemu_iovec_destroy(&iov);
127
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
128
uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
129
uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
130
131
- if (slba + nlb > ns->id_ns.nsze) {
132
+ if (unlikely(slba + nlb > ns->id_ns.nsze)) {
133
+ trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
134
return NVME_LBA_RANGE | NVME_DNR;
135
}
136
137
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
138
int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
139
enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
140
141
- if ((slba + nlb) > ns->id_ns.nsze) {
142
+ trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
143
+
144
+ if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
145
block_acct_invalid(blk_get_stats(n->conf.blk), acct);
146
+ trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
147
return NVME_LBA_RANGE | NVME_DNR;
148
}
149
150
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
151
NvmeNamespace *ns;
152
uint32_t nsid = le32_to_cpu(cmd->nsid);
153
154
- if (nsid == 0 || nsid > n->num_namespaces) {
155
+ if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
156
+ trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
157
return NVME_INVALID_NSID | NVME_DNR;
158
}
159
160
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
161
case NVME_CMD_READ:
162
return nvme_rw(n, ns, cmd, req);
163
default:
164
+ trace_nvme_err_invalid_opc(cmd->opcode);
165
return NVME_INVALID_OPCODE | NVME_DNR;
166
}
167
}
168
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
169
NvmeCQueue *cq;
170
uint16_t qid = le16_to_cpu(c->qid);
171
172
- if (!qid || nvme_check_sqid(n, qid)) {
173
+ if (unlikely(!qid || nvme_check_sqid(n, qid))) {
174
+ trace_nvme_err_invalid_del_sq(qid);
175
return NVME_INVALID_QID | NVME_DNR;
176
}
177
178
+ trace_nvme_del_sq(qid);
179
+
180
sq = n->sq[qid];
181
while (!QTAILQ_EMPTY(&sq->out_req_list)) {
182
req = QTAILQ_FIRST(&sq->out_req_list);
183
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
184
uint16_t qflags = le16_to_cpu(c->sq_flags);
185
uint64_t prp1 = le64_to_cpu(c->prp1);
186
187
- if (!cqid || nvme_check_cqid(n, cqid)) {
188
+ trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
189
+
190
+ if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
191
+ trace_nvme_err_invalid_create_sq_cqid(cqid);
192
return NVME_INVALID_CQID | NVME_DNR;
193
}
194
- if (!sqid || !nvme_check_sqid(n, sqid)) {
195
+ if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
196
+ trace_nvme_err_invalid_create_sq_sqid(sqid);
197
return NVME_INVALID_QID | NVME_DNR;
198
}
199
- if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
200
+ if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
201
+ trace_nvme_err_invalid_create_sq_size(qsize);
202
return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
203
}
204
- if (!prp1 || prp1 & (n->page_size - 1)) {
205
+ if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
206
+ trace_nvme_err_invalid_create_sq_addr(prp1);
207
return NVME_INVALID_FIELD | NVME_DNR;
208
}
209
- if (!(NVME_SQ_FLAGS_PC(qflags))) {
210
+ if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
211
+ trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
212
return NVME_INVALID_FIELD | NVME_DNR;
213
}
214
sq = g_malloc0(sizeof(*sq));
215
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
216
NvmeCQueue *cq;
217
uint16_t qid = le16_to_cpu(c->qid);
218
219
- if (!qid || nvme_check_cqid(n, qid)) {
220
+ if (unlikely(!qid || nvme_check_cqid(n, qid))) {
221
+ trace_nvme_err_invalid_del_cq_cqid(qid);
222
return NVME_INVALID_CQID | NVME_DNR;
223
}
224
225
cq = n->cq[qid];
226
- if (!QTAILQ_EMPTY(&cq->sq_list)) {
227
+ if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
228
+ trace_nvme_err_invalid_del_cq_notempty(qid);
229
return NVME_INVALID_QUEUE_DEL;
230
}
231
+ trace_nvme_del_cq(qid);
232
nvme_free_cq(cq, n);
233
return NVME_SUCCESS;
234
}
235
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
236
uint16_t qflags = le16_to_cpu(c->cq_flags);
237
uint64_t prp1 = le64_to_cpu(c->prp1);
238
239
- if (!cqid || !nvme_check_cqid(n, cqid)) {
240
+ trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
241
+ NVME_CQ_FLAGS_IEN(qflags) != 0);
242
+
243
+ if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
244
+ trace_nvme_err_invalid_create_cq_cqid(cqid);
245
return NVME_INVALID_CQID | NVME_DNR;
246
}
247
- if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
248
+ if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
249
+ trace_nvme_err_invalid_create_cq_size(qsize);
250
return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
251
}
252
- if (!prp1) {
253
+ if (unlikely(!prp1)) {
254
+ trace_nvme_err_invalid_create_cq_addr(prp1);
255
return NVME_INVALID_FIELD | NVME_DNR;
256
}
257
- if (vector > n->num_queues) {
258
+ if (unlikely(vector > n->num_queues)) {
259
+ trace_nvme_err_invalid_create_cq_vector(vector);
260
return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
261
}
262
- if (!(NVME_CQ_FLAGS_PC(qflags))) {
263
+ if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
264
+ trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
265
return NVME_INVALID_FIELD | NVME_DNR;
266
}
267
268
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
269
uint64_t prp1 = le64_to_cpu(c->prp1);
270
uint64_t prp2 = le64_to_cpu(c->prp2);
271
272
+ trace_nvme_identify_ctrl();
273
+
274
return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
275
prp1, prp2);
276
}
277
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
278
uint64_t prp1 = le64_to_cpu(c->prp1);
279
uint64_t prp2 = le64_to_cpu(c->prp2);
280
281
- if (nsid == 0 || nsid > n->num_namespaces) {
282
+ trace_nvme_identify_ns(nsid);
283
+
284
+ if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
285
+ trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
286
return NVME_INVALID_NSID | NVME_DNR;
287
}
288
289
ns = &n->namespaces[nsid - 1];
290
+
291
return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
292
prp1, prp2);
293
}
294
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
295
uint16_t ret;
296
int i, j = 0;
297
298
+ trace_nvme_identify_nslist(min_nsid);
299
+
300
list = g_malloc0(data_len);
301
for (i = 0; i < n->num_namespaces; i++) {
302
if (i < min_nsid) {
303
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
304
case 0x02:
305
return nvme_identify_nslist(n, c);
306
default:
307
+ trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
308
return NVME_INVALID_FIELD | NVME_DNR;
309
}
310
}
311
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
312
switch (dw10) {
313
case NVME_VOLATILE_WRITE_CACHE:
314
result = blk_enable_write_cache(n->conf.blk);
315
+ trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
316
break;
317
case NVME_NUMBER_OF_QUEUES:
318
result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
319
+ trace_nvme_getfeat_numq(result);
320
break;
321
default:
322
+ trace_nvme_err_invalid_getfeat(dw10);
323
return NVME_INVALID_FIELD | NVME_DNR;
324
}
325
326
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
327
blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
328
break;
329
case NVME_NUMBER_OF_QUEUES:
330
+ trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
331
+ ((dw11 >> 16) & 0xFFFF) + 1,
332
+ n->num_queues - 1, n->num_queues - 1);
333
req->cqe.result =
334
cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
335
break;
336
default:
337
+ trace_nvme_err_invalid_setfeat(dw10);
338
return NVME_INVALID_FIELD | NVME_DNR;
339
}
340
return NVME_SUCCESS;
341
@@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
342
case NVME_ADM_CMD_GET_FEATURES:
343
return nvme_get_feature(n, cmd, req);
344
default:
345
+ trace_nvme_err_invalid_admin_opc(cmd->opcode);
346
return NVME_INVALID_OPCODE | NVME_DNR;
347
}
348
}
349
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
350
uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
351
uint32_t page_size = 1 << page_bits;
352
353
- if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
354
- n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
355
- NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
356
- NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
357
- NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
358
- NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
359
- NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
360
- NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
361
- !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
362
+ if (unlikely(n->cq[0])) {
363
+ trace_nvme_err_startfail_cq();
364
+ return -1;
365
+ }
366
+ if (unlikely(n->sq[0])) {
367
+ trace_nvme_err_startfail_sq();
368
+ return -1;
369
+ }
370
+ if (unlikely(!n->bar.asq)) {
371
+ trace_nvme_err_startfail_nbarasq();
372
+ return -1;
373
+ }
374
+ if (unlikely(!n->bar.acq)) {
375
+ trace_nvme_err_startfail_nbaracq();
376
+ return -1;
377
+ }
378
+ if (unlikely(n->bar.asq & (page_size - 1))) {
379
+ trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
380
+ return -1;
381
+ }
382
+ if (unlikely(n->bar.acq & (page_size - 1))) {
383
+ trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
384
+ return -1;
385
+ }
386
+ if (unlikely(NVME_CC_MPS(n->bar.cc) <
387
+ NVME_CAP_MPSMIN(n->bar.cap))) {
388
+ trace_nvme_err_startfail_page_too_small(
389
+ NVME_CC_MPS(n->bar.cc),
390
+ NVME_CAP_MPSMIN(n->bar.cap));
391
+ return -1;
392
+ }
393
+ if (unlikely(NVME_CC_MPS(n->bar.cc) >
394
+ NVME_CAP_MPSMAX(n->bar.cap))) {
395
+ trace_nvme_err_startfail_page_too_large(
396
+ NVME_CC_MPS(n->bar.cc),
397
+ NVME_CAP_MPSMAX(n->bar.cap));
398
+ return -1;
399
+ }
400
+ if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
401
+ NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
402
+ trace_nvme_err_startfail_cqent_too_small(
403
+ NVME_CC_IOCQES(n->bar.cc),
404
+ NVME_CTRL_CQES_MIN(n->bar.cap));
405
+ return -1;
406
+ }
407
+ if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
408
+ NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
409
+ trace_nvme_err_startfail_cqent_too_large(
410
+ NVME_CC_IOCQES(n->bar.cc),
411
+ NVME_CTRL_CQES_MAX(n->bar.cap));
412
+ return -1;
413
+ }
414
+ if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
415
+ NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
416
+ trace_nvme_err_startfail_sqent_too_small(
417
+ NVME_CC_IOSQES(n->bar.cc),
418
+ NVME_CTRL_SQES_MIN(n->bar.cap));
419
+ return -1;
420
+ }
421
+ if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
422
+ NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
423
+ trace_nvme_err_startfail_sqent_too_large(
424
+ NVME_CC_IOSQES(n->bar.cc),
425
+ NVME_CTRL_SQES_MAX(n->bar.cap));
426
+ return -1;
427
+ }
428
+ if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
429
+ trace_nvme_err_startfail_asqent_sz_zero();
430
+ return -1;
431
+ }
432
+ if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
433
+ trace_nvme_err_startfail_acqent_sz_zero();
434
return -1;
435
}
436
437
@@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n)
438
static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
439
unsigned size)
440
{
441
+ if (unlikely(offset & (sizeof(uint32_t) - 1))) {
442
+ NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
443
+ "MMIO write not 32-bit aligned,"
444
+ " offset=0x%"PRIx64"", offset);
445
+ /* should be ignored, fall through for now */
446
+ }
447
+
448
+ if (unlikely(size < sizeof(uint32_t))) {
449
+ NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
450
+ "MMIO write smaller than 32-bits,"
451
+ " offset=0x%"PRIx64", size=%u",
452
+ offset, size);
453
+ /* should be ignored, fall through for now */
454
+ }
455
+
456
switch (offset) {
457
- case 0xc:
458
+ case 0xc: /* INTMS */
459
+ if (unlikely(msix_enabled(&(n->parent_obj)))) {
460
+ NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
461
+ "undefined access to interrupt mask set"
462
+ " when MSI-X is enabled");
463
+ /* should be ignored, fall through for now */
464
+ }
465
n->bar.intms |= data & 0xffffffff;
466
n->bar.intmc = n->bar.intms;
467
+ trace_nvme_mmio_intm_set(data & 0xffffffff,
468
+ n->bar.intmc);
469
break;
470
- case 0x10:
471
+ case 0x10: /* INTMC */
472
+ if (unlikely(msix_enabled(&(n->parent_obj)))) {
473
+ NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
474
+ "undefined access to interrupt mask clr"
475
+ " when MSI-X is enabled");
476
+ /* should be ignored, fall through for now */
477
+ }
478
n->bar.intms &= ~(data & 0xffffffff);
479
n->bar.intmc = n->bar.intms;
480
+ trace_nvme_mmio_intm_clr(data & 0xffffffff,
481
+ n->bar.intmc);
482
break;
483
- case 0x14:
484
+ case 0x14: /* CC */
485
+ trace_nvme_mmio_cfg(data & 0xffffffff);
486
/* Windows first sends data, then sends enable bit */
487
if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
488
!NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
489
@@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
490
491
if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
492
n->bar.cc = data;
493
- if (nvme_start_ctrl(n)) {
494
+ if (unlikely(nvme_start_ctrl(n))) {
495
+ trace_nvme_err_startfail();
496
n->bar.csts = NVME_CSTS_FAILED;
497
} else {
498
+ trace_nvme_mmio_start_success();
499
n->bar.csts = NVME_CSTS_READY;
500
}
501
} else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
502
+ trace_nvme_mmio_stopped();
503
nvme_clear_ctrl(n);
504
n->bar.csts &= ~NVME_CSTS_READY;
505
}
506
if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
507
- nvme_clear_ctrl(n);
508
- n->bar.cc = data;
509
- n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
510
+ trace_nvme_mmio_shutdown_set();
511
+ nvme_clear_ctrl(n);
512
+ n->bar.cc = data;
513
+ n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
514
} else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
515
- n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
516
- n->bar.cc = data;
517
+ trace_nvme_mmio_shutdown_cleared();
518
+ n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
519
+ n->bar.cc = data;
520
+ }
521
+ break;
522
+ case 0x1C: /* CSTS */
523
+ if (data & (1 << 4)) {
524
+ NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
525
+ "attempted to W1C CSTS.NSSRO"
526
+ " but CAP.NSSRS is zero (not supported)");
527
+ } else if (data != 0) {
528
+ NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
529
+ "attempted to set a read only bit"
530
+ " of controller status");
531
+ }
532
+ break;
533
+ case 0x20: /* NSSR */
534
+ if (data == 0x4E564D65) {
535
+ trace_nvme_ub_mmiowr_ssreset_unsupported();
536
+ } else {
537
+ /* The spec says that writes of other values have no effect */
538
+ return;
539
}
540
break;
541
- case 0x24:
542
+ case 0x24: /* AQA */
543
n->bar.aqa = data & 0xffffffff;
544
+ trace_nvme_mmio_aqattr(data & 0xffffffff);
545
break;
546
- case 0x28:
547
+ case 0x28: /* ASQ */
548
n->bar.asq = data;
549
+ trace_nvme_mmio_asqaddr(data);
550
break;
551
- case 0x2c:
552
+ case 0x2c: /* ASQ hi */
553
n->bar.asq |= data << 32;
554
+ trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
555
break;
556
- case 0x30:
557
+ case 0x30: /* ACQ */
558
+ trace_nvme_mmio_acqaddr(data);
559
n->bar.acq = data;
560
break;
561
- case 0x34:
562
+ case 0x34: /* ACQ hi */
563
n->bar.acq |= data << 32;
564
+ trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
565
break;
566
+ case 0x38: /* CMBLOC */
567
+ NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
568
+ "invalid write to reserved CMBLOC"
569
+ " when CMBSZ is zero, ignored");
570
+ return;
571
+ case 0x3C: /* CMBSZ */
572
+ NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
573
+ "invalid write to read only CMBSZ, ignored");
574
+ return;
575
default:
576
+ NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
577
+ "invalid MMIO write,"
578
+ " offset=0x%"PRIx64", data=%"PRIx64"",
579
+ offset, data);
580
break;
581
}
582
}
583
@@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
584
uint8_t *ptr = (uint8_t *)&n->bar;
585
uint64_t val = 0;
586
587
+ if (unlikely(addr & (sizeof(uint32_t) - 1))) {
588
+ NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
589
+ "MMIO read not 32-bit aligned,"
590
+ " offset=0x%"PRIx64"", addr);
591
+ /* should RAZ, fall through for now */
592
+ } else if (unlikely(size < sizeof(uint32_t))) {
593
+ NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
594
+ "MMIO read smaller than 32-bits,"
595
+ " offset=0x%"PRIx64"", addr);
596
+ /* should RAZ, fall through for now */
597
+ }
598
+
599
if (addr < sizeof(n->bar)) {
600
memcpy(&val, ptr + addr, size);
601
+ } else {
602
+ NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
603
+ "MMIO read beyond last register,"
604
+ " offset=0x%"PRIx64", returning 0", addr);
605
}
606
+
607
return val;
608
}
609
610
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
611
{
612
uint32_t qid;
613
614
- if (addr & ((1 << 2) - 1)) {
615
+ if (unlikely(addr & ((1 << 2) - 1))) {
616
+ NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
617
+ "doorbell write not 32-bit aligned,"
618
+ " offset=0x%"PRIx64", ignoring", addr);
619
return;
620
}
621
622
if (((addr - 0x1000) >> 2) & 1) {
623
+ /* Completion queue doorbell write */
624
+
625
uint16_t new_head = val & 0xffff;
626
int start_sqs;
627
NvmeCQueue *cq;
628
629
qid = (addr - (0x1000 + (1 << 2))) >> 3;
630
- if (nvme_check_cqid(n, qid)) {
631
+ if (unlikely(nvme_check_cqid(n, qid))) {
632
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
633
+ "completion queue doorbell write"
634
+ " for nonexistent queue,"
635
+ " sqid=%"PRIu32", ignoring", qid);
636
return;
637
}
638
639
cq = n->cq[qid];
640
- if (new_head >= cq->size) {
641
+ if (unlikely(new_head >= cq->size)) {
642
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
643
+ "completion queue doorbell write value"
644
+ " beyond queue size, sqid=%"PRIu32","
645
+ " new_head=%"PRIu16", ignoring",
646
+ qid, new_head);
647
return;
648
}
649
650
@@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
651
nvme_isr_notify(n, cq);
652
}
653
} else {
654
+ /* Submission queue doorbell write */
655
+
656
uint16_t new_tail = val & 0xffff;
657
NvmeSQueue *sq;
658
659
qid = (addr - 0x1000) >> 3;
660
- if (nvme_check_sqid(n, qid)) {
661
+ if (unlikely(nvme_check_sqid(n, qid))) {
662
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
663
+ "submission queue doorbell write"
664
+ " for nonexistent queue,"
665
+ " sqid=%"PRIu32", ignoring", qid);
666
return;
667
}
668
669
sq = n->sq[qid];
670
- if (new_tail >= sq->size) {
671
+ if (unlikely(new_tail >= sq->size)) {
672
+ NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
673
+ "submission queue doorbell write value"
674
+ " beyond queue size, sqid=%"PRIu32","
675
+ " new_tail=%"PRIu16", ignoring",
676
+ qid, new_tail);
677
return;
678
}
679
680
diff --git a/hw/block/trace-events b/hw/block/trace-events
681
index XXXXXXX..XXXXXXX 100644
682
--- a/hw/block/trace-events
683
+++ b/hw/block/trace-events
684
@@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6
685
hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
686
hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d"
687
688
+# hw/block/nvme.c
689
+# nvme traces for successful events
690
+nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
691
+nvme_irq_pin(void) "pulsing IRQ pin"
692
+nvme_irq_masked(void) "IRQ is masked"
693
+nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
694
+nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
695
+nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
696
+nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
697
+nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
698
+nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
699
+nvme_identify_ctrl(void) "identify controller"
700
+nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
701
+nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
702
+nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s"
703
+nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
704
+nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
705
+nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
706
+nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
707
+nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64""
708
+nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64""
709
+nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64""
710
+nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64""
711
+nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
712
+nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64""
713
+nvme_mmio_start_success(void) "setting controller enable bit succeeded"
714
+nvme_mmio_stopped(void) "cleared controller enable bit"
715
+nvme_mmio_shutdown_set(void) "shutdown bit set"
716
+nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
717
+
718
+# nvme traces for error conditions
719
+nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
720
+nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
721
+nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
722
+nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred"
723
+nvme_err_invalid_field(void) "invalid field"
724
+nvme_err_invalid_prp(void) "invalid PRP"
725
+nvme_err_invalid_sgl(void) "invalid SGL"
726
+nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u"
727
+nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
728
+nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
729
+nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
730
+nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
731
+nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
732
+nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
733
+nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16""
734
+nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64""
735
+nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16""
736
+nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16""
737
+nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16""
738
+nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16""
739
+nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16""
740
+nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64""
741
+nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16""
742
+nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16""
743
+nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16""
744
+nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32""
745
+nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32""
746
+nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues"
747
+nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues"
748
+nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null"
749
+nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null"
750
+nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64""
751
+nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64""
752
+nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u"
753
+nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u"
754
+nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u"
755
+nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u"
756
+nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u"
757
+nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u"
758
+nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
759
+nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
760
+nvme_err_startfail(void) "setting controller enable bit failed"
761
+
762
+# Traces for undefined behavior
763
+nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
764
+nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u"
765
+nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled"
766
+nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status"
767
+nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)"
768
+nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)"
769
+nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored"
770
+nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored"
771
+nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64""
772
+nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64""
773
+nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64""
774
+nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0"
775
+nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring"
776
+nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring"
777
+nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
778
+nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
779
+nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
780
+
781
# hw/block/xen_disk.c
782
xen_disk_alloc(char *name) "%s"
783
xen_disk_init(char *name) "%s"
45
--
784
--
46
2.35.3
785
2.13.6
786
787
diff view generated by jsdifflib
New patch
1
From: Fam Zheng <famz@redhat.com>
1
2
3
Management tools create overlays of running guests with qemu-img:
4
5
$ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2
6
7
but this doesn't work anymore due to image locking:
8
9
qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock
10
Is another process using the image?
11
Could not open backing image to determine size.
12
Use the force share option to allow this use case again.
13
14
Cc: qemu-stable@nongnu.org
15
Signed-off-by: Fam Zheng <famz@redhat.com>
16
Reviewed-by: Eric Blake <eblake@redhat.com>
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
18
---
19
block.c | 3 ++-
20
1 file changed, 2 insertions(+), 1 deletion(-)
21
22
diff --git a/block.c b/block.c
23
index XXXXXXX..XXXXXXX 100644
24
--- a/block.c
25
+++ b/block.c
26
@@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt,
27
back_flags = flags;
28
back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
29
30
+ backing_options = qdict_new();
31
if (backing_fmt) {
32
- backing_options = qdict_new();
33
qdict_put_str(backing_options, "driver", backing_fmt);
34
}
35
+ qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
36
37
bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
38
&local_err);
39
--
40
2.13.6
41
42
diff view generated by jsdifflib
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>
1
From: Thomas Huth <thuth@redhat.com>
2
2
3
At the end we ignore failure of bdrv_merge_dirty_bitmap() and report
3
It's not working anymore since QEMU v1.3.0 - time to remove it now.
4
success. And still set errp. That's wrong.
5
4
6
Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
5
Signed-off-by: Thomas Huth <thuth@redhat.com>
7
Reviewed-by: Nikita Lapshin <nikita.lapshin@virtuozzo.com>
6
Reviewed-by: John Snow <jsnow@redhat.com>
8
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
7
Reviewed-by: Markus Armbruster <armbru@redhat.com>
9
Message-Id: <20220517111206.23585-2-v.sementsov-og@mail.ru>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
9
---
12
block/monitor/bitmap-qmp-cmds.c | 5 ++++-
10
blockdev.c | 11 -----------
13
1 file changed, 4 insertions(+), 1 deletion(-)
11
qemu-doc.texi | 6 ------
12
2 files changed, 17 deletions(-)
14
13
15
diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c
14
diff --git a/blockdev.c b/blockdev.c
16
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
17
--- a/block/monitor/bitmap-qmp-cmds.c
16
--- a/blockdev.c
18
+++ b/block/monitor/bitmap-qmp-cmds.c
17
+++ b/blockdev.c
19
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
18
@@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = {
19
.type = QEMU_OPT_STRING,
20
.help = "chs translation (auto, lba, none)",
21
},{
22
- .name = "boot",
23
- .type = QEMU_OPT_BOOL,
24
- .help = "(deprecated, ignored)",
25
- },{
26
.name = "addr",
27
.type = QEMU_OPT_STRING,
28
.help = "pci address (virtio only)",
29
@@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
30
goto fail;
20
}
31
}
21
32
22
/* Merge into dst; dst is unchanged on failure. */
33
- /* Deprecated option boot=[on|off] */
23
- bdrv_merge_dirty_bitmap(dst, anon, backup, errp);
34
- if (qemu_opt_get(legacy_opts, "boot") != NULL) {
24
+ if (!bdrv_merge_dirty_bitmap(dst, anon, backup, errp)) {
35
- fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be "
25
+ dst = NULL;
36
- "ignored. Future versions will reject this parameter. Please "
26
+ goto out;
37
- "update your scripts.\n");
27
+ }
38
- }
28
39
-
29
out:
40
/* Other deprecated options */
30
bdrv_release_dirty_bitmap(anon);
41
if (!qtest_enabled()) {
42
for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
43
diff --git a/qemu-doc.texi b/qemu-doc.texi
44
index XXXXXXX..XXXXXXX 100644
45
--- a/qemu-doc.texi
46
+++ b/qemu-doc.texi
47
@@ -XXX,XX +XXX,XX @@ deprecated.
48
49
@section System emulator command line arguments
50
51
-@subsection -drive boot=on|off (since 1.3.0)
52
-
53
-The ``boot=on|off'' option to the ``-drive'' argument is
54
-ignored. Applications should use the ``bootindex=N'' parameter
55
-to set an absolute ordering between devices instead.
56
-
57
@subsection -tdf (since 1.3.0)
58
59
The ``-tdf'' argument is ignored. The behaviour implemented
31
--
60
--
32
2.35.3
61
2.13.6
62
63
diff view generated by jsdifflib
New patch
1
1
From: Thomas Huth <thuth@redhat.com>
2
3
It's been marked as deprecated since QEMU v2.10.0, and so far nobody
4
complained that we should keep it, so let's remove this legacy option
5
now to simplify the code quite a bit.
6
7
Signed-off-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: John Snow <jsnow@redhat.com>
9
Reviewed-by: Markus Armbruster <armbru@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
12
vl.c | 86 ++-------------------------------------------------------
13
qemu-doc.texi | 8 ------
14
qemu-options.hx | 19 ++-----------
15
3 files changed, 4 insertions(+), 109 deletions(-)
16
17
diff --git a/vl.c b/vl.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/vl.c
20
+++ b/vl.c
21
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
22
const char *boot_order = NULL;
23
const char *boot_once = NULL;
24
DisplayState *ds;
25
- int cyls, heads, secs, translation;
26
QemuOpts *opts, *machine_opts;
27
- QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL;
28
+ QemuOpts *icount_opts = NULL, *accel_opts = NULL;
29
QemuOptsList *olist;
30
int optind;
31
const char *optarg;
32
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
33
34
cpu_model = NULL;
35
snapshot = 0;
36
- cyls = heads = secs = 0;
37
- translation = BIOS_ATA_TRANSLATION_AUTO;
38
39
nb_nics = 0;
40
41
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
42
if (optind >= argc)
43
break;
44
if (argv[optind][0] != '-') {
45
- hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
46
+ drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
47
} else {
48
const QEMUOption *popt;
49
50
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
51
cpu_model = optarg;
52
break;
53
case QEMU_OPTION_hda:
54
- {
55
- char buf[256];
56
- if (cyls == 0)
57
- snprintf(buf, sizeof(buf), "%s", HD_OPTS);
58
- else
59
- snprintf(buf, sizeof(buf),
60
- "%s,cyls=%d,heads=%d,secs=%d%s",
61
- HD_OPTS , cyls, heads, secs,
62
- translation == BIOS_ATA_TRANSLATION_LBA ?
63
- ",trans=lba" :
64
- translation == BIOS_ATA_TRANSLATION_NONE ?
65
- ",trans=none" : "");
66
- drive_add(IF_DEFAULT, 0, optarg, buf);
67
- break;
68
- }
69
case QEMU_OPTION_hdb:
70
case QEMU_OPTION_hdc:
71
case QEMU_OPTION_hdd:
72
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp)
73
case QEMU_OPTION_snapshot:
74
snapshot = 1;
75
break;
76
- case QEMU_OPTION_hdachs:
77
- {
78
- const char *p;
79
- p = optarg;
80
- cyls = strtol(p, (char **)&p, 0);
81
- if (cyls < 1 || cyls > 16383)
82
- goto chs_fail;
83
- if (*p != ',')
84
- goto chs_fail;
85
- p++;
86
- heads = strtol(p, (char **)&p, 0);
87
- if (heads < 1 || heads > 16)
88
- goto chs_fail;
89
- if (*p != ',')
90
- goto chs_fail;
91
- p++;
92
- secs = strtol(p, (char **)&p, 0);
93
- if (secs < 1 || secs > 63)
94
- goto chs_fail;
95
- if (*p == ',') {
96
- p++;
97
- if (!strcmp(p, "large")) {
98
- translation = BIOS_ATA_TRANSLATION_LARGE;
99
- } else if (!strcmp(p, "rechs")) {
100
- translation = BIOS_ATA_TRANSLATION_RECHS;
101
- } else if (!strcmp(p, "none")) {
102
- translation = BIOS_ATA_TRANSLATION_NONE;
103
- } else if (!strcmp(p, "lba")) {
104
- translation = BIOS_ATA_TRANSLATION_LBA;
105
- } else if (!strcmp(p, "auto")) {
106
- translation = BIOS_ATA_TRANSLATION_AUTO;
107
- } else {
108
- goto chs_fail;
109
- }
110
- } else if (*p != '\0') {
111
- chs_fail:
112
- error_report("invalid physical CHS format");
113
- exit(1);
114
- }
115
- if (hda_opts != NULL) {
116
- qemu_opt_set_number(hda_opts, "cyls", cyls,
117
- &error_abort);
118
- qemu_opt_set_number(hda_opts, "heads", heads,
119
- &error_abort);
120
- qemu_opt_set_number(hda_opts, "secs", secs,
121
- &error_abort);
122
- if (translation == BIOS_ATA_TRANSLATION_LARGE) {
123
- qemu_opt_set(hda_opts, "trans", "large",
124
- &error_abort);
125
- } else if (translation == BIOS_ATA_TRANSLATION_RECHS) {
126
- qemu_opt_set(hda_opts, "trans", "rechs",
127
- &error_abort);
128
- } else if (translation == BIOS_ATA_TRANSLATION_LBA) {
129
- qemu_opt_set(hda_opts, "trans", "lba",
130
- &error_abort);
131
- } else if (translation == BIOS_ATA_TRANSLATION_NONE) {
132
- qemu_opt_set(hda_opts, "trans", "none",
133
- &error_abort);
134
- }
135
- }
136
- }
137
- error_report("'-hdachs' is deprecated, please use '-device"
138
- " ide-hd,cyls=c,heads=h,secs=s,...' instead");
139
- break;
140
case QEMU_OPTION_numa:
141
opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
142
optarg, true);
143
diff --git a/qemu-doc.texi b/qemu-doc.texi
144
index XXXXXXX..XXXXXXX 100644
145
--- a/qemu-doc.texi
146
+++ b/qemu-doc.texi
147
@@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the
148
``-object filter-dump'' argument which works in combination
149
with the modern ``-netdev`` backends instead.
150
151
-@subsection -hdachs (since 2.10.0)
152
-
153
-The ``-hdachs'' argument is now a synonym for setting
154
-the ``cyls'', ``heads'', ``secs'', and ``trans'' properties
155
-on the ``ide-hd'' device using the ``-device'' argument.
156
-The new syntax allows different settings to be provided
157
-per disk.
158
-
159
@subsection -usbdevice (since 2.10.0)
160
161
The ``-usbdevice DEV'' argument is now a synonym for setting
162
diff --git a/qemu-options.hx b/qemu-options.hx
163
index XXXXXXX..XXXXXXX 100644
164
--- a/qemu-options.hx
165
+++ b/qemu-options.hx
166
@@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type.
167
@item media=@var{media}
168
This option defines the type of the media: disk or cdrom.
169
@item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
170
-These options have the same definition as they have in @option{-hdachs}.
171
-These parameters are deprecated, use the corresponding parameters
172
+Force disk physical geometry and the optional BIOS translation (trans=none or
173
+lba). These parameters are deprecated, use the corresponding parameters
174
of @code{-device} instead.
175
@item snapshot=@var{snapshot}
176
@var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
177
@@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force
178
the write back by pressing @key{C-a s} (@pxref{disk_images}).
179
ETEXI
180
181
-DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \
182
- "-hdachs c,h,s[,t]\n" \
183
- " force hard disk 0 physical geometry and the optional BIOS\n" \
184
- " translation (t=none or lba) (usually QEMU can guess them)\n",
185
- QEMU_ARCH_ALL)
186
-STEXI
187
-@item -hdachs @var{c},@var{h},@var{s},[,@var{t}]
188
-@findex -hdachs
189
-Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
190
-@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
191
-translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
192
-all those parameters. This option is deprecated, please use
193
-@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
194
-ETEXI
195
-
196
DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
197
"-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n"
198
" [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n"
199
--
200
2.13.6
201
202
diff view generated by jsdifflib
New patch
1
From: Thomas Huth <thuth@redhat.com>
1
2
3
Looks like we forgot to announce the deprecation of these options in
4
the corresponding chapter of the qemu-doc text, so let's do that now.
5
6
Signed-off-by: Thomas Huth <thuth@redhat.com>
7
Reviewed-by: John Snow <jsnow@redhat.com>
8
Reviewed-by: Markus Armbruster <armbru@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
11
qemu-doc.texi | 15 +++++++++++++++
12
1 file changed, 15 insertions(+)
13
14
diff --git a/qemu-doc.texi b/qemu-doc.texi
15
index XXXXXXX..XXXXXXX 100644
16
--- a/qemu-doc.texi
17
+++ b/qemu-doc.texi
18
@@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU.
19
The ``-drive if=scsi'' argument is replaced by the the
20
``-device BUS-TYPE'' argument combined with ``-drive if=none''.
21
22
+@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0)
23
+
24
+The drive geometry arguments are replaced by the the geometry arguments
25
+that can be specified with the ``-device'' parameter.
26
+
27
+@subsection -drive serial=... (since 2.10.0)
28
+
29
+The drive serial argument is replaced by the the serial argument
30
+that can be specified with the ``-device'' parameter.
31
+
32
+@subsection -drive addr=... (since 2.10.0)
33
+
34
+The drive addr argument is replaced by the the addr argument
35
+that can be specified with the ``-device'' parameter.
36
+
37
@subsection -net dump (since 2.10.0)
38
39
The ``--net dump'' argument is now replaced with the
40
--
41
2.13.6
42
43
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
1
From: Fam Zheng <famz@redhat.com>
2
2
3
bdrv_co_drain() has not been used since commit 9a0cec664eef ("mirror:
3
Signed-off-by: Fam Zheng <famz@redhat.com>
4
use bdrv_drained_begin/bdrv_drained_end") in 2016. Remove it so there
5
are fewer drain scenarios to worry about.
6
7
Use bdrv_drained_begin()/bdrv_drained_end() instead. They are "mixed"
8
functions that can be called from coroutine context. Unlike
9
bdrv_co_drain(), these functions provide control of the length of the
10
drained section, which is usually the right thing.
11
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-Id: <20220521122714.3837731-1-stefanha@redhat.com>
14
Reviewed-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
15
Reviewed-by: Alberto Faria <afaria@redhat.com>
16
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
---
5
---
18
include/block/block-io.h | 1 -
6
include/block/block_int.h | 1 -
19
block/io.c | 15 ---------------
7
block/io.c | 18 ------------------
20
2 files changed, 16 deletions(-)
8
2 files changed, 19 deletions(-)
21
9
22
diff --git a/include/block/block-io.h b/include/block/block-io.h
10
diff --git a/include/block/block_int.h b/include/block/block_int.h
23
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
24
--- a/include/block/block-io.h
12
--- a/include/block/block_int.h
25
+++ b/include/block/block-io.h
13
+++ b/include/block/block_int.h
26
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
14
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
27
cond); })
15
bool blk_dev_is_medium_locked(BlockBackend *blk);
28
16
29
void bdrv_drain(BlockDriverState *bs);
17
void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
30
-void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
18
-bool bdrv_requests_pending(BlockDriverState *bs);
31
19
32
int generated_co_wrapper
20
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
33
bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
21
void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
34
diff --git a/block/io.c b/block/io.c
22
diff --git a/block/io.c b/block/io.c
35
index XXXXXXX..XXXXXXX 100644
23
index XXXXXXX..XXXXXXX 100644
36
--- a/block/io.c
24
--- a/block/io.c
37
+++ b/block/io.c
25
+++ b/block/io.c
38
@@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
26
@@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
39
BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
27
assert(old >= 1);
40
}
28
}
41
29
42
-/*
30
-/* Check if any requests are in-flight (including throttled requests) */
43
- * Wait for pending requests to complete on a single BlockDriverState subtree,
31
-bool bdrv_requests_pending(BlockDriverState *bs)
44
- * and suspend block driver's internal I/O until next request arrives.
45
- *
46
- * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
47
- * AioContext.
48
- */
49
-void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
50
-{
32
-{
51
- IO_OR_GS_CODE();
33
- BdrvChild *child;
52
- assert(qemu_in_coroutine());
34
-
53
- bdrv_drained_begin(bs);
35
- if (atomic_read(&bs->in_flight)) {
54
- bdrv_drained_end(bs);
36
- return true;
37
- }
38
-
39
- QLIST_FOREACH(child, &bs->children, next) {
40
- if (bdrv_requests_pending(child->bs)) {
41
- return true;
42
- }
43
- }
44
-
45
- return false;
55
-}
46
-}
56
-
47
-
57
void bdrv_drain(BlockDriverState *bs)
48
typedef struct {
58
{
49
Coroutine *co;
59
IO_OR_GS_CODE();
50
BlockDriverState *bs;
60
--
51
--
61
2.35.3
52
2.13.6
53
54
diff view generated by jsdifflib
New patch
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2
Reviewed-by: Fam Zheng <famz@redhat.com>
3
---
4
block/io.c | 6 ++++++
5
1 file changed, 6 insertions(+)
1
6
7
diff --git a/block/io.c b/block/io.c
8
index XXXXXXX..XXXXXXX 100644
9
--- a/block/io.c
10
+++ b/block/io.c
11
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
12
BdrvNextIterator it;
13
GSList *aio_ctxs = NULL, *ctx;
14
15
+ /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread
16
+ * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on
17
+ * nodes in several different AioContexts, so make sure we're in the main
18
+ * context. */
19
+ assert(qemu_get_current_aio_context() == qemu_get_aio_context());
20
+
21
block_job_pause_all();
22
23
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
24
--
25
2.13.6
26
27
diff view generated by jsdifflib
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>
1
bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively
2
and also doesn't notify other parent nodes of children, which both means
3
that the child nodes are not actually drained, and bdrv_drained_begin()
4
is providing useful functionality only on a single node.
2
5
3
We have too much logic to simply check that bitmaps are of the same
6
To keep things consistent, we also shouldn't call the block driver
4
size. Let's just define that hbitmap_merge() and
7
callbacks recursively.
5
bdrv_dirty_bitmap_merge_internal() require their argument bitmaps be of
6
same size, this simplifies things.
7
8
8
Let's look through the callers:
9
A proper recursive drain version that provides an actually working
10
drained section for child nodes will be introduced later.
9
11
10
For backup_init_bcs_bitmap() we already assert that merge can't fail.
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
Reviewed-by: Fam Zheng <famz@redhat.com>
14
---
15
block/io.c | 16 +++++++++-------
16
1 file changed, 9 insertions(+), 7 deletions(-)
11
17
12
In bdrv_reclaim_dirty_bitmap_locked() we gracefully handle the error
18
diff --git a/block/io.c b/block/io.c
13
that can't happen: successor always has same size as its parent, drop
14
this logic.
15
16
In bdrv_merge_dirty_bitmap() we already has assertion and separate
17
check. Make the check explicit and improve error message.
18
19
Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
20
Reviewed-by: Nikita Lapshin <nikita.lapshin@virtuozzo.com>
21
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
22
Message-Id: <20220517111206.23585-4-v.sementsov-og@mail.ru>
23
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
24
---
25
include/block/block_int-io.h | 2 +-
26
include/qemu/hbitmap.h | 15 ++-------------
27
block/backup.c | 6 ++----
28
block/dirty-bitmap.c | 26 +++++++++++---------------
29
util/hbitmap.c | 25 +++++++------------------
30
5 files changed, 23 insertions(+), 51 deletions(-)
31
32
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
33
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
34
--- a/include/block/block_int-io.h
20
--- a/block/io.c
35
+++ b/include/block/block_int-io.h
21
+++ b/block/io.c
36
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
22
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
37
void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
23
}
38
24
39
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
25
/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
40
-bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
26
-static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
41
+void bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
27
+static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
42
const BdrvDirtyBitmap *src,
43
HBitmap **backup, bool lock);
44
45
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
46
index XXXXXXX..XXXXXXX 100644
47
--- a/include/qemu/hbitmap.h
48
+++ b/include/qemu/hbitmap.h
49
@@ -XXX,XX +XXX,XX @@ void hbitmap_truncate(HBitmap *hb, uint64_t size);
50
*
51
* Store result of merging @a and @b into @result.
52
* @result is allowed to be equal to @a or @b.
53
- *
54
- * Return true if the merge was successful,
55
- * false if it was not attempted.
56
- */
57
-bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result);
58
-
59
-/**
60
- * hbitmap_can_merge:
61
- *
62
- * hbitmap_can_merge(a, b) && hbitmap_can_merge(a, result) is sufficient and
63
- * necessary for hbitmap_merge will not fail.
64
- *
65
+ * All bitmaps must have same size.
66
*/
67
-bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b);
68
+void hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result);
69
70
/**
71
* hbitmap_empty:
72
diff --git a/block/backup.c b/block/backup.c
73
index XXXXXXX..XXXXXXX 100644
74
--- a/block/backup.c
75
+++ b/block/backup.c
76
@@ -XXX,XX +XXX,XX @@ out:
77
78
static void backup_init_bcs_bitmap(BackupBlockJob *job)
79
{
28
{
80
- bool ret;
29
BdrvChild *child, *tmp;
81
uint64_t estimate;
30
BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
82
BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(job->bcs);
31
@@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
83
32
bdrv_coroutine_enter(bs, data.co);
84
if (job->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
33
BDRV_POLL_WHILE(bs, !data.done);
85
bdrv_clear_dirty_bitmap(bcs_bitmap, NULL);
34
86
- ret = bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap,
35
- QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
87
- NULL, true);
36
- bdrv_drain_invoke(child->bs, begin);
88
- assert(ret);
37
+ if (recursive) {
89
+ bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap, NULL,
38
+ QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
90
+ true);
39
+ bdrv_drain_invoke(child->bs, begin, true);
91
} else if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
40
+ }
92
/*
93
* We can't hog the coroutine to initialize this thoroughly.
94
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
95
index XXXXXXX..XXXXXXX 100644
96
--- a/block/dirty-bitmap.c
97
+++ b/block/dirty-bitmap.c
98
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *parent,
99
return NULL;
100
}
101
102
- if (!hbitmap_merge(parent->bitmap, successor->bitmap, parent->bitmap)) {
103
- error_setg(errp, "Merging of parent and successor bitmap failed");
104
- return NULL;
105
- }
106
+ hbitmap_merge(parent->bitmap, successor->bitmap, parent->bitmap);
107
108
parent->disabled = successor->disabled;
109
parent->busy = false;
110
@@ -XXX,XX +XXX,XX @@ bool bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
111
goto out;
112
}
113
114
- if (!hbitmap_can_merge(dest->bitmap, src->bitmap)) {
115
- error_setg(errp, "Bitmaps are incompatible and can't be merged");
116
+ if (bdrv_dirty_bitmap_size(src) != bdrv_dirty_bitmap_size(dest)) {
117
+ error_setg(errp, "Bitmaps are of different sizes (destination size is %"
118
+ PRId64 ", source size is %" PRId64 ") and can't be merged",
119
+ bdrv_dirty_bitmap_size(dest), bdrv_dirty_bitmap_size(src));
120
goto out;
121
}
122
123
- ret = bdrv_dirty_bitmap_merge_internal(dest, src, backup, false);
124
- assert(ret);
125
+ bdrv_dirty_bitmap_merge_internal(dest, src, backup, false);
126
+ ret = true;
127
128
out:
129
bdrv_dirty_bitmaps_unlock(dest->bs);
130
@@ -XXX,XX +XXX,XX @@ out:
131
/**
132
* bdrv_dirty_bitmap_merge_internal: merge src into dest.
133
* Does NOT check bitmap permissions; not suitable for use as public API.
134
+ * @dest, @src and @backup (if not NULL) must have same size.
135
*
136
* @backup: If provided, make a copy of dest here prior to merge.
137
* @lock: If true, lock and unlock bitmaps on the way in/out.
138
- * returns true if the merge succeeded; false if unattempted.
139
*/
140
-bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
141
+void bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
142
const BdrvDirtyBitmap *src,
143
HBitmap **backup,
144
bool lock)
145
{
146
- bool ret;
147
IO_CODE();
148
149
assert(!bdrv_dirty_bitmap_readonly(dest));
150
@@ -XXX,XX +XXX,XX @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
151
if (backup) {
152
*backup = dest->bitmap;
153
dest->bitmap = hbitmap_alloc(dest->size, hbitmap_granularity(*backup));
154
- ret = hbitmap_merge(*backup, src->bitmap, dest->bitmap);
155
+ hbitmap_merge(*backup, src->bitmap, dest->bitmap);
156
} else {
157
- ret = hbitmap_merge(dest->bitmap, src->bitmap, dest->bitmap);
158
+ hbitmap_merge(dest->bitmap, src->bitmap, dest->bitmap);
159
}
160
161
if (lock) {
162
@@ -XXX,XX +XXX,XX @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
163
bdrv_dirty_bitmaps_unlock(src->bs);
164
}
165
}
166
-
167
- return ret;
168
}
169
diff --git a/util/hbitmap.c b/util/hbitmap.c
170
index XXXXXXX..XXXXXXX 100644
171
--- a/util/hbitmap.c
172
+++ b/util/hbitmap.c
173
@@ -XXX,XX +XXX,XX @@ void hbitmap_truncate(HBitmap *hb, uint64_t size)
174
}
41
}
175
}
42
}
176
43
177
-bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b)
44
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
178
-{
45
bdrv_parent_drained_begin(bs);
179
- return (a->orig_size == b->orig_size);
180
-}
181
-
182
/**
183
* hbitmap_sparse_merge: performs dst = dst | src
184
* works with differing granularities.
185
@@ -XXX,XX +XXX,XX @@ static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src)
186
* Given HBitmaps A and B, let R := A (BITOR) B.
187
* Bitmaps A and B will not be modified,
188
* except when bitmap R is an alias of A or B.
189
- *
190
- * @return true if the merge was successful,
191
- * false if it was not attempted.
192
+ * Bitmaps must have same size.
193
*/
194
-bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
195
+void hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
196
{
197
int i;
198
uint64_t j;
199
200
- if (!hbitmap_can_merge(a, b) || !hbitmap_can_merge(a, result)) {
201
- return false;
202
- }
203
- assert(hbitmap_can_merge(b, result));
204
+ assert(a->orig_size == result->orig_size);
205
+ assert(b->orig_size == result->orig_size);
206
207
if ((!hbitmap_count(a) && result == b) ||
208
(!hbitmap_count(b) && result == a)) {
209
- return true;
210
+ return;
211
}
46
}
212
47
213
if (!hbitmap_count(a) && !hbitmap_count(b)) {
48
- bdrv_drain_invoke(bs, true);
214
hbitmap_reset_all(result);
49
+ bdrv_drain_invoke(bs, true, false);
215
- return true;
50
bdrv_drain_recurse(bs);
216
+ return;
51
}
52
53
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
217
}
54
}
218
55
219
if (a->granularity != b->granularity) {
56
/* Re-enable things in child-to-parent order */
220
@@ -XXX,XX +XXX,XX @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
57
- bdrv_drain_invoke(bs, false);
221
if (b != result) {
58
+ bdrv_drain_invoke(bs, false, false);
222
hbitmap_sparse_merge(result, b);
59
bdrv_parent_drained_end(bs);
223
}
60
aio_enable_external(bdrv_get_aio_context(bs));
224
- return true;
225
+ return;
226
}
227
228
/* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
229
@@ -XXX,XX +XXX,XX @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
230
231
/* Recompute the dirty count */
232
result->count = hb_count_between(result, 0, result->size - 1);
233
-
234
- return true;
235
}
61
}
236
62
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
237
char *hbitmap_sha256(const HBitmap *bitmap, Error **errp)
63
aio_context_acquire(aio_context);
64
aio_disable_external(aio_context);
65
bdrv_parent_drained_begin(bs);
66
- bdrv_drain_invoke(bs, true);
67
+ bdrv_drain_invoke(bs, true, true);
68
aio_context_release(aio_context);
69
70
if (!g_slist_find(aio_ctxs, aio_context)) {
71
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
72
73
/* Re-enable things in child-to-parent order */
74
aio_context_acquire(aio_context);
75
- bdrv_drain_invoke(bs, false);
76
+ bdrv_drain_invoke(bs, false, true);
77
bdrv_parent_drained_end(bs);
78
aio_enable_external(aio_context);
79
aio_context_release(aio_context);
238
--
80
--
239
2.35.3
81
2.13.6
82
83
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
The existing test is for bdrv_drain_all_begin/end() only. Generalise the
2
test case so that it can be run for the other variants as well. At the
3
moment this is only bdrv_drain_begin/end(), but in a while, we'll add
4
another one.
2
5
3
Abstract the common logic of virtio-blk I/O process to a function
6
Also, add a backing file to the test node to test whether the operations
4
named virtio_blk_process_req(). It's needed for the following commit.
7
work recursively.
5
8
6
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
7
Message-Id: <20220523084611.91-4-xieyongji@bytedance.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
10
---
11
block/export/virtio-blk-handler.h | 37 ++++
11
tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-----
12
block/export/vhost-user-blk-server.c | 259 +++------------------------
12
1 file changed, 62 insertions(+), 7 deletions(-)
13
block/export/virtio-blk-handler.c | 240 +++++++++++++++++++++++++
14
MAINTAINERS | 2 +
15
block/export/meson.build | 2 +-
16
5 files changed, 301 insertions(+), 239 deletions(-)
17
create mode 100644 block/export/virtio-blk-handler.h
18
create mode 100644 block/export/virtio-blk-handler.c
19
13
20
diff --git a/block/export/virtio-blk-handler.h b/block/export/virtio-blk-handler.h
14
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
21
new file mode 100644
15
index XXXXXXX..XXXXXXX 100644
22
index XXXXXXX..XXXXXXX
16
--- a/tests/test-bdrv-drain.c
23
--- /dev/null
17
+++ b/tests/test-bdrv-drain.c
24
+++ b/block/export/virtio-blk-handler.h
18
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = {
25
@@ -XXX,XX +XXX,XX @@
19
26
+/*
20
.bdrv_co_drain_begin = bdrv_test_co_drain_begin,
27
+ * Handler for virtio-blk I/O
21
.bdrv_co_drain_end = bdrv_test_co_drain_end,
28
+ *
29
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
30
+ *
31
+ * Author:
32
+ * Xie Yongji <xieyongji@bytedance.com>
33
+ *
34
+ * This work is licensed under the terms of the GNU GPL, version 2 or
35
+ * later. See the COPYING file in the top-level directory.
36
+ */
37
+
22
+
38
+#ifndef VIRTIO_BLK_HANDLER_H
23
+ .bdrv_child_perm = bdrv_format_default_perms,
39
+#define VIRTIO_BLK_HANDLER_H
40
+
41
+#include "sysemu/block-backend.h"
42
+
43
+#define VIRTIO_BLK_SECTOR_BITS 9
44
+#define VIRTIO_BLK_SECTOR_SIZE (1ULL << VIRTIO_BLK_SECTOR_BITS)
45
+
46
+#define VIRTIO_BLK_MAX_DISCARD_SECTORS 32768
47
+#define VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS 32768
48
+
49
+typedef struct {
50
+ BlockBackend *blk;
51
+ const char *serial;
52
+ uint32_t logical_block_size;
53
+ bool writable;
54
+} VirtioBlkHandler;
55
+
56
+int coroutine_fn virtio_blk_process_req(VirtioBlkHandler *handler,
57
+ struct iovec *in_iov,
58
+ struct iovec *out_iov,
59
+ unsigned int in_num,
60
+ unsigned int out_num);
61
+
62
+#endif /* VIRTIO_BLK_HANDLER_H */
63
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
64
index XXXXXXX..XXXXXXX 100644
65
--- a/block/export/vhost-user-blk-server.c
66
+++ b/block/export/vhost-user-blk-server.c
67
@@ -XXX,XX +XXX,XX @@
68
#include "vhost-user-blk-server.h"
69
#include "qapi/error.h"
70
#include "qom/object_interfaces.h"
71
-#include "sysemu/block-backend.h"
72
#include "util/block-helpers.h"
73
-
74
-/*
75
- * Sector units are 512 bytes regardless of the
76
- * virtio_blk_config->blk_size value.
77
- */
78
-#define VIRTIO_BLK_SECTOR_BITS 9
79
-#define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS)
80
+#include "virtio-blk-handler.h"
81
82
enum {
83
VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1,
84
- VHOST_USER_BLK_MAX_DISCARD_SECTORS = 32768,
85
- VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS = 32768,
86
-};
87
-struct virtio_blk_inhdr {
88
- unsigned char status;
89
};
24
};
90
25
91
typedef struct VuBlkReq {
26
static void aio_ret_cb(void *opaque, int ret)
92
VuVirtqElement elem;
27
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
93
- int64_t sector_num;
28
*aio_ret = ret;
94
- size_t size;
95
- struct virtio_blk_inhdr *in;
96
- struct virtio_blk_outhdr out;
97
VuServer *server;
98
struct VuVirtq *vq;
99
} VuBlkReq;
100
@@ -XXX,XX +XXX,XX @@ typedef struct VuBlkReq {
101
typedef struct {
102
BlockExport export;
103
VuServer vu_server;
104
- uint32_t blk_size;
105
+ VirtioBlkHandler handler;
106
QIOChannelSocket *sioc;
107
struct virtio_blk_config blkcfg;
108
- bool writable;
109
} VuBlkExport;
110
111
-static void vu_blk_req_complete(VuBlkReq *req)
112
+static void vu_blk_req_complete(VuBlkReq *req, size_t in_len)
113
{
114
VuDev *vu_dev = &req->server->vu_dev;
115
116
- vu_queue_push(vu_dev, req->vq, &req->elem, req->size);
117
+ vu_queue_push(vu_dev, req->vq, &req->elem, in_len);
118
vu_queue_notify(vu_dev, req->vq);
119
120
free(req);
121
}
29
}
122
30
123
-static bool vu_blk_sect_range_ok(VuBlkExport *vexp, uint64_t sector,
31
-static void test_drv_cb_drain_all(void)
124
- size_t size)
32
+enum drain_type {
125
-{
33
+ BDRV_DRAIN_ALL,
126
- uint64_t nb_sectors;
34
+ BDRV_DRAIN,
127
- uint64_t total_sectors;
128
-
129
- if (size % VIRTIO_BLK_SECTOR_SIZE) {
130
- return false;
131
- }
132
-
133
- nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS;
134
-
135
- QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE);
136
- if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
137
- return false;
138
- }
139
- if ((sector << VIRTIO_BLK_SECTOR_BITS) % vexp->blk_size) {
140
- return false;
141
- }
142
- blk_get_geometry(vexp->export.blk, &total_sectors);
143
- if (sector > total_sectors || nb_sectors > total_sectors - sector) {
144
- return false;
145
- }
146
- return true;
147
-}
148
-
149
-static int coroutine_fn
150
-vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct iovec *iov,
151
- uint32_t iovcnt, uint32_t type)
152
-{
153
- BlockBackend *blk = vexp->export.blk;
154
- struct virtio_blk_discard_write_zeroes desc;
155
- ssize_t size;
156
- uint64_t sector;
157
- uint32_t num_sectors;
158
- uint32_t max_sectors;
159
- uint32_t flags;
160
- int bytes;
161
-
162
- /* Only one desc is currently supported */
163
- if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) {
164
- return VIRTIO_BLK_S_UNSUPP;
165
- }
166
-
167
- size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
168
- if (unlikely(size != sizeof(desc))) {
169
- error_report("Invalid size %zd, expected %zu", size, sizeof(desc));
170
- return VIRTIO_BLK_S_IOERR;
171
- }
172
-
173
- sector = le64_to_cpu(desc.sector);
174
- num_sectors = le32_to_cpu(desc.num_sectors);
175
- flags = le32_to_cpu(desc.flags);
176
- max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ?
177
- VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS :
178
- VHOST_USER_BLK_MAX_DISCARD_SECTORS;
179
-
180
- /* This check ensures that 'bytes' fits in an int */
181
- if (unlikely(num_sectors > max_sectors)) {
182
- return VIRTIO_BLK_S_IOERR;
183
- }
184
-
185
- bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS;
186
-
187
- if (unlikely(!vu_blk_sect_range_ok(vexp, sector, bytes))) {
188
- return VIRTIO_BLK_S_IOERR;
189
- }
190
-
191
- /*
192
- * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
193
- * and write zeroes commands if any unknown flag is set.
194
- */
195
- if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
196
- return VIRTIO_BLK_S_UNSUPP;
197
- }
198
-
199
- if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
200
- int blk_flags = 0;
201
-
202
- if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
203
- blk_flags |= BDRV_REQ_MAY_UNMAP;
204
- }
205
-
206
- if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS,
207
- bytes, blk_flags) == 0) {
208
- return VIRTIO_BLK_S_OK;
209
- }
210
- } else if (type == VIRTIO_BLK_T_DISCARD) {
211
- /*
212
- * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
213
- * discard commands if the unmap flag is set.
214
- */
215
- if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
216
- return VIRTIO_BLK_S_UNSUPP;
217
- }
218
-
219
- if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS,
220
- bytes) == 0) {
221
- return VIRTIO_BLK_S_OK;
222
- }
223
- }
224
-
225
- return VIRTIO_BLK_S_IOERR;
226
-}
227
-
228
/* Called with server refcount increased, must decrease before returning */
229
static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
230
{
231
VuBlkReq *req = opaque;
232
VuServer *server = req->server;
233
VuVirtqElement *elem = &req->elem;
234
- uint32_t type;
235
-
236
VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
237
- BlockBackend *blk = vexp->export.blk;
238
-
239
+ VirtioBlkHandler *handler = &vexp->handler;
240
struct iovec *in_iov = elem->in_sg;
241
struct iovec *out_iov = elem->out_sg;
242
unsigned in_num = elem->in_num;
243
unsigned out_num = elem->out_num;
244
-
245
- /* refer to hw/block/virtio_blk.c */
246
- if (elem->out_num < 1 || elem->in_num < 1) {
247
- error_report("virtio-blk request missing headers");
248
- goto err;
249
- }
250
-
251
- if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
252
- sizeof(req->out)) != sizeof(req->out))) {
253
- error_report("virtio-blk request outhdr too short");
254
- goto err;
255
- }
256
-
257
- iov_discard_front(&out_iov, &out_num, sizeof(req->out));
258
-
259
- if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
260
- error_report("virtio-blk request inhdr too short");
261
- goto err;
262
- }
263
-
264
- req->size = iov_size(in_iov, in_num);
265
- /* We always touch the last byte, so just see how big in_iov is. */
266
- req->in = (void *)in_iov[in_num - 1].iov_base
267
- + in_iov[in_num - 1].iov_len
268
- - sizeof(struct virtio_blk_inhdr);
269
- iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
270
-
271
- type = le32_to_cpu(req->out.type);
272
- switch (type & ~VIRTIO_BLK_T_BARRIER) {
273
- case VIRTIO_BLK_T_IN:
274
- case VIRTIO_BLK_T_OUT: {
275
- QEMUIOVector qiov;
276
- int64_t offset;
277
- ssize_t ret = 0;
278
- bool is_write = type & VIRTIO_BLK_T_OUT;
279
- req->sector_num = le64_to_cpu(req->out.sector);
280
-
281
- if (is_write && !vexp->writable) {
282
- req->in->status = VIRTIO_BLK_S_IOERR;
283
- break;
284
- }
285
-
286
- if (is_write) {
287
- qemu_iovec_init_external(&qiov, out_iov, out_num);
288
- } else {
289
- qemu_iovec_init_external(&qiov, in_iov, in_num);
290
- }
291
-
292
- if (unlikely(!vu_blk_sect_range_ok(vexp,
293
- req->sector_num,
294
- qiov.size))) {
295
- req->in->status = VIRTIO_BLK_S_IOERR;
296
- break;
297
- }
298
-
299
- offset = req->sector_num << VIRTIO_BLK_SECTOR_BITS;
300
-
301
- if (is_write) {
302
- ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
303
- } else {
304
- ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
305
- }
306
- if (ret >= 0) {
307
- req->in->status = VIRTIO_BLK_S_OK;
308
- } else {
309
- req->in->status = VIRTIO_BLK_S_IOERR;
310
- }
311
- break;
312
- }
313
- case VIRTIO_BLK_T_FLUSH:
314
- if (blk_co_flush(blk) == 0) {
315
- req->in->status = VIRTIO_BLK_S_OK;
316
- } else {
317
- req->in->status = VIRTIO_BLK_S_IOERR;
318
- }
319
- break;
320
- case VIRTIO_BLK_T_GET_ID: {
321
- size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
322
- VIRTIO_BLK_ID_BYTES);
323
- snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
324
- req->in->status = VIRTIO_BLK_S_OK;
325
- break;
326
+ int in_len;
327
+
328
+ in_len = virtio_blk_process_req(handler, in_iov, out_iov,
329
+ in_num, out_num);
330
+ if (in_len < 0) {
331
+ free(req);
332
+ vhost_user_server_unref(server);
333
+ return;
334
}
335
- case VIRTIO_BLK_T_DISCARD:
336
- case VIRTIO_BLK_T_WRITE_ZEROES: {
337
- if (!vexp->writable) {
338
- req->in->status = VIRTIO_BLK_S_IOERR;
339
- break;
340
- }
341
-
342
- req->in->status = vu_blk_discard_write_zeroes(vexp, out_iov, out_num,
343
- type);
344
- break;
345
- }
346
- default:
347
- req->in->status = VIRTIO_BLK_S_UNSUPP;
348
- break;
349
- }
350
-
351
- vu_blk_req_complete(req);
352
- vhost_user_server_unref(server);
353
- return;
354
355
-err:
356
- free(req);
357
+ vu_blk_req_complete(req, in_len);
358
vhost_user_server_unref(server);
359
}
360
361
@@ -XXX,XX +XXX,XX @@ static uint64_t vu_blk_get_features(VuDev *dev)
362
1ull << VIRTIO_RING_F_EVENT_IDX |
363
1ull << VHOST_USER_F_PROTOCOL_FEATURES;
364
365
- if (!vexp->writable) {
366
+ if (!vexp->handler.writable) {
367
features |= 1ull << VIRTIO_BLK_F_RO;
368
}
369
370
@@ -XXX,XX +XXX,XX @@ vu_blk_initialize_config(BlockDriverState *bs,
371
config->opt_io_size = cpu_to_le32(1);
372
config->num_queues = cpu_to_le16(num_queues);
373
config->max_discard_sectors =
374
- cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS);
375
+ cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS);
376
config->max_discard_seg = cpu_to_le32(1);
377
config->discard_sector_alignment =
378
cpu_to_le32(blk_size >> VIRTIO_BLK_SECTOR_BITS);
379
config->max_write_zeroes_sectors
380
- = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS);
381
+ = cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS);
382
config->max_write_zeroes_seg = cpu_to_le32(1);
383
}
384
385
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
386
uint64_t logical_block_size;
387
uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT;
388
389
- vexp->writable = opts->writable;
390
vexp->blkcfg.wce = 0;
391
392
if (vu_opts->has_logical_block_size) {
393
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
394
error_propagate(errp, local_err);
395
return -EINVAL;
396
}
397
- vexp->blk_size = logical_block_size;
398
399
if (vu_opts->has_num_queues) {
400
num_queues = vu_opts->num_queues;
401
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
402
error_setg(errp, "num-queues must be greater than 0");
403
return -EINVAL;
404
}
405
+ vexp->handler.blk = exp->blk;
406
+ vexp->handler.serial = "vhost_user_blk";
407
+ vexp->handler.logical_block_size = logical_block_size;
408
+ vexp->handler.writable = opts->writable;
409
410
vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
411
logical_block_size, num_queues);
412
diff --git a/block/export/virtio-blk-handler.c b/block/export/virtio-blk-handler.c
413
new file mode 100644
414
index XXXXXXX..XXXXXXX
415
--- /dev/null
416
+++ b/block/export/virtio-blk-handler.c
417
@@ -XXX,XX +XXX,XX @@
418
+/*
419
+ * Handler for virtio-blk I/O
420
+ *
421
+ * Copyright (c) 2020 Red Hat, Inc.
422
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
423
+ *
424
+ * Author:
425
+ * Coiby Xu <coiby.xu@gmail.com>
426
+ * Xie Yongji <xieyongji@bytedance.com>
427
+ *
428
+ * This work is licensed under the terms of the GNU GPL, version 2 or
429
+ * later. See the COPYING file in the top-level directory.
430
+ */
431
+
432
+#include "qemu/osdep.h"
433
+#include "qemu/error-report.h"
434
+#include "virtio-blk-handler.h"
435
+
436
+#include "standard-headers/linux/virtio_blk.h"
437
+
438
+struct virtio_blk_inhdr {
439
+ unsigned char status;
440
+};
35
+};
441
+
36
+
442
+static bool virtio_blk_sect_range_ok(BlockBackend *blk, uint32_t block_size,
37
+static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
443
+ uint64_t sector, size_t size)
444
+{
38
+{
445
+ uint64_t nb_sectors;
39
+ switch (drain_type) {
446
+ uint64_t total_sectors;
40
+ case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break;
447
+
41
+ case BDRV_DRAIN: bdrv_drained_begin(bs); break;
448
+ if (size % VIRTIO_BLK_SECTOR_SIZE) {
42
+ default: g_assert_not_reached();
449
+ return false;
450
+ }
43
+ }
451
+
452
+ nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS;
453
+
454
+ QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE);
455
+ if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
456
+ return false;
457
+ }
458
+ if ((sector << VIRTIO_BLK_SECTOR_BITS) % block_size) {
459
+ return false;
460
+ }
461
+ blk_get_geometry(blk, &total_sectors);
462
+ if (sector > total_sectors || nb_sectors > total_sectors - sector) {
463
+ return false;
464
+ }
465
+ return true;
466
+}
44
+}
467
+
45
+
468
+static int coroutine_fn
46
+static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
469
+virtio_blk_discard_write_zeroes(VirtioBlkHandler *handler, struct iovec *iov,
470
+ uint32_t iovcnt, uint32_t type)
471
+{
47
+{
472
+ BlockBackend *blk = handler->blk;
48
+ switch (drain_type) {
473
+ struct virtio_blk_discard_write_zeroes desc;
49
+ case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break;
474
+ ssize_t size;
50
+ case BDRV_DRAIN: bdrv_drained_end(bs); break;
475
+ uint64_t sector;
51
+ default: g_assert_not_reached();
476
+ uint32_t num_sectors;
477
+ uint32_t max_sectors;
478
+ uint32_t flags;
479
+ int bytes;
480
+
481
+ /* Only one desc is currently supported */
482
+ if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) {
483
+ return VIRTIO_BLK_S_UNSUPP;
484
+ }
52
+ }
485
+
486
+ size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
487
+ if (unlikely(size != sizeof(desc))) {
488
+ error_report("Invalid size %zd, expected %zu", size, sizeof(desc));
489
+ return VIRTIO_BLK_S_IOERR;
490
+ }
491
+
492
+ sector = le64_to_cpu(desc.sector);
493
+ num_sectors = le32_to_cpu(desc.num_sectors);
494
+ flags = le32_to_cpu(desc.flags);
495
+ max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ?
496
+ VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS :
497
+ VIRTIO_BLK_MAX_DISCARD_SECTORS;
498
+
499
+ /* This check ensures that 'bytes' fits in an int */
500
+ if (unlikely(num_sectors > max_sectors)) {
501
+ return VIRTIO_BLK_S_IOERR;
502
+ }
503
+
504
+ bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS;
505
+
506
+ if (unlikely(!virtio_blk_sect_range_ok(blk, handler->logical_block_size,
507
+ sector, bytes))) {
508
+ return VIRTIO_BLK_S_IOERR;
509
+ }
510
+
511
+ /*
512
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
513
+ * and write zeroes commands if any unknown flag is set.
514
+ */
515
+ if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
516
+ return VIRTIO_BLK_S_UNSUPP;
517
+ }
518
+
519
+ if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
520
+ int blk_flags = 0;
521
+
522
+ if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
523
+ blk_flags |= BDRV_REQ_MAY_UNMAP;
524
+ }
525
+
526
+ if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS,
527
+ bytes, blk_flags) == 0) {
528
+ return VIRTIO_BLK_S_OK;
529
+ }
530
+ } else if (type == VIRTIO_BLK_T_DISCARD) {
531
+ /*
532
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
533
+ * discard commands if the unmap flag is set.
534
+ */
535
+ if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
536
+ return VIRTIO_BLK_S_UNSUPP;
537
+ }
538
+
539
+ if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS,
540
+ bytes) == 0) {
541
+ return VIRTIO_BLK_S_OK;
542
+ }
543
+ }
544
+
545
+ return VIRTIO_BLK_S_IOERR;
546
+}
53
+}
547
+
54
+
548
+int coroutine_fn virtio_blk_process_req(VirtioBlkHandler *handler,
55
+static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
549
+ struct iovec *in_iov,
56
{
550
+ struct iovec *out_iov,
57
BlockBackend *blk;
551
+ unsigned int in_num,
58
- BlockDriverState *bs;
552
+ unsigned int out_num)
59
- BDRVTestState *s;
60
+ BlockDriverState *bs, *backing;
61
+ BDRVTestState *s, *backing_s;
62
BlockAIOCB *acb;
63
int aio_ret;
64
65
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
66
s = bs->opaque;
67
blk_insert_bs(blk, bs, &error_abort);
68
69
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
70
+ backing_s = backing->opaque;
71
+ bdrv_set_backing_hd(bs, backing, &error_abort);
72
+
73
/* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
74
g_assert_cmpint(s->drain_count, ==, 0);
75
- bdrv_drain_all_begin();
76
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
77
+
78
+ do_drain_begin(drain_type, bs);
79
+
80
g_assert_cmpint(s->drain_count, ==, 1);
81
- bdrv_drain_all_end();
82
+ g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
83
+
84
+ do_drain_end(drain_type, bs);
85
+
86
g_assert_cmpint(s->drain_count, ==, 0);
87
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
88
89
/* Now do the same while a request is pending */
90
aio_ret = -EINPROGRESS;
91
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void)
92
g_assert_cmpint(aio_ret, ==, -EINPROGRESS);
93
94
g_assert_cmpint(s->drain_count, ==, 0);
95
- bdrv_drain_all_begin();
96
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
97
+
98
+ do_drain_begin(drain_type, bs);
99
+
100
g_assert_cmpint(aio_ret, ==, 0);
101
g_assert_cmpint(s->drain_count, ==, 1);
102
- bdrv_drain_all_end();
103
+ g_assert_cmpint(backing_s->drain_count, ==, !!recursive);
104
+
105
+ do_drain_end(drain_type, bs);
106
+
107
g_assert_cmpint(s->drain_count, ==, 0);
108
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
109
110
+ bdrv_unref(backing);
111
bdrv_unref(bs);
112
blk_unref(blk);
113
}
114
115
+static void test_drv_cb_drain_all(void)
553
+{
116
+{
554
+ BlockBackend *blk = handler->blk;
117
+ test_drv_cb_common(BDRV_DRAIN_ALL, true);
555
+ struct virtio_blk_inhdr *in;
118
+}
556
+ struct virtio_blk_outhdr out;
557
+ uint32_t type;
558
+ int in_len;
559
+
119
+
560
+ if (out_num < 1 || in_num < 1) {
120
+static void test_drv_cb_drain(void)
561
+ error_report("virtio-blk request missing headers");
121
+{
562
+ return -EINVAL;
122
+ test_drv_cb_common(BDRV_DRAIN, false);
563
+ }
123
+}
564
+
124
+
565
+ if (unlikely(iov_to_buf(out_iov, out_num, 0, &out,
125
int main(int argc, char **argv)
566
+ sizeof(out)) != sizeof(out))) {
126
{
567
+ error_report("virtio-blk request outhdr too short");
127
bdrv_init();
568
+ return -EINVAL;
128
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
569
+ }
129
g_test_init(&argc, &argv, NULL);
570
+
130
571
+ iov_discard_front(&out_iov, &out_num, sizeof(out));
131
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
572
+
132
+ g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
573
+ if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
133
574
+ error_report("virtio-blk request inhdr too short");
134
return g_test_run();
575
+ return -EINVAL;
135
}
576
+ }
577
+
578
+ /* We always touch the last byte, so just see how big in_iov is. */
579
+ in_len = iov_size(in_iov, in_num);
580
+ in = (void *)in_iov[in_num - 1].iov_base
581
+ + in_iov[in_num - 1].iov_len
582
+ - sizeof(struct virtio_blk_inhdr);
583
+ iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
584
+
585
+ type = le32_to_cpu(out.type);
586
+ switch (type & ~VIRTIO_BLK_T_BARRIER) {
587
+ case VIRTIO_BLK_T_IN:
588
+ case VIRTIO_BLK_T_OUT: {
589
+ QEMUIOVector qiov;
590
+ int64_t offset;
591
+ ssize_t ret = 0;
592
+ bool is_write = type & VIRTIO_BLK_T_OUT;
593
+ int64_t sector_num = le64_to_cpu(out.sector);
594
+
595
+ if (is_write && !handler->writable) {
596
+ in->status = VIRTIO_BLK_S_IOERR;
597
+ break;
598
+ }
599
+
600
+ if (is_write) {
601
+ qemu_iovec_init_external(&qiov, out_iov, out_num);
602
+ } else {
603
+ qemu_iovec_init_external(&qiov, in_iov, in_num);
604
+ }
605
+
606
+ if (unlikely(!virtio_blk_sect_range_ok(blk,
607
+ handler->logical_block_size,
608
+ sector_num, qiov.size))) {
609
+ in->status = VIRTIO_BLK_S_IOERR;
610
+ break;
611
+ }
612
+
613
+ offset = sector_num << VIRTIO_BLK_SECTOR_BITS;
614
+
615
+ if (is_write) {
616
+ ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
617
+ } else {
618
+ ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
619
+ }
620
+ if (ret >= 0) {
621
+ in->status = VIRTIO_BLK_S_OK;
622
+ } else {
623
+ in->status = VIRTIO_BLK_S_IOERR;
624
+ }
625
+ break;
626
+ }
627
+ case VIRTIO_BLK_T_FLUSH:
628
+ if (blk_co_flush(blk) == 0) {
629
+ in->status = VIRTIO_BLK_S_OK;
630
+ } else {
631
+ in->status = VIRTIO_BLK_S_IOERR;
632
+ }
633
+ break;
634
+ case VIRTIO_BLK_T_GET_ID: {
635
+ size_t size = MIN(strlen(handler->serial) + 1,
636
+ MIN(iov_size(in_iov, in_num),
637
+ VIRTIO_BLK_ID_BYTES));
638
+ iov_from_buf(in_iov, in_num, 0, handler->serial, size);
639
+ in->status = VIRTIO_BLK_S_OK;
640
+ break;
641
+ }
642
+ case VIRTIO_BLK_T_DISCARD:
643
+ case VIRTIO_BLK_T_WRITE_ZEROES:
644
+ if (!handler->writable) {
645
+ in->status = VIRTIO_BLK_S_IOERR;
646
+ break;
647
+ }
648
+ in->status = virtio_blk_discard_write_zeroes(handler, out_iov,
649
+ out_num, type);
650
+ break;
651
+ default:
652
+ in->status = VIRTIO_BLK_S_UNSUPP;
653
+ break;
654
+ }
655
+
656
+ return in_len;
657
+}
658
diff --git a/MAINTAINERS b/MAINTAINERS
659
index XXXXXXX..XXXXXXX 100644
660
--- a/MAINTAINERS
661
+++ b/MAINTAINERS
662
@@ -XXX,XX +XXX,XX @@ M: Coiby Xu <Coiby.Xu@gmail.com>
663
S: Maintained
664
F: block/export/vhost-user-blk-server.c
665
F: block/export/vhost-user-blk-server.h
666
+F: block/export/virtio-blk-handler.c
667
+F: block/export/virtio-blk-handler.h
668
F: include/qemu/vhost-user-server.h
669
F: tests/qtest/libqos/vhost-user-blk.c
670
F: tests/qtest/libqos/vhost-user-blk.h
671
diff --git a/block/export/meson.build b/block/export/meson.build
672
index XXXXXXX..XXXXXXX 100644
673
--- a/block/export/meson.build
674
+++ b/block/export/meson.build
675
@@ -XXX,XX +XXX,XX @@
676
blockdev_ss.add(files('export.c'))
677
678
if have_vhost_user_blk_server
679
- blockdev_ss.add(files('vhost-user-blk-server.c'))
680
+ blockdev_ss.add(files('vhost-user-blk-server.c', 'virtio-blk-handler.c'))
681
endif
682
683
blockdev_ss.add(when: fuse, if_true: files('fuse.c'))
684
--
136
--
685
2.35.3
137
2.13.6
138
139
diff view generated by jsdifflib
1
From: Eric Blake <eblake@redhat.com>
1
This is currently only working correctly for bdrv_drain(), not for
2
bdrv_drain_all(). Leave a comment for the drain_all case, we'll address
3
it later.
2
4
3
CID 1488362 points out that the second 'rc >= 0' check is now dead
4
code.
5
6
Reported-by: Peter Maydell <peter.maydell@linaro.org>
7
Fixes: 172f5f1a40(nbd: remove peppering of nbd_client_connected)
8
Signed-off-by: Eric Blake <eblake@redhat.com>
9
Message-Id: <20220516210519.76135-1-eblake@redhat.com>
10
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
11
Reviewed-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
---
6
---
14
block/nbd.c | 8 ++------
7
tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
15
1 file changed, 2 insertions(+), 6 deletions(-)
8
1 file changed, 45 insertions(+)
16
9
17
diff --git a/block/nbd.c b/block/nbd.c
10
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
18
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
19
--- a/block/nbd.c
12
--- a/tests/test-bdrv-drain.c
20
+++ b/block/nbd.c
13
+++ b/tests/test-bdrv-drain.c
21
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nbd_co_send_request(BlockDriverState *bs,
14
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
22
if (qiov) {
15
test_drv_cb_common(BDRV_DRAIN, false);
23
qio_channel_set_cork(s->ioc, true);
16
}
24
rc = nbd_send_request(s->ioc, request);
17
25
- if (rc >= 0) {
18
+static void test_quiesce_common(enum drain_type drain_type, bool recursive)
26
- if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
19
+{
27
- NULL) < 0) {
20
+ BlockBackend *blk;
28
- rc = -EIO;
21
+ BlockDriverState *bs, *backing;
29
- }
22
+
30
- } else if (rc >= 0) {
23
+ blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
31
+ if (rc >= 0 && qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
24
+ bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
32
+ NULL) < 0) {
25
+ &error_abort);
33
rc = -EIO;
26
+ blk_insert_bs(blk, bs, &error_abort);
34
}
27
+
35
qio_channel_set_cork(s->ioc, false);
28
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
29
+ bdrv_set_backing_hd(bs, backing, &error_abort);
30
+
31
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
32
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
33
+
34
+ do_drain_begin(drain_type, bs);
35
+
36
+ g_assert_cmpint(bs->quiesce_counter, ==, 1);
37
+ g_assert_cmpint(backing->quiesce_counter, ==, !!recursive);
38
+
39
+ do_drain_end(drain_type, bs);
40
+
41
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
42
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
43
+
44
+ bdrv_unref(backing);
45
+ bdrv_unref(bs);
46
+ blk_unref(blk);
47
+}
48
+
49
+static void test_quiesce_drain_all(void)
50
+{
51
+ // XXX drain_all doesn't quiesce
52
+ //test_quiesce_common(BDRV_DRAIN_ALL, true);
53
+}
54
+
55
+static void test_quiesce_drain(void)
56
+{
57
+ test_quiesce_common(BDRV_DRAIN, false);
58
+}
59
+
60
int main(int argc, char **argv)
61
{
62
bdrv_init();
63
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
64
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
65
g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
66
67
+ g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
68
+ g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
69
+
70
return g_test_run();
71
}
36
--
72
--
37
2.35.3
73
2.13.6
74
75
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
Block jobs already paused themselves when their main BlockBackend
2
entered a drained section. This is not good enough: We also want to
3
pause a block job and may not submit new requests if, for example, the
4
mirror target node should be drained.
2
5
3
Add a 'serial' option to allow user to specify this value
6
This implements .drained_begin/end callbacks in child_job in order to
4
explicitly. And the default value is changed to an empty
7
consider all block nodes related to the job, and removes the
5
string as what we did in "hw/block/virtio-blk.c".
8
BlockBackend callbacks which are unnecessary now because the root of the
9
job main BlockBackend is always referenced with a child_job, too.
6
10
7
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
8
Message-Id: <20220614051532.92-6-xieyongji@bytedance.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
12
---
11
qapi/block-export.json | 4 +++-
13
blockjob.c | 22 +++++++++-------------
12
docs/tools/qemu-storage-daemon.rst | 2 +-
14
1 file changed, 9 insertions(+), 13 deletions(-)
13
block/export/virtio-blk-handler.h | 2 +-
14
block/export/vduse-blk.c | 20 ++++++++++++++------
15
block/export/vhost-user-blk-server.c | 4 +++-
16
storage-daemon/qemu-storage-daemon.c | 1 +
17
6 files changed, 23 insertions(+), 10 deletions(-)
18
15
19
diff --git a/qapi/block-export.json b/qapi/block-export.json
16
diff --git a/blockjob.c b/blockjob.c
20
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
21
--- a/qapi/block-export.json
18
--- a/blockjob.c
22
+++ b/qapi/block-export.json
19
+++ b/blockjob.c
23
@@ -XXX,XX +XXX,XX @@
20
@@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c)
24
# @queue-size: the size of virtqueue. Defaults to 256.
21
job->id);
25
# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE]
26
# and must be power of 2. Defaults to 512 bytes.
27
+# @serial: the serial number of virtio block device. Defaults to empty string.
28
#
29
# Since: 7.1
30
##
31
{ 'struct': 'BlockExportOptionsVduseBlk',
32
'data': { '*num-queues': 'uint16',
33
'*queue-size': 'uint16',
34
- '*logical-block-size': 'size'} }
35
+ '*logical-block-size': 'size',
36
+ '*serial': 'str' } }
37
38
##
39
# @NbdServerAddOptions:
40
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
41
index XXXXXXX..XXXXXXX 100644
42
--- a/docs/tools/qemu-storage-daemon.rst
43
+++ b/docs/tools/qemu-storage-daemon.rst
44
@@ -XXX,XX +XXX,XX @@ Standard options:
45
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
46
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
47
--export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
48
- --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>]
49
+ --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
50
51
is a block export definition. ``node-name`` is the block node that should be
52
exported. ``writable`` determines whether or not the export allows write
53
diff --git a/block/export/virtio-blk-handler.h b/block/export/virtio-blk-handler.h
54
index XXXXXXX..XXXXXXX 100644
55
--- a/block/export/virtio-blk-handler.h
56
+++ b/block/export/virtio-blk-handler.h
57
@@ -XXX,XX +XXX,XX @@
58
59
typedef struct {
60
BlockBackend *blk;
61
- const char *serial;
62
+ char *serial;
63
uint32_t logical_block_size;
64
bool writable;
65
} VirtioBlkHandler;
66
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
67
index XXXXXXX..XXXXXXX 100644
68
--- a/block/export/vduse-blk.c
69
+++ b/block/export/vduse-blk.c
70
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
71
Error *local_err = NULL;
72
struct virtio_blk_config config = { 0 };
73
uint64_t features;
74
- int i;
75
+ int i, ret;
76
77
if (vblk_opts->has_num_queues) {
78
num_queues = vblk_opts->num_queues;
79
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
80
}
81
vblk_exp->num_queues = num_queues;
82
vblk_exp->handler.blk = exp->blk;
83
- vblk_exp->handler.serial = exp->id;
84
+ vblk_exp->handler.serial = g_strdup(vblk_opts->has_serial ?
85
+ vblk_opts->serial : "");
86
vblk_exp->handler.logical_block_size = logical_block_size;
87
vblk_exp->handler.writable = opts->writable;
88
89
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
90
vblk_exp);
91
if (!vblk_exp->dev) {
92
error_setg(errp, "failed to create vduse device");
93
- return -ENOMEM;
94
+ ret = -ENOMEM;
95
+ goto err_dev;
96
}
97
98
vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
99
g_get_tmp_dir(), exp->id);
100
if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
101
error_setg(errp, "failed to set reconnect log file");
102
- vduse_dev_destroy(vblk_exp->dev);
103
- g_free(vblk_exp->recon_file);
104
- return -EINVAL;
105
+ ret = -EINVAL;
106
+ goto err;
107
}
108
109
for (i = 0; i < num_queues; i++) {
110
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
111
blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
112
113
return 0;
114
+err:
115
+ vduse_dev_destroy(vblk_exp->dev);
116
+ g_free(vblk_exp->recon_file);
117
+err_dev:
118
+ g_free(vblk_exp->handler.serial);
119
+ return ret;
120
}
22
}
121
23
122
static void vduse_blk_exp_delete(BlockExport *exp)
24
-static const BdrvChildRole child_job = {
123
@@ -XXX,XX +XXX,XX @@ static void vduse_blk_exp_delete(BlockExport *exp)
25
- .get_parent_desc = child_job_get_parent_desc,
124
unlink(vblk_exp->recon_file);
26
- .stay_at_node = true,
125
}
27
-};
126
g_free(vblk_exp->recon_file);
28
-
127
+ g_free(vblk_exp->handler.serial);
29
-static void block_job_drained_begin(void *opaque)
30
+static void child_job_drained_begin(BdrvChild *c)
31
{
32
- BlockJob *job = opaque;
33
+ BlockJob *job = c->opaque;
34
block_job_pause(job);
128
}
35
}
129
36
130
static void vduse_blk_exp_request_shutdown(BlockExport *exp)
37
-static void block_job_drained_end(void *opaque)
131
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
38
+static void child_job_drained_end(BdrvChild *c)
132
index XXXXXXX..XXXXXXX 100644
39
{
133
--- a/block/export/vhost-user-blk-server.c
40
- BlockJob *job = opaque;
134
+++ b/block/export/vhost-user-blk-server.c
41
+ BlockJob *job = c->opaque;
135
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
42
block_job_resume(job);
136
return -EINVAL;
137
}
138
vexp->handler.blk = exp->blk;
139
- vexp->handler.serial = "vhost_user_blk";
140
+ vexp->handler.serial = g_strdup("vhost_user_blk");
141
vexp->handler.logical_block_size = logical_block_size;
142
vexp->handler.writable = opts->writable;
143
144
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
145
num_queues, &vu_blk_iface, errp)) {
146
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
147
blk_aio_detach, vexp);
148
+ g_free(vexp->handler.serial);
149
return -EADDRNOTAVAIL;
150
}
151
152
@@ -XXX,XX +XXX,XX @@ static void vu_blk_exp_delete(BlockExport *exp)
153
154
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
155
vexp);
156
+ g_free(vexp->handler.serial);
157
}
43
}
158
44
159
const BlockExportDriver blk_exp_vhost_user_blk = {
45
-static const BlockDevOps block_job_dev_ops = {
160
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
46
- .drained_begin = block_job_drained_begin,
161
index XXXXXXX..XXXXXXX 100644
47
- .drained_end = block_job_drained_end,
162
--- a/storage-daemon/qemu-storage-daemon.c
48
+static const BdrvChildRole child_job = {
163
+++ b/storage-daemon/qemu-storage-daemon.c
49
+ .get_parent_desc = child_job_get_parent_desc,
164
@@ -XXX,XX +XXX,XX @@ static void help(void)
50
+ .drained_begin = child_job_drained_begin,
165
" [,writable=on|off][,num-queues=<num-queues>]\n"
51
+ .drained_end = child_job_drained_end,
166
" [,queue-size=<queue-size>]\n"
52
+ .stay_at_node = true,
167
" [,logical-block-size=<logical-block-size>]\n"
53
};
168
+" [,serial=<serial-number>]\n"
54
169
" export the specified block node as a vduse-blk\n"
55
void block_job_remove_all_bdrv(BlockJob *job)
170
" device using the id as the VDUSE device name\n"
56
@@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
171
"\n"
57
block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort);
58
bs->job = job;
59
60
- blk_set_dev_ops(blk, &block_job_dev_ops, job);
61
bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
62
63
QLIST_INSERT_HEAD(&block_jobs, job, job_list);
172
--
64
--
173
2.35.3
65
2.13.6
66
67
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
Block jobs must be paused if any of the involved nodes are drained.
2
2
3
To support reconnecting after restart or crash, VDUSE backend
4
might need to resubmit inflight I/Os. This stores the metadata
5
such as the index of inflight I/O's descriptors to a shm file so
6
that VDUSE backend can restore them during reconnecting.
7
8
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
9
Message-Id: <20220523084611.91-9-xieyongji@bytedance.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
---
4
---
13
subprojects/libvduse/libvduse.h | 12 ++
5
tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
14
block/export/vduse-blk.c | 19 ++-
6
1 file changed, 121 insertions(+)
15
subprojects/libvduse/libvduse.c | 235 +++++++++++++++++++++++++++++++-
16
3 files changed, 260 insertions(+), 6 deletions(-)
17
7
18
diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
8
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
19
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
20
--- a/subprojects/libvduse/libvduse.h
10
--- a/tests/test-bdrv-drain.c
21
+++ b/subprojects/libvduse/libvduse.h
11
+++ b/tests/test-bdrv-drain.c
22
@@ -XXX,XX +XXX,XX @@ int vduse_dev_update_config(VduseDev *dev, uint32_t size,
12
@@ -XXX,XX +XXX,XX @@
23
*/
13
24
int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
14
#include "qemu/osdep.h"
25
15
#include "block/block.h"
26
+/**
16
+#include "block/blockjob_int.h"
27
+ * vduse_set_reconnect_log_file:
17
#include "sysemu/block-backend.h"
28
+ * @dev: VDUSE device
18
#include "qapi/error.h"
29
+ * @file: filename of reconnect log
19
30
+ *
20
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
31
+ * Specify the file to store log for reconnecting. It should
21
test_quiesce_common(BDRV_DRAIN, false);
32
+ * be called before vduse_dev_setup_queue().
22
}
33
+ *
23
34
+ * Returns: 0 on success, -errno on failure.
35
+ */
36
+int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename);
37
+
24
+
38
/**
25
+typedef struct TestBlockJob {
39
* vduse_dev_create_by_fd:
26
+ BlockJob common;
40
* @fd: passed file descriptor
27
+ bool should_complete;
41
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
28
+} TestBlockJob;
42
index XXXXXXX..XXXXXXX 100644
29
+
43
--- a/block/export/vduse-blk.c
30
+static void test_job_completed(BlockJob *job, void *opaque)
44
+++ b/block/export/vduse-blk.c
31
+{
45
@@ -XXX,XX +XXX,XX @@ typedef struct VduseBlkExport {
32
+ block_job_completed(job, 0);
46
VirtioBlkHandler handler;
33
+}
47
VduseDev *dev;
34
+
48
uint16_t num_queues;
35
+static void coroutine_fn test_job_start(void *opaque)
49
+ char *recon_file;
36
+{
50
unsigned int inflight;
37
+ TestBlockJob *s = opaque;
51
} VduseBlkExport;
38
+
52
39
+ while (!s->should_complete) {
53
@@ -XXX,XX +XXX,XX @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
40
+ block_job_sleep_ns(&s->common, 100000);
54
55
aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
56
true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
57
+ /* Make sure we don't miss any kick afer reconnecting */
58
+ eventfd_write(vduse_queue_get_fd(vq), 1);
59
}
60
61
static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
62
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
63
return -ENOMEM;
64
}
65
66
+ vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
67
+ g_get_tmp_dir(), exp->id);
68
+ if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
69
+ error_setg(errp, "failed to set reconnect log file");
70
+ vduse_dev_destroy(vblk_exp->dev);
71
+ g_free(vblk_exp->recon_file);
72
+ return -EINVAL;
73
+ }
41
+ }
74
+
42
+
75
for (i = 0; i < num_queues; i++) {
43
+ block_job_defer_to_main_loop(&s->common, test_job_completed, NULL);
76
vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
77
}
78
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
79
static void vduse_blk_exp_delete(BlockExport *exp)
80
{
81
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
82
+ int ret;
83
84
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
85
vblk_exp);
86
blk_set_dev_ops(exp->blk, NULL, NULL);
87
- vduse_dev_destroy(vblk_exp->dev);
88
+ ret = vduse_dev_destroy(vblk_exp->dev);
89
+ if (ret != -EBUSY) {
90
+ unlink(vblk_exp->recon_file);
91
+ }
92
+ g_free(vblk_exp->recon_file);
93
}
94
95
static void vduse_blk_exp_request_shutdown(BlockExport *exp)
96
diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
97
index XXXXXXX..XXXXXXX 100644
98
--- a/subprojects/libvduse/libvduse.c
99
+++ b/subprojects/libvduse/libvduse.c
100
@@ -XXX,XX +XXX,XX @@
101
#define VDUSE_VQ_ALIGN 4096
102
#define MAX_IOVA_REGIONS 256
103
104
+#define LOG_ALIGNMENT 64
105
+
106
/* Round number down to multiple */
107
#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
108
109
@@ -XXX,XX +XXX,XX @@
110
#define unlikely(x) __builtin_expect(!!(x), 0)
111
#endif
112
113
+typedef struct VduseDescStateSplit {
114
+ uint8_t inflight;
115
+ uint8_t padding[5];
116
+ uint16_t next;
117
+ uint64_t counter;
118
+} VduseDescStateSplit;
119
+
120
+typedef struct VduseVirtqLogInflight {
121
+ uint64_t features;
122
+ uint16_t version;
123
+ uint16_t desc_num;
124
+ uint16_t last_batch_head;
125
+ uint16_t used_idx;
126
+ VduseDescStateSplit desc[];
127
+} VduseVirtqLogInflight;
128
+
129
+typedef struct VduseVirtqLog {
130
+ VduseVirtqLogInflight inflight;
131
+} VduseVirtqLog;
132
+
133
+typedef struct VduseVirtqInflightDesc {
134
+ uint16_t index;
135
+ uint64_t counter;
136
+} VduseVirtqInflightDesc;
137
+
138
typedef struct VduseRing {
139
unsigned int num;
140
uint64_t desc_addr;
141
@@ -XXX,XX +XXX,XX @@ struct VduseVirtq {
142
bool ready;
143
int fd;
144
VduseDev *dev;
145
+ VduseVirtqInflightDesc *resubmit_list;
146
+ uint16_t resubmit_num;
147
+ uint64_t counter;
148
+ VduseVirtqLog *log;
149
};
150
151
typedef struct VduseIovaRegion {
152
@@ -XXX,XX +XXX,XX @@ struct VduseDev {
153
int fd;
154
int ctrl_fd;
155
void *priv;
156
+ void *log;
157
};
158
159
+static inline size_t vduse_vq_log_size(uint16_t queue_size)
160
+{
161
+ return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
162
+ sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
163
+}
44
+}
164
+
45
+
165
+static void *vduse_log_get(const char *filename, size_t size)
46
+static void test_job_complete(BlockJob *job, Error **errp)
166
+{
47
+{
167
+ void *ptr = MAP_FAILED;
48
+ TestBlockJob *s = container_of(job, TestBlockJob, common);
168
+ int fd;
49
+ s->should_complete = true;
169
+
170
+ fd = open(filename, O_RDWR | O_CREAT, 0600);
171
+ if (fd == -1) {
172
+ return MAP_FAILED;
173
+ }
174
+
175
+ if (ftruncate(fd, size) == -1) {
176
+ goto out;
177
+ }
178
+
179
+ ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
180
+
181
+out:
182
+ close(fd);
183
+ return ptr;
184
+}
50
+}
185
+
51
+
186
static inline bool has_feature(uint64_t features, unsigned int fbit)
52
+BlockJobDriver test_job_driver = {
187
{
53
+ .instance_size = sizeof(TestBlockJob),
188
assert(fbit < 64);
54
+ .start = test_job_start,
189
@@ -XXX,XX +XXX,XX @@ static int vduse_inject_irq(VduseDev *dev, int index)
55
+ .complete = test_job_complete,
190
return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
56
+};
191
}
57
+
192
58
+static void test_blockjob_common(enum drain_type drain_type)
193
+static int inflight_desc_compare(const void *a, const void *b)
194
+{
59
+{
195
+ VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
60
+ BlockBackend *blk_src, *blk_target;
196
+ *desc1 = (VduseVirtqInflightDesc *)b;
61
+ BlockDriverState *src, *target;
62
+ BlockJob *job;
63
+ int ret;
197
+
64
+
198
+ if (desc1->counter > desc0->counter &&
65
+ src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR,
199
+ (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
66
+ &error_abort);
200
+ return 1;
67
+ blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
68
+ blk_insert_bs(blk_src, src, &error_abort);
69
+
70
+ target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR,
71
+ &error_abort);
72
+ blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
73
+ blk_insert_bs(blk_target, target, &error_abort);
74
+
75
+ job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0,
76
+ 0, NULL, NULL, &error_abort);
77
+ block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort);
78
+ block_job_start(job);
79
+
80
+ g_assert_cmpint(job->pause_count, ==, 0);
81
+ g_assert_false(job->paused);
82
+ g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
83
+
84
+ do_drain_begin(drain_type, src);
85
+
86
+ if (drain_type == BDRV_DRAIN_ALL) {
87
+ /* bdrv_drain_all() drains both src and target, and involves an
88
+ * additional block_job_pause_all() */
89
+ g_assert_cmpint(job->pause_count, ==, 3);
90
+ } else {
91
+ g_assert_cmpint(job->pause_count, ==, 1);
201
+ }
92
+ }
93
+ /* XXX We don't wait until the job is actually paused. Is this okay? */
94
+ /* g_assert_true(job->paused); */
95
+ g_assert_false(job->busy); /* The job is paused */
202
+
96
+
203
+ return -1;
97
+ do_drain_end(drain_type, src);
98
+
99
+ g_assert_cmpint(job->pause_count, ==, 0);
100
+ g_assert_false(job->paused);
101
+ g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
102
+
103
+ do_drain_begin(drain_type, target);
104
+
105
+ if (drain_type == BDRV_DRAIN_ALL) {
106
+ /* bdrv_drain_all() drains both src and target, and involves an
107
+ * additional block_job_pause_all() */
108
+ g_assert_cmpint(job->pause_count, ==, 3);
109
+ } else {
110
+ g_assert_cmpint(job->pause_count, ==, 1);
111
+ }
112
+ /* XXX We don't wait until the job is actually paused. Is this okay? */
113
+ /* g_assert_true(job->paused); */
114
+ g_assert_false(job->busy); /* The job is paused */
115
+
116
+ do_drain_end(drain_type, target);
117
+
118
+ g_assert_cmpint(job->pause_count, ==, 0);
119
+ g_assert_false(job->paused);
120
+ g_assert_false(job->busy); /* We're in block_job_sleep_ns() */
121
+
122
+ ret = block_job_complete_sync(job, &error_abort);
123
+ g_assert_cmpint(ret, ==, 0);
124
+
125
+ blk_unref(blk_src);
126
+ blk_unref(blk_target);
127
+ bdrv_unref(src);
128
+ bdrv_unref(target);
204
+}
129
+}
205
+
130
+
206
+static int vduse_queue_check_inflights(VduseVirtq *vq)
131
+static void test_blockjob_drain_all(void)
207
+{
132
+{
208
+ int i = 0;
133
+ test_blockjob_common(BDRV_DRAIN_ALL);
209
+ VduseDev *dev = vq->dev;
210
+
211
+ vq->used_idx = le16toh(vq->vring.used->idx);
212
+ vq->resubmit_num = 0;
213
+ vq->resubmit_list = NULL;
214
+ vq->counter = 0;
215
+
216
+ if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
217
+ if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) {
218
+ return -1;
219
+ }
220
+
221
+ vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0;
222
+
223
+ barrier();
224
+
225
+ vq->log->inflight.used_idx = vq->used_idx;
226
+ }
227
+
228
+ for (i = 0; i < vq->log->inflight.desc_num; i++) {
229
+ if (vq->log->inflight.desc[i].inflight == 1) {
230
+ vq->inuse++;
231
+ }
232
+ }
233
+
234
+ vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
235
+
236
+ if (vq->inuse) {
237
+ vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc));
238
+ if (!vq->resubmit_list) {
239
+ return -1;
240
+ }
241
+
242
+ for (i = 0; i < vq->log->inflight.desc_num; i++) {
243
+ if (vq->log->inflight.desc[i].inflight) {
244
+ vq->resubmit_list[vq->resubmit_num].index = i;
245
+ vq->resubmit_list[vq->resubmit_num].counter =
246
+ vq->log->inflight.desc[i].counter;
247
+ vq->resubmit_num++;
248
+ }
249
+ }
250
+
251
+ if (vq->resubmit_num > 1) {
252
+ qsort(vq->resubmit_list, vq->resubmit_num,
253
+ sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
254
+ }
255
+ vq->counter = vq->resubmit_list[0].counter + 1;
256
+ }
257
+
258
+ vduse_inject_irq(dev, vq->index);
259
+
260
+ return 0;
261
+}
134
+}
262
+
135
+
263
+static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
136
+static void test_blockjob_drain(void)
264
+{
137
+{
265
+ vq->log->inflight.desc[desc_idx].counter = vq->counter++;
138
+ test_blockjob_common(BDRV_DRAIN);
266
+
267
+ barrier();
268
+
269
+ vq->log->inflight.desc[desc_idx].inflight = 1;
270
+
271
+ return 0;
272
+}
139
+}
273
+
140
+
274
+static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
141
int main(int argc, char **argv)
275
+{
142
{
276
+ vq->log->inflight.last_batch_head = desc_idx;
143
bdrv_init();
144
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
145
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
146
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
147
148
+ g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
149
+ g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
277
+
150
+
278
+ return 0;
151
return g_test_run();
279
+}
280
+
281
+static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
282
+{
283
+ vq->log->inflight.desc[desc_idx].inflight = 0;
284
+
285
+ barrier();
286
+
287
+ vq->log->inflight.used_idx = vq->used_idx;
288
+
289
+ return 0;
290
+}
291
+
292
static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
293
uint64_t last)
294
{
295
@@ -XXX,XX +XXX,XX @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
296
unsigned int head;
297
VduseVirtqElement *elem;
298
VduseDev *dev = vq->dev;
299
+ int i;
300
301
if (unlikely(!vq->vring.avail)) {
302
return NULL;
303
}
304
305
+ if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
306
+ i = (--vq->resubmit_num);
307
+ elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
308
+
309
+ if (!vq->resubmit_num) {
310
+ free(vq->resubmit_list);
311
+ vq->resubmit_list = NULL;
312
+ }
313
+
314
+ return elem;
315
+ }
316
+
317
if (vduse_queue_empty(vq)) {
318
return NULL;
319
}
320
@@ -XXX,XX +XXX,XX @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
321
322
vq->inuse++;
323
324
+ vduse_queue_inflight_get(vq, head);
325
+
326
return elem;
327
}
152
}
328
329
@@ -XXX,XX +XXX,XX @@ void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
330
unsigned int len)
331
{
332
vduse_queue_fill(vq, elem, len, 0);
333
+ vduse_queue_inflight_pre_put(vq, elem->index);
334
vduse_queue_flush(vq, 1);
335
+ vduse_queue_inflight_post_put(vq, elem->index);
336
}
337
338
static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
339
@@ -XXX,XX +XXX,XX @@ static void vduse_queue_enable(VduseVirtq *vq)
340
}
341
342
vq->fd = fd;
343
- vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
344
- vq->inuse = 0;
345
- vq->used_idx = 0;
346
vq->signalled_used_valid = false;
347
vq->ready = true;
348
349
+ if (vduse_queue_check_inflights(vq)) {
350
+ fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index);
351
+ close(fd);
352
+ return;
353
+ }
354
+
355
dev->ops->enable_queue(dev, vq);
356
}
357
358
@@ -XXX,XX +XXX,XX @@ static void vduse_dev_start_dataplane(VduseDev *dev)
359
360
static void vduse_dev_stop_dataplane(VduseDev *dev)
361
{
362
+ size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
363
int i;
364
365
for (i = 0; i < dev->num_queues; i++) {
366
vduse_queue_disable(&dev->vqs[i]);
367
}
368
+ if (dev->log) {
369
+ memset(dev->log, 0, log_size);
370
+ }
371
dev->features = 0;
372
vduse_iova_remove_region(dev, 0, ULONG_MAX);
373
}
374
@@ -XXX,XX +XXX,XX @@ int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
375
return -errno;
376
}
377
378
+ vduse_queue_enable(vq);
379
+
380
+ return 0;
381
+}
382
+
383
+int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename)
384
+{
385
+
386
+ size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
387
+ void *log;
388
+ int i;
389
+
390
+ dev->log = log = vduse_log_get(filename, log_size);
391
+ if (log == MAP_FAILED) {
392
+ fprintf(stderr, "Failed to get vduse log\n");
393
+ return -EINVAL;
394
+ }
395
+
396
+ for (i = 0; i < dev->num_queues; i++) {
397
+ dev->vqs[i].log = log;
398
+ dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
399
+ log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
400
+ }
401
+
402
return 0;
403
}
404
405
@@ -XXX,XX +XXX,XX @@ static int vduse_dev_init(VduseDev *dev, const char *name,
406
return -errno;
407
}
408
409
+ if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
410
+ fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
411
+ close(fd);
412
+ return -errno;
413
+ }
414
+
415
dev_name = strdup(name);
416
if (!dev_name) {
417
close(fd);
418
@@ -XXX,XX +XXX,XX @@ VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
419
return NULL;
420
}
421
422
+ if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
423
+ fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
424
+ free(dev);
425
+ return NULL;
426
+ }
427
+
428
ret = vduse_dev_init_vqs(dev, num_queues);
429
if (ret) {
430
fprintf(stderr, "Failed to init vqs\n");
431
@@ -XXX,XX +XXX,XX @@ VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
432
433
ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
434
free(dev_config);
435
- if (ret < 0) {
436
+ if (ret && errno != EEXIST) {
437
fprintf(stderr, "Failed to create vduse device %s: %s\n",
438
name, strerror(errno));
439
goto err_dev;
440
@@ -XXX,XX +XXX,XX @@ err_ctrl:
441
442
int vduse_dev_destroy(VduseDev *dev)
443
{
444
- int ret = 0;
445
+ size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
446
+ int i, ret = 0;
447
448
+ if (dev->log) {
449
+ munmap(dev->log, log_size);
450
+ }
451
+ for (i = 0; i < dev->num_queues; i++) {
452
+ free(dev->vqs[i].resubmit_list);
453
+ }
454
free(dev->vqs);
455
if (dev->fd >= 0) {
456
close(dev->fd);
457
--
153
--
458
2.35.3
154
2.13.6
155
156
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
1
Block jobs are already paused using the BdrvChildRole drain callbacks,
2
so we don't need an additional block_job_pause_all() call.
2
3
3
Document vduse-blk exports in qemu-storage-daemon --help and the
4
qemu-storage-daemon(1) man page.
5
6
Based-on: <20220523084611.91-1-xieyongji@bytedance.com>
7
Cc: Xie Yongji <xieyongji@bytedance.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-Id: <20220525121947.859820-1-stefanha@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
5
---
12
docs/tools/qemu-storage-daemon.rst | 21 +++++++++++++++++++++
6
block/io.c | 4 ----
13
storage-daemon/qemu-storage-daemon.c | 9 +++++++++
7
tests/test-bdrv-drain.c | 10 ++++------
14
2 files changed, 30 insertions(+)
8
2 files changed, 4 insertions(+), 10 deletions(-)
15
9
16
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
10
diff --git a/block/io.c b/block/io.c
17
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
18
--- a/docs/tools/qemu-storage-daemon.rst
12
--- a/block/io.c
19
+++ b/docs/tools/qemu-storage-daemon.rst
13
+++ b/block/io.c
20
@@ -XXX,XX +XXX,XX @@ Standard options:
14
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
21
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
15
* context. */
22
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
16
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
23
--export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
17
24
+ --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>]
18
- block_job_pause_all();
25
19
-
26
is a block export definition. ``node-name`` is the block node that should be
20
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
27
exported. ``writable`` determines whether or not the export allows write
21
AioContext *aio_context = bdrv_get_aio_context(bs);
28
@@ -XXX,XX +XXX,XX @@ Standard options:
22
29
``allow-other`` to auto (the default) will try enabling this option, and on
23
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
30
error fall back to disabling it.
24
aio_enable_external(aio_context);
31
25
aio_context_release(aio_context);
32
+ The ``vduse-blk`` export type uses the ``id`` as the VDUSE device name.
26
}
33
+ ``num-queues`` sets the number of virtqueues (the default is 1).
27
-
34
+ ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
28
- block_job_resume_all();
35
+
29
}
36
+ The instantiated VDUSE device must then be added to the vDPA bus using the
30
37
+ vdpa(8) command from the iproute2 project::
31
void bdrv_drain_all(void)
38
+
32
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
39
+ # vdpa dev add name <id> mgmtdev vduse
40
+
41
+ The device can be removed from the vDPA bus later as follows::
42
+
43
+ # vdpa dev del <id>
44
+
45
+ For more information about attaching vDPA devices to the host with
46
+ virtio_vdpa.ko or attaching them to guests with vhost_vdpa.ko, see
47
+ https://vdpa-dev.gitlab.io/.
48
+
49
+ For more information about VDUSE, see
50
+ https://docs.kernel.org/userspace-api/vduse.html.
51
+
52
.. option:: --monitor MONITORDEF
53
54
is a QMP monitor definition. See the :manpage:`qemu(1)` manual page for
55
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
56
index XXXXXXX..XXXXXXX 100644
33
index XXXXXXX..XXXXXXX 100644
57
--- a/storage-daemon/qemu-storage-daemon.c
34
--- a/tests/test-bdrv-drain.c
58
+++ b/storage-daemon/qemu-storage-daemon.c
35
+++ b/tests/test-bdrv-drain.c
59
@@ -XXX,XX +XXX,XX @@ static void help(void)
36
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
60
" vhost-user-blk device over file descriptor\n"
37
do_drain_begin(drain_type, src);
61
"\n"
38
62
#endif /* CONFIG_VHOST_USER_BLK_SERVER */
39
if (drain_type == BDRV_DRAIN_ALL) {
63
+#ifdef CONFIG_VDUSE_BLK_EXPORT
40
- /* bdrv_drain_all() drains both src and target, and involves an
64
+" --export [type=]vduse-blk,id=<id>,node-name=<node-name>\n"
41
- * additional block_job_pause_all() */
65
+" [,writable=on|off][,num-queues=<num-queues>]\n"
42
- g_assert_cmpint(job->pause_count, ==, 3);
66
+" [,queue-size=<queue-size>]\n"
43
+ /* bdrv_drain_all() drains both src and target */
67
+" [,logical-block-size=<logical-block-size>]\n"
44
+ g_assert_cmpint(job->pause_count, ==, 2);
68
+" export the specified block node as a vduse-blk\n"
45
} else {
69
+" device using the id as the VDUSE device name\n"
46
g_assert_cmpint(job->pause_count, ==, 1);
70
+"\n"
47
}
71
+#endif /* CONFIG_VDUSE_BLK_EXPORT */
48
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type)
72
" --monitor [chardev=]name[,mode=control][,pretty[=on|off]]\n"
49
do_drain_begin(drain_type, target);
73
" configure a QMP monitor\n"
50
74
"\n"
51
if (drain_type == BDRV_DRAIN_ALL) {
52
- /* bdrv_drain_all() drains both src and target, and involves an
53
- * additional block_job_pause_all() */
54
- g_assert_cmpint(job->pause_count, ==, 3);
55
+ /* bdrv_drain_all() drains both src and target */
56
+ g_assert_cmpint(job->pause_count, ==, 2);
57
} else {
58
g_assert_cmpint(job->pause_count, ==, 1);
59
}
75
--
60
--
76
2.35.3
61
2.13.6
62
63
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
bdrv_do_drained_begin() restricts the call of parent callbacks and
2
aio_disable_external() to the outermost drain section, but the block
3
driver callbacks are always called. bdrv_do_drained_end() must match
4
this behaviour, otherwise nodes stay drained even if begin/end calls
5
were balanced.
2
6
3
This adds vduse header to linux headers so that the
4
relevant VDUSE API can be used in subsequent patches.
5
6
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Message-Id: <20220523084611.91-5-xieyongji@bytedance.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
7
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
8
---
11
linux-headers/linux/vduse.h | 306 ++++++++++++++++++++++++++++++++
9
block/io.c | 12 +++++++-----
12
scripts/update-linux-headers.sh | 2 +-
10
1 file changed, 7 insertions(+), 5 deletions(-)
13
2 files changed, 307 insertions(+), 1 deletion(-)
14
create mode 100644 linux-headers/linux/vduse.h
15
11
16
diff --git a/linux-headers/linux/vduse.h b/linux-headers/linux/vduse.h
12
diff --git a/block/io.c b/block/io.c
17
new file mode 100644
13
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX
14
--- a/block/io.c
19
--- /dev/null
15
+++ b/block/io.c
20
+++ b/linux-headers/linux/vduse.h
16
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs)
21
@@ -XXX,XX +XXX,XX @@
17
22
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
18
void bdrv_drained_end(BlockDriverState *bs)
23
+#ifndef _VDUSE_H_
19
{
24
+#define _VDUSE_H_
20
+ int old_quiesce_counter;
25
+
21
+
26
+#include <linux/types.h>
22
if (qemu_in_coroutine()) {
27
+
23
bdrv_co_yield_to_drain(bs, false);
28
+#define VDUSE_BASE    0x81
24
return;
29
+
25
}
30
+/* The ioctls for control device (/dev/vduse/control) */
26
assert(bs->quiesce_counter > 0);
31
+
27
- if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
32
+#define VDUSE_API_VERSION    0
28
- return;
33
+
29
- }
34
+/*
30
+ old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
35
+ * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
31
36
+ * This is used for future extension.
32
/* Re-enable things in child-to-parent order */
37
+ */
33
bdrv_drain_invoke(bs, false, false);
38
+#define VDUSE_GET_API_VERSION    _IOR(VDUSE_BASE, 0x00, __u64)
34
- bdrv_parent_drained_end(bs);
39
+
35
- aio_enable_external(bdrv_get_aio_context(bs));
40
+/* Set the version of VDUSE API that userspace supported. */
36
+ if (old_quiesce_counter == 1) {
41
+#define VDUSE_SET_API_VERSION    _IOW(VDUSE_BASE, 0x01, __u64)
37
+ bdrv_parent_drained_end(bs);
42
+
38
+ aio_enable_external(bdrv_get_aio_context(bs));
43
+/**
39
+ }
44
+ * struct vduse_dev_config - basic configuration of a VDUSE device
40
}
45
+ * @name: VDUSE device name, needs to be NUL terminated
41
46
+ * @vendor_id: virtio vendor id
42
/*
47
+ * @device_id: virtio device id
48
+ * @features: virtio features
49
+ * @vq_num: the number of virtqueues
50
+ * @vq_align: the allocation alignment of virtqueue's metadata
51
+ * @reserved: for future use, needs to be initialized to zero
52
+ * @config_size: the size of the configuration space
53
+ * @config: the buffer of the configuration space
54
+ *
55
+ * Structure used by VDUSE_CREATE_DEV ioctl to create VDUSE device.
56
+ */
57
+struct vduse_dev_config {
58
+#define VDUSE_NAME_MAX    256
59
+    char name[VDUSE_NAME_MAX];
60
+    __u32 vendor_id;
61
+    __u32 device_id;
62
+    __u64 features;
63
+    __u32 vq_num;
64
+    __u32 vq_align;
65
+    __u32 reserved[13];
66
+    __u32 config_size;
67
+    __u8 config[];
68
+};
69
+
70
+/* Create a VDUSE device which is represented by a char device (/dev/vduse/$NAME) */
71
+#define VDUSE_CREATE_DEV    _IOW(VDUSE_BASE, 0x02, struct vduse_dev_config)
72
+
73
+/*
74
+ * Destroy a VDUSE device. Make sure there are no more references
75
+ * to the char device (/dev/vduse/$NAME).
76
+ */
77
+#define VDUSE_DESTROY_DEV    _IOW(VDUSE_BASE, 0x03, char[VDUSE_NAME_MAX])
78
+
79
+/* The ioctls for VDUSE device (/dev/vduse/$NAME) */
80
+
81
+/**
82
+ * struct vduse_iotlb_entry - entry of IOTLB to describe one IOVA region [start, last]
83
+ * @offset: the mmap offset on returned file descriptor
84
+ * @start: start of the IOVA region
85
+ * @last: last of the IOVA region
86
+ * @perm: access permission of the IOVA region
87
+ *
88
+ * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region.
89
+ */
90
+struct vduse_iotlb_entry {
91
+    __u64 offset;
92
+    __u64 start;
93
+    __u64 last;
94
+#define VDUSE_ACCESS_RO 0x1
95
+#define VDUSE_ACCESS_WO 0x2
96
+#define VDUSE_ACCESS_RW 0x3
97
+    __u8 perm;
98
+};
99
+
100
+/*
101
+ * Find the first IOVA region that overlaps with the range [start, last]
102
+ * and return the corresponding file descriptor. Return -EINVAL means the
103
+ * IOVA region doesn't exist. Caller should set start and last fields.
104
+ */
105
+#define VDUSE_IOTLB_GET_FD    _IOWR(VDUSE_BASE, 0x10, struct vduse_iotlb_entry)
106
+
107
+/*
108
+ * Get the negotiated virtio features. It's a subset of the features in
109
+ * struct vduse_dev_config which can be accepted by virtio driver. It's
110
+ * only valid after FEATURES_OK status bit is set.
111
+ */
112
+#define VDUSE_DEV_GET_FEATURES    _IOR(VDUSE_BASE, 0x11, __u64)
113
+
114
+/**
115
+ * struct vduse_config_data - data used to update configuration space
116
+ * @offset: the offset from the beginning of configuration space
117
+ * @length: the length to write to configuration space
118
+ * @buffer: the buffer used to write from
119
+ *
120
+ * Structure used by VDUSE_DEV_SET_CONFIG ioctl to update device
121
+ * configuration space.
122
+ */
123
+struct vduse_config_data {
124
+    __u32 offset;
125
+    __u32 length;
126
+    __u8 buffer[];
127
+};
128
+
129
+/* Set device configuration space */
130
+#define VDUSE_DEV_SET_CONFIG    _IOW(VDUSE_BASE, 0x12, struct vduse_config_data)
131
+
132
+/*
133
+ * Inject a config interrupt. It's usually used to notify virtio driver
134
+ * that device configuration space has changed.
135
+ */
136
+#define VDUSE_DEV_INJECT_CONFIG_IRQ    _IO(VDUSE_BASE, 0x13)
137
+
138
+/**
139
+ * struct vduse_vq_config - basic configuration of a virtqueue
140
+ * @index: virtqueue index
141
+ * @max_size: the max size of virtqueue
142
+ * @reserved: for future use, needs to be initialized to zero
143
+ *
144
+ * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
145
+ */
146
+struct vduse_vq_config {
147
+    __u32 index;
148
+    __u16 max_size;
149
+    __u16 reserved[13];
150
+};
151
+
152
+/*
153
+ * Setup the specified virtqueue. Make sure all virtqueues have been
154
+ * configured before the device is attached to vDPA bus.
155
+ */
156
+#define VDUSE_VQ_SETUP        _IOW(VDUSE_BASE, 0x14, struct vduse_vq_config)
157
+
158
+/**
159
+ * struct vduse_vq_state_split - split virtqueue state
160
+ * @avail_index: available index
161
+ */
162
+struct vduse_vq_state_split {
163
+    __u16 avail_index;
164
+};
165
+
166
+/**
167
+ * struct vduse_vq_state_packed - packed virtqueue state
168
+ * @last_avail_counter: last driver ring wrap counter observed by device
169
+ * @last_avail_idx: device available index
170
+ * @last_used_counter: device ring wrap counter
171
+ * @last_used_idx: used index
172
+ */
173
+struct vduse_vq_state_packed {
174
+    __u16 last_avail_counter;
175
+    __u16 last_avail_idx;
176
+    __u16 last_used_counter;
177
+    __u16 last_used_idx;
178
+};
179
+
180
+/**
181
+ * struct vduse_vq_info - information of a virtqueue
182
+ * @index: virtqueue index
183
+ * @num: the size of virtqueue
184
+ * @desc_addr: address of desc area
185
+ * @driver_addr: address of driver area
186
+ * @device_addr: address of device area
187
+ * @split: split virtqueue state
188
+ * @packed: packed virtqueue state
189
+ * @ready: ready status of virtqueue
190
+ *
191
+ * Structure used by VDUSE_VQ_GET_INFO ioctl to get virtqueue's information.
192
+ */
193
+struct vduse_vq_info {
194
+    __u32 index;
195
+    __u32 num;
196
+    __u64 desc_addr;
197
+    __u64 driver_addr;
198
+    __u64 device_addr;
199
+    union {
200
+        struct vduse_vq_state_split split;
201
+        struct vduse_vq_state_packed packed;
202
+    };
203
+    __u8 ready;
204
+};
205
+
206
+/* Get the specified virtqueue's information. Caller should set index field. */
207
+#define VDUSE_VQ_GET_INFO    _IOWR(VDUSE_BASE, 0x15, struct vduse_vq_info)
208
+
209
+/**
210
+ * struct vduse_vq_eventfd - eventfd configuration for a virtqueue
211
+ * @index: virtqueue index
212
+ * @fd: eventfd, -1 means de-assigning the eventfd
213
+ *
214
+ * Structure used by VDUSE_VQ_SETUP_KICKFD ioctl to setup kick eventfd.
215
+ */
216
+struct vduse_vq_eventfd {
217
+    __u32 index;
218
+#define VDUSE_EVENTFD_DEASSIGN -1
219
+    int fd;
220
+};
221
+
222
+/*
223
+ * Setup kick eventfd for specified virtqueue. The kick eventfd is used
224
+ * by VDUSE kernel module to notify userspace to consume the avail vring.
225
+ */
226
+#define VDUSE_VQ_SETUP_KICKFD    _IOW(VDUSE_BASE, 0x16, struct vduse_vq_eventfd)
227
+
228
+/*
229
+ * Inject an interrupt for specific virtqueue. It's used to notify virtio driver
230
+ * to consume the used vring.
231
+ */
232
+#define VDUSE_VQ_INJECT_IRQ    _IOW(VDUSE_BASE, 0x17, __u32)
233
+
234
+/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
235
+
236
+/**
237
+ * enum vduse_req_type - request type
238
+ * @VDUSE_GET_VQ_STATE: get the state for specified virtqueue from userspace
239
+ * @VDUSE_SET_STATUS: set the device status
240
+ * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
241
+ * specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
242
+ */
243
+enum vduse_req_type {
244
+    VDUSE_GET_VQ_STATE,
245
+    VDUSE_SET_STATUS,
246
+    VDUSE_UPDATE_IOTLB,
247
+};
248
+
249
+/**
250
+ * struct vduse_vq_state - virtqueue state
251
+ * @index: virtqueue index
252
+ * @split: split virtqueue state
253
+ * @packed: packed virtqueue state
254
+ */
255
+struct vduse_vq_state {
256
+    __u32 index;
257
+    union {
258
+        struct vduse_vq_state_split split;
259
+        struct vduse_vq_state_packed packed;
260
+    };
261
+};
262
+
263
+/**
264
+ * struct vduse_dev_status - device status
265
+ * @status: device status
266
+ */
267
+struct vduse_dev_status {
268
+    __u8 status;
269
+};
270
+
271
+/**
272
+ * struct vduse_iova_range - IOVA range [start, last]
273
+ * @start: start of the IOVA range
274
+ * @last: last of the IOVA range
275
+ */
276
+struct vduse_iova_range {
277
+    __u64 start;
278
+    __u64 last;
279
+};
280
+
281
+/**
282
+ * struct vduse_dev_request - control request
283
+ * @type: request type
284
+ * @request_id: request id
285
+ * @reserved: for future use
286
+ * @vq_state: virtqueue state, only index field is available
287
+ * @s: device status
288
+ * @iova: IOVA range for updating
289
+ * @padding: padding
290
+ *
291
+ * Structure used by read(2) on /dev/vduse/$NAME.
292
+ */
293
+struct vduse_dev_request {
294
+    __u32 type;
295
+    __u32 request_id;
296
+    __u32 reserved[4];
297
+    union {
298
+        struct vduse_vq_state vq_state;
299
+        struct vduse_dev_status s;
300
+        struct vduse_iova_range iova;
301
+        __u32 padding[32];
302
+    };
303
+};
304
+
305
+/**
306
+ * struct vduse_dev_response - response to control request
307
+ * @request_id: corresponding request id
308
+ * @result: the result of request
309
+ * @reserved: for future use, needs to be initialized to zero
310
+ * @vq_state: virtqueue state
311
+ * @padding: padding
312
+ *
313
+ * Structure used by write(2) on /dev/vduse/$NAME.
314
+ */
315
+struct vduse_dev_response {
316
+    __u32 request_id;
317
+#define VDUSE_REQ_RESULT_OK    0x00
318
+#define VDUSE_REQ_RESULT_FAILED    0x01
319
+    __u32 result;
320
+    __u32 reserved[4];
321
+    union {
322
+        struct vduse_vq_state vq_state;
323
+        __u32 padding[32];
324
+    };
325
+};
326
+
327
+#endif /* _VDUSE_H_ */
328
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
329
index XXXXXXX..XXXXXXX 100755
330
--- a/scripts/update-linux-headers.sh
331
+++ b/scripts/update-linux-headers.sh
332
@@ -XXX,XX +XXX,XX @@ done
333
rm -rf "$output/linux-headers/linux"
334
mkdir -p "$output/linux-headers/linux"
335
for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \
336
- psci.h psp-sev.h userfaultfd.h mman.h; do
337
+ psci.h psp-sev.h userfaultfd.h mman.h vduse.h; do
338
cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
339
done
340
341
--
43
--
342
2.35.3
44
2.13.6
45
46
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
2
3
To support block resize, this uses vduse_dev_update_config()
4
to update the capacity field in configuration space and inject
5
config interrupt on the block resize callback.
6
7
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-Id: <20220523084611.91-8-xieyongji@bytedance.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
2
---
12
block/export/vduse-blk.c | 20 ++++++++++++++++++++
3
tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
13
1 file changed, 20 insertions(+)
4
1 file changed, 57 insertions(+)
14
5
15
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
6
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
16
index XXXXXXX..XXXXXXX 100644
7
index XXXXXXX..XXXXXXX 100644
17
--- a/block/export/vduse-blk.c
8
--- a/tests/test-bdrv-drain.c
18
+++ b/block/export/vduse-blk.c
9
+++ b/tests/test-bdrv-drain.c
19
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
10
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
20
vblk_exp->export.ctx = NULL;
11
enum drain_type {
12
BDRV_DRAIN_ALL,
13
BDRV_DRAIN,
14
+ DRAIN_TYPE_MAX,
15
};
16
17
static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
18
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
19
test_quiesce_common(BDRV_DRAIN, false);
21
}
20
}
22
21
23
+static void vduse_blk_resize(void *opaque)
22
+static void test_nested(void)
24
+{
23
+{
25
+ BlockExport *exp = opaque;
24
+ BlockBackend *blk;
26
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
25
+ BlockDriverState *bs, *backing;
27
+ struct virtio_blk_config config;
26
+ BDRVTestState *s, *backing_s;
27
+ enum drain_type outer, inner;
28
+
28
+
29
+ config.capacity =
29
+ blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
30
+ cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
30
+ bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR,
31
+ vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity),
31
+ &error_abort);
32
+ offsetof(struct virtio_blk_config, capacity),
32
+ s = bs->opaque;
33
+ (char *)&config.capacity);
33
+ blk_insert_bs(blk, bs, &error_abort);
34
+
35
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
36
+ backing_s = backing->opaque;
37
+ bdrv_set_backing_hd(bs, backing, &error_abort);
38
+
39
+ for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) {
40
+ for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) {
41
+ /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
42
+ int bs_quiesce = (outer != BDRV_DRAIN_ALL) +
43
+ (inner != BDRV_DRAIN_ALL);
44
+ int backing_quiesce = 0;
45
+ int backing_cb_cnt = (outer != BDRV_DRAIN) +
46
+ (inner != BDRV_DRAIN);
47
+
48
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
49
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
50
+ g_assert_cmpint(s->drain_count, ==, 0);
51
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
52
+
53
+ do_drain_begin(outer, bs);
54
+ do_drain_begin(inner, bs);
55
+
56
+ g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce);
57
+ g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce);
58
+ g_assert_cmpint(s->drain_count, ==, 2);
59
+ g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt);
60
+
61
+ do_drain_end(inner, bs);
62
+ do_drain_end(outer, bs);
63
+
64
+ g_assert_cmpint(bs->quiesce_counter, ==, 0);
65
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
66
+ g_assert_cmpint(s->drain_count, ==, 0);
67
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
68
+ }
69
+ }
70
+
71
+ bdrv_unref(backing);
72
+ bdrv_unref(bs);
73
+ blk_unref(blk);
34
+}
74
+}
35
+
75
+
36
+static const BlockDevOps vduse_block_ops = {
76
37
+ .resize_cb = vduse_blk_resize,
77
typedef struct TestBlockJob {
38
+};
78
BlockJob common;
79
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
80
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
81
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
82
83
+ g_test_add_func("/bdrv-drain/nested", test_nested);
39
+
84
+
40
static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
85
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
41
Error **errp)
86
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
42
{
43
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
44
blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
45
vblk_exp);
46
47
+ blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
48
+
49
return 0;
50
}
51
52
@@ -XXX,XX +XXX,XX @@ static void vduse_blk_exp_delete(BlockExport *exp)
53
54
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
55
vblk_exp);
56
+ blk_set_dev_ops(exp->blk, NULL, NULL);
57
vduse_dev_destroy(vblk_exp->dev);
58
}
59
87
60
--
88
--
61
2.35.3
89
2.13.6
90
91
diff view generated by jsdifflib
1
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
1
This is in preparation for subtree drains, i.e. drained sections that
2
2
affect not only a single node, but recursively all child nodes, too.
3
It seems that aio_wait_kick always required a memory barrier
3
4
or atomic operation in the caller, but nobody actually
4
Calling the parent callbacks for drain is pointless when we just came
5
took care of doing it.
5
from that parent node recursively and leads to multiple increases of
6
6
bs->quiesce_counter in a single drain call. Don't do it.
7
Let's put the barrier in the function instead, and pair it
7
8
with another one in AIO_WAIT_WHILE. Read aio_wait_kick()
8
In order for this to work correctly, the parent callback must be called
9
comment for further explanation.
9
for every bdrv_drain_begin/end() call, not only for the outermost one:
10
10
11
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
11
If we have a node N with two parents A and B, recursive draining of A
12
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
12
should cause the quiesce_counter of B to increase because its child N is
13
Message-Id: <20220524173054.12651-1-eesposit@redhat.com>
13
drained independently of B. If now B is recursively drained, too, A must
14
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
14
increase its quiesce_counter because N is drained independently of A
15
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
15
only now, even if N is going from quiesce_counter 1 to 2.
16
16
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
---
18
---
18
include/block/aio-wait.h | 2 ++
19
include/block/block.h | 4 ++--
19
util/aio-wait.c | 16 +++++++++++++++-
20
block.c | 13 +++++++++----
20
2 files changed, 17 insertions(+), 1 deletion(-)
21
block/io.c | 47 ++++++++++++++++++++++++++++++++++-------------
21
22
3 files changed, 45 insertions(+), 19 deletions(-)
22
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
23
24
diff --git a/include/block/block.h b/include/block/block.h
23
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
24
--- a/include/block/aio-wait.h
26
--- a/include/block/block.h
25
+++ b/include/block/aio-wait.h
27
+++ b/include/block/block.h
26
@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
28
@@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs);
27
AioContext *ctx_ = (ctx); \
29
* Begin a quiesced section of all users of @bs. This is part of
28
/* Increment wait_->num_waiters before evaluating cond. */ \
30
* bdrv_drained_begin.
29
qatomic_inc(&wait_->num_waiters); \
31
*/
30
+ /* Paired with smp_mb in aio_wait_kick(). */ \
32
-void bdrv_parent_drained_begin(BlockDriverState *bs);
31
+ smp_mb(); \
33
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore);
32
if (ctx_ && in_aio_context_home_thread(ctx_)) { \
34
33
while ((cond)) { \
35
/**
34
aio_poll(ctx_, true); \
36
* bdrv_parent_drained_end:
35
diff --git a/util/aio-wait.c b/util/aio-wait.c
37
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs);
38
* End a quiesced section of all users of @bs. This is part of
39
* bdrv_drained_end.
40
*/
41
-void bdrv_parent_drained_end(BlockDriverState *bs);
42
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
43
44
/**
45
* bdrv_drained_begin:
46
diff --git a/block.c b/block.c
36
index XXXXXXX..XXXXXXX 100644
47
index XXXXXXX..XXXXXXX 100644
37
--- a/util/aio-wait.c
48
--- a/block.c
38
+++ b/util/aio-wait.c
49
+++ b/block.c
39
@@ -XXX,XX +XXX,XX @@ static void dummy_bh_cb(void *opaque)
50
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
40
51
BlockDriverState *new_bs)
41
void aio_wait_kick(void)
52
{
42
{
53
BlockDriverState *old_bs = child->bs;
43
- /* The barrier (or an atomic op) is in the caller. */
54
+ int i;
44
+ /*
55
45
+ * Paired with smp_mb in AIO_WAIT_WHILE. Here we have:
56
if (old_bs && new_bs) {
46
+ * write(condition);
57
assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
47
+ * aio_wait_kick() {
58
}
48
+ * smp_mb();
59
if (old_bs) {
49
+ * read(num_waiters);
60
if (old_bs->quiesce_counter && child->role->drained_end) {
50
+ * }
61
- child->role->drained_end(child);
51
+ *
62
+ for (i = 0; i < old_bs->quiesce_counter; i++) {
52
+ * And in AIO_WAIT_WHILE:
63
+ child->role->drained_end(child);
53
+ * write(num_waiters);
64
+ }
54
+ * smp_mb();
65
}
55
+ * read(condition);
66
if (child->role->detach) {
56
+ */
67
child->role->detach(child);
57
+ smp_mb();
68
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
69
if (new_bs) {
70
QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
71
if (new_bs->quiesce_counter && child->role->drained_begin) {
72
- child->role->drained_begin(child);
73
+ for (i = 0; i < new_bs->quiesce_counter; i++) {
74
+ child->role->drained_begin(child);
75
+ }
76
}
77
78
if (child->role->attach) {
79
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
80
AioContext *ctx = bdrv_get_aio_context(bs);
81
82
aio_disable_external(ctx);
83
- bdrv_parent_drained_begin(bs);
84
+ bdrv_parent_drained_begin(bs, NULL);
85
bdrv_drain(bs); /* ensure there are no in-flight requests */
86
87
while (aio_poll(ctx, false)) {
88
@@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
89
*/
90
aio_context_acquire(new_context);
91
bdrv_attach_aio_context(bs, new_context);
92
- bdrv_parent_drained_end(bs);
93
+ bdrv_parent_drained_end(bs, NULL);
94
aio_enable_external(ctx);
95
aio_context_release(new_context);
96
}
97
diff --git a/block/io.c b/block/io.c
98
index XXXXXXX..XXXXXXX 100644
99
--- a/block/io.c
100
+++ b/block/io.c
101
@@ -XXX,XX +XXX,XX @@
102
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
103
int64_t offset, int bytes, BdrvRequestFlags flags);
104
105
-void bdrv_parent_drained_begin(BlockDriverState *bs)
106
+void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
107
{
108
BdrvChild *c, *next;
109
110
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
111
+ if (c == ignore) {
112
+ continue;
113
+ }
114
if (c->role->drained_begin) {
115
c->role->drained_begin(c);
116
}
117
}
118
}
119
120
-void bdrv_parent_drained_end(BlockDriverState *bs)
121
+void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
122
{
123
BdrvChild *c, *next;
124
125
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
126
+ if (c == ignore) {
127
+ continue;
128
+ }
129
if (c->role->drained_end) {
130
c->role->drained_end(c);
131
}
132
@@ -XXX,XX +XXX,XX @@ typedef struct {
133
BlockDriverState *bs;
134
bool done;
135
bool begin;
136
+ BdrvChild *parent;
137
} BdrvCoDrainData;
138
139
static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
140
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
141
return waited;
142
}
143
144
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
145
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
58
+
146
+
59
if (qatomic_read(&global_aio_wait.num_waiters)) {
147
static void bdrv_co_drain_bh_cb(void *opaque)
60
aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
148
{
149
BdrvCoDrainData *data = opaque;
150
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
151
152
bdrv_dec_in_flight(bs);
153
if (data->begin) {
154
- bdrv_drained_begin(bs);
155
+ bdrv_do_drained_begin(bs, data->parent);
156
} else {
157
- bdrv_drained_end(bs);
158
+ bdrv_do_drained_end(bs, data->parent);
159
}
160
161
data->done = true;
162
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
163
}
164
165
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
166
- bool begin)
167
+ bool begin, BdrvChild *parent)
168
{
169
BdrvCoDrainData data;
170
171
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
172
.bs = bs,
173
.done = false,
174
.begin = begin,
175
+ .parent = parent,
176
};
177
bdrv_inc_in_flight(bs);
178
aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
179
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
180
assert(data.done);
181
}
182
183
-void bdrv_drained_begin(BlockDriverState *bs)
184
+static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
185
{
186
if (qemu_in_coroutine()) {
187
- bdrv_co_yield_to_drain(bs, true);
188
+ bdrv_co_yield_to_drain(bs, true, parent);
189
return;
190
}
191
192
/* Stop things in parent-to-child order */
193
if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
194
aio_disable_external(bdrv_get_aio_context(bs));
195
- bdrv_parent_drained_begin(bs);
196
}
197
198
+ bdrv_parent_drained_begin(bs, parent);
199
bdrv_drain_invoke(bs, true, false);
200
bdrv_drain_recurse(bs);
201
}
202
203
-void bdrv_drained_end(BlockDriverState *bs)
204
+void bdrv_drained_begin(BlockDriverState *bs)
205
+{
206
+ bdrv_do_drained_begin(bs, NULL);
207
+}
208
+
209
+static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
210
{
211
int old_quiesce_counter;
212
213
if (qemu_in_coroutine()) {
214
- bdrv_co_yield_to_drain(bs, false);
215
+ bdrv_co_yield_to_drain(bs, false, parent);
216
return;
217
}
218
assert(bs->quiesce_counter > 0);
219
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs)
220
221
/* Re-enable things in child-to-parent order */
222
bdrv_drain_invoke(bs, false, false);
223
+ bdrv_parent_drained_end(bs, parent);
224
if (old_quiesce_counter == 1) {
225
- bdrv_parent_drained_end(bs);
226
aio_enable_external(bdrv_get_aio_context(bs));
227
}
228
}
229
230
+void bdrv_drained_end(BlockDriverState *bs)
231
+{
232
+ bdrv_do_drained_end(bs, NULL);
233
+}
234
+
235
/*
236
* Wait for pending requests to complete on a single BlockDriverState subtree,
237
* and suspend block driver's internal I/O until next request arrives.
238
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void)
239
/* Stop things in parent-to-child order */
240
aio_context_acquire(aio_context);
241
aio_disable_external(aio_context);
242
- bdrv_parent_drained_begin(bs);
243
+ bdrv_parent_drained_begin(bs, NULL);
244
bdrv_drain_invoke(bs, true, true);
245
aio_context_release(aio_context);
246
247
@@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void)
248
/* Re-enable things in child-to-parent order */
249
aio_context_acquire(aio_context);
250
bdrv_drain_invoke(bs, false, true);
251
- bdrv_parent_drained_end(bs);
252
+ bdrv_parent_drained_end(bs, NULL);
253
aio_enable_external(aio_context);
254
aio_context_release(aio_context);
61
}
255
}
62
--
256
--
63
2.35.3
257
2.13.6
258
259
diff view generated by jsdifflib
1
From: Stefano Garzarella <sgarzare@redhat.com>
1
bdrv_drained_begin() waits for the completion of requests in the whole
2
subtree, but it only actually keeps its immediate bs parameter quiesced
3
until bdrv_drained_end().
2
4
3
If the namespace does not exist, rbd_create() fails with -ENOENT and
5
Add a version that keeps the whole subtree drained. As of this commit,
4
QEMU reports a generic "error rbd create: No such file or directory":
6
graph changes cannot be allowed during a subtree drained section, but
7
this will be fixed soon.
5
8
6
$ qemu-img create rbd:rbd/namespace/image 1M
7
Formatting 'rbd:rbd/namespace/image', fmt=raw size=1048576
8
qemu-img: rbd:rbd/namespace/image: error rbd create: No such file or directory
9
10
Unfortunately rados_ioctx_set_namespace() does not fail if the namespace
11
does not exist, so let's use rbd_namespace_exists() in qemu_rbd_connect()
12
to check if the namespace exists, reporting a more understandable error:
13
14
$ qemu-img create rbd:rbd/namespace/image 1M
15
Formatting 'rbd:rbd/namespace/image', fmt=raw size=1048576
16
qemu-img: rbd:rbd/namespace/image: namespace 'namespace' does not exist
17
18
Reported-by: Tingting Mao <timao@redhat.com>
19
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
20
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
21
Message-Id: <20220517071012.6120-1-sgarzare@redhat.com>
22
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
23
---
10
---
24
block/rbd.c | 24 ++++++++++++++++++++++++
11
include/block/block.h | 13 +++++++++++++
25
meson.build | 6 ++++++
12
block/io.c | 54 ++++++++++++++++++++++++++++++++++++++++-----------
26
2 files changed, 30 insertions(+)
13
2 files changed, 56 insertions(+), 11 deletions(-)
27
14
28
diff --git a/block/rbd.c b/block/rbd.c
15
diff --git a/include/block/block.h b/include/block/block.h
29
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
30
--- a/block/rbd.c
17
--- a/include/block/block.h
31
+++ b/block/rbd.c
18
+++ b/include/block/block.h
32
@@ -XXX,XX +XXX,XX @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
19
@@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore);
33
error_setg_errno(errp, -r, "error opening pool %s", opts->pool);
20
void bdrv_drained_begin(BlockDriverState *bs);
34
goto failed_shutdown;
21
22
/**
23
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
24
+ * exclusive access to all child nodes as well.
25
+ *
26
+ * Graph changes are not allowed during a subtree drain section.
27
+ */
28
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
29
+
30
+/**
31
* bdrv_drained_end:
32
*
33
* End a quiescent section started by bdrv_drained_begin().
34
*/
35
void bdrv_drained_end(BlockDriverState *bs);
36
37
+/**
38
+ * End a quiescent section started by bdrv_subtree_drained_begin().
39
+ */
40
+void bdrv_subtree_drained_end(BlockDriverState *bs);
41
+
42
void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
43
Error **errp);
44
void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
45
diff --git a/block/io.c b/block/io.c
46
index XXXXXXX..XXXXXXX 100644
47
--- a/block/io.c
48
+++ b/block/io.c
49
@@ -XXX,XX +XXX,XX @@ typedef struct {
50
BlockDriverState *bs;
51
bool done;
52
bool begin;
53
+ bool recursive;
54
BdrvChild *parent;
55
} BdrvCoDrainData;
56
57
@@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
58
return waited;
59
}
60
61
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent);
62
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent);
63
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
64
+ BdrvChild *parent);
65
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
66
+ BdrvChild *parent);
67
68
static void bdrv_co_drain_bh_cb(void *opaque)
69
{
70
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
71
72
bdrv_dec_in_flight(bs);
73
if (data->begin) {
74
- bdrv_do_drained_begin(bs, data->parent);
75
+ bdrv_do_drained_begin(bs, data->recursive, data->parent);
76
} else {
77
- bdrv_do_drained_end(bs, data->parent);
78
+ bdrv_do_drained_end(bs, data->recursive, data->parent);
79
}
80
81
data->done = true;
82
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
83
}
84
85
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
86
- bool begin, BdrvChild *parent)
87
+ bool begin, bool recursive,
88
+ BdrvChild *parent)
89
{
90
BdrvCoDrainData data;
91
92
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
93
.bs = bs,
94
.done = false,
95
.begin = begin,
96
+ .recursive = recursive,
97
.parent = parent,
98
};
99
bdrv_inc_in_flight(bs);
100
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
101
assert(data.done);
102
}
103
104
-static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
105
+static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
106
+ BdrvChild *parent)
107
{
108
+ BdrvChild *child, *next;
109
+
110
if (qemu_in_coroutine()) {
111
- bdrv_co_yield_to_drain(bs, true, parent);
112
+ bdrv_co_yield_to_drain(bs, true, recursive, parent);
113
return;
114
}
115
116
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent)
117
bdrv_parent_drained_begin(bs, parent);
118
bdrv_drain_invoke(bs, true, false);
119
bdrv_drain_recurse(bs);
120
+
121
+ if (recursive) {
122
+ QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
123
+ bdrv_do_drained_begin(child->bs, true, child);
124
+ }
125
+ }
126
}
127
128
void bdrv_drained_begin(BlockDriverState *bs)
129
{
130
- bdrv_do_drained_begin(bs, NULL);
131
+ bdrv_do_drained_begin(bs, false, NULL);
132
+}
133
+
134
+void bdrv_subtree_drained_begin(BlockDriverState *bs)
135
+{
136
+ bdrv_do_drained_begin(bs, true, NULL);
137
}
138
139
-static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
140
+static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
141
+ BdrvChild *parent)
142
{
143
+ BdrvChild *child, *next;
144
int old_quiesce_counter;
145
146
if (qemu_in_coroutine()) {
147
- bdrv_co_yield_to_drain(bs, false, parent);
148
+ bdrv_co_yield_to_drain(bs, false, recursive, parent);
149
return;
150
}
151
assert(bs->quiesce_counter > 0);
152
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent)
153
if (old_quiesce_counter == 1) {
154
aio_enable_external(bdrv_get_aio_context(bs));
35
}
155
}
36
+
156
+
37
+#ifdef HAVE_RBD_NAMESPACE_EXISTS
157
+ if (recursive) {
38
+ if (opts->has_q_namespace && strlen(opts->q_namespace) > 0) {
158
+ QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
39
+ bool exists;
159
+ bdrv_do_drained_end(child->bs, true, child);
40
+
41
+ r = rbd_namespace_exists(*io_ctx, opts->q_namespace, &exists);
42
+ if (r < 0) {
43
+ error_setg_errno(errp, -r, "error checking namespace");
44
+ goto failed_ioctx_destroy;
45
+ }
46
+
47
+ if (!exists) {
48
+ error_setg(errp, "namespace '%s' does not exist",
49
+ opts->q_namespace);
50
+ r = -ENOENT;
51
+ goto failed_ioctx_destroy;
52
+ }
160
+ }
53
+ }
161
+ }
54
+#endif
162
}
163
164
void bdrv_drained_end(BlockDriverState *bs)
165
{
166
- bdrv_do_drained_end(bs, NULL);
167
+ bdrv_do_drained_end(bs, false, NULL);
168
+}
55
+
169
+
56
/*
170
+void bdrv_subtree_drained_end(BlockDriverState *bs)
57
* Set the namespace after opening the io context on the pool,
171
+{
58
* if nspace == NULL or if nspace == "", it is just as we did nothing
172
+ bdrv_do_drained_end(bs, true, NULL);
59
@@ -XXX,XX +XXX,XX @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
173
}
60
r = 0;
174
61
goto out;
175
/*
62
63
+#ifdef HAVE_RBD_NAMESPACE_EXISTS
64
+failed_ioctx_destroy:
65
+ rados_ioctx_destroy(*io_ctx);
66
+#endif
67
failed_shutdown:
68
rados_shutdown(*cluster);
69
out:
70
diff --git a/meson.build b/meson.build
71
index XXXXXXX..XXXXXXX 100644
72
--- a/meson.build
73
+++ b/meson.build
74
@@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs'))
75
config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: util))
76
config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul'))
77
config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
78
+if rbd.found()
79
+ config_host_data.set('HAVE_RBD_NAMESPACE_EXISTS',
80
+ cc.has_function('rbd_namespace_exists',
81
+ dependencies: rbd,
82
+ prefix: '#include <rbd/librbd.h>'))
83
+endif
84
if rdma.found()
85
config_host_data.set('HAVE_IBV_ADVISE_MR',
86
cc.has_function('ibv_advise_mr',
87
--
176
--
88
2.35.3
177
2.13.6
178
179
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
Add a subtree drain version to the existing test cases.
2
2
3
This supports passing NULL ops to blk_set_dev_ops()
4
so that we can remove stale ops in some cases.
5
6
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Message-Id: <20220523084611.91-2-xieyongji@bytedance.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
4
---
11
block/block-backend.c | 2 +-
5
tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++-
12
1 file changed, 1 insertion(+), 1 deletion(-)
6
1 file changed, 26 insertions(+), 1 deletion(-)
13
7
14
diff --git a/block/block-backend.c b/block/block-backend.c
8
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
15
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
16
--- a/block/block-backend.c
10
--- a/tests/test-bdrv-drain.c
17
+++ b/block/block-backend.c
11
+++ b/tests/test-bdrv-drain.c
18
@@ -XXX,XX +XXX,XX @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
12
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
19
blk->dev_opaque = opaque;
13
enum drain_type {
20
14
BDRV_DRAIN_ALL,
21
/* Are we currently quiesced? Should we enforce this right now? */
15
BDRV_DRAIN,
22
- if (blk->quiesce_counter && ops->drained_begin) {
16
+ BDRV_SUBTREE_DRAIN,
23
+ if (blk->quiesce_counter && ops && ops->drained_begin) {
17
DRAIN_TYPE_MAX,
24
ops->drained_begin(opaque);
18
};
19
20
@@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs)
21
switch (drain_type) {
22
case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break;
23
case BDRV_DRAIN: bdrv_drained_begin(bs); break;
24
+ case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_begin(bs); break;
25
default: g_assert_not_reached();
25
}
26
}
26
}
27
}
28
@@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs)
29
switch (drain_type) {
30
case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break;
31
case BDRV_DRAIN: bdrv_drained_end(bs); break;
32
+ case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_end(bs); break;
33
default: g_assert_not_reached();
34
}
35
}
36
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void)
37
test_drv_cb_common(BDRV_DRAIN, false);
38
}
39
40
+static void test_drv_cb_drain_subtree(void)
41
+{
42
+ test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
43
+}
44
+
45
static void test_quiesce_common(enum drain_type drain_type, bool recursive)
46
{
47
BlockBackend *blk;
48
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void)
49
test_quiesce_common(BDRV_DRAIN, false);
50
}
51
52
+static void test_quiesce_drain_subtree(void)
53
+{
54
+ test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
55
+}
56
+
57
static void test_nested(void)
58
{
59
BlockBackend *blk;
60
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
61
/* XXX bdrv_drain_all() doesn't increase the quiesce_counter */
62
int bs_quiesce = (outer != BDRV_DRAIN_ALL) +
63
(inner != BDRV_DRAIN_ALL);
64
- int backing_quiesce = 0;
65
+ int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) +
66
+ (inner == BDRV_SUBTREE_DRAIN);
67
int backing_cb_cnt = (outer != BDRV_DRAIN) +
68
(inner != BDRV_DRAIN);
69
70
@@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void)
71
test_blockjob_common(BDRV_DRAIN);
72
}
73
74
+static void test_blockjob_drain_subtree(void)
75
+{
76
+ test_blockjob_common(BDRV_SUBTREE_DRAIN);
77
+}
78
+
79
int main(int argc, char **argv)
80
{
81
bdrv_init();
82
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
83
84
g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all);
85
g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain);
86
+ g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
87
+ test_drv_cb_drain_subtree);
88
89
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
90
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
91
+ g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
92
+ test_quiesce_drain_subtree);
93
94
g_test_add_func("/bdrv-drain/nested", test_nested);
95
96
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
97
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
98
+ g_test_add_func("/bdrv-drain/blockjob/drain_subtree",
99
+ test_blockjob_drain_subtree);
100
101
return g_test_run();
102
}
27
--
103
--
28
2.35.3
104
2.13.6
105
106
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
If bdrv_do_drained_begin/end() are called in coroutine context, they
2
first use a BH to get out of the coroutine context. Call some existing
3
tests again from a coroutine to cover this code path.
2
4
3
This implements a VDUSE block backends based on
4
the libvduse library. We can use it to export the BDSs
5
for both VM and container (host) usage.
6
7
The new command-line syntax is:
8
9
$ qemu-storage-daemon \
10
--blockdev file,node-name=drive0,filename=test.img \
11
--export vduse-blk,node-name=drive0,id=vduse-export0,writable=on
12
13
After the qemu-storage-daemon started, we need to use
14
the "vdpa" command to attach the device to vDPA bus:
15
16
$ vdpa dev add name vduse-export0 mgmtdev vduse
17
18
Also the device must be removed via the "vdpa" command
19
before we stop the qemu-storage-daemon.
20
21
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
22
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
23
Message-Id: <20220523084611.91-7-xieyongji@bytedance.com>
24
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
25
---
6
---
26
qapi/block-export.json | 28 ++-
7
tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
27
meson_options.txt | 2 +
8
1 file changed, 59 insertions(+)
28
block/export/vduse-blk.h | 20 +++
29
block/export/export.c | 6 +
30
block/export/vduse-blk.c | 329 ++++++++++++++++++++++++++++++++++
31
MAINTAINERS | 4 +-
32
block/export/meson.build | 5 +
33
meson.build | 13 ++
34
scripts/meson-buildoptions.sh | 4 +
35
9 files changed, 407 insertions(+), 4 deletions(-)
36
create mode 100644 block/export/vduse-blk.h
37
create mode 100644 block/export/vduse-blk.c
38
9
39
diff --git a/qapi/block-export.json b/qapi/block-export.json
10
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
40
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
41
--- a/qapi/block-export.json
12
--- a/tests/test-bdrv-drain.c
42
+++ b/qapi/block-export.json
13
+++ b/tests/test-bdrv-drain.c
43
@@ -XXX,XX +XXX,XX @@
14
@@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret)
44
'*allow-other': 'FuseExportAllowOther' },
15
*aio_ret = ret;
45
'if': 'CONFIG_FUSE' }
16
}
46
17
47
+##
18
+typedef struct CallInCoroutineData {
48
+# @BlockExportOptionsVduseBlk:
19
+ void (*entry)(void);
49
+#
20
+ bool done;
50
+# A vduse-blk block export.
21
+} CallInCoroutineData;
51
+#
52
+# @num-queues: the number of virtqueues. Defaults to 1.
53
+# @queue-size: the size of virtqueue. Defaults to 256.
54
+# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE]
55
+# and must be power of 2. Defaults to 512 bytes.
56
+#
57
+# Since: 7.1
58
+##
59
+{ 'struct': 'BlockExportOptionsVduseBlk',
60
+ 'data': { '*num-queues': 'uint16',
61
+ '*queue-size': 'uint16',
62
+ '*logical-block-size': 'size'} }
63
+
22
+
64
##
23
+static coroutine_fn void call_in_coroutine_entry(void *opaque)
65
# @NbdServerAddOptions:
24
+{
66
#
25
+ CallInCoroutineData *data = opaque;
67
@@ -XXX,XX +XXX,XX @@
68
# @nbd: NBD export
69
# @vhost-user-blk: vhost-user-blk export (since 5.2)
70
# @fuse: FUSE export (since: 6.0)
71
+# @vduse-blk: vduse-blk export (since 7.1)
72
#
73
# Since: 4.2
74
##
75
@@ -XXX,XX +XXX,XX @@
76
'data': [ 'nbd',
77
{ 'name': 'vhost-user-blk',
78
'if': 'CONFIG_VHOST_USER_BLK_SERVER' },
79
- { 'name': 'fuse', 'if': 'CONFIG_FUSE' } ] }
80
+ { 'name': 'fuse', 'if': 'CONFIG_FUSE' },
81
+ { 'name': 'vduse-blk', 'if': 'CONFIG_VDUSE_BLK_EXPORT' } ] }
82
83
##
84
# @BlockExportOptions:
85
@@ -XXX,XX +XXX,XX @@
86
# Describes a block export, i.e. how single node should be exported on an
87
# external interface.
88
#
89
-# @id: A unique identifier for the block export (across all export types)
90
+# @id: A unique identifier for the block export (across the host for vduse-blk
91
+# export type or across all export types for other types)
92
#
93
# @node-name: The node name of the block node to be exported (since: 5.2)
94
#
95
@@ -XXX,XX +XXX,XX @@
96
'vhost-user-blk': { 'type': 'BlockExportOptionsVhostUserBlk',
97
'if': 'CONFIG_VHOST_USER_BLK_SERVER' },
98
'fuse': { 'type': 'BlockExportOptionsFuse',
99
- 'if': 'CONFIG_FUSE' }
100
+ 'if': 'CONFIG_FUSE' },
101
+ 'vduse-blk': { 'type': 'BlockExportOptionsVduseBlk',
102
+ 'if': 'CONFIG_VDUSE_BLK_EXPORT' }
103
} }
104
105
##
106
diff --git a/meson_options.txt b/meson_options.txt
107
index XXXXXXX..XXXXXXX 100644
108
--- a/meson_options.txt
109
+++ b/meson_options.txt
110
@@ -XXX,XX +XXX,XX @@ option('virtiofsd', type: 'feature', value: 'auto',
111
description: 'build virtiofs daemon (virtiofsd)')
112
option('libvduse', type: 'feature', value: 'auto',
113
description: 'build VDUSE Library')
114
+option('vduse_blk_export', type: 'feature', value: 'auto',
115
+ description: 'VDUSE block export support')
116
117
option('capstone', type: 'feature', value: 'auto',
118
description: 'Whether and how to find the capstone library')
119
diff --git a/block/export/vduse-blk.h b/block/export/vduse-blk.h
120
new file mode 100644
121
index XXXXXXX..XXXXXXX
122
--- /dev/null
123
+++ b/block/export/vduse-blk.h
124
@@ -XXX,XX +XXX,XX @@
125
+/*
126
+ * Export QEMU block device via VDUSE
127
+ *
128
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
129
+ *
130
+ * Author:
131
+ * Xie Yongji <xieyongji@bytedance.com>
132
+ *
133
+ * This work is licensed under the terms of the GNU GPL, version 2 or
134
+ * later. See the COPYING file in the top-level directory.
135
+ */
136
+
26
+
137
+#ifndef VDUSE_BLK_H
27
+ data->entry();
138
+#define VDUSE_BLK_H
28
+ data->done = true;
139
+
140
+#include "block/export.h"
141
+
142
+extern const BlockExportDriver blk_exp_vduse_blk;
143
+
144
+#endif /* VDUSE_BLK_H */
145
diff --git a/block/export/export.c b/block/export/export.c
146
index XXXXXXX..XXXXXXX 100644
147
--- a/block/export/export.c
148
+++ b/block/export/export.c
149
@@ -XXX,XX +XXX,XX @@
150
#ifdef CONFIG_VHOST_USER_BLK_SERVER
151
#include "vhost-user-blk-server.h"
152
#endif
153
+#ifdef CONFIG_VDUSE_BLK_EXPORT
154
+#include "vduse-blk.h"
155
+#endif
156
157
static const BlockExportDriver *blk_exp_drivers[] = {
158
&blk_exp_nbd,
159
@@ -XXX,XX +XXX,XX @@ static const BlockExportDriver *blk_exp_drivers[] = {
160
#ifdef CONFIG_FUSE
161
&blk_exp_fuse,
162
#endif
163
+#ifdef CONFIG_VDUSE_BLK_EXPORT
164
+ &blk_exp_vduse_blk,
165
+#endif
166
};
167
168
/* Only accessed from the main thread */
169
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
170
new file mode 100644
171
index XXXXXXX..XXXXXXX
172
--- /dev/null
173
+++ b/block/export/vduse-blk.c
174
@@ -XXX,XX +XXX,XX @@
175
+/*
176
+ * Export QEMU block device via VDUSE
177
+ *
178
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
179
+ *
180
+ * Author:
181
+ * Xie Yongji <xieyongji@bytedance.com>
182
+ *
183
+ * This work is licensed under the terms of the GNU GPL, version 2 or
184
+ * later. See the COPYING file in the top-level directory.
185
+ */
186
+
187
+#include <sys/eventfd.h>
188
+
189
+#include "qemu/osdep.h"
190
+#include "qapi/error.h"
191
+#include "block/export.h"
192
+#include "qemu/error-report.h"
193
+#include "util/block-helpers.h"
194
+#include "subprojects/libvduse/libvduse.h"
195
+#include "virtio-blk-handler.h"
196
+
197
+#include "standard-headers/linux/virtio_blk.h"
198
+
199
+#define VDUSE_DEFAULT_NUM_QUEUE 1
200
+#define VDUSE_DEFAULT_QUEUE_SIZE 256
201
+
202
+typedef struct VduseBlkExport {
203
+ BlockExport export;
204
+ VirtioBlkHandler handler;
205
+ VduseDev *dev;
206
+ uint16_t num_queues;
207
+ unsigned int inflight;
208
+} VduseBlkExport;
209
+
210
+typedef struct VduseBlkReq {
211
+ VduseVirtqElement elem;
212
+ VduseVirtq *vq;
213
+} VduseBlkReq;
214
+
215
+static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
216
+{
217
+ vblk_exp->inflight++;
218
+}
29
+}
219
+
30
+
220
+static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
31
+static void call_in_coroutine(void (*entry)(void))
221
+{
32
+{
222
+ if (--vblk_exp->inflight == 0) {
33
+ Coroutine *co;
223
+ aio_wait_kick();
34
+ CallInCoroutineData data = {
35
+ .entry = entry,
36
+ .done = false,
37
+ };
38
+
39
+ co = qemu_coroutine_create(call_in_coroutine_entry, &data);
40
+ qemu_coroutine_enter(co);
41
+ while (!data.done) {
42
+ aio_poll(qemu_get_aio_context(), true);
224
+ }
43
+ }
225
+}
44
+}
226
+
45
+
227
+static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len)
46
enum drain_type {
47
BDRV_DRAIN_ALL,
48
BDRV_DRAIN,
49
@@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void)
50
test_drv_cb_common(BDRV_SUBTREE_DRAIN, true);
51
}
52
53
+static void test_drv_cb_co_drain(void)
228
+{
54
+{
229
+ vduse_queue_push(req->vq, &req->elem, in_len);
55
+ call_in_coroutine(test_drv_cb_drain);
230
+ vduse_queue_notify(req->vq);
231
+
232
+ free(req);
233
+}
56
+}
234
+
57
+
235
+static void coroutine_fn vduse_blk_virtio_process_req(void *opaque)
58
+static void test_drv_cb_co_drain_subtree(void)
236
+{
59
+{
237
+ VduseBlkReq *req = opaque;
60
+ call_in_coroutine(test_drv_cb_drain_subtree);
238
+ VduseVirtq *vq = req->vq;
239
+ VduseDev *dev = vduse_queue_get_dev(vq);
240
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
241
+ VirtioBlkHandler *handler = &vblk_exp->handler;
242
+ VduseVirtqElement *elem = &req->elem;
243
+ struct iovec *in_iov = elem->in_sg;
244
+ struct iovec *out_iov = elem->out_sg;
245
+ unsigned in_num = elem->in_num;
246
+ unsigned out_num = elem->out_num;
247
+ int in_len;
248
+
249
+ in_len = virtio_blk_process_req(handler, in_iov,
250
+ out_iov, in_num, out_num);
251
+ if (in_len < 0) {
252
+ free(req);
253
+ return;
254
+ }
255
+
256
+ vduse_blk_req_complete(req, in_len);
257
+ vduse_blk_inflight_dec(vblk_exp);
258
+}
61
+}
259
+
62
+
260
+static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq)
63
static void test_quiesce_common(enum drain_type drain_type, bool recursive)
64
{
65
BlockBackend *blk;
66
@@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void)
67
test_quiesce_common(BDRV_SUBTREE_DRAIN, true);
68
}
69
70
+static void test_quiesce_co_drain(void)
261
+{
71
+{
262
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
72
+ call_in_coroutine(test_quiesce_drain);
263
+
264
+ while (1) {
265
+ VduseBlkReq *req;
266
+
267
+ req = vduse_queue_pop(vq, sizeof(VduseBlkReq));
268
+ if (!req) {
269
+ break;
270
+ }
271
+ req->vq = vq;
272
+
273
+ Coroutine *co =
274
+ qemu_coroutine_create(vduse_blk_virtio_process_req, req);
275
+
276
+ vduse_blk_inflight_inc(vblk_exp);
277
+ qemu_coroutine_enter(co);
278
+ }
279
+}
73
+}
280
+
74
+
281
+static void on_vduse_vq_kick(void *opaque)
75
+static void test_quiesce_co_drain_subtree(void)
282
+{
76
+{
283
+ VduseVirtq *vq = opaque;
77
+ call_in_coroutine(test_quiesce_drain_subtree);
284
+ VduseDev *dev = vduse_queue_get_dev(vq);
285
+ int fd = vduse_queue_get_fd(vq);
286
+ eventfd_t kick_data;
287
+
288
+ if (eventfd_read(fd, &kick_data) == -1) {
289
+ error_report("failed to read data from eventfd");
290
+ return;
291
+ }
292
+
293
+ vduse_blk_vq_handler(dev, vq);
294
+}
78
+}
295
+
79
+
296
+static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
80
static void test_nested(void)
297
+{
81
{
298
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
82
BlockBackend *blk;
299
+
83
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
300
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
84
g_test_add_func("/bdrv-drain/driver-cb/drain_subtree",
301
+ true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
85
test_drv_cb_drain_subtree);
302
+}
86
303
+
87
+ // XXX bdrv_drain_all() doesn't work in coroutine context
304
+static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
88
+ g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain);
305
+{
89
+ g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree",
306
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
90
+ test_drv_cb_co_drain_subtree);
307
+
308
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
309
+ true, NULL, NULL, NULL, NULL, NULL);
310
+}
311
+
312
+static const VduseOps vduse_blk_ops = {
313
+ .enable_queue = vduse_blk_enable_queue,
314
+ .disable_queue = vduse_blk_disable_queue,
315
+};
316
+
317
+static void on_vduse_dev_kick(void *opaque)
318
+{
319
+ VduseDev *dev = opaque;
320
+
321
+ vduse_dev_handler(dev);
322
+}
323
+
324
+static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
325
+{
326
+ int i;
327
+
328
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
329
+ true, on_vduse_dev_kick, NULL, NULL, NULL,
330
+ vblk_exp->dev);
331
+
332
+ for (i = 0; i < vblk_exp->num_queues; i++) {
333
+ VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
334
+ int fd = vduse_queue_get_fd(vq);
335
+
336
+ if (fd < 0) {
337
+ continue;
338
+ }
339
+ aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
340
+ on_vduse_vq_kick, NULL, NULL, NULL, vq);
341
+ }
342
+}
343
+
344
+static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
345
+{
346
+ int i;
347
+
348
+ for (i = 0; i < vblk_exp->num_queues; i++) {
349
+ VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
350
+ int fd = vduse_queue_get_fd(vq);
351
+
352
+ if (fd < 0) {
353
+ continue;
354
+ }
355
+ aio_set_fd_handler(vblk_exp->export.ctx, fd,
356
+ true, NULL, NULL, NULL, NULL, NULL);
357
+ }
358
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
359
+ true, NULL, NULL, NULL, NULL, NULL);
360
+
361
+ AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
362
+}
363
+
91
+
364
+
92
+
365
+static void blk_aio_attached(AioContext *ctx, void *opaque)
93
g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all);
366
+{
94
g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain);
367
+ VduseBlkExport *vblk_exp = opaque;
95
g_test_add_func("/bdrv-drain/quiesce/drain_subtree",
96
test_quiesce_drain_subtree);
97
98
+ // XXX bdrv_drain_all() doesn't work in coroutine context
99
+ g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain);
100
+ g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree",
101
+ test_quiesce_co_drain_subtree);
368
+
102
+
369
+ vblk_exp->export.ctx = ctx;
103
g_test_add_func("/bdrv-drain/nested", test_nested);
370
+ vduse_blk_attach_ctx(vblk_exp, ctx);
104
371
+}
105
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
372
+
373
+static void blk_aio_detach(void *opaque)
374
+{
375
+ VduseBlkExport *vblk_exp = opaque;
376
+
377
+ vduse_blk_detach_ctx(vblk_exp);
378
+ vblk_exp->export.ctx = NULL;
379
+}
380
+
381
+static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
382
+ Error **errp)
383
+{
384
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
385
+ BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk;
386
+ uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE;
387
+ uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE;
388
+ uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE;
389
+ Error *local_err = NULL;
390
+ struct virtio_blk_config config = { 0 };
391
+ uint64_t features;
392
+ int i;
393
+
394
+ if (vblk_opts->has_num_queues) {
395
+ num_queues = vblk_opts->num_queues;
396
+ if (num_queues == 0) {
397
+ error_setg(errp, "num-queues must be greater than 0");
398
+ return -EINVAL;
399
+ }
400
+ }
401
+
402
+ if (vblk_opts->has_queue_size) {
403
+ queue_size = vblk_opts->queue_size;
404
+ if (queue_size <= 2 || !is_power_of_2(queue_size) ||
405
+ queue_size > VIRTQUEUE_MAX_SIZE) {
406
+ error_setg(errp, "queue-size is invalid");
407
+ return -EINVAL;
408
+ }
409
+ }
410
+
411
+ if (vblk_opts->has_logical_block_size) {
412
+ logical_block_size = vblk_opts->logical_block_size;
413
+ check_block_size(exp->id, "logical-block-size", logical_block_size,
414
+ &local_err);
415
+ if (local_err) {
416
+ error_propagate(errp, local_err);
417
+ return -EINVAL;
418
+ }
419
+ }
420
+ vblk_exp->num_queues = num_queues;
421
+ vblk_exp->handler.blk = exp->blk;
422
+ vblk_exp->handler.serial = exp->id;
423
+ vblk_exp->handler.logical_block_size = logical_block_size;
424
+ vblk_exp->handler.writable = opts->writable;
425
+
426
+ config.capacity =
427
+ cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
428
+ config.seg_max = cpu_to_le32(queue_size - 2);
429
+ config.min_io_size = cpu_to_le16(1);
430
+ config.opt_io_size = cpu_to_le32(1);
431
+ config.num_queues = cpu_to_le16(num_queues);
432
+ config.blk_size = cpu_to_le32(logical_block_size);
433
+ config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS);
434
+ config.max_discard_seg = cpu_to_le32(1);
435
+ config.discard_sector_alignment =
436
+ cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS);
437
+ config.max_write_zeroes_sectors =
438
+ cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS);
439
+ config.max_write_zeroes_seg = cpu_to_le32(1);
440
+
441
+ features = vduse_get_virtio_features() |
442
+ (1ULL << VIRTIO_BLK_F_SEG_MAX) |
443
+ (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
444
+ (1ULL << VIRTIO_BLK_F_BLK_SIZE) |
445
+ (1ULL << VIRTIO_BLK_F_FLUSH) |
446
+ (1ULL << VIRTIO_BLK_F_DISCARD) |
447
+ (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
448
+
449
+ if (num_queues > 1) {
450
+ features |= 1ULL << VIRTIO_BLK_F_MQ;
451
+ }
452
+ if (!opts->writable) {
453
+ features |= 1ULL << VIRTIO_BLK_F_RO;
454
+ }
455
+
456
+ vblk_exp->dev = vduse_dev_create(exp->id, VIRTIO_ID_BLOCK, 0,
457
+ features, num_queues,
458
+ sizeof(struct virtio_blk_config),
459
+ (char *)&config, &vduse_blk_ops,
460
+ vblk_exp);
461
+ if (!vblk_exp->dev) {
462
+ error_setg(errp, "failed to create vduse device");
463
+ return -ENOMEM;
464
+ }
465
+
466
+ for (i = 0; i < num_queues; i++) {
467
+ vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
468
+ }
469
+
470
+ aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
471
+ on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
472
+
473
+ blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
474
+ vblk_exp);
475
+
476
+ return 0;
477
+}
478
+
479
+static void vduse_blk_exp_delete(BlockExport *exp)
480
+{
481
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
482
+
483
+ blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
484
+ vblk_exp);
485
+ vduse_dev_destroy(vblk_exp->dev);
486
+}
487
+
488
+static void vduse_blk_exp_request_shutdown(BlockExport *exp)
489
+{
490
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
491
+
492
+ aio_context_acquire(vblk_exp->export.ctx);
493
+ vduse_blk_detach_ctx(vblk_exp);
494
+ aio_context_acquire(vblk_exp->export.ctx);
495
+}
496
+
497
+const BlockExportDriver blk_exp_vduse_blk = {
498
+ .type = BLOCK_EXPORT_TYPE_VDUSE_BLK,
499
+ .instance_size = sizeof(VduseBlkExport),
500
+ .create = vduse_blk_exp_create,
501
+ .delete = vduse_blk_exp_delete,
502
+ .request_shutdown = vduse_blk_exp_request_shutdown,
503
+};
504
diff --git a/MAINTAINERS b/MAINTAINERS
505
index XXXXXXX..XXXXXXX 100644
506
--- a/MAINTAINERS
507
+++ b/MAINTAINERS
508
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
509
S: Supported
510
F: block/export/fuse.c
511
512
-VDUSE library
513
+VDUSE library and block device exports
514
M: Xie Yongji <xieyongji@bytedance.com>
515
S: Maintained
516
F: subprojects/libvduse/
517
+F: block/export/vduse-blk.c
518
+F: block/export/vduse-blk.h
519
520
Replication
521
M: Wen Congyang <wencongyang2@huawei.com>
522
diff --git a/block/export/meson.build b/block/export/meson.build
523
index XXXXXXX..XXXXXXX 100644
524
--- a/block/export/meson.build
525
+++ b/block/export/meson.build
526
@@ -XXX,XX +XXX,XX @@ if have_vhost_user_blk_server
527
endif
528
529
blockdev_ss.add(when: fuse, if_true: files('fuse.c'))
530
+
531
+if have_vduse_blk_export
532
+ blockdev_ss.add(files('vduse-blk.c', 'virtio-blk-handler.c'))
533
+ blockdev_ss.add(libvduse)
534
+endif
535
diff --git a/meson.build b/meson.build
536
index XXXXXXX..XXXXXXX 100644
537
--- a/meson.build
538
+++ b/meson.build
539
@@ -XXX,XX +XXX,XX @@ elif get_option('libvduse').disabled()
540
have_libvduse = false
541
endif
542
543
+have_vduse_blk_export = (have_libvduse and targetos == 'linux')
544
+if get_option('vduse_blk_export').enabled()
545
+ if targetos != 'linux'
546
+ error('vduse_blk_export requires linux')
547
+ elif not have_libvduse
548
+ error('vduse_blk_export requires libvduse support')
549
+ endif
550
+elif get_option('vduse_blk_export').disabled()
551
+ have_vduse_blk_export = false
552
+endif
553
+
554
# libbpf
555
libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config')
556
if libbpf.found() and not cc.links('''
557
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto)
558
config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa)
559
config_host_data.set('CONFIG_VMNET', vmnet.found())
560
config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server)
561
+config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export)
562
config_host_data.set('CONFIG_PNG', png.found())
563
config_host_data.set('CONFIG_VNC', vnc.found())
564
config_host_data.set('CONFIG_VNC_JPEG', jpeg.found())
565
@@ -XXX,XX +XXX,XX @@ if have_block
566
summary_info += {'qed support': get_option('qed').allowed()}
567
summary_info += {'parallels support': get_option('parallels').allowed()}
568
summary_info += {'FUSE exports': fuse}
569
+ summary_info += {'VDUSE block exports': have_vduse_blk_export}
570
endif
571
summary(summary_info, bool_yn: true, section: 'Block layer support')
572
573
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
574
index XXXXXXX..XXXXXXX 100644
575
--- a/scripts/meson-buildoptions.sh
576
+++ b/scripts/meson-buildoptions.sh
577
@@ -XXX,XX +XXX,XX @@ meson_options_help() {
578
printf "%s\n" ' vhost-user vhost-user backend support'
579
printf "%s\n" ' vhost-user-blk-server'
580
printf "%s\n" ' build vhost-user-blk server'
581
+ printf "%s\n" ' vduse-blk-export'
582
+ printf "%s\n" ' VDUSE block export support'
583
printf "%s\n" ' vhost-vdpa vhost-vdpa kernel backend support'
584
printf "%s\n" ' virglrenderer virgl rendering support'
585
printf "%s\n" ' virtfs virtio-9p support'
586
@@ -XXX,XX +XXX,XX @@ _meson_option_parse() {
587
--disable-vhost-user) printf "%s" -Dvhost_user=disabled ;;
588
--enable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=enabled ;;
589
--disable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=disabled ;;
590
+ --enable-vduse-blk-export) printf "%s" -Dvduse_blk_export=enabled ;;
591
+ --disable-vduse-blk-export) printf "%s" -Dvduse_blk_export=disabled ;;
592
--enable-vhost-vdpa) printf "%s" -Dvhost_vdpa=enabled ;;
593
--disable-vhost-vdpa) printf "%s" -Dvhost_vdpa=disabled ;;
594
--enable-virglrenderer) printf "%s" -Dvirglrenderer=enabled ;;
595
--
106
--
596
2.35.3
107
2.13.6
108
109
diff view generated by jsdifflib
1
From: Stefan Hajnoczi <stefanha@redhat.com>
1
Test that drain sections are correctly propagated through the graph.
2
2
3
Commit 1b7fd729559c ("block: rename buffer_alignment to
4
guest_block_size") noted:
5
6
At this point, the field is set by the device emulation, but completely
7
ignored by the block layer.
8
9
The last time the value of buffer_alignment/guest_block_size was
10
actually used was before commit 339064d50639 ("block: Don't use guest
11
sector size for qemu_blockalign()").
12
13
This value has not been used since 2013. Get rid of it.
14
15
Cc: Xie Yongji <xieyongji@bytedance.com>
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Message-Id: <20220518130945.2657905-1-stefanha@redhat.com>
18
Reviewed-by: Paul Durrant <paul@xen.org>
19
Reviewed-by: Eric Blake <eblake@redhat.com>
20
Reviewed-by: Alberto Faria <afaria@redhat.com>
21
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
3
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
22
---
4
---
23
include/sysemu/block-backend-io.h | 1 -
5
tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
24
block/block-backend.c | 10 ----------
6
1 file changed, 74 insertions(+)
25
block/export/vhost-user-blk-server.c | 1 -
26
hw/block/virtio-blk.c | 1 -
27
hw/block/xen-block.c | 1 -
28
hw/ide/core.c | 1 -
29
hw/scsi/scsi-disk.c | 1 -
30
hw/scsi/scsi-generic.c | 1 -
31
8 files changed, 17 deletions(-)
32
7
33
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
8
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
34
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
35
--- a/include/sysemu/block-backend-io.h
10
--- a/tests/test-bdrv-drain.c
36
+++ b/include/sysemu/block-backend-io.h
11
+++ b/tests/test-bdrv-drain.c
37
@@ -XXX,XX +XXX,XX @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action,
12
@@ -XXX,XX +XXX,XX @@ static void test_nested(void)
38
void blk_iostatus_set_err(BlockBackend *blk, int error);
39
int blk_get_max_iov(BlockBackend *blk);
40
int blk_get_max_hw_iov(BlockBackend *blk);
41
-void blk_set_guest_block_size(BlockBackend *blk, int align);
42
43
void blk_io_plug(BlockBackend *blk);
44
void blk_io_unplug(BlockBackend *blk);
45
diff --git a/block/block-backend.c b/block/block-backend.c
46
index XXXXXXX..XXXXXXX 100644
47
--- a/block/block-backend.c
48
+++ b/block/block-backend.c
49
@@ -XXX,XX +XXX,XX @@ struct BlockBackend {
50
const BlockDevOps *dev_ops;
51
void *dev_opaque;
52
53
- /* the block size for which the guest device expects atomicity */
54
- int guest_block_size;
55
-
56
/* If the BDS tree is removed, some of its options are stored here (which
57
* can be used to restore those options in the new BDS on insert) */
58
BlockBackendRootState root_state;
59
@@ -XXX,XX +XXX,XX @@ void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
60
blk->dev = NULL;
61
blk->dev_ops = NULL;
62
blk->dev_opaque = NULL;
63
- blk->guest_block_size = 512;
64
blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
65
blk_unref(blk);
13
blk_unref(blk);
66
}
14
}
67
@@ -XXX,XX +XXX,XX @@ int blk_get_max_iov(BlockBackend *blk)
15
68
return blk->root->bs->bl.max_iov;
16
+static void test_multiparent(void)
69
}
17
+{
70
18
+ BlockBackend *blk_a, *blk_b;
71
-void blk_set_guest_block_size(BlockBackend *blk, int align)
19
+ BlockDriverState *bs_a, *bs_b, *backing;
72
-{
20
+ BDRVTestState *a_s, *b_s, *backing_s;
73
- IO_CODE();
21
+
74
- blk->guest_block_size = align;
22
+ blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
75
-}
23
+ bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
76
-
24
+ &error_abort);
77
void *blk_try_blockalign(BlockBackend *blk, size_t size)
25
+ a_s = bs_a->opaque;
78
{
26
+ blk_insert_bs(blk_a, bs_a, &error_abort);
79
IO_CODE();
27
+
80
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
28
+ blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
81
index XXXXXXX..XXXXXXX 100644
29
+ bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
82
--- a/block/export/vhost-user-blk-server.c
30
+ &error_abort);
83
+++ b/block/export/vhost-user-blk-server.c
31
+ b_s = bs_b->opaque;
84
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
32
+ blk_insert_bs(blk_b, bs_b, &error_abort);
85
return -EINVAL;
33
+
86
}
34
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
87
vexp->blk_size = logical_block_size;
35
+ backing_s = backing->opaque;
88
- blk_set_guest_block_size(exp->blk, logical_block_size);
36
+ bdrv_set_backing_hd(bs_a, backing, &error_abort);
89
37
+ bdrv_set_backing_hd(bs_b, backing, &error_abort);
90
if (vu_opts->has_num_queues) {
38
+
91
num_queues = vu_opts->num_queues;
39
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
92
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
40
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
93
index XXXXXXX..XXXXXXX 100644
41
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
94
--- a/hw/block/virtio-blk.c
42
+ g_assert_cmpint(a_s->drain_count, ==, 0);
95
+++ b/hw/block/virtio-blk.c
43
+ g_assert_cmpint(b_s->drain_count, ==, 0);
96
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
44
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
97
45
+
98
s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
46
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
99
blk_set_dev_ops(s->blk, &virtio_block_ops, s);
47
+
100
- blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
48
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
101
49
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
102
blk_iostatus_enable(s->blk);
50
+ g_assert_cmpint(backing->quiesce_counter, ==, 1);
103
51
+ g_assert_cmpint(a_s->drain_count, ==, 1);
104
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
52
+ g_assert_cmpint(b_s->drain_count, ==, 1);
105
index XXXXXXX..XXXXXXX 100644
53
+ g_assert_cmpint(backing_s->drain_count, ==, 1);
106
--- a/hw/block/xen-block.c
54
+
107
+++ b/hw/block/xen-block.c
55
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
108
@@ -XXX,XX +XXX,XX @@ static void xen_block_realize(XenDevice *xendev, Error **errp)
56
+
109
}
57
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 2);
110
58
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
111
blk_set_dev_ops(blk, &xen_block_dev_ops, blockdev);
59
+ g_assert_cmpint(backing->quiesce_counter, ==, 2);
112
- blk_set_guest_block_size(blk, conf->logical_block_size);
60
+ g_assert_cmpint(a_s->drain_count, ==, 2);
113
61
+ g_assert_cmpint(b_s->drain_count, ==, 2);
114
if (conf->discard_granularity == -1) {
62
+ g_assert_cmpint(backing_s->drain_count, ==, 2);
115
conf->discard_granularity = conf->physical_block_size;
63
+
116
diff --git a/hw/ide/core.c b/hw/ide/core.c
64
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
117
index XXXXXXX..XXXXXXX 100644
65
+
118
--- a/hw/ide/core.c
66
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 1);
119
+++ b/hw/ide/core.c
67
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 1);
120
@@ -XXX,XX +XXX,XX @@ int ide_init_drive(IDEState *s, BlockBackend *blk, IDEDriveKind kind,
68
+ g_assert_cmpint(backing->quiesce_counter, ==, 1);
121
s->smart_selftest_count = 0;
69
+ g_assert_cmpint(a_s->drain_count, ==, 1);
122
if (kind == IDE_CD) {
70
+ g_assert_cmpint(b_s->drain_count, ==, 1);
123
blk_set_dev_ops(blk, &ide_cd_block_ops, s);
71
+ g_assert_cmpint(backing_s->drain_count, ==, 1);
124
- blk_set_guest_block_size(blk, 2048);
72
+
125
} else {
73
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
126
if (!blk_is_inserted(s->blk)) {
74
+
127
error_setg(errp, "Device needs media, but drive is empty");
75
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
128
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
76
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
129
index XXXXXXX..XXXXXXX 100644
77
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
130
--- a/hw/scsi/scsi-disk.c
78
+ g_assert_cmpint(a_s->drain_count, ==, 0);
131
+++ b/hw/scsi/scsi-disk.c
79
+ g_assert_cmpint(b_s->drain_count, ==, 0);
132
@@ -XXX,XX +XXX,XX @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
80
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
133
} else {
81
+
134
blk_set_dev_ops(s->qdev.conf.blk, &scsi_disk_block_ops, s);
82
+ bdrv_unref(backing);
135
}
83
+ bdrv_unref(bs_a);
136
- blk_set_guest_block_size(s->qdev.conf.blk, s->qdev.blocksize);
84
+ bdrv_unref(bs_b);
137
85
+ blk_unref(blk_a);
138
blk_iostatus_enable(s->qdev.conf.blk);
86
+ blk_unref(blk_b);
139
87
+}
140
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
88
+
141
index XXXXXXX..XXXXXXX 100644
89
142
--- a/hw/scsi/scsi-generic.c
90
typedef struct TestBlockJob {
143
+++ b/hw/scsi/scsi-generic.c
91
BlockJob common;
144
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
92
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
145
s->blocksize = ldl_be_p(&r->buf[8]);
93
test_quiesce_co_drain_subtree);
146
s->max_lba = ldq_be_p(&r->buf[0]);
94
147
}
95
g_test_add_func("/bdrv-drain/nested", test_nested);
148
- blk_set_guest_block_size(s->conf.blk, s->blocksize);
96
+ g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
149
97
150
/*
98
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
151
* Patch MODE SENSE device specific parameters if the BDS is opened
99
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
152
--
100
--
153
2.35.3
101
2.13.6
102
103
diff view generated by jsdifflib
1
From: Xie Yongji <xieyongji@bytedance.com>
1
We need to remember how many of the drain sections in which a node is
2
2
were recursive (i.e. subtree drain rather than node drain), so that they
3
Currently we use 'id' option as the name of VDUSE device.
3
can be correctly applied when children are added or removed during the
4
It's a bit confusing since we use one value for two different
4
drained section.
5
purposes: the ID to identfy the export within QEMU (must be
5
6
distinct from any other exports in the same QEMU process, but
6
With this change, it is safe to modify the graph even inside a
7
can overlap with names used by other processes), and the VDUSE
7
bdrv_subtree_drained_begin/end() section.
8
name to uniquely identify it on the host (must be distinct from
8
9
other VDUSE devices on the same host, but can overlap with other
10
export types like NBD in the same process). To make it clear,
11
this patch adds a separate 'name' option to specify the VDUSE
12
name for the vduse-blk export instead.
13
14
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
15
Message-Id: <20220614051532.92-7-xieyongji@bytedance.com>
16
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
---
10
---
18
qapi/block-export.json | 7 ++++---
11
include/block/block.h | 2 --
19
docs/tools/qemu-storage-daemon.rst | 5 +++--
12
include/block/block_int.h | 5 +++++
20
block/export/vduse-blk.c | 4 ++--
13
block.c | 32 +++++++++++++++++++++++++++++---
21
storage-daemon/qemu-storage-daemon.c | 8 ++++----
14
block/io.c | 28 ++++++++++++++++++++++++----
22
4 files changed, 13 insertions(+), 11 deletions(-)
15
4 files changed, 58 insertions(+), 9 deletions(-)
23
16
24
diff --git a/qapi/block-export.json b/qapi/block-export.json
17
diff --git a/include/block/block.h b/include/block/block.h
25
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
26
--- a/qapi/block-export.json
19
--- a/include/block/block.h
27
+++ b/qapi/block-export.json
20
+++ b/include/block/block.h
28
@@ -XXX,XX +XXX,XX @@
21
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs);
29
#
22
/**
30
# A vduse-blk block export.
23
* Like bdrv_drained_begin, but recursively begins a quiesced section for
31
#
24
* exclusive access to all child nodes as well.
32
+# @name: the name of VDUSE device (must be unique across the host).
25
- *
33
# @num-queues: the number of virtqueues. Defaults to 1.
26
- * Graph changes are not allowed during a subtree drain section.
34
# @queue-size: the size of virtqueue. Defaults to 256.
27
*/
35
# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE]
28
void bdrv_subtree_drained_begin(BlockDriverState *bs);
36
@@ -XXX,XX +XXX,XX @@
29
37
# Since: 7.1
30
diff --git a/include/block/block_int.h b/include/block/block_int.h
38
##
31
index XXXXXXX..XXXXXXX 100644
39
{ 'struct': 'BlockExportOptionsVduseBlk',
32
--- a/include/block/block_int.h
40
- 'data': { '*num-queues': 'uint16',
33
+++ b/include/block/block_int.h
41
+ 'data': { 'name': 'str',
34
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
42
+ '*num-queues': 'uint16',
35
43
'*queue-size': 'uint16',
36
/* Accessed with atomic ops. */
44
'*logical-block-size': 'size',
37
int quiesce_counter;
45
'*serial': 'str' } }
38
+ int recursive_quiesce_counter;
46
@@ -XXX,XX +XXX,XX @@
39
+
47
# Describes a block export, i.e. how single node should be exported on an
40
unsigned int write_gen; /* Current data generation */
48
# external interface.
41
49
#
42
/* Protected by reqs_lock. */
50
-# @id: A unique identifier for the block export (across the host for vduse-blk
43
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
51
-# export type or across all export types for other types)
44
int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
52
+# @id: A unique identifier for the block export (across all export types)
45
BdrvRequestFlags flags);
53
#
46
54
# @node-name: The node name of the block node to be exported (since: 5.2)
47
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
55
#
48
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
56
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
49
+
57
index XXXXXXX..XXXXXXX 100644
50
int get_tmp_filename(char *filename, int size);
58
--- a/docs/tools/qemu-storage-daemon.rst
51
BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
59
+++ b/docs/tools/qemu-storage-daemon.rst
52
const char *filename);
60
@@ -XXX,XX +XXX,XX @@ Standard options:
53
diff --git a/block.c b/block.c
61
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
54
index XXXXXXX..XXXXXXX 100644
62
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
55
--- a/block.c
63
--export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
56
+++ b/block.c
64
- --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
57
@@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child)
65
+ --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
58
bdrv_drained_end(bs);
66
59
}
67
is a block export definition. ``node-name`` is the block node that should be
60
68
exported. ``writable`` determines whether or not the export allows write
61
+static void bdrv_child_cb_attach(BdrvChild *child)
69
@@ -XXX,XX +XXX,XX @@ Standard options:
62
+{
70
``allow-other`` to auto (the default) will try enabling this option, and on
63
+ BlockDriverState *bs = child->opaque;
71
error fall back to disabling it.
64
+ bdrv_apply_subtree_drain(child, bs);
72
65
+}
73
- The ``vduse-blk`` export type uses the ``id`` as the VDUSE device name.
66
+
74
+ The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
67
+static void bdrv_child_cb_detach(BdrvChild *child)
75
+ to create the VDUSE device.
68
+{
76
``num-queues`` sets the number of virtqueues (the default is 1).
69
+ BlockDriverState *bs = child->opaque;
77
``queue-size`` sets the virtqueue descriptor table size (the default is 256).
70
+ bdrv_unapply_subtree_drain(child, bs);
78
71
+}
79
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
72
+
80
index XXXXXXX..XXXXXXX 100644
73
static int bdrv_child_cb_inactivate(BdrvChild *child)
81
--- a/block/export/vduse-blk.c
74
{
82
+++ b/block/export/vduse-blk.c
75
BlockDriverState *bs = child->opaque;
83
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
76
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = {
84
features |= 1ULL << VIRTIO_BLK_F_RO;
77
.inherit_options = bdrv_inherited_options,
78
.drained_begin = bdrv_child_cb_drained_begin,
79
.drained_end = bdrv_child_cb_drained_end,
80
+ .attach = bdrv_child_cb_attach,
81
+ .detach = bdrv_child_cb_detach,
82
.inactivate = bdrv_child_cb_inactivate,
83
};
84
85
@@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = {
86
.inherit_options = bdrv_inherited_fmt_options,
87
.drained_begin = bdrv_child_cb_drained_begin,
88
.drained_end = bdrv_child_cb_drained_end,
89
+ .attach = bdrv_child_cb_attach,
90
+ .detach = bdrv_child_cb_detach,
91
.inactivate = bdrv_child_cb_inactivate,
92
};
93
94
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c)
95
parent->backing_blocker);
96
bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
97
parent->backing_blocker);
98
+
99
+ bdrv_child_cb_attach(c);
100
}
101
102
static void bdrv_backing_detach(BdrvChild *c)
103
@@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c)
104
bdrv_op_unblock_all(c->bs, parent->backing_blocker);
105
error_free(parent->backing_blocker);
106
parent->backing_blocker = NULL;
107
+
108
+ bdrv_child_cb_detach(c);
109
}
110
111
/*
112
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
113
assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
85
}
114
}
86
115
if (old_bs) {
87
- vblk_exp->dev = vduse_dev_create(exp->id, VIRTIO_ID_BLOCK, 0,
116
+ /* Detach first so that the recursive drain sections coming from @child
88
+ vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0,
117
+ * are already gone and we only end the drain sections that came from
89
features, num_queues,
118
+ * elsewhere. */
90
sizeof(struct virtio_blk_config),
119
+ if (child->role->detach) {
91
(char *)&config, &vduse_blk_ops,
120
+ child->role->detach(child);
92
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
121
+ }
122
if (old_bs->quiesce_counter && child->role->drained_end) {
123
for (i = 0; i < old_bs->quiesce_counter; i++) {
124
child->role->drained_end(child);
125
}
126
}
127
- if (child->role->detach) {
128
- child->role->detach(child);
129
- }
130
QLIST_REMOVE(child, next_parent);
93
}
131
}
94
132
95
vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
133
@@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child,
96
- g_get_tmp_dir(), exp->id);
134
}
97
+ g_get_tmp_dir(), vblk_opts->name);
135
}
98
if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
136
99
error_setg(errp, "failed to set reconnect log file");
137
+ /* Attach only after starting new drained sections, so that recursive
100
ret = -EINVAL;
138
+ * drain sections coming from @child don't get an extra .drained_begin
101
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
139
+ * callback. */
102
index XXXXXXX..XXXXXXX 100644
140
if (child->role->attach) {
103
--- a/storage-daemon/qemu-storage-daemon.c
141
child->role->attach(child);
104
+++ b/storage-daemon/qemu-storage-daemon.c
142
}
105
@@ -XXX,XX +XXX,XX @@ static void help(void)
143
diff --git a/block/io.c b/block/io.c
106
#endif /* CONFIG_VHOST_USER_BLK_SERVER */
144
index XXXXXXX..XXXXXXX 100644
107
#ifdef CONFIG_VDUSE_BLK_EXPORT
145
--- a/block/io.c
108
" --export [type=]vduse-blk,id=<id>,node-name=<node-name>\n"
146
+++ b/block/io.c
109
-" [,writable=on|off][,num-queues=<num-queues>]\n"
147
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
110
-" [,queue-size=<queue-size>]\n"
148
assert(data.done);
111
+" ,name=<vduse-name>[,writable=on|off]\n"
149
}
112
+" [,num-queues=<num-queues>][,queue-size=<queue-size>]\n"
150
113
" [,logical-block-size=<logical-block-size>]\n"
151
-static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
114
" [,serial=<serial-number>]\n"
152
- BdrvChild *parent)
115
-" export the specified block node as a vduse-blk\n"
153
+void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
116
-" device using the id as the VDUSE device name\n"
154
+ BdrvChild *parent)
117
+" export the specified block node as a\n"
155
{
118
+" vduse-blk device\n"
156
BdrvChild *child, *next;
119
"\n"
157
120
#endif /* CONFIG_VDUSE_BLK_EXPORT */
158
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
121
" --monitor [chardev=]name[,mode=control][,pretty[=on|off]]\n"
159
bdrv_drain_recurse(bs);
160
161
if (recursive) {
162
+ bs->recursive_quiesce_counter++;
163
QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
164
bdrv_do_drained_begin(child->bs, true, child);
165
}
166
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs)
167
bdrv_do_drained_begin(bs, true, NULL);
168
}
169
170
-static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
171
- BdrvChild *parent)
172
+void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
173
+ BdrvChild *parent)
174
{
175
BdrvChild *child, *next;
176
int old_quiesce_counter;
177
@@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
178
}
179
180
if (recursive) {
181
+ bs->recursive_quiesce_counter--;
182
QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
183
bdrv_do_drained_end(child->bs, true, child);
184
}
185
@@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs)
186
bdrv_do_drained_end(bs, true, NULL);
187
}
188
189
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
190
+{
191
+ int i;
192
+
193
+ for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
194
+ bdrv_do_drained_begin(child->bs, true, child);
195
+ }
196
+}
197
+
198
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
199
+{
200
+ int i;
201
+
202
+ for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
203
+ bdrv_do_drained_end(child->bs, true, child);
204
+ }
205
+}
206
+
207
/*
208
* Wait for pending requests to complete on a single BlockDriverState subtree,
209
* and suspend block driver's internal I/O until next request arrives.
122
--
210
--
123
2.35.3
211
2.13.6
212
213
diff view generated by jsdifflib
1
From: Fabian Ebner <f.ebner@proxmox.com>
2
3
On 64-bit platforms, assigning SIZE_MAX to the int64_t max_pdiscard
4
results in a negative value, and the following assertion would trigger
5
down the line (it's not the same max_pdiscard, but computed from the
6
other one):
7
qemu-system-x86_64: ../block/io.c:3166: bdrv_co_pdiscard: Assertion
8
`max_pdiscard >= bs->bl.request_alignment' failed.
9
10
On 32-bit platforms, it's fine to keep using SIZE_MAX.
11
12
The assertion in qemu_gluster_co_pdiscard() is checking that the value
13
of 'bytes' can safely be passed to glfs_discard_async(), which takes a
14
size_t for the argument in question, so it is kept as is. And since
15
max_pdiscard is still <= SIZE_MAX, relying on max_pdiscard is still
16
fine.
17
18
Fixes: 0c8022876f ("block: use int64_t instead of int in driver discard handlers")
19
Cc: qemu-stable@nongnu.org
20
Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
21
Message-Id: <20220520075922.43972-1-f.ebner@proxmox.com>
22
Reviewed-by: Eric Blake <eblake@redhat.com>
23
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
24
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
25
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
1
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
26
---
2
---
27
block/gluster.c | 2 +-
3
tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
28
1 file changed, 1 insertion(+), 1 deletion(-)
4
1 file changed, 80 insertions(+)
29
5
30
diff --git a/block/gluster.c b/block/gluster.c
6
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
31
index XXXXXXX..XXXXXXX 100644
7
index XXXXXXX..XXXXXXX 100644
32
--- a/block/gluster.c
8
--- a/tests/test-bdrv-drain.c
33
+++ b/block/gluster.c
9
+++ b/tests/test-bdrv-drain.c
34
@@ -XXX,XX +XXX,XX @@ out:
10
@@ -XXX,XX +XXX,XX @@ static void test_multiparent(void)
35
static void qemu_gluster_refresh_limits(BlockDriverState *bs, Error **errp)
11
blk_unref(blk_b);
36
{
37
bs->bl.max_transfer = GLUSTER_MAX_TRANSFER;
38
- bs->bl.max_pdiscard = SIZE_MAX;
39
+ bs->bl.max_pdiscard = MIN(SIZE_MAX, INT64_MAX);
40
}
12
}
41
13
42
static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
14
+static void test_graph_change(void)
15
+{
16
+ BlockBackend *blk_a, *blk_b;
17
+ BlockDriverState *bs_a, *bs_b, *backing;
18
+ BDRVTestState *a_s, *b_s, *backing_s;
19
+
20
+ blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
21
+ bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR,
22
+ &error_abort);
23
+ a_s = bs_a->opaque;
24
+ blk_insert_bs(blk_a, bs_a, &error_abort);
25
+
26
+ blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
27
+ bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR,
28
+ &error_abort);
29
+ b_s = bs_b->opaque;
30
+ blk_insert_bs(blk_b, bs_b, &error_abort);
31
+
32
+ backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort);
33
+ backing_s = backing->opaque;
34
+ bdrv_set_backing_hd(bs_a, backing, &error_abort);
35
+
36
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
37
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
38
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
39
+ g_assert_cmpint(a_s->drain_count, ==, 0);
40
+ g_assert_cmpint(b_s->drain_count, ==, 0);
41
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
42
+
43
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
44
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
45
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a);
46
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
47
+ do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b);
48
+
49
+ bdrv_set_backing_hd(bs_b, backing, &error_abort);
50
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
51
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
52
+ g_assert_cmpint(backing->quiesce_counter, ==, 5);
53
+ g_assert_cmpint(a_s->drain_count, ==, 5);
54
+ g_assert_cmpint(b_s->drain_count, ==, 5);
55
+ g_assert_cmpint(backing_s->drain_count, ==, 5);
56
+
57
+ bdrv_set_backing_hd(bs_b, NULL, &error_abort);
58
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 3);
59
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 2);
60
+ g_assert_cmpint(backing->quiesce_counter, ==, 3);
61
+ g_assert_cmpint(a_s->drain_count, ==, 3);
62
+ g_assert_cmpint(b_s->drain_count, ==, 2);
63
+ g_assert_cmpint(backing_s->drain_count, ==, 3);
64
+
65
+ bdrv_set_backing_hd(bs_b, backing, &error_abort);
66
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 5);
67
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 5);
68
+ g_assert_cmpint(backing->quiesce_counter, ==, 5);
69
+ g_assert_cmpint(a_s->drain_count, ==, 5);
70
+ g_assert_cmpint(b_s->drain_count, ==, 5);
71
+ g_assert_cmpint(backing_s->drain_count, ==, 5);
72
+
73
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
74
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_b);
75
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
76
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
77
+ do_drain_end(BDRV_SUBTREE_DRAIN, bs_a);
78
+
79
+ g_assert_cmpint(bs_a->quiesce_counter, ==, 0);
80
+ g_assert_cmpint(bs_b->quiesce_counter, ==, 0);
81
+ g_assert_cmpint(backing->quiesce_counter, ==, 0);
82
+ g_assert_cmpint(a_s->drain_count, ==, 0);
83
+ g_assert_cmpint(b_s->drain_count, ==, 0);
84
+ g_assert_cmpint(backing_s->drain_count, ==, 0);
85
+
86
+ bdrv_unref(backing);
87
+ bdrv_unref(bs_a);
88
+ bdrv_unref(bs_b);
89
+ blk_unref(blk_a);
90
+ blk_unref(blk_b);
91
+}
92
+
93
94
typedef struct TestBlockJob {
95
BlockJob common;
96
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
97
98
g_test_add_func("/bdrv-drain/nested", test_nested);
99
g_test_add_func("/bdrv-drain/multiparent", test_multiparent);
100
+ g_test_add_func("/bdrv-drain/graph-change", test_graph_change);
101
102
g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all);
103
g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain);
43
--
104
--
44
2.35.3
105
2.13.6
106
107
diff view generated by jsdifflib
New patch
1
Since commit bde70715, base is the only node that is reopened in
2
commit_start(). This means that the code, which still involves an
3
explicit BlockReopenQueue, can now be simplified by using bdrv_reopen().
1
4
5
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
6
Reviewed-by: Fam Zheng <famz@redhat.com>
7
---
8
block/commit.c | 8 +-------
9
1 file changed, 1 insertion(+), 7 deletions(-)
10
11
diff --git a/block/commit.c b/block/commit.c
12
index XXXXXXX..XXXXXXX 100644
13
--- a/block/commit.c
14
+++ b/block/commit.c
15
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
16
const char *filter_node_name, Error **errp)
17
{
18
CommitBlockJob *s;
19
- BlockReopenQueue *reopen_queue = NULL;
20
int orig_base_flags;
21
BlockDriverState *iter;
22
BlockDriverState *commit_top_bs = NULL;
23
@@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs,
24
/* convert base to r/w, if necessary */
25
orig_base_flags = bdrv_get_flags(base);
26
if (!(orig_base_flags & BDRV_O_RDWR)) {
27
- reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
28
- orig_base_flags | BDRV_O_RDWR);
29
- }
30
-
31
- if (reopen_queue) {
32
- bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
33
+ bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err);
34
if (local_err != NULL) {
35
error_propagate(errp, local_err);
36
goto fail;
37
--
38
2.13.6
39
40
diff view generated by jsdifflib
New patch
1
The bdrv_reopen*() implementation doesn't like it if the graph is
2
changed between queuing nodes for reopen and actually reopening them
3
(one of the reasons is that queuing can be recursive).
1
4
5
So instead of draining the device only in bdrv_reopen_multiple(),
6
require that callers already drained all affected nodes, and assert this
7
in bdrv_reopen_queue().
8
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
Reviewed-by: Fam Zheng <famz@redhat.com>
11
---
12
block.c | 23 ++++++++++++++++-------
13
block/replication.c | 6 ++++++
14
qemu-io-cmds.c | 3 +++
15
3 files changed, 25 insertions(+), 7 deletions(-)
16
17
diff --git a/block.c b/block.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/block.c
20
+++ b/block.c
21
@@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
22
* returns a pointer to bs_queue, which is either the newly allocated
23
* bs_queue, or the existing bs_queue being used.
24
*
25
+ * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
26
*/
27
static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
28
BlockDriverState *bs,
29
@@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
30
BdrvChild *child;
31
QDict *old_options, *explicit_options;
32
33
+ /* Make sure that the caller remembered to use a drained section. This is
34
+ * important to avoid graph changes between the recursive queuing here and
35
+ * bdrv_reopen_multiple(). */
36
+ assert(bs->quiesce_counter > 0);
37
+
38
if (bs_queue == NULL) {
39
bs_queue = g_new0(BlockReopenQueue, 1);
40
QSIMPLEQ_INIT(bs_queue);
41
@@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
42
* If all devices prepare successfully, then the changes are committed
43
* to all devices.
44
*
45
+ * All affected nodes must be drained between bdrv_reopen_queue() and
46
+ * bdrv_reopen_multiple().
47
*/
48
int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp)
49
{
50
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
51
52
assert(bs_queue != NULL);
53
54
- aio_context_release(ctx);
55
- bdrv_drain_all_begin();
56
- aio_context_acquire(ctx);
57
-
58
QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
59
+ assert(bs_entry->state.bs->quiesce_counter > 0);
60
if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
61
error_propagate(errp, local_err);
62
goto cleanup;
63
@@ -XXX,XX +XXX,XX @@ cleanup:
64
}
65
g_free(bs_queue);
66
67
- bdrv_drain_all_end();
68
-
69
return ret;
70
}
71
72
@@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
73
{
74
int ret = -1;
75
Error *local_err = NULL;
76
- BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
77
+ BlockReopenQueue *queue;
78
79
+ bdrv_subtree_drained_begin(bs);
80
+
81
+ queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
82
ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err);
83
if (local_err != NULL) {
84
error_propagate(errp, local_err);
85
}
86
+
87
+ bdrv_subtree_drained_end(bs);
88
+
89
return ret;
90
}
91
92
diff --git a/block/replication.c b/block/replication.c
93
index XXXXXXX..XXXXXXX 100644
94
--- a/block/replication.c
95
+++ b/block/replication.c
96
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
97
new_secondary_flags = s->orig_secondary_flags;
98
}
99
100
+ bdrv_subtree_drained_begin(s->hidden_disk->bs);
101
+ bdrv_subtree_drained_begin(s->secondary_disk->bs);
102
+
103
if (orig_hidden_flags != new_hidden_flags) {
104
reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL,
105
new_hidden_flags);
106
@@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
107
reopen_queue, &local_err);
108
error_propagate(errp, local_err);
109
}
110
+
111
+ bdrv_subtree_drained_end(s->hidden_disk->bs);
112
+ bdrv_subtree_drained_end(s->secondary_disk->bs);
113
}
114
115
static void backup_job_cleanup(BlockDriverState *bs)
116
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
117
index XXXXXXX..XXXXXXX 100644
118
--- a/qemu-io-cmds.c
119
+++ b/qemu-io-cmds.c
120
@@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv)
121
opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL;
122
qemu_opts_reset(&reopen_opts);
123
124
+ bdrv_subtree_drained_begin(bs);
125
brq = bdrv_reopen_queue(NULL, bs, opts, flags);
126
bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err);
127
+ bdrv_subtree_drained_end(bs);
128
+
129
if (local_err) {
130
error_report_err(local_err);
131
} else {
132
--
133
2.13.6
134
135
diff view generated by jsdifflib