1 | The following changes since commit 3a821c52e1a30ecd9a436f2c67cc66b5628c829f: | 1 | The following changes since commit 281f327487c9c9b1599f93c589a408bbf4a651b8: |
---|---|---|---|
2 | 2 | ||
3 | Merge tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme into staging (2022-06-23 14:52:30 -0700) | 3 | Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.12-pull-request' into staging (2017-12-22 00:11:36 +0000) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the git repository at: |
6 | 6 | ||
7 | git://repo.or.cz/qemu/kevin.git tags/for-upstream | 7 | git://repo.or.cz/qemu/kevin.git tags/for-upstream |
8 | 8 | ||
9 | for you to fetch changes up to 779d82e1d305f2a9cbd7f48cf6555ad58145e04a: | 9 | for you to fetch changes up to 1a63a907507fbbcfaee3f622907ec244b7eabda8: |
10 | 10 | ||
11 | vduse-blk: Add name option (2022-06-24 17:07:06 +0200) | 11 | block: Keep nodes drained between reopen_queue/multiple (2017-12-22 15:05:32 +0100) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Block layer patches | 14 | Block layer patches |
15 | 15 | ||
16 | - Add vduse-blk export | 16 | ---------------------------------------------------------------- |
17 | - Dirty bitmaps: Fix and improve bitmap merge | 17 | Doug Gale (1): |
18 | - gluster: correctly set max_pdiscard | 18 | nvme: Add tracing |
19 | - rbd: report a better error when namespace does not exist | ||
20 | - aio_wait_kick: add missing memory barrier | ||
21 | - Code cleanups | ||
22 | 19 | ||
23 | ---------------------------------------------------------------- | 20 | Edgar Kaziakhmedov (1): |
24 | Emanuele Giuseppe Esposito (1): | 21 | qcow2: get rid of qcow2_backing_read1 routine |
25 | aio_wait_kick: add missing memory barrier | ||
26 | 22 | ||
27 | Eric Blake (1): | 23 | Fam Zheng (2): |
28 | nbd: Drop dead code spotted by Coverity | 24 | block: Open backing image in force share mode for size probe |
25 | block: Remove unused bdrv_requests_pending | ||
29 | 26 | ||
30 | Fabian Ebner (1): | 27 | John Snow (1): |
31 | block/gluster: correctly set max_pdiscard | 28 | iotests: fix 197 for vpc |
32 | 29 | ||
33 | Stefan Hajnoczi (3): | 30 | Kevin Wolf (27): |
34 | block: drop unused bdrv_co_drain() API | 31 | block: Formats don't need CONSISTENT_READ with NO_IO |
35 | block: get rid of blk->guest_block_size | 32 | block: Make bdrv_drain_invoke() recursive |
36 | qsd: document vduse-blk exports | 33 | block: Call .drain_begin only once in bdrv_drain_all_begin() |
34 | test-bdrv-drain: Test BlockDriver callbacks for drain | ||
35 | block: bdrv_drain_recurse(): Remove unused begin parameter | ||
36 | block: Don't wait for requests in bdrv_drain*_end() | ||
37 | block: Unify order in drain functions | ||
38 | block: Don't acquire AioContext in hmp_qemu_io() | ||
39 | block: Document that x-blockdev-change breaks quorum children list | ||
40 | block: Assert drain_all is only called from main AioContext | ||
41 | block: Make bdrv_drain() driver callbacks non-recursive | ||
42 | test-bdrv-drain: Test callback for bdrv_drain | ||
43 | test-bdrv-drain: Test bs->quiesce_counter | ||
44 | blockjob: Pause job on draining any job BDS | ||
45 | test-bdrv-drain: Test drain vs. block jobs | ||
46 | block: Don't block_job_pause_all() in bdrv_drain_all() | ||
47 | block: Nested drain_end must still call callbacks | ||
48 | test-bdrv-drain: Test nested drain sections | ||
49 | block: Don't notify parents in drain call chain | ||
50 | block: Add bdrv_subtree_drained_begin/end() | ||
51 | test-bdrv-drain: Tests for bdrv_subtree_drain | ||
52 | test-bdrv-drain: Test behaviour in coroutine context | ||
53 | test-bdrv-drain: Recursive draining with multiple parents | ||
54 | block: Allow graph changes in subtree drained section | ||
55 | test-bdrv-drain: Test graph changes in drained section | ||
56 | commit: Simplify reopen of base | ||
57 | block: Keep nodes drained between reopen_queue/multiple | ||
37 | 58 | ||
38 | Stefano Garzarella (1): | 59 | Thomas Huth (3): |
39 | block/rbd: report a better error when namespace does not exist | 60 | block: Remove the obsolete -drive boot=on|off parameter |
61 | block: Remove the deprecated -hdachs option | ||
62 | block: Mention -drive cyls/heads/secs/trans/serial/addr in deprecation chapter | ||
40 | 63 | ||
41 | Vladimir Sementsov-Ogievskiy (3): | 64 | qapi/block-core.json | 4 + |
42 | block: block_dirty_bitmap_merge(): fix error path | 65 | block/qcow2.h | 3 - |
43 | block: improve block_dirty_bitmap_merge(): don't allocate extra bitmap | 66 | include/block/block.h | 15 +- |
44 | block: simplify handling of try to merge different sized bitmaps | 67 | include/block/block_int.h | 6 +- |
68 | block.c | 75 ++++- | ||
69 | block/commit.c | 8 +- | ||
70 | block/io.c | 164 +++++++--- | ||
71 | block/qcow2.c | 51 +-- | ||
72 | block/replication.c | 6 + | ||
73 | blockdev.c | 11 - | ||
74 | blockjob.c | 22 +- | ||
75 | hmp.c | 6 - | ||
76 | hw/block/nvme.c | 349 +++++++++++++++++---- | ||
77 | qemu-io-cmds.c | 3 + | ||
78 | tests/test-bdrv-drain.c | 651 +++++++++++++++++++++++++++++++++++++++ | ||
79 | vl.c | 86 +----- | ||
80 | hw/block/trace-events | 93 ++++++ | ||
81 | qemu-doc.texi | 29 +- | ||
82 | qemu-options.hx | 19 +- | ||
83 | tests/Makefile.include | 2 + | ||
84 | tests/qemu-iotests/197 | 4 + | ||
85 | tests/qemu-iotests/common.filter | 3 +- | ||
86 | 22 files changed, 1294 insertions(+), 316 deletions(-) | ||
87 | create mode 100644 tests/test-bdrv-drain.c | ||
45 | 88 | ||
46 | Xie Yongji (10): | ||
47 | block: Support passing NULL ops to blk_set_dev_ops() | ||
48 | block/export: Fix incorrect length passed to vu_queue_push() | ||
49 | block/export: Abstract out the logic of virtio-blk I/O process | ||
50 | linux-headers: Add vduse.h | ||
51 | libvduse: Add VDUSE (vDPA Device in Userspace) library | ||
52 | vduse-blk: Implement vduse-blk export | ||
53 | vduse-blk: Add vduse-blk resize support | ||
54 | libvduse: Add support for reconnecting | ||
55 | vduse-blk: Add serial option | ||
56 | vduse-blk: Add name option | ||
57 | |||
58 | qapi/block-export.json | 29 +- | ||
59 | docs/tools/qemu-storage-daemon.rst | 22 + | ||
60 | meson_options.txt | 4 + | ||
61 | block/export/vduse-blk.h | 20 + | ||
62 | block/export/virtio-blk-handler.h | 37 + | ||
63 | include/block/aio-wait.h | 2 + | ||
64 | include/block/block-io.h | 1 - | ||
65 | include/block/block_int-io.h | 2 +- | ||
66 | include/qemu/hbitmap.h | 15 +- | ||
67 | include/sysemu/block-backend-io.h | 1 - | ||
68 | linux-headers/linux/vduse.h | 306 ++++++ | ||
69 | subprojects/libvduse/include/atomic.h | 1 + | ||
70 | subprojects/libvduse/include/compiler.h | 1 + | ||
71 | subprojects/libvduse/libvduse.h | 247 +++++ | ||
72 | block/backup.c | 6 +- | ||
73 | block/block-backend.c | 12 +- | ||
74 | block/dirty-bitmap.c | 26 +- | ||
75 | block/export/export.c | 6 + | ||
76 | block/export/vduse-blk.c | 374 ++++++++ | ||
77 | block/export/vhost-user-blk-server.c | 263 +---- | ||
78 | block/export/virtio-blk-handler.c | 240 +++++ | ||
79 | block/gluster.c | 2 +- | ||
80 | block/io.c | 15 - | ||
81 | block/monitor/bitmap-qmp-cmds.c | 40 +- | ||
82 | block/nbd.c | 8 +- | ||
83 | block/rbd.c | 24 + | ||
84 | hw/block/virtio-blk.c | 1 - | ||
85 | hw/block/xen-block.c | 1 - | ||
86 | hw/ide/core.c | 1 - | ||
87 | hw/scsi/scsi-disk.c | 1 - | ||
88 | hw/scsi/scsi-generic.c | 1 - | ||
89 | storage-daemon/qemu-storage-daemon.c | 10 + | ||
90 | subprojects/libvduse/libvduse.c | 1375 +++++++++++++++++++++++++++ | ||
91 | util/aio-wait.c | 16 +- | ||
92 | util/hbitmap.c | 25 +- | ||
93 | MAINTAINERS | 9 + | ||
94 | block/export/meson.build | 7 +- | ||
95 | meson.build | 34 + | ||
96 | scripts/meson-buildoptions.sh | 7 + | ||
97 | scripts/update-linux-headers.sh | 2 +- | ||
98 | subprojects/libvduse/linux-headers/linux | 1 + | ||
99 | subprojects/libvduse/meson.build | 10 + | ||
100 | subprojects/libvduse/standard-headers/linux | 1 + | ||
101 | 43 files changed, 2852 insertions(+), 354 deletions(-) | ||
102 | create mode 100644 block/export/vduse-blk.h | ||
103 | create mode 100644 block/export/virtio-blk-handler.h | ||
104 | create mode 100644 linux-headers/linux/vduse.h | ||
105 | create mode 120000 subprojects/libvduse/include/atomic.h | ||
106 | create mode 120000 subprojects/libvduse/include/compiler.h | ||
107 | create mode 100644 subprojects/libvduse/libvduse.h | ||
108 | create mode 100644 block/export/vduse-blk.c | ||
109 | create mode 100644 block/export/virtio-blk-handler.c | ||
110 | create mode 100644 subprojects/libvduse/libvduse.c | ||
111 | create mode 120000 subprojects/libvduse/linux-headers/linux | ||
112 | create mode 100644 subprojects/libvduse/meson.build | ||
113 | create mode 120000 subprojects/libvduse/standard-headers/linux | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Commit 1f4ad7d fixed 'qemu-img info' for raw images that are currently | ||
2 | in use as a mirror target. It is not enough for image formats, though, | ||
3 | as these still unconditionally request BLK_PERM_CONSISTENT_READ. | ||
1 | 4 | ||
5 | As this permission is geared towards whether the guest-visible data is | ||
6 | consistent, and has no impact on whether the metadata is sane, and | ||
7 | 'qemu-img info' does not read guest-visible data (except for the raw | ||
8 | format), it makes sense to not require BLK_PERM_CONSISTENT_READ if there | ||
9 | is not going to be any guest I/O performed, regardless of image format. | ||
10 | |||
11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
12 | --- | ||
13 | block.c | 6 +++++- | ||
14 | 1 file changed, 5 insertions(+), 1 deletion(-) | ||
15 | |||
16 | diff --git a/block.c b/block.c | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/block.c | ||
19 | +++ b/block.c | ||
20 | @@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, | ||
21 | assert(role == &child_backing || role == &child_file); | ||
22 | |||
23 | if (!backing) { | ||
24 | + int flags = bdrv_reopen_get_flags(reopen_queue, bs); | ||
25 | + | ||
26 | /* Apart from the modifications below, the same permissions are | ||
27 | * forwarded and left alone as for filters */ | ||
28 | bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared, | ||
29 | @@ -XXX,XX +XXX,XX @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, | ||
30 | |||
31 | /* bs->file always needs to be consistent because of the metadata. We | ||
32 | * can never allow other users to resize or write to it. */ | ||
33 | - perm |= BLK_PERM_CONSISTENT_READ; | ||
34 | + if (!(flags & BDRV_O_NO_IO)) { | ||
35 | + perm |= BLK_PERM_CONSISTENT_READ; | ||
36 | + } | ||
37 | shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); | ||
38 | } else { | ||
39 | /* We want consistent read from backing files if the parent needs it. | ||
40 | -- | ||
41 | 2.13.6 | ||
42 | |||
43 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: John Snow <jsnow@redhat.com> | ||
1 | 2 | ||
3 | VPC has some difficulty creating geometries of particular size. | ||
4 | However, we can indeed force it to use a literal one, so let's | ||
5 | do that for the sake of test 197, which is testing some specific | ||
6 | offsets. | ||
7 | |||
8 | Signed-off-by: John Snow <jsnow@redhat.com> | ||
9 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
10 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
12 | Reviewed-by: Lukáš Doktor <ldoktor@redhat.com> | ||
13 | --- | ||
14 | tests/qemu-iotests/197 | 4 ++++ | ||
15 | tests/qemu-iotests/common.filter | 3 ++- | ||
16 | 2 files changed, 6 insertions(+), 1 deletion(-) | ||
17 | |||
18 | diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 | ||
19 | index XXXXXXX..XXXXXXX 100755 | ||
20 | --- a/tests/qemu-iotests/197 | ||
21 | +++ b/tests/qemu-iotests/197 | ||
22 | @@ -XXX,XX +XXX,XX @@ echo '=== Copy-on-read ===' | ||
23 | echo | ||
24 | |||
25 | # Prep the images | ||
26 | +# VPC rounds image sizes to a specific geometry, force a specific size. | ||
27 | +if [ "$IMGFMT" = "vpc" ]; then | ||
28 | + IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size") | ||
29 | +fi | ||
30 | _make_test_img 4G | ||
31 | $QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io | ||
32 | IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \ | ||
33 | diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter | ||
34 | index XXXXXXX..XXXXXXX 100644 | ||
35 | --- a/tests/qemu-iotests/common.filter | ||
36 | +++ b/tests/qemu-iotests/common.filter | ||
37 | @@ -XXX,XX +XXX,XX @@ _filter_img_create() | ||
38 | -e "s# log_size=[0-9]\\+##g" \ | ||
39 | -e "s# refcount_bits=[0-9]\\+##g" \ | ||
40 | -e "s# key-secret=[a-zA-Z0-9]\\+##g" \ | ||
41 | - -e "s# iter-time=[0-9]\\+##g" | ||
42 | + -e "s# iter-time=[0-9]\\+##g" \ | ||
43 | + -e "s# force_size=\\(on\\|off\\)##g" | ||
44 | } | ||
45 | |||
46 | _filter_img_info() | ||
47 | -- | ||
48 | 2.13.6 | ||
49 | |||
50 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | This change separates bdrv_drain_invoke(), which calls the BlockDriver | ||
2 | drain callbacks, from bdrv_drain_recurse(). Instead, the function | ||
3 | performs its own recursion now. | ||
1 | 4 | ||
5 | One reason for this is that bdrv_drain_recurse() can be called multiple | ||
6 | times by bdrv_drain_all_begin(), but the callbacks may only be called | ||
7 | once. The separation is necessary to fix this bug. | ||
8 | |||
9 | The other reason is that we intend to go to a model where we call all | ||
10 | driver callbacks first, and only then start polling. This is not fully | ||
11 | achieved yet with this patch, as bdrv_drain_invoke() contains a | ||
12 | BDRV_POLL_WHILE() loop for the block driver callbacks, which can still | ||
13 | call callbacks for any unrelated event. It's a step in this direction | ||
14 | anyway. | ||
15 | |||
16 | Cc: qemu-stable@nongnu.org | ||
17 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
18 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
19 | --- | ||
20 | block/io.c | 14 +++++++++++--- | ||
21 | 1 file changed, 11 insertions(+), 3 deletions(-) | ||
22 | |||
23 | diff --git a/block/io.c b/block/io.c | ||
24 | index XXXXXXX..XXXXXXX 100644 | ||
25 | --- a/block/io.c | ||
26 | +++ b/block/io.c | ||
27 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) | ||
28 | bdrv_wakeup(bs); | ||
29 | } | ||
30 | |||
31 | +/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ | ||
32 | static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) | ||
33 | { | ||
34 | + BdrvChild *child, *tmp; | ||
35 | BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin}; | ||
36 | |||
37 | if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || | ||
38 | @@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) | ||
39 | data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data); | ||
40 | bdrv_coroutine_enter(bs, data.co); | ||
41 | BDRV_POLL_WHILE(bs, !data.done); | ||
42 | + | ||
43 | + QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { | ||
44 | + bdrv_drain_invoke(child->bs, begin); | ||
45 | + } | ||
46 | } | ||
47 | |||
48 | static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin) | ||
49 | @@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin) | ||
50 | BdrvChild *child, *tmp; | ||
51 | bool waited; | ||
52 | |||
53 | - /* Ensure any pending metadata writes are submitted to bs->file. */ | ||
54 | - bdrv_drain_invoke(bs, begin); | ||
55 | - | ||
56 | /* Wait for drained requests to finish */ | ||
57 | waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); | ||
58 | |||
59 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs) | ||
60 | bdrv_parent_drained_begin(bs); | ||
61 | } | ||
62 | |||
63 | + bdrv_drain_invoke(bs, true); | ||
64 | bdrv_drain_recurse(bs, true); | ||
65 | } | ||
66 | |||
67 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs) | ||
68 | } | ||
69 | |||
70 | bdrv_parent_drained_end(bs); | ||
71 | + bdrv_drain_invoke(bs, false); | ||
72 | bdrv_drain_recurse(bs, false); | ||
73 | aio_enable_external(bdrv_get_aio_context(bs)); | ||
74 | } | ||
75 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
76 | aio_context_acquire(aio_context); | ||
77 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { | ||
78 | if (aio_context == bdrv_get_aio_context(bs)) { | ||
79 | + /* FIXME Calling this multiple times is wrong */ | ||
80 | + bdrv_drain_invoke(bs, true); | ||
81 | waited |= bdrv_drain_recurse(bs, true); | ||
82 | } | ||
83 | } | ||
84 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) | ||
85 | aio_context_acquire(aio_context); | ||
86 | aio_enable_external(aio_context); | ||
87 | bdrv_parent_drained_end(bs); | ||
88 | + bdrv_drain_invoke(bs, false); | ||
89 | bdrv_drain_recurse(bs, false); | ||
90 | aio_context_release(aio_context); | ||
91 | } | ||
92 | -- | ||
93 | 2.13.6 | ||
94 | |||
95 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | bdrv_drain_all_begin() used to call the .bdrv_co_drain_begin() driver | ||
2 | callback inside its polling loop. This means that how many times it got | ||
3 | called for each node depended on long it had to poll the event loop. | ||
1 | 4 | ||
5 | This is obviously not right and results in nodes that stay drained even | ||
6 | after bdrv_drain_all_end(), which calls .bdrv_co_drain_begin() once per | ||
7 | node. | ||
8 | |||
9 | Fix bdrv_drain_all_begin() to call the callback only once, too. | ||
10 | |||
11 | Cc: qemu-stable@nongnu.org | ||
12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
13 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
14 | --- | ||
15 | block/io.c | 3 +-- | ||
16 | 1 file changed, 1 insertion(+), 2 deletions(-) | ||
17 | |||
18 | diff --git a/block/io.c b/block/io.c | ||
19 | index XXXXXXX..XXXXXXX 100644 | ||
20 | --- a/block/io.c | ||
21 | +++ b/block/io.c | ||
22 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
23 | aio_context_acquire(aio_context); | ||
24 | bdrv_parent_drained_begin(bs); | ||
25 | aio_disable_external(aio_context); | ||
26 | + bdrv_drain_invoke(bs, true); | ||
27 | aio_context_release(aio_context); | ||
28 | |||
29 | if (!g_slist_find(aio_ctxs, aio_context)) { | ||
30 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
31 | aio_context_acquire(aio_context); | ||
32 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { | ||
33 | if (aio_context == bdrv_get_aio_context(bs)) { | ||
34 | - /* FIXME Calling this multiple times is wrong */ | ||
35 | - bdrv_drain_invoke(bs, true); | ||
36 | waited |= bdrv_drain_recurse(bs, true); | ||
37 | } | ||
38 | } | ||
39 | -- | ||
40 | 2.13.6 | ||
41 | |||
42 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | This adds a test case that the BlockDriver callbacks for drain are |
---|---|---|---|
2 | called in bdrv_drained_all_begin/end(), and that both of them are called | ||
3 | exactly once. | ||
2 | 4 | ||
3 | VDUSE [1] is a linux framework that makes it possible to implement | 5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
4 | software-emulated vDPA devices in userspace. This adds a library | 6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
5 | as a subproject to help implementing VDUSE backends in QEMU. | 7 | Reviewed-by: Eric Blake <eblake@redhat.com> |
8 | --- | ||
9 | tests/test-bdrv-drain.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++ | ||
10 | tests/Makefile.include | 2 + | ||
11 | 2 files changed, 139 insertions(+) | ||
12 | create mode 100644 tests/test-bdrv-drain.c | ||
6 | 13 | ||
7 | [1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html | 14 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
8 | |||
9 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
10 | Message-Id: <20220523084611.91-6-xieyongji@bytedance.com> | ||
11 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
13 | --- | ||
14 | meson_options.txt | 2 + | ||
15 | subprojects/libvduse/include/atomic.h | 1 + | ||
16 | subprojects/libvduse/include/compiler.h | 1 + | ||
17 | subprojects/libvduse/libvduse.h | 235 ++++ | ||
18 | subprojects/libvduse/libvduse.c | 1150 +++++++++++++++++++ | ||
19 | MAINTAINERS | 5 + | ||
20 | meson.build | 15 + | ||
21 | scripts/meson-buildoptions.sh | 3 + | ||
22 | subprojects/libvduse/linux-headers/linux | 1 + | ||
23 | subprojects/libvduse/meson.build | 10 + | ||
24 | subprojects/libvduse/standard-headers/linux | 1 + | ||
25 | 11 files changed, 1424 insertions(+) | ||
26 | create mode 120000 subprojects/libvduse/include/atomic.h | ||
27 | create mode 120000 subprojects/libvduse/include/compiler.h | ||
28 | create mode 100644 subprojects/libvduse/libvduse.h | ||
29 | create mode 100644 subprojects/libvduse/libvduse.c | ||
30 | create mode 120000 subprojects/libvduse/linux-headers/linux | ||
31 | create mode 100644 subprojects/libvduse/meson.build | ||
32 | create mode 120000 subprojects/libvduse/standard-headers/linux | ||
33 | |||
34 | diff --git a/meson_options.txt b/meson_options.txt | ||
35 | index XXXXXXX..XXXXXXX 100644 | ||
36 | --- a/meson_options.txt | ||
37 | +++ b/meson_options.txt | ||
38 | @@ -XXX,XX +XXX,XX @@ option('virtfs', type: 'feature', value: 'auto', | ||
39 | description: 'virtio-9p support') | ||
40 | option('virtiofsd', type: 'feature', value: 'auto', | ||
41 | description: 'build virtiofs daemon (virtiofsd)') | ||
42 | +option('libvduse', type: 'feature', value: 'auto', | ||
43 | + description: 'build VDUSE Library') | ||
44 | |||
45 | option('capstone', type: 'feature', value: 'auto', | ||
46 | description: 'Whether and how to find the capstone library') | ||
47 | diff --git a/subprojects/libvduse/include/atomic.h b/subprojects/libvduse/include/atomic.h | ||
48 | new file mode 120000 | ||
49 | index XXXXXXX..XXXXXXX | ||
50 | --- /dev/null | ||
51 | +++ b/subprojects/libvduse/include/atomic.h | ||
52 | @@ -0,0 +1 @@ | ||
53 | +../../../include/qemu/atomic.h | ||
54 | \ No newline at end of file | ||
55 | diff --git a/subprojects/libvduse/include/compiler.h b/subprojects/libvduse/include/compiler.h | ||
56 | new file mode 120000 | ||
57 | index XXXXXXX..XXXXXXX | ||
58 | --- /dev/null | ||
59 | +++ b/subprojects/libvduse/include/compiler.h | ||
60 | @@ -0,0 +1 @@ | ||
61 | +../../../include/qemu/compiler.h | ||
62 | \ No newline at end of file | ||
63 | diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h | ||
64 | new file mode 100644 | 15 | new file mode 100644 |
65 | index XXXXXXX..XXXXXXX | 16 | index XXXXXXX..XXXXXXX |
66 | --- /dev/null | 17 | --- /dev/null |
67 | +++ b/subprojects/libvduse/libvduse.h | 18 | +++ b/tests/test-bdrv-drain.c |
68 | @@ -XXX,XX +XXX,XX @@ | 19 | @@ -XXX,XX +XXX,XX @@ |
69 | +/* | 20 | +/* |
70 | + * VDUSE (vDPA Device in Userspace) library | 21 | + * Block node draining tests |
71 | + * | 22 | + * |
72 | + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | 23 | + * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com> |
73 | + * | 24 | + * |
74 | + * Author: | 25 | + * Permission is hereby granted, free of charge, to any person obtaining a copy |
75 | + * Xie Yongji <xieyongji@bytedance.com> | 26 | + * of this software and associated documentation files (the "Software"), to deal |
27 | + * in the Software without restriction, including without limitation the rights | ||
28 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
29 | + * copies of the Software, and to permit persons to whom the Software is | ||
30 | + * furnished to do so, subject to the following conditions: | ||
76 | + * | 31 | + * |
77 | + * This work is licensed under the terms of the GNU GPL, version 2 or | 32 | + * The above copyright notice and this permission notice shall be included in |
78 | + * later. See the COPYING file in the top-level directory. | 33 | + * all copies or substantial portions of the Software. |
34 | + * | ||
35 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
36 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
37 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
38 | + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
39 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
40 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
41 | + * THE SOFTWARE. | ||
79 | + */ | 42 | + */ |
80 | + | 43 | + |
81 | +#ifndef LIBVDUSE_H | 44 | +#include "qemu/osdep.h" |
82 | +#define LIBVDUSE_H | 45 | +#include "block/block.h" |
46 | +#include "sysemu/block-backend.h" | ||
47 | +#include "qapi/error.h" | ||
83 | + | 48 | + |
84 | +#include <stdint.h> | 49 | +typedef struct BDRVTestState { |
85 | +#include <sys/uio.h> | 50 | + int drain_count; |
51 | +} BDRVTestState; | ||
86 | + | 52 | + |
87 | +#define VIRTQUEUE_MAX_SIZE 1024 | 53 | +static void coroutine_fn bdrv_test_co_drain_begin(BlockDriverState *bs) |
88 | + | ||
89 | +/* VDUSE device structure */ | ||
90 | +typedef struct VduseDev VduseDev; | ||
91 | + | ||
92 | +/* Virtqueue structure */ | ||
93 | +typedef struct VduseVirtq VduseVirtq; | ||
94 | + | ||
95 | +/* Some operation of VDUSE backend */ | ||
96 | +typedef struct VduseOps { | ||
97 | + /* Called when virtqueue can be processed */ | ||
98 | + void (*enable_queue)(VduseDev *dev, VduseVirtq *vq); | ||
99 | + /* Called when virtqueue processing should be stopped */ | ||
100 | + void (*disable_queue)(VduseDev *dev, VduseVirtq *vq); | ||
101 | +} VduseOps; | ||
102 | + | ||
103 | +/* Describing elements of the I/O buffer */ | ||
104 | +typedef struct VduseVirtqElement { | ||
105 | + /* Descriptor table index */ | ||
106 | + unsigned int index; | ||
107 | + /* Number of physically-contiguous device-readable descriptors */ | ||
108 | + unsigned int out_num; | ||
109 | + /* Number of physically-contiguous device-writable descriptors */ | ||
110 | + unsigned int in_num; | ||
111 | + /* Array to store physically-contiguous device-writable descriptors */ | ||
112 | + struct iovec *in_sg; | ||
113 | + /* Array to store physically-contiguous device-readable descriptors */ | ||
114 | + struct iovec *out_sg; | ||
115 | +} VduseVirtqElement; | ||
116 | + | ||
117 | + | ||
118 | +/** | ||
119 | + * vduse_get_virtio_features: | ||
120 | + * | ||
121 | + * Get supported virtio features | ||
122 | + * | ||
123 | + * Returns: supported feature bits | ||
124 | + */ | ||
125 | +uint64_t vduse_get_virtio_features(void); | ||
126 | + | ||
127 | +/** | ||
128 | + * vduse_queue_get_dev: | ||
129 | + * @vq: specified virtqueue | ||
130 | + * | ||
131 | + * Get corresponding VDUSE device from the virtqueue. | ||
132 | + * | ||
133 | + * Returns: a pointer to VDUSE device on success, NULL on failure. | ||
134 | + */ | ||
135 | +VduseDev *vduse_queue_get_dev(VduseVirtq *vq); | ||
136 | + | ||
137 | +/** | ||
138 | + * vduse_queue_get_fd: | ||
139 | + * @vq: specified virtqueue | ||
140 | + * | ||
141 | + * Get the kick fd for the virtqueue. | ||
142 | + * | ||
143 | + * Returns: file descriptor on success, -1 on failure. | ||
144 | + */ | ||
145 | +int vduse_queue_get_fd(VduseVirtq *vq); | ||
146 | + | ||
147 | +/** | ||
148 | + * vduse_queue_pop: | ||
149 | + * @vq: specified virtqueue | ||
150 | + * @sz: the size of struct to return (must be >= VduseVirtqElement) | ||
151 | + * | ||
152 | + * Pop an element from virtqueue available ring. | ||
153 | + * | ||
154 | + * Returns: a pointer to a structure containing VduseVirtqElement on success, | ||
155 | + * NULL on failure. | ||
156 | + */ | ||
157 | +void *vduse_queue_pop(VduseVirtq *vq, size_t sz); | ||
158 | + | ||
159 | +/** | ||
160 | + * vduse_queue_push: | ||
161 | + * @vq: specified virtqueue | ||
162 | + * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop() | ||
163 | + * @len: length in bytes to write | ||
164 | + * | ||
165 | + * Push an element to virtqueue used ring. | ||
166 | + */ | ||
167 | +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, | ||
168 | + unsigned int len); | ||
169 | +/** | ||
170 | + * vduse_queue_notify: | ||
171 | + * @vq: specified virtqueue | ||
172 | + * | ||
173 | + * Request to notify the queue. | ||
174 | + */ | ||
175 | +void vduse_queue_notify(VduseVirtq *vq); | ||
176 | + | ||
177 | +/** | ||
178 | + * vduse_dev_get_priv: | ||
179 | + * @dev: VDUSE device | ||
180 | + * | ||
181 | + * Get the private pointer passed to vduse_dev_create(). | ||
182 | + * | ||
183 | + * Returns: private pointer on success, NULL on failure. | ||
184 | + */ | ||
185 | +void *vduse_dev_get_priv(VduseDev *dev); | ||
186 | + | ||
187 | +/** | ||
188 | + * vduse_dev_get_queue: | ||
189 | + * @dev: VDUSE device | ||
190 | + * @index: virtqueue index | ||
191 | + * | ||
192 | + * Get the specified virtqueue. | ||
193 | + * | ||
194 | + * Returns: a pointer to the virtqueue on success, NULL on failure. | ||
195 | + */ | ||
196 | +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index); | ||
197 | + | ||
198 | +/** | ||
199 | + * vduse_dev_get_fd: | ||
200 | + * @dev: VDUSE device | ||
201 | + * | ||
202 | + * Get the control message fd for the VDUSE device. | ||
203 | + * | ||
204 | + * Returns: file descriptor on success, -1 on failure. | ||
205 | + */ | ||
206 | +int vduse_dev_get_fd(VduseDev *dev); | ||
207 | + | ||
208 | +/** | ||
209 | + * vduse_dev_handler: | ||
210 | + * @dev: VDUSE device | ||
211 | + * | ||
212 | + * Used to process the control message. | ||
213 | + * | ||
214 | + * Returns: file descriptor on success, -errno on failure. | ||
215 | + */ | ||
216 | +int vduse_dev_handler(VduseDev *dev); | ||
217 | + | ||
218 | +/** | ||
219 | + * vduse_dev_update_config: | ||
220 | + * @dev: VDUSE device | ||
221 | + * @size: the size to write to configuration space | ||
222 | + * @offset: the offset from the beginning of configuration space | ||
223 | + * @buffer: the buffer used to write from | ||
224 | + * | ||
225 | + * Update device configuration space and inject a config interrupt. | ||
226 | + * | ||
227 | + * Returns: 0 on success, -errno on failure. | ||
228 | + */ | ||
229 | +int vduse_dev_update_config(VduseDev *dev, uint32_t size, | ||
230 | + uint32_t offset, char *buffer); | ||
231 | + | ||
232 | +/** | ||
233 | + * vduse_dev_setup_queue: | ||
234 | + * @dev: VDUSE device | ||
235 | + * @index: virtqueue index | ||
236 | + * @max_size: the max size of virtqueue | ||
237 | + * | ||
238 | + * Setup the specified virtqueue. | ||
239 | + * | ||
240 | + * Returns: 0 on success, -errno on failure. | ||
241 | + */ | ||
242 | +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size); | ||
243 | + | ||
244 | +/** | ||
245 | + * vduse_dev_create_by_fd: | ||
246 | + * @fd: passed file descriptor | ||
247 | + * @num_queues: the number of virtqueues | ||
248 | + * @ops: the operation of VDUSE backend | ||
249 | + * @priv: private pointer | ||
250 | + * | ||
251 | + * Create VDUSE device from a passed file descriptor. | ||
252 | + * | ||
253 | + * Returns: pointer to VDUSE device on success, NULL on failure. | ||
254 | + */ | ||
255 | +VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues, | ||
256 | + const VduseOps *ops, void *priv); | ||
257 | + | ||
258 | +/** | ||
259 | + * vduse_dev_create_by_name: | ||
260 | + * @name: VDUSE device name | ||
261 | + * @num_queues: the number of virtqueues | ||
262 | + * @ops: the operation of VDUSE backend | ||
263 | + * @priv: private pointer | ||
264 | + * | ||
265 | + * Create VDUSE device on /dev/vduse/$NAME. | ||
266 | + * | ||
267 | + * Returns: pointer to VDUSE device on success, NULL on failure. | ||
268 | + */ | ||
269 | +VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues, | ||
270 | + const VduseOps *ops, void *priv); | ||
271 | + | ||
272 | +/** | ||
273 | + * vduse_dev_create: | ||
274 | + * @name: VDUSE device name | ||
275 | + * @device_id: virtio device id | ||
276 | + * @vendor_id: virtio vendor id | ||
277 | + * @features: virtio features | ||
278 | + * @num_queues: the number of virtqueues | ||
279 | + * @config_size: the size of the configuration space | ||
280 | + * @config: the buffer of the configuration space | ||
281 | + * @ops: the operation of VDUSE backend | ||
282 | + * @priv: private pointer | ||
283 | + * | ||
284 | + * Create VDUSE device. | ||
285 | + * | ||
286 | + * Returns: pointer to VDUSE device on success, NULL on failure. | ||
287 | + */ | ||
288 | +VduseDev *vduse_dev_create(const char *name, uint32_t device_id, | ||
289 | + uint32_t vendor_id, uint64_t features, | ||
290 | + uint16_t num_queues, uint32_t config_size, | ||
291 | + char *config, const VduseOps *ops, void *priv); | ||
292 | + | ||
293 | +/** | ||
294 | + * vduse_dev_destroy: | ||
295 | + * @dev: VDUSE device | ||
296 | + * | ||
297 | + * Destroy the VDUSE device. | ||
298 | + * | ||
299 | + * Returns: 0 on success, -errno on failure. | ||
300 | + */ | ||
301 | +int vduse_dev_destroy(VduseDev *dev); | ||
302 | + | ||
303 | +#endif | ||
304 | diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c | ||
305 | new file mode 100644 | ||
306 | index XXXXXXX..XXXXXXX | ||
307 | --- /dev/null | ||
308 | +++ b/subprojects/libvduse/libvduse.c | ||
309 | @@ -XXX,XX +XXX,XX @@ | ||
310 | +/* | ||
311 | + * VDUSE (vDPA Device in Userspace) library | ||
312 | + * | ||
313 | + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | ||
314 | + * Portions of codes and concepts borrowed from libvhost-user.c, so: | ||
315 | + * Copyright IBM, Corp. 2007 | ||
316 | + * Copyright (c) 2016 Red Hat, Inc. | ||
317 | + * | ||
318 | + * Author: | ||
319 | + * Xie Yongji <xieyongji@bytedance.com> | ||
320 | + * Anthony Liguori <aliguori@us.ibm.com> | ||
321 | + * Marc-André Lureau <mlureau@redhat.com> | ||
322 | + * Victor Kaplansky <victork@redhat.com> | ||
323 | + * | ||
324 | + * This work is licensed under the terms of the GNU GPL, version 2 or | ||
325 | + * later. See the COPYING file in the top-level directory. | ||
326 | + */ | ||
327 | + | ||
328 | +#include <stdlib.h> | ||
329 | +#include <stdio.h> | ||
330 | +#include <stdbool.h> | ||
331 | +#include <stddef.h> | ||
332 | +#include <errno.h> | ||
333 | +#include <string.h> | ||
334 | +#include <assert.h> | ||
335 | +#include <endian.h> | ||
336 | +#include <unistd.h> | ||
337 | +#include <limits.h> | ||
338 | +#include <fcntl.h> | ||
339 | +#include <inttypes.h> | ||
340 | + | ||
341 | +#include <sys/ioctl.h> | ||
342 | +#include <sys/eventfd.h> | ||
343 | +#include <sys/mman.h> | ||
344 | + | ||
345 | +#include "include/atomic.h" | ||
346 | +#include "linux-headers/linux/virtio_ring.h" | ||
347 | +#include "linux-headers/linux/virtio_config.h" | ||
348 | +#include "linux-headers/linux/vduse.h" | ||
349 | +#include "libvduse.h" | ||
350 | + | ||
351 | +#define VDUSE_VQ_ALIGN 4096 | ||
352 | +#define MAX_IOVA_REGIONS 256 | ||
353 | + | ||
354 | +/* Round number down to multiple */ | ||
355 | +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) | ||
356 | + | ||
357 | +/* Round number up to multiple */ | ||
358 | +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) | ||
359 | + | ||
360 | +#ifndef unlikely | ||
361 | +#define unlikely(x) __builtin_expect(!!(x), 0) | ||
362 | +#endif | ||
363 | + | ||
364 | +typedef struct VduseRing { | ||
365 | + unsigned int num; | ||
366 | + uint64_t desc_addr; | ||
367 | + uint64_t avail_addr; | ||
368 | + uint64_t used_addr; | ||
369 | + struct vring_desc *desc; | ||
370 | + struct vring_avail *avail; | ||
371 | + struct vring_used *used; | ||
372 | +} VduseRing; | ||
373 | + | ||
374 | +struct VduseVirtq { | ||
375 | + VduseRing vring; | ||
376 | + uint16_t last_avail_idx; | ||
377 | + uint16_t shadow_avail_idx; | ||
378 | + uint16_t used_idx; | ||
379 | + uint16_t signalled_used; | ||
380 | + bool signalled_used_valid; | ||
381 | + int index; | ||
382 | + int inuse; | ||
383 | + bool ready; | ||
384 | + int fd; | ||
385 | + VduseDev *dev; | ||
386 | +}; | ||
387 | + | ||
388 | +typedef struct VduseIovaRegion { | ||
389 | + uint64_t iova; | ||
390 | + uint64_t size; | ||
391 | + uint64_t mmap_offset; | ||
392 | + uint64_t mmap_addr; | ||
393 | +} VduseIovaRegion; | ||
394 | + | ||
395 | +struct VduseDev { | ||
396 | + VduseVirtq *vqs; | ||
397 | + VduseIovaRegion regions[MAX_IOVA_REGIONS]; | ||
398 | + int num_regions; | ||
399 | + char *name; | ||
400 | + uint32_t device_id; | ||
401 | + uint32_t vendor_id; | ||
402 | + uint16_t num_queues; | ||
403 | + uint16_t queue_size; | ||
404 | + uint64_t features; | ||
405 | + const VduseOps *ops; | ||
406 | + int fd; | ||
407 | + int ctrl_fd; | ||
408 | + void *priv; | ||
409 | +}; | ||
410 | + | ||
411 | +static inline bool has_feature(uint64_t features, unsigned int fbit) | ||
412 | +{ | 54 | +{ |
413 | + assert(fbit < 64); | 55 | + BDRVTestState *s = bs->opaque; |
414 | + return !!(features & (1ULL << fbit)); | 56 | + s->drain_count++; |
415 | +} | 57 | +} |
416 | + | 58 | + |
417 | +static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit) | 59 | +static void coroutine_fn bdrv_test_co_drain_end(BlockDriverState *bs) |
418 | +{ | 60 | +{ |
419 | + return has_feature(dev->features, fbit); | 61 | + BDRVTestState *s = bs->opaque; |
62 | + s->drain_count--; | ||
420 | +} | 63 | +} |
421 | + | 64 | + |
422 | +uint64_t vduse_get_virtio_features(void) | 65 | +static void bdrv_test_close(BlockDriverState *bs) |
423 | +{ | 66 | +{ |
424 | + return (1ULL << VIRTIO_F_IOMMU_PLATFORM) | | 67 | + BDRVTestState *s = bs->opaque; |
425 | + (1ULL << VIRTIO_F_VERSION_1) | | 68 | + g_assert_cmpint(s->drain_count, >, 0); |
426 | + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | | ||
427 | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | | ||
428 | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC); | ||
429 | +} | 69 | +} |
430 | + | 70 | + |
431 | +VduseDev *vduse_queue_get_dev(VduseVirtq *vq) | 71 | +static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs, |
72 | + uint64_t offset, uint64_t bytes, | ||
73 | + QEMUIOVector *qiov, int flags) | ||
432 | +{ | 74 | +{ |
433 | + return vq->dev; | 75 | + /* We want this request to stay until the polling loop in drain waits for |
434 | +} | 76 | + * it to complete. We need to sleep a while as bdrv_drain_invoke() comes |
435 | + | 77 | + * first and polls its result, too, but it shouldn't accidentally complete |
436 | +int vduse_queue_get_fd(VduseVirtq *vq) | 78 | + * this request yet. */ |
437 | +{ | 79 | + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); |
438 | + return vq->fd; | ||
439 | +} | ||
440 | + | ||
441 | +void *vduse_dev_get_priv(VduseDev *dev) | ||
442 | +{ | ||
443 | + return dev->priv; | ||
444 | +} | ||
445 | + | ||
446 | +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index) | ||
447 | +{ | ||
448 | + return &dev->vqs[index]; | ||
449 | +} | ||
450 | + | ||
451 | +int vduse_dev_get_fd(VduseDev *dev) | ||
452 | +{ | ||
453 | + return dev->fd; | ||
454 | +} | ||
455 | + | ||
456 | +static int vduse_inject_irq(VduseDev *dev, int index) | ||
457 | +{ | ||
458 | + return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index); | ||
459 | +} | ||
460 | + | ||
461 | +static void vduse_iova_remove_region(VduseDev *dev, uint64_t start, | ||
462 | + uint64_t last) | ||
463 | +{ | ||
464 | + int i; | ||
465 | + | ||
466 | + if (last == start) { | ||
467 | + return; | ||
468 | + } | ||
469 | + | ||
470 | + for (i = 0; i < MAX_IOVA_REGIONS; i++) { | ||
471 | + if (!dev->regions[i].mmap_addr) { | ||
472 | + continue; | ||
473 | + } | ||
474 | + | ||
475 | + if (start <= dev->regions[i].iova && | ||
476 | + last >= (dev->regions[i].iova + dev->regions[i].size - 1)) { | ||
477 | + munmap((void *)(uintptr_t)dev->regions[i].mmap_addr, | ||
478 | + dev->regions[i].mmap_offset + dev->regions[i].size); | ||
479 | + dev->regions[i].mmap_addr = 0; | ||
480 | + dev->num_regions--; | ||
481 | + } | ||
482 | + } | ||
483 | +} | ||
484 | + | ||
485 | +static int vduse_iova_add_region(VduseDev *dev, int fd, | ||
486 | + uint64_t offset, uint64_t start, | ||
487 | + uint64_t last, int prot) | ||
488 | +{ | ||
489 | + int i; | ||
490 | + uint64_t size = last - start + 1; | ||
491 | + void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0); | ||
492 | + | ||
493 | + if (mmap_addr == MAP_FAILED) { | ||
494 | + close(fd); | ||
495 | + return -EINVAL; | ||
496 | + } | ||
497 | + | ||
498 | + for (i = 0; i < MAX_IOVA_REGIONS; i++) { | ||
499 | + if (!dev->regions[i].mmap_addr) { | ||
500 | + dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr; | ||
501 | + dev->regions[i].mmap_offset = offset; | ||
502 | + dev->regions[i].iova = start; | ||
503 | + dev->regions[i].size = size; | ||
504 | + dev->num_regions++; | ||
505 | + break; | ||
506 | + } | ||
507 | + } | ||
508 | + assert(i < MAX_IOVA_REGIONS); | ||
509 | + close(fd); | ||
510 | + | 80 | + |
511 | + return 0; | 81 | + return 0; |
512 | +} | 82 | +} |
513 | + | 83 | + |
514 | +static int perm_to_prot(uint8_t perm) | 84 | +static BlockDriver bdrv_test = { |
85 | + .format_name = "test", | ||
86 | + .instance_size = sizeof(BDRVTestState), | ||
87 | + | ||
88 | + .bdrv_close = bdrv_test_close, | ||
89 | + .bdrv_co_preadv = bdrv_test_co_preadv, | ||
90 | + | ||
91 | + .bdrv_co_drain_begin = bdrv_test_co_drain_begin, | ||
92 | + .bdrv_co_drain_end = bdrv_test_co_drain_end, | ||
93 | +}; | ||
94 | + | ||
95 | +static void aio_ret_cb(void *opaque, int ret) | ||
515 | +{ | 96 | +{ |
516 | + int prot = 0; | 97 | + int *aio_ret = opaque; |
517 | + | 98 | + *aio_ret = ret; |
518 | + switch (perm) { | ||
519 | + case VDUSE_ACCESS_WO: | ||
520 | + prot |= PROT_WRITE; | ||
521 | + break; | ||
522 | + case VDUSE_ACCESS_RO: | ||
523 | + prot |= PROT_READ; | ||
524 | + break; | ||
525 | + case VDUSE_ACCESS_RW: | ||
526 | + prot |= PROT_READ | PROT_WRITE; | ||
527 | + break; | ||
528 | + default: | ||
529 | + break; | ||
530 | + } | ||
531 | + | ||
532 | + return prot; | ||
533 | +} | 99 | +} |
534 | + | 100 | + |
535 | +static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova) | 101 | +static void test_drv_cb_drain_all(void) |
536 | +{ | 102 | +{ |
537 | + int i, ret; | 103 | + BlockBackend *blk; |
538 | + struct vduse_iotlb_entry entry; | 104 | + BlockDriverState *bs; |
105 | + BDRVTestState *s; | ||
106 | + BlockAIOCB *acb; | ||
107 | + int aio_ret; | ||
539 | + | 108 | + |
540 | + for (i = 0; i < MAX_IOVA_REGIONS; i++) { | 109 | + QEMUIOVector qiov; |
541 | + VduseIovaRegion *r = &dev->regions[i]; | 110 | + struct iovec iov = { |
111 | + .iov_base = NULL, | ||
112 | + .iov_len = 0, | ||
113 | + }; | ||
114 | + qemu_iovec_init_external(&qiov, &iov, 1); | ||
542 | + | 115 | + |
543 | + if (!r->mmap_addr) { | 116 | + blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); |
544 | + continue; | 117 | + bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR, |
545 | + } | 118 | + &error_abort); |
119 | + s = bs->opaque; | ||
120 | + blk_insert_bs(blk, bs, &error_abort); | ||
546 | + | 121 | + |
547 | + if ((iova >= r->iova) && (iova < (r->iova + r->size))) { | 122 | + /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */ |
548 | + if ((iova + *plen) > (r->iova + r->size)) { | 123 | + g_assert_cmpint(s->drain_count, ==, 0); |
549 | + *plen = r->iova + r->size - iova; | 124 | + bdrv_drain_all_begin(); |
550 | + } | 125 | + g_assert_cmpint(s->drain_count, ==, 1); |
551 | + return (void *)(uintptr_t)(iova - r->iova + | 126 | + bdrv_drain_all_end(); |
552 | + r->mmap_addr + r->mmap_offset); | 127 | + g_assert_cmpint(s->drain_count, ==, 0); |
553 | + } | ||
554 | + } | ||
555 | + | 128 | + |
556 | + entry.start = iova; | 129 | + /* Now do the same while a request is pending */ |
557 | + entry.last = iova + 1; | 130 | + aio_ret = -EINPROGRESS; |
558 | + ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry); | 131 | + acb = blk_aio_preadv(blk, 0, &qiov, 0, aio_ret_cb, &aio_ret); |
559 | + if (ret < 0) { | 132 | + g_assert(acb != NULL); |
560 | + return NULL; | 133 | + g_assert_cmpint(aio_ret, ==, -EINPROGRESS); |
561 | + } | ||
562 | + | 134 | + |
563 | + if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start, | 135 | + g_assert_cmpint(s->drain_count, ==, 0); |
564 | + entry.last, perm_to_prot(entry.perm))) { | 136 | + bdrv_drain_all_begin(); |
565 | + return iova_to_va(dev, plen, iova); | 137 | + g_assert_cmpint(aio_ret, ==, 0); |
566 | + } | 138 | + g_assert_cmpint(s->drain_count, ==, 1); |
139 | + bdrv_drain_all_end(); | ||
140 | + g_assert_cmpint(s->drain_count, ==, 0); | ||
567 | + | 141 | + |
568 | + return NULL; | 142 | + bdrv_unref(bs); |
143 | + blk_unref(blk); | ||
569 | +} | 144 | +} |
570 | + | 145 | + |
571 | +static inline uint16_t vring_avail_flags(VduseVirtq *vq) | 146 | +int main(int argc, char **argv) |
572 | +{ | 147 | +{ |
573 | + return le16toh(vq->vring.avail->flags); | 148 | + bdrv_init(); |
149 | + qemu_init_main_loop(&error_abort); | ||
150 | + | ||
151 | + g_test_init(&argc, &argv, NULL); | ||
152 | + | ||
153 | + g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all); | ||
154 | + | ||
155 | + return g_test_run(); | ||
574 | +} | 156 | +} |
575 | + | 157 | diff --git a/tests/Makefile.include b/tests/Makefile.include |
576 | +static inline uint16_t vring_avail_idx(VduseVirtq *vq) | ||
577 | +{ | ||
578 | + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); | ||
579 | + | ||
580 | + return vq->shadow_avail_idx; | ||
581 | +} | ||
582 | + | ||
583 | +static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i) | ||
584 | +{ | ||
585 | + return le16toh(vq->vring.avail->ring[i]); | ||
586 | +} | ||
587 | + | ||
588 | +static inline uint16_t vring_get_used_event(VduseVirtq *vq) | ||
589 | +{ | ||
590 | + return vring_avail_ring(vq, vq->vring.num); | ||
591 | +} | ||
592 | + | ||
593 | +static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx, | ||
594 | + unsigned int *head) | ||
595 | +{ | ||
596 | + /* | ||
597 | + * Grab the next descriptor number they're advertising, and increment | ||
598 | + * the index we've seen. | ||
599 | + */ | ||
600 | + *head = vring_avail_ring(vq, idx % vq->vring.num); | ||
601 | + | ||
602 | + /* If their number is silly, that's a fatal mistake. */ | ||
603 | + if (*head >= vq->vring.num) { | ||
604 | + fprintf(stderr, "Guest says index %u is available\n", *head); | ||
605 | + return false; | ||
606 | + } | ||
607 | + | ||
608 | + return true; | ||
609 | +} | ||
610 | + | ||
611 | +static int | ||
612 | +vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc, | ||
613 | + uint64_t addr, size_t len) | ||
614 | +{ | ||
615 | + struct vring_desc *ori_desc; | ||
616 | + uint64_t read_len; | ||
617 | + | ||
618 | + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { | ||
619 | + return -1; | ||
620 | + } | ||
621 | + | ||
622 | + if (len == 0) { | ||
623 | + return -1; | ||
624 | + } | ||
625 | + | ||
626 | + while (len) { | ||
627 | + read_len = len; | ||
628 | + ori_desc = iova_to_va(dev, &read_len, addr); | ||
629 | + if (!ori_desc) { | ||
630 | + return -1; | ||
631 | + } | ||
632 | + | ||
633 | + memcpy(desc, ori_desc, read_len); | ||
634 | + len -= read_len; | ||
635 | + addr += read_len; | ||
636 | + desc += read_len; | ||
637 | + } | ||
638 | + | ||
639 | + return 0; | ||
640 | +} | ||
641 | + | ||
642 | +enum { | ||
643 | + VIRTQUEUE_READ_DESC_ERROR = -1, | ||
644 | + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ | ||
645 | + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ | ||
646 | +}; | ||
647 | + | ||
648 | +static int vduse_queue_read_next_desc(struct vring_desc *desc, int i, | ||
649 | + unsigned int max, unsigned int *next) | ||
650 | +{ | ||
651 | + /* If this descriptor says it doesn't chain, we're done. */ | ||
652 | + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { | ||
653 | + return VIRTQUEUE_READ_DESC_DONE; | ||
654 | + } | ||
655 | + | ||
656 | + /* Check they're not leading us off end of descriptors. */ | ||
657 | + *next = desc[i].next; | ||
658 | + /* Make sure compiler knows to grab that: we don't want it changing! */ | ||
659 | + smp_wmb(); | ||
660 | + | ||
661 | + if (*next >= max) { | ||
662 | + fprintf(stderr, "Desc next is %u\n", *next); | ||
663 | + return VIRTQUEUE_READ_DESC_ERROR; | ||
664 | + } | ||
665 | + | ||
666 | + return VIRTQUEUE_READ_DESC_MORE; | ||
667 | +} | ||
668 | + | ||
669 | +/* | ||
670 | + * Fetch avail_idx from VQ memory only when we really need to know if | ||
671 | + * guest has added some buffers. | ||
672 | + */ | ||
673 | +static bool vduse_queue_empty(VduseVirtq *vq) | ||
674 | +{ | ||
675 | + if (unlikely(!vq->vring.avail)) { | ||
676 | + return true; | ||
677 | + } | ||
678 | + | ||
679 | + if (vq->shadow_avail_idx != vq->last_avail_idx) { | ||
680 | + return false; | ||
681 | + } | ||
682 | + | ||
683 | + return vring_avail_idx(vq) == vq->last_avail_idx; | ||
684 | +} | ||
685 | + | ||
686 | +static bool vduse_queue_should_notify(VduseVirtq *vq) | ||
687 | +{ | ||
688 | + VduseDev *dev = vq->dev; | ||
689 | + uint16_t old, new; | ||
690 | + bool v; | ||
691 | + | ||
692 | + /* We need to expose used array entries before checking used event. */ | ||
693 | + smp_mb(); | ||
694 | + | ||
695 | + /* Always notify when queue is empty (when feature acknowledge) */ | ||
696 | + if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && | ||
697 | + !vq->inuse && vduse_queue_empty(vq)) { | ||
698 | + return true; | ||
699 | + } | ||
700 | + | ||
701 | + if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { | ||
702 | + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); | ||
703 | + } | ||
704 | + | ||
705 | + v = vq->signalled_used_valid; | ||
706 | + vq->signalled_used_valid = true; | ||
707 | + old = vq->signalled_used; | ||
708 | + new = vq->signalled_used = vq->used_idx; | ||
709 | + return !v || vring_need_event(vring_get_used_event(vq), new, old); | ||
710 | +} | ||
711 | + | ||
712 | +void vduse_queue_notify(VduseVirtq *vq) | ||
713 | +{ | ||
714 | + VduseDev *dev = vq->dev; | ||
715 | + | ||
716 | + if (unlikely(!vq->vring.avail)) { | ||
717 | + return; | ||
718 | + } | ||
719 | + | ||
720 | + if (!vduse_queue_should_notify(vq)) { | ||
721 | + return; | ||
722 | + } | ||
723 | + | ||
724 | + if (vduse_inject_irq(dev, vq->index) < 0) { | ||
725 | + fprintf(stderr, "Error inject irq for vq %d: %s\n", | ||
726 | + vq->index, strerror(errno)); | ||
727 | + } | ||
728 | +} | ||
729 | + | ||
730 | +static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val) | ||
731 | +{ | ||
732 | + *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val); | ||
733 | +} | ||
734 | + | ||
735 | +static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg, | ||
736 | + struct iovec *iov, unsigned int max_num_sg, | ||
737 | + bool is_write, uint64_t pa, size_t sz) | ||
738 | +{ | ||
739 | + unsigned num_sg = *p_num_sg; | ||
740 | + VduseDev *dev = vq->dev; | ||
741 | + | ||
742 | + assert(num_sg <= max_num_sg); | ||
743 | + | ||
744 | + if (!sz) { | ||
745 | + fprintf(stderr, "virtio: zero sized buffers are not allowed\n"); | ||
746 | + return false; | ||
747 | + } | ||
748 | + | ||
749 | + while (sz) { | ||
750 | + uint64_t len = sz; | ||
751 | + | ||
752 | + if (num_sg == max_num_sg) { | ||
753 | + fprintf(stderr, | ||
754 | + "virtio: too many descriptors in indirect table\n"); | ||
755 | + return false; | ||
756 | + } | ||
757 | + | ||
758 | + iov[num_sg].iov_base = iova_to_va(dev, &len, pa); | ||
759 | + if (iov[num_sg].iov_base == NULL) { | ||
760 | + fprintf(stderr, "virtio: invalid address for buffers\n"); | ||
761 | + return false; | ||
762 | + } | ||
763 | + iov[num_sg++].iov_len = len; | ||
764 | + sz -= len; | ||
765 | + pa += len; | ||
766 | + } | ||
767 | + | ||
768 | + *p_num_sg = num_sg; | ||
769 | + return true; | ||
770 | +} | ||
771 | + | ||
772 | +static void *vduse_queue_alloc_element(size_t sz, unsigned out_num, | ||
773 | + unsigned in_num) | ||
774 | +{ | ||
775 | + VduseVirtqElement *elem; | ||
776 | + size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); | ||
777 | + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); | ||
778 | + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); | ||
779 | + | ||
780 | + assert(sz >= sizeof(VduseVirtqElement)); | ||
781 | + elem = malloc(out_sg_end); | ||
782 | + if (!elem) { | ||
783 | + return NULL; | ||
784 | + } | ||
785 | + elem->out_num = out_num; | ||
786 | + elem->in_num = in_num; | ||
787 | + elem->in_sg = (void *)elem + in_sg_ofs; | ||
788 | + elem->out_sg = (void *)elem + out_sg_ofs; | ||
789 | + return elem; | ||
790 | +} | ||
791 | + | ||
792 | +static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz) | ||
793 | +{ | ||
794 | + struct vring_desc *desc = vq->vring.desc; | ||
795 | + VduseDev *dev = vq->dev; | ||
796 | + uint64_t desc_addr, read_len; | ||
797 | + unsigned int desc_len; | ||
798 | + unsigned int max = vq->vring.num; | ||
799 | + unsigned int i = idx; | ||
800 | + VduseVirtqElement *elem; | ||
801 | + struct iovec iov[VIRTQUEUE_MAX_SIZE]; | ||
802 | + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; | ||
803 | + unsigned int out_num = 0, in_num = 0; | ||
804 | + int rc; | ||
805 | + | ||
806 | + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { | ||
807 | + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { | ||
808 | + fprintf(stderr, "Invalid size for indirect buffer table\n"); | ||
809 | + return NULL; | ||
810 | + } | ||
811 | + | ||
812 | + /* loop over the indirect descriptor table */ | ||
813 | + desc_addr = le64toh(desc[i].addr); | ||
814 | + desc_len = le32toh(desc[i].len); | ||
815 | + max = desc_len / sizeof(struct vring_desc); | ||
816 | + read_len = desc_len; | ||
817 | + desc = iova_to_va(dev, &read_len, desc_addr); | ||
818 | + if (unlikely(desc && read_len != desc_len)) { | ||
819 | + /* Failed to use zero copy */ | ||
820 | + desc = NULL; | ||
821 | + if (!vduse_queue_read_indirect_desc(dev, desc_buf, | ||
822 | + desc_addr, | ||
823 | + desc_len)) { | ||
824 | + desc = desc_buf; | ||
825 | + } | ||
826 | + } | ||
827 | + if (!desc) { | ||
828 | + fprintf(stderr, "Invalid indirect buffer table\n"); | ||
829 | + return NULL; | ||
830 | + } | ||
831 | + i = 0; | ||
832 | + } | ||
833 | + | ||
834 | + /* Collect all the descriptors */ | ||
835 | + do { | ||
836 | + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { | ||
837 | + if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num, | ||
838 | + VIRTQUEUE_MAX_SIZE - out_num, | ||
839 | + true, le64toh(desc[i].addr), | ||
840 | + le32toh(desc[i].len))) { | ||
841 | + return NULL; | ||
842 | + } | ||
843 | + } else { | ||
844 | + if (in_num) { | ||
845 | + fprintf(stderr, "Incorrect order for descriptors\n"); | ||
846 | + return NULL; | ||
847 | + } | ||
848 | + if (!vduse_queue_map_single_desc(vq, &out_num, iov, | ||
849 | + VIRTQUEUE_MAX_SIZE, false, | ||
850 | + le64toh(desc[i].addr), | ||
851 | + le32toh(desc[i].len))) { | ||
852 | + return NULL; | ||
853 | + } | ||
854 | + } | ||
855 | + | ||
856 | + /* If we've got too many, that implies a descriptor loop. */ | ||
857 | + if ((in_num + out_num) > max) { | ||
858 | + fprintf(stderr, "Looped descriptor\n"); | ||
859 | + return NULL; | ||
860 | + } | ||
861 | + rc = vduse_queue_read_next_desc(desc, i, max, &i); | ||
862 | + } while (rc == VIRTQUEUE_READ_DESC_MORE); | ||
863 | + | ||
864 | + if (rc == VIRTQUEUE_READ_DESC_ERROR) { | ||
865 | + fprintf(stderr, "read descriptor error\n"); | ||
866 | + return NULL; | ||
867 | + } | ||
868 | + | ||
869 | + /* Now copy what we have collected and mapped */ | ||
870 | + elem = vduse_queue_alloc_element(sz, out_num, in_num); | ||
871 | + if (!elem) { | ||
872 | + fprintf(stderr, "read descriptor error\n"); | ||
873 | + return NULL; | ||
874 | + } | ||
875 | + elem->index = idx; | ||
876 | + for (i = 0; i < out_num; i++) { | ||
877 | + elem->out_sg[i] = iov[i]; | ||
878 | + } | ||
879 | + for (i = 0; i < in_num; i++) { | ||
880 | + elem->in_sg[i] = iov[out_num + i]; | ||
881 | + } | ||
882 | + | ||
883 | + return elem; | ||
884 | +} | ||
885 | + | ||
886 | +void *vduse_queue_pop(VduseVirtq *vq, size_t sz) | ||
887 | +{ | ||
888 | + unsigned int head; | ||
889 | + VduseVirtqElement *elem; | ||
890 | + VduseDev *dev = vq->dev; | ||
891 | + | ||
892 | + if (unlikely(!vq->vring.avail)) { | ||
893 | + return NULL; | ||
894 | + } | ||
895 | + | ||
896 | + if (vduse_queue_empty(vq)) { | ||
897 | + return NULL; | ||
898 | + } | ||
899 | + /* Needed after virtio_queue_empty() */ | ||
900 | + smp_rmb(); | ||
901 | + | ||
902 | + if (vq->inuse >= vq->vring.num) { | ||
903 | + fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse); | ||
904 | + return NULL; | ||
905 | + } | ||
906 | + | ||
907 | + if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) { | ||
908 | + return NULL; | ||
909 | + } | ||
910 | + | ||
911 | + if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { | ||
912 | + vring_set_avail_event(vq, vq->last_avail_idx); | ||
913 | + } | ||
914 | + | ||
915 | + elem = vduse_queue_map_desc(vq, head, sz); | ||
916 | + | ||
917 | + if (!elem) { | ||
918 | + return NULL; | ||
919 | + } | ||
920 | + | ||
921 | + vq->inuse++; | ||
922 | + | ||
923 | + return elem; | ||
924 | +} | ||
925 | + | ||
926 | +static inline void vring_used_write(VduseVirtq *vq, | ||
927 | + struct vring_used_elem *uelem, int i) | ||
928 | +{ | ||
929 | + struct vring_used *used = vq->vring.used; | ||
930 | + | ||
931 | + used->ring[i] = *uelem; | ||
932 | +} | ||
933 | + | ||
934 | +static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem, | ||
935 | + unsigned int len, unsigned int idx) | ||
936 | +{ | ||
937 | + struct vring_used_elem uelem; | ||
938 | + | ||
939 | + if (unlikely(!vq->vring.used)) { | ||
940 | + return; | ||
941 | + } | ||
942 | + | ||
943 | + idx = (idx + vq->used_idx) % vq->vring.num; | ||
944 | + | ||
945 | + uelem.id = htole32(elem->index); | ||
946 | + uelem.len = htole32(len); | ||
947 | + vring_used_write(vq, &uelem, idx); | ||
948 | +} | ||
949 | + | ||
950 | +static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val) | ||
951 | +{ | ||
952 | + vq->vring.used->idx = htole16(val); | ||
953 | + vq->used_idx = val; | ||
954 | +} | ||
955 | + | ||
956 | +static void vduse_queue_flush(VduseVirtq *vq, unsigned int count) | ||
957 | +{ | ||
958 | + uint16_t old, new; | ||
959 | + | ||
960 | + if (unlikely(!vq->vring.used)) { | ||
961 | + return; | ||
962 | + } | ||
963 | + | ||
964 | + /* Make sure buffer is written before we update index. */ | ||
965 | + smp_wmb(); | ||
966 | + | ||
967 | + old = vq->used_idx; | ||
968 | + new = old + count; | ||
969 | + vring_used_idx_set(vq, new); | ||
970 | + vq->inuse -= count; | ||
971 | + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { | ||
972 | + vq->signalled_used_valid = false; | ||
973 | + } | ||
974 | +} | ||
975 | + | ||
976 | +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, | ||
977 | + unsigned int len) | ||
978 | +{ | ||
979 | + vduse_queue_fill(vq, elem, len, 0); | ||
980 | + vduse_queue_flush(vq, 1); | ||
981 | +} | ||
982 | + | ||
983 | +static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr, | ||
984 | + uint64_t avail_addr, uint64_t used_addr) | ||
985 | +{ | ||
986 | + struct VduseDev *dev = vq->dev; | ||
987 | + uint64_t len; | ||
988 | + | ||
989 | + len = sizeof(struct vring_desc); | ||
990 | + vq->vring.desc = iova_to_va(dev, &len, desc_addr); | ||
991 | + if (len != sizeof(struct vring_desc)) { | ||
992 | + return -EINVAL; | ||
993 | + } | ||
994 | + | ||
995 | + len = sizeof(struct vring_avail); | ||
996 | + vq->vring.avail = iova_to_va(dev, &len, avail_addr); | ||
997 | + if (len != sizeof(struct vring_avail)) { | ||
998 | + return -EINVAL; | ||
999 | + } | ||
1000 | + | ||
1001 | + len = sizeof(struct vring_used); | ||
1002 | + vq->vring.used = iova_to_va(dev, &len, used_addr); | ||
1003 | + if (len != sizeof(struct vring_used)) { | ||
1004 | + return -EINVAL; | ||
1005 | + } | ||
1006 | + | ||
1007 | + if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) { | ||
1008 | + fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index); | ||
1009 | + return -EINVAL; | ||
1010 | + } | ||
1011 | + | ||
1012 | + return 0; | ||
1013 | +} | ||
1014 | + | ||
1015 | +static void vduse_queue_enable(VduseVirtq *vq) | ||
1016 | +{ | ||
1017 | + struct VduseDev *dev = vq->dev; | ||
1018 | + struct vduse_vq_info vq_info; | ||
1019 | + struct vduse_vq_eventfd vq_eventfd; | ||
1020 | + int fd; | ||
1021 | + | ||
1022 | + vq_info.index = vq->index; | ||
1023 | + if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) { | ||
1024 | + fprintf(stderr, "Failed to get vq[%d] info: %s\n", | ||
1025 | + vq->index, strerror(errno)); | ||
1026 | + return; | ||
1027 | + } | ||
1028 | + | ||
1029 | + if (!vq_info.ready) { | ||
1030 | + return; | ||
1031 | + } | ||
1032 | + | ||
1033 | + vq->vring.num = vq_info.num; | ||
1034 | + vq->vring.desc_addr = vq_info.desc_addr; | ||
1035 | + vq->vring.avail_addr = vq_info.driver_addr; | ||
1036 | + vq->vring.used_addr = vq_info.device_addr; | ||
1037 | + | ||
1038 | + if (vduse_queue_update_vring(vq, vq_info.desc_addr, | ||
1039 | + vq_info.driver_addr, vq_info.device_addr)) { | ||
1040 | + fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index); | ||
1041 | + return; | ||
1042 | + } | ||
1043 | + | ||
1044 | + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); | ||
1045 | + if (fd < 0) { | ||
1046 | + fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index); | ||
1047 | + return; | ||
1048 | + } | ||
1049 | + | ||
1050 | + vq_eventfd.index = vq->index; | ||
1051 | + vq_eventfd.fd = fd; | ||
1052 | + if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) { | ||
1053 | + fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index); | ||
1054 | + close(fd); | ||
1055 | + return; | ||
1056 | + } | ||
1057 | + | ||
1058 | + vq->fd = fd; | ||
1059 | + vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index; | ||
1060 | + vq->inuse = 0; | ||
1061 | + vq->used_idx = 0; | ||
1062 | + vq->signalled_used_valid = false; | ||
1063 | + vq->ready = true; | ||
1064 | + | ||
1065 | + dev->ops->enable_queue(dev, vq); | ||
1066 | +} | ||
1067 | + | ||
1068 | +static void vduse_queue_disable(VduseVirtq *vq) | ||
1069 | +{ | ||
1070 | + struct VduseDev *dev = vq->dev; | ||
1071 | + struct vduse_vq_eventfd eventfd; | ||
1072 | + | ||
1073 | + if (!vq->ready) { | ||
1074 | + return; | ||
1075 | + } | ||
1076 | + | ||
1077 | + dev->ops->disable_queue(dev, vq); | ||
1078 | + | ||
1079 | + eventfd.index = vq->index; | ||
1080 | + eventfd.fd = VDUSE_EVENTFD_DEASSIGN; | ||
1081 | + ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd); | ||
1082 | + close(vq->fd); | ||
1083 | + | ||
1084 | + assert(vq->inuse == 0); | ||
1085 | + | ||
1086 | + vq->vring.num = 0; | ||
1087 | + vq->vring.desc_addr = 0; | ||
1088 | + vq->vring.avail_addr = 0; | ||
1089 | + vq->vring.used_addr = 0; | ||
1090 | + vq->vring.desc = 0; | ||
1091 | + vq->vring.avail = 0; | ||
1092 | + vq->vring.used = 0; | ||
1093 | + vq->ready = false; | ||
1094 | + vq->fd = -1; | ||
1095 | +} | ||
1096 | + | ||
1097 | +static void vduse_dev_start_dataplane(VduseDev *dev) | ||
1098 | +{ | ||
1099 | + int i; | ||
1100 | + | ||
1101 | + if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { | ||
1102 | + fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); | ||
1103 | + return; | ||
1104 | + } | ||
1105 | + assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1)); | ||
1106 | + | ||
1107 | + for (i = 0; i < dev->num_queues; i++) { | ||
1108 | + vduse_queue_enable(&dev->vqs[i]); | ||
1109 | + } | ||
1110 | +} | ||
1111 | + | ||
1112 | +static void vduse_dev_stop_dataplane(VduseDev *dev) | ||
1113 | +{ | ||
1114 | + int i; | ||
1115 | + | ||
1116 | + for (i = 0; i < dev->num_queues; i++) { | ||
1117 | + vduse_queue_disable(&dev->vqs[i]); | ||
1118 | + } | ||
1119 | + dev->features = 0; | ||
1120 | + vduse_iova_remove_region(dev, 0, ULONG_MAX); | ||
1121 | +} | ||
1122 | + | ||
1123 | +int vduse_dev_handler(VduseDev *dev) | ||
1124 | +{ | ||
1125 | + struct vduse_dev_request req; | ||
1126 | + struct vduse_dev_response resp = { 0 }; | ||
1127 | + VduseVirtq *vq; | ||
1128 | + int i, ret; | ||
1129 | + | ||
1130 | + ret = read(dev->fd, &req, sizeof(req)); | ||
1131 | + if (ret != sizeof(req)) { | ||
1132 | + fprintf(stderr, "Read request error [%d]: %s\n", | ||
1133 | + ret, strerror(errno)); | ||
1134 | + return -errno; | ||
1135 | + } | ||
1136 | + resp.request_id = req.request_id; | ||
1137 | + | ||
1138 | + switch (req.type) { | ||
1139 | + case VDUSE_GET_VQ_STATE: | ||
1140 | + vq = &dev->vqs[req.vq_state.index]; | ||
1141 | + resp.vq_state.split.avail_index = vq->last_avail_idx; | ||
1142 | + resp.result = VDUSE_REQ_RESULT_OK; | ||
1143 | + break; | ||
1144 | + case VDUSE_SET_STATUS: | ||
1145 | + if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) { | ||
1146 | + vduse_dev_start_dataplane(dev); | ||
1147 | + } else if (req.s.status == 0) { | ||
1148 | + vduse_dev_stop_dataplane(dev); | ||
1149 | + } | ||
1150 | + resp.result = VDUSE_REQ_RESULT_OK; | ||
1151 | + break; | ||
1152 | + case VDUSE_UPDATE_IOTLB: | ||
1153 | + /* The iova will be updated by iova_to_va() later, so just remove it */ | ||
1154 | + vduse_iova_remove_region(dev, req.iova.start, req.iova.last); | ||
1155 | + for (i = 0; i < dev->num_queues; i++) { | ||
1156 | + VduseVirtq *vq = &dev->vqs[i]; | ||
1157 | + if (vq->ready) { | ||
1158 | + if (vduse_queue_update_vring(vq, vq->vring.desc_addr, | ||
1159 | + vq->vring.avail_addr, | ||
1160 | + vq->vring.used_addr)) { | ||
1161 | + fprintf(stderr, "Failed to update vring for vq[%d]\n", | ||
1162 | + vq->index); | ||
1163 | + } | ||
1164 | + } | ||
1165 | + } | ||
1166 | + resp.result = VDUSE_REQ_RESULT_OK; | ||
1167 | + break; | ||
1168 | + default: | ||
1169 | + resp.result = VDUSE_REQ_RESULT_FAILED; | ||
1170 | + break; | ||
1171 | + } | ||
1172 | + | ||
1173 | + ret = write(dev->fd, &resp, sizeof(resp)); | ||
1174 | + if (ret != sizeof(resp)) { | ||
1175 | + fprintf(stderr, "Write request %d error [%d]: %s\n", | ||
1176 | + req.type, ret, strerror(errno)); | ||
1177 | + return -errno; | ||
1178 | + } | ||
1179 | + return 0; | ||
1180 | +} | ||
1181 | + | ||
1182 | +int vduse_dev_update_config(VduseDev *dev, uint32_t size, | ||
1183 | + uint32_t offset, char *buffer) | ||
1184 | +{ | ||
1185 | + int ret; | ||
1186 | + struct vduse_config_data *data; | ||
1187 | + | ||
1188 | + data = malloc(offsetof(struct vduse_config_data, buffer) + size); | ||
1189 | + if (!data) { | ||
1190 | + return -ENOMEM; | ||
1191 | + } | ||
1192 | + | ||
1193 | + data->offset = offset; | ||
1194 | + data->length = size; | ||
1195 | + memcpy(data->buffer, buffer, size); | ||
1196 | + | ||
1197 | + ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data); | ||
1198 | + free(data); | ||
1199 | + | ||
1200 | + if (ret) { | ||
1201 | + return -errno; | ||
1202 | + } | ||
1203 | + | ||
1204 | + if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) { | ||
1205 | + return -errno; | ||
1206 | + } | ||
1207 | + | ||
1208 | + return 0; | ||
1209 | +} | ||
1210 | + | ||
1211 | +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size) | ||
1212 | +{ | ||
1213 | + VduseVirtq *vq = &dev->vqs[index]; | ||
1214 | + struct vduse_vq_config vq_config = { 0 }; | ||
1215 | + | ||
1216 | + if (max_size > VIRTQUEUE_MAX_SIZE) { | ||
1217 | + return -EINVAL; | ||
1218 | + } | ||
1219 | + | ||
1220 | + vq_config.index = vq->index; | ||
1221 | + vq_config.max_size = max_size; | ||
1222 | + | ||
1223 | + if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) { | ||
1224 | + return -errno; | ||
1225 | + } | ||
1226 | + | ||
1227 | + return 0; | ||
1228 | +} | ||
1229 | + | ||
1230 | +static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues) | ||
1231 | +{ | ||
1232 | + VduseVirtq *vqs; | ||
1233 | + int i; | ||
1234 | + | ||
1235 | + vqs = calloc(sizeof(VduseVirtq), num_queues); | ||
1236 | + if (!vqs) { | ||
1237 | + return -ENOMEM; | ||
1238 | + } | ||
1239 | + | ||
1240 | + for (i = 0; i < num_queues; i++) { | ||
1241 | + vqs[i].index = i; | ||
1242 | + vqs[i].dev = dev; | ||
1243 | + vqs[i].fd = -1; | ||
1244 | + } | ||
1245 | + dev->vqs = vqs; | ||
1246 | + | ||
1247 | + return 0; | ||
1248 | +} | ||
1249 | + | ||
1250 | +static int vduse_dev_init(VduseDev *dev, const char *name, | ||
1251 | + uint16_t num_queues, const VduseOps *ops, | ||
1252 | + void *priv) | ||
1253 | +{ | ||
1254 | + char *dev_path, *dev_name; | ||
1255 | + int ret, fd; | ||
1256 | + | ||
1257 | + dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1); | ||
1258 | + if (!dev_path) { | ||
1259 | + return -ENOMEM; | ||
1260 | + } | ||
1261 | + sprintf(dev_path, "/dev/vduse/%s", name); | ||
1262 | + | ||
1263 | + fd = open(dev_path, O_RDWR); | ||
1264 | + free(dev_path); | ||
1265 | + if (fd < 0) { | ||
1266 | + fprintf(stderr, "Failed to open vduse dev %s: %s\n", | ||
1267 | + name, strerror(errno)); | ||
1268 | + return -errno; | ||
1269 | + } | ||
1270 | + | ||
1271 | + dev_name = strdup(name); | ||
1272 | + if (!dev_name) { | ||
1273 | + close(fd); | ||
1274 | + return -ENOMEM; | ||
1275 | + } | ||
1276 | + | ||
1277 | + ret = vduse_dev_init_vqs(dev, num_queues); | ||
1278 | + if (ret) { | ||
1279 | + free(dev_name); | ||
1280 | + close(fd); | ||
1281 | + return ret; | ||
1282 | + } | ||
1283 | + | ||
1284 | + dev->name = dev_name; | ||
1285 | + dev->num_queues = num_queues; | ||
1286 | + dev->fd = fd; | ||
1287 | + dev->ops = ops; | ||
1288 | + dev->priv = priv; | ||
1289 | + | ||
1290 | + return 0; | ||
1291 | +} | ||
1292 | + | ||
1293 | +static inline bool vduse_name_is_valid(const char *name) | ||
1294 | +{ | ||
1295 | + return strlen(name) >= VDUSE_NAME_MAX || strstr(name, ".."); | ||
1296 | +} | ||
1297 | + | ||
1298 | +VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues, | ||
1299 | + const VduseOps *ops, void *priv) | ||
1300 | +{ | ||
1301 | + VduseDev *dev; | ||
1302 | + int ret; | ||
1303 | + | ||
1304 | + if (!ops || !ops->enable_queue || !ops->disable_queue) { | ||
1305 | + fprintf(stderr, "Invalid parameter for vduse\n"); | ||
1306 | + return NULL; | ||
1307 | + } | ||
1308 | + | ||
1309 | + dev = calloc(sizeof(VduseDev), 1); | ||
1310 | + if (!dev) { | ||
1311 | + fprintf(stderr, "Failed to allocate vduse device\n"); | ||
1312 | + return NULL; | ||
1313 | + } | ||
1314 | + | ||
1315 | + ret = vduse_dev_init_vqs(dev, num_queues); | ||
1316 | + if (ret) { | ||
1317 | + fprintf(stderr, "Failed to init vqs\n"); | ||
1318 | + free(dev); | ||
1319 | + return NULL; | ||
1320 | + } | ||
1321 | + | ||
1322 | + dev->num_queues = num_queues; | ||
1323 | + dev->fd = fd; | ||
1324 | + dev->ops = ops; | ||
1325 | + dev->priv = priv; | ||
1326 | + | ||
1327 | + return dev; | ||
1328 | +} | ||
1329 | + | ||
1330 | +VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues, | ||
1331 | + const VduseOps *ops, void *priv) | ||
1332 | +{ | ||
1333 | + VduseDev *dev; | ||
1334 | + int ret; | ||
1335 | + | ||
1336 | + if (!name || vduse_name_is_valid(name) || !ops || | ||
1337 | + !ops->enable_queue || !ops->disable_queue) { | ||
1338 | + fprintf(stderr, "Invalid parameter for vduse\n"); | ||
1339 | + return NULL; | ||
1340 | + } | ||
1341 | + | ||
1342 | + dev = calloc(sizeof(VduseDev), 1); | ||
1343 | + if (!dev) { | ||
1344 | + fprintf(stderr, "Failed to allocate vduse device\n"); | ||
1345 | + return NULL; | ||
1346 | + } | ||
1347 | + | ||
1348 | + ret = vduse_dev_init(dev, name, num_queues, ops, priv); | ||
1349 | + if (ret < 0) { | ||
1350 | + fprintf(stderr, "Failed to init vduse device %s: %s\n", | ||
1351 | + name, strerror(ret)); | ||
1352 | + free(dev); | ||
1353 | + return NULL; | ||
1354 | + } | ||
1355 | + | ||
1356 | + return dev; | ||
1357 | +} | ||
1358 | + | ||
1359 | +VduseDev *vduse_dev_create(const char *name, uint32_t device_id, | ||
1360 | + uint32_t vendor_id, uint64_t features, | ||
1361 | + uint16_t num_queues, uint32_t config_size, | ||
1362 | + char *config, const VduseOps *ops, void *priv) | ||
1363 | +{ | ||
1364 | + VduseDev *dev; | ||
1365 | + int ret, ctrl_fd; | ||
1366 | + uint64_t version; | ||
1367 | + struct vduse_dev_config *dev_config; | ||
1368 | + size_t size = offsetof(struct vduse_dev_config, config); | ||
1369 | + | ||
1370 | + if (!name || vduse_name_is_valid(name) || | ||
1371 | + !has_feature(features, VIRTIO_F_VERSION_1) || !config || | ||
1372 | + !config_size || !ops || !ops->enable_queue || !ops->disable_queue) { | ||
1373 | + fprintf(stderr, "Invalid parameter for vduse\n"); | ||
1374 | + return NULL; | ||
1375 | + } | ||
1376 | + | ||
1377 | + dev = calloc(sizeof(VduseDev), 1); | ||
1378 | + if (!dev) { | ||
1379 | + fprintf(stderr, "Failed to allocate vduse device\n"); | ||
1380 | + return NULL; | ||
1381 | + } | ||
1382 | + | ||
1383 | + ctrl_fd = open("/dev/vduse/control", O_RDWR); | ||
1384 | + if (ctrl_fd < 0) { | ||
1385 | + fprintf(stderr, "Failed to open /dev/vduse/control: %s\n", | ||
1386 | + strerror(errno)); | ||
1387 | + goto err_ctrl; | ||
1388 | + } | ||
1389 | + | ||
1390 | + version = VDUSE_API_VERSION; | ||
1391 | + if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) { | ||
1392 | + fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n", | ||
1393 | + version, strerror(errno)); | ||
1394 | + goto err_dev; | ||
1395 | + } | ||
1396 | + | ||
1397 | + dev_config = calloc(size + config_size, 1); | ||
1398 | + if (!dev_config) { | ||
1399 | + fprintf(stderr, "Failed to allocate config space\n"); | ||
1400 | + goto err_dev; | ||
1401 | + } | ||
1402 | + | ||
1403 | + strcpy(dev_config->name, name); | ||
1404 | + dev_config->device_id = device_id; | ||
1405 | + dev_config->vendor_id = vendor_id; | ||
1406 | + dev_config->features = features; | ||
1407 | + dev_config->vq_num = num_queues; | ||
1408 | + dev_config->vq_align = VDUSE_VQ_ALIGN; | ||
1409 | + dev_config->config_size = config_size; | ||
1410 | + memcpy(dev_config->config, config, config_size); | ||
1411 | + | ||
1412 | + ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config); | ||
1413 | + free(dev_config); | ||
1414 | + if (ret < 0) { | ||
1415 | + fprintf(stderr, "Failed to create vduse device %s: %s\n", | ||
1416 | + name, strerror(errno)); | ||
1417 | + goto err_dev; | ||
1418 | + } | ||
1419 | + dev->ctrl_fd = ctrl_fd; | ||
1420 | + | ||
1421 | + ret = vduse_dev_init(dev, name, num_queues, ops, priv); | ||
1422 | + if (ret < 0) { | ||
1423 | + fprintf(stderr, "Failed to init vduse device %s: %s\n", | ||
1424 | + name, strerror(ret)); | ||
1425 | + goto err; | ||
1426 | + } | ||
1427 | + | ||
1428 | + return dev; | ||
1429 | +err: | ||
1430 | + ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name); | ||
1431 | +err_dev: | ||
1432 | + close(ctrl_fd); | ||
1433 | +err_ctrl: | ||
1434 | + free(dev); | ||
1435 | + | ||
1436 | + return NULL; | ||
1437 | +} | ||
1438 | + | ||
1439 | +int vduse_dev_destroy(VduseDev *dev) | ||
1440 | +{ | ||
1441 | + int ret = 0; | ||
1442 | + | ||
1443 | + free(dev->vqs); | ||
1444 | + if (dev->fd >= 0) { | ||
1445 | + close(dev->fd); | ||
1446 | + dev->fd = -1; | ||
1447 | + } | ||
1448 | + if (dev->ctrl_fd >= 0) { | ||
1449 | + if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) { | ||
1450 | + ret = -errno; | ||
1451 | + } | ||
1452 | + close(dev->ctrl_fd); | ||
1453 | + dev->ctrl_fd = -1; | ||
1454 | + } | ||
1455 | + free(dev->name); | ||
1456 | + free(dev); | ||
1457 | + | ||
1458 | + return ret; | ||
1459 | +} | ||
1460 | diff --git a/MAINTAINERS b/MAINTAINERS | ||
1461 | index XXXXXXX..XXXXXXX 100644 | 158 | index XXXXXXX..XXXXXXX 100644 |
1462 | --- a/MAINTAINERS | 159 | --- a/tests/Makefile.include |
1463 | +++ b/MAINTAINERS | 160 | +++ b/tests/Makefile.include |
1464 | @@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org | 161 | @@ -XXX,XX +XXX,XX @@ gcov-files-test-thread-pool-y = thread-pool.c |
1465 | S: Supported | 162 | gcov-files-test-hbitmap-y = util/hbitmap.c |
1466 | F: block/export/fuse.c | 163 | check-unit-y += tests/test-hbitmap$(EXESUF) |
1467 | 164 | gcov-files-test-hbitmap-y = blockjob.c | |
1468 | +VDUSE library | 165 | +check-unit-y += tests/test-bdrv-drain$(EXESUF) |
1469 | +M: Xie Yongji <xieyongji@bytedance.com> | 166 | check-unit-y += tests/test-blockjob$(EXESUF) |
1470 | +S: Maintained | 167 | check-unit-y += tests/test-blockjob-txn$(EXESUF) |
1471 | +F: subprojects/libvduse/ | 168 | check-unit-y += tests/test-x86-cpuid$(EXESUF) |
1472 | + | 169 | @@ -XXX,XX +XXX,XX @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y) |
1473 | Replication | 170 | tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y) |
1474 | M: Wen Congyang <wencongyang2@huawei.com> | 171 | tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y) |
1475 | M: Xie Changlong <xiechanglong.d@gmail.com> | 172 | tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y) |
1476 | diff --git a/meson.build b/meson.build | 173 | +tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y) |
1477 | index XXXXXXX..XXXXXXX 100644 | 174 | tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y) |
1478 | --- a/meson.build | 175 | tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y) |
1479 | +++ b/meson.build | 176 | tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y) |
1480 | @@ -XXX,XX +XXX,XX @@ if get_option('fuse_lseek').allowed() | ||
1481 | endif | ||
1482 | endif | ||
1483 | |||
1484 | +have_libvduse = (targetos == 'linux') | ||
1485 | +if get_option('libvduse').enabled() | ||
1486 | + if targetos != 'linux' | ||
1487 | + error('libvduse requires linux') | ||
1488 | + endif | ||
1489 | +elif get_option('libvduse').disabled() | ||
1490 | + have_libvduse = false | ||
1491 | +endif | ||
1492 | + | ||
1493 | # libbpf | ||
1494 | libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config') | ||
1495 | if libbpf.found() and not cc.links(''' | ||
1496 | @@ -XXX,XX +XXX,XX @@ if targetos == 'linux' and have_vhost_user | ||
1497 | vhost_user = libvhost_user.get_variable('vhost_user_dep') | ||
1498 | endif | ||
1499 | |||
1500 | +libvduse = not_found | ||
1501 | +if have_libvduse | ||
1502 | + libvduse_proj = subproject('libvduse') | ||
1503 | + libvduse = libvduse_proj.get_variable('libvduse_dep') | ||
1504 | +endif | ||
1505 | + | ||
1506 | # NOTE: the trace/ subdirectory needs the qapi_trace_events variable | ||
1507 | # that is filled in by qapi/. | ||
1508 | subdir('qapi') | ||
1509 | diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh | ||
1510 | index XXXXXXX..XXXXXXX 100644 | ||
1511 | --- a/scripts/meson-buildoptions.sh | ||
1512 | +++ b/scripts/meson-buildoptions.sh | ||
1513 | @@ -XXX,XX +XXX,XX @@ meson_options_help() { | ||
1514 | printf "%s\n" ' libssh ssh block device support' | ||
1515 | printf "%s\n" ' libudev Use libudev to enumerate host devices' | ||
1516 | printf "%s\n" ' libusb libusb support for USB passthrough' | ||
1517 | + printf "%s\n" ' libvduse build VDUSE Library' | ||
1518 | printf "%s\n" ' linux-aio Linux AIO support' | ||
1519 | printf "%s\n" ' linux-io-uring Linux io_uring support' | ||
1520 | printf "%s\n" ' live-block-migration' | ||
1521 | @@ -XXX,XX +XXX,XX @@ _meson_option_parse() { | ||
1522 | --disable-libudev) printf "%s" -Dlibudev=disabled ;; | ||
1523 | --enable-libusb) printf "%s" -Dlibusb=enabled ;; | ||
1524 | --disable-libusb) printf "%s" -Dlibusb=disabled ;; | ||
1525 | + --enable-libvduse) printf "%s" -Dlibvduse=enabled ;; | ||
1526 | + --disable-libvduse) printf "%s" -Dlibvduse=disabled ;; | ||
1527 | --enable-linux-aio) printf "%s" -Dlinux_aio=enabled ;; | ||
1528 | --disable-linux-aio) printf "%s" -Dlinux_aio=disabled ;; | ||
1529 | --enable-linux-io-uring) printf "%s" -Dlinux_io_uring=enabled ;; | ||
1530 | diff --git a/subprojects/libvduse/linux-headers/linux b/subprojects/libvduse/linux-headers/linux | ||
1531 | new file mode 120000 | ||
1532 | index XXXXXXX..XXXXXXX | ||
1533 | --- /dev/null | ||
1534 | +++ b/subprojects/libvduse/linux-headers/linux | ||
1535 | @@ -0,0 +1 @@ | ||
1536 | +../../../linux-headers/linux/ | ||
1537 | \ No newline at end of file | ||
1538 | diff --git a/subprojects/libvduse/meson.build b/subprojects/libvduse/meson.build | ||
1539 | new file mode 100644 | ||
1540 | index XXXXXXX..XXXXXXX | ||
1541 | --- /dev/null | ||
1542 | +++ b/subprojects/libvduse/meson.build | ||
1543 | @@ -XXX,XX +XXX,XX @@ | ||
1544 | +project('libvduse', 'c', | ||
1545 | + license: 'GPL-2.0-or-later', | ||
1546 | + default_options: ['c_std=gnu99']) | ||
1547 | + | ||
1548 | +libvduse = static_library('vduse', | ||
1549 | + files('libvduse.c'), | ||
1550 | + c_args: '-D_GNU_SOURCE') | ||
1551 | + | ||
1552 | +libvduse_dep = declare_dependency(link_with: libvduse, | ||
1553 | + include_directories: include_directories('.')) | ||
1554 | diff --git a/subprojects/libvduse/standard-headers/linux b/subprojects/libvduse/standard-headers/linux | ||
1555 | new file mode 120000 | ||
1556 | index XXXXXXX..XXXXXXX | ||
1557 | --- /dev/null | ||
1558 | +++ b/subprojects/libvduse/standard-headers/linux | ||
1559 | @@ -0,0 +1 @@ | ||
1560 | +../../../include/standard-headers/linux/ | ||
1561 | \ No newline at end of file | ||
1562 | -- | 177 | -- |
1563 | 2.35.3 | 178 | 2.13.6 |
1564 | 179 | ||
1565 | 180 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Now that the bdrv_drain_invoke() calls are pulled up to the callers of | ||
2 | bdrv_drain_recurse(), the 'begin' parameter isn't needed any more. | ||
1 | 3 | ||
4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
5 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
6 | --- | ||
7 | block/io.c | 12 ++++++------ | ||
8 | 1 file changed, 6 insertions(+), 6 deletions(-) | ||
9 | |||
10 | diff --git a/block/io.c b/block/io.c | ||
11 | index XXXXXXX..XXXXXXX 100644 | ||
12 | --- a/block/io.c | ||
13 | +++ b/block/io.c | ||
14 | @@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) | ||
15 | } | ||
16 | } | ||
17 | |||
18 | -static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin) | ||
19 | +static bool bdrv_drain_recurse(BlockDriverState *bs) | ||
20 | { | ||
21 | BdrvChild *child, *tmp; | ||
22 | bool waited; | ||
23 | @@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin) | ||
24 | */ | ||
25 | bdrv_ref(bs); | ||
26 | } | ||
27 | - waited |= bdrv_drain_recurse(bs, begin); | ||
28 | + waited |= bdrv_drain_recurse(bs); | ||
29 | if (in_main_loop) { | ||
30 | bdrv_unref(bs); | ||
31 | } | ||
32 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs) | ||
33 | } | ||
34 | |||
35 | bdrv_drain_invoke(bs, true); | ||
36 | - bdrv_drain_recurse(bs, true); | ||
37 | + bdrv_drain_recurse(bs); | ||
38 | } | ||
39 | |||
40 | void bdrv_drained_end(BlockDriverState *bs) | ||
41 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs) | ||
42 | |||
43 | bdrv_parent_drained_end(bs); | ||
44 | bdrv_drain_invoke(bs, false); | ||
45 | - bdrv_drain_recurse(bs, false); | ||
46 | + bdrv_drain_recurse(bs); | ||
47 | aio_enable_external(bdrv_get_aio_context(bs)); | ||
48 | } | ||
49 | |||
50 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
51 | aio_context_acquire(aio_context); | ||
52 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { | ||
53 | if (aio_context == bdrv_get_aio_context(bs)) { | ||
54 | - waited |= bdrv_drain_recurse(bs, true); | ||
55 | + waited |= bdrv_drain_recurse(bs); | ||
56 | } | ||
57 | } | ||
58 | aio_context_release(aio_context); | ||
59 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) | ||
60 | aio_enable_external(aio_context); | ||
61 | bdrv_parent_drained_end(bs); | ||
62 | bdrv_drain_invoke(bs, false); | ||
63 | - bdrv_drain_recurse(bs, false); | ||
64 | + bdrv_drain_recurse(bs); | ||
65 | aio_context_release(aio_context); | ||
66 | } | ||
67 | |||
68 | -- | ||
69 | 2.13.6 | ||
70 | |||
71 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | The device is drained, so there is no point in waiting for requests at | ||
2 | the end of the drained section. Remove the bdrv_drain_recurse() calls | ||
3 | there. | ||
1 | 4 | ||
5 | The bdrv_drain_recurse() calls were introduced in commit 481cad48e5e | ||
6 | in order to call the .bdrv_co_drain_end() driver callback. This is now | ||
7 | done by a separate bdrv_drain_invoke() call. | ||
8 | |||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
10 | Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> | ||
11 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | --- | ||
13 | block/io.c | 2 -- | ||
14 | 1 file changed, 2 deletions(-) | ||
15 | |||
16 | diff --git a/block/io.c b/block/io.c | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/block/io.c | ||
19 | +++ b/block/io.c | ||
20 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs) | ||
21 | |||
22 | bdrv_parent_drained_end(bs); | ||
23 | bdrv_drain_invoke(bs, false); | ||
24 | - bdrv_drain_recurse(bs); | ||
25 | aio_enable_external(bdrv_get_aio_context(bs)); | ||
26 | } | ||
27 | |||
28 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) | ||
29 | aio_enable_external(aio_context); | ||
30 | bdrv_parent_drained_end(bs); | ||
31 | bdrv_drain_invoke(bs, false); | ||
32 | - bdrv_drain_recurse(bs); | ||
33 | aio_context_release(aio_context); | ||
34 | } | ||
35 | |||
36 | -- | ||
37 | 2.13.6 | ||
38 | |||
39 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Drain requests are propagated to child nodes, parent nodes and directly | ||
2 | to the AioContext. The order in which this happened was different | ||
3 | between all combinations of drain/drain_all and begin/end. | ||
1 | 4 | ||
5 | The correct order is to keep children only drained when their parents | ||
6 | are also drained. This means that at the start of a drained section, the | ||
7 | AioContext needs to be drained first, the parents second and only then | ||
8 | the children. The correct order for the end of a drained section is the | ||
9 | opposite. | ||
10 | |||
11 | This patch changes the three other functions to follow the example of | ||
12 | bdrv_drained_begin(), which is the only one that got it right. | ||
13 | |||
14 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
15 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
16 | --- | ||
17 | block/io.c | 12 ++++++++---- | ||
18 | 1 file changed, 8 insertions(+), 4 deletions(-) | ||
19 | |||
20 | diff --git a/block/io.c b/block/io.c | ||
21 | index XXXXXXX..XXXXXXX 100644 | ||
22 | --- a/block/io.c | ||
23 | +++ b/block/io.c | ||
24 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs) | ||
25 | return; | ||
26 | } | ||
27 | |||
28 | + /* Stop things in parent-to-child order */ | ||
29 | if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { | ||
30 | aio_disable_external(bdrv_get_aio_context(bs)); | ||
31 | bdrv_parent_drained_begin(bs); | ||
32 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs) | ||
33 | return; | ||
34 | } | ||
35 | |||
36 | - bdrv_parent_drained_end(bs); | ||
37 | + /* Re-enable things in child-to-parent order */ | ||
38 | bdrv_drain_invoke(bs, false); | ||
39 | + bdrv_parent_drained_end(bs); | ||
40 | aio_enable_external(bdrv_get_aio_context(bs)); | ||
41 | } | ||
42 | |||
43 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
44 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { | ||
45 | AioContext *aio_context = bdrv_get_aio_context(bs); | ||
46 | |||
47 | + /* Stop things in parent-to-child order */ | ||
48 | aio_context_acquire(aio_context); | ||
49 | - bdrv_parent_drained_begin(bs); | ||
50 | aio_disable_external(aio_context); | ||
51 | + bdrv_parent_drained_begin(bs); | ||
52 | bdrv_drain_invoke(bs, true); | ||
53 | aio_context_release(aio_context); | ||
54 | |||
55 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) | ||
56 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { | ||
57 | AioContext *aio_context = bdrv_get_aio_context(bs); | ||
58 | |||
59 | + /* Re-enable things in child-to-parent order */ | ||
60 | aio_context_acquire(aio_context); | ||
61 | - aio_enable_external(aio_context); | ||
62 | - bdrv_parent_drained_end(bs); | ||
63 | bdrv_drain_invoke(bs, false); | ||
64 | + bdrv_parent_drained_end(bs); | ||
65 | + aio_enable_external(aio_context); | ||
66 | aio_context_release(aio_context); | ||
67 | } | ||
68 | |||
69 | -- | ||
70 | 2.13.6 | ||
71 | |||
72 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Commit 15afd94a047 added code to acquire and release the AioContext in | ||
2 | qemuio_command(). This means that the lock is taken twice now in the | ||
3 | call path from hmp_qemu_io(). This causes BDRV_POLL_WHILE() to hang for | ||
4 | any requests issued to nodes in a non-mainloop AioContext. | ||
1 | 5 | ||
6 | Dropping the first locking from hmp_qemu_io() fixes the problem. | ||
7 | |||
8 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
9 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | --- | ||
11 | hmp.c | 6 ------ | ||
12 | 1 file changed, 6 deletions(-) | ||
13 | |||
14 | diff --git a/hmp.c b/hmp.c | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/hmp.c | ||
17 | +++ b/hmp.c | ||
18 | @@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) | ||
19 | { | ||
20 | BlockBackend *blk; | ||
21 | BlockBackend *local_blk = NULL; | ||
22 | - AioContext *aio_context; | ||
23 | const char* device = qdict_get_str(qdict, "device"); | ||
24 | const char* command = qdict_get_str(qdict, "command"); | ||
25 | Error *err = NULL; | ||
26 | @@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) | ||
27 | } | ||
28 | } | ||
29 | |||
30 | - aio_context = blk_get_aio_context(blk); | ||
31 | - aio_context_acquire(aio_context); | ||
32 | - | ||
33 | /* | ||
34 | * Notably absent: Proper permission management. This is sad, but it seems | ||
35 | * almost impossible to achieve without changing the semantics and thereby | ||
36 | @@ -XXX,XX +XXX,XX @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) | ||
37 | */ | ||
38 | qemuio_command(blk, command); | ||
39 | |||
40 | - aio_context_release(aio_context); | ||
41 | - | ||
42 | fail: | ||
43 | blk_unref(local_blk); | ||
44 | hmp_handle_error(mon, &err); | ||
45 | -- | ||
46 | 2.13.6 | ||
47 | |||
48 | diff view generated by jsdifflib |
1 | From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org> | 1 | From: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com> |
---|---|---|---|
2 | 2 | ||
3 | We don't need extra bitmap. All we need is to backup the original | 3 | Since bdrv_co_preadv does all neccessary checks including |
4 | bitmap when we do first merge. So, drop extra temporary bitmap and work | 4 | reading after the end of the backing file, avoid duplication |
5 | directly with target and backup. | 5 | of verification before bdrv_co_preadv call. |
6 | 6 | ||
7 | Still to keep old semantics, that on failure target is unchanged and | 7 | Signed-off-by: Edgar Kaziakhmedov <edgar.kaziakhmedov@virtuozzo.com> |
8 | user don't need to restore, we need a local_backup variable and do | 8 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> |
9 | restore ourselves on failure path. | ||
10 | |||
11 | Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> | ||
12 | Message-Id: <20220517111206.23585-3-v.sementsov-og@mail.ru> | ||
13 | Reviewed-by: Eric Blake <eblake@redhat.com> | 9 | Reviewed-by: Eric Blake <eblake@redhat.com> |
14 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
15 | --- | 11 | --- |
16 | block/monitor/bitmap-qmp-cmds.c | 41 +++++++++++++++++---------------- | 12 | block/qcow2.h | 3 --- |
17 | 1 file changed, 21 insertions(+), 20 deletions(-) | 13 | block/qcow2.c | 51 ++++++++------------------------------------------- |
14 | 2 files changed, 8 insertions(+), 46 deletions(-) | ||
18 | 15 | ||
19 | diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c | 16 | diff --git a/block/qcow2.h b/block/qcow2.h |
20 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/block/monitor/bitmap-qmp-cmds.c | 18 | --- a/block/qcow2.h |
22 | +++ b/block/monitor/bitmap-qmp-cmds.c | 19 | +++ b/block/qcow2.h |
23 | @@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, | 20 | @@ -XXX,XX +XXX,XX @@ uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset) |
24 | HBitmap **backup, Error **errp) | 21 | } |
25 | { | 22 | |
26 | BlockDriverState *bs; | 23 | /* qcow2.c functions */ |
27 | - BdrvDirtyBitmap *dst, *src, *anon; | 24 | -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, |
28 | + BdrvDirtyBitmap *dst, *src; | 25 | - int64_t sector_num, int nb_sectors); |
29 | BlockDirtyBitmapOrStrList *lst; | 26 | - |
30 | + HBitmap *local_backup = NULL; | 27 | int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size, |
31 | 28 | int refcount_order, bool generous_increase, | |
32 | GLOBAL_STATE_CODE(); | 29 | uint64_t *refblock_count); |
33 | 30 | diff --git a/block/qcow2.c b/block/qcow2.c | |
34 | @@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, | 31 | index XXXXXXX..XXXXXXX 100644 |
35 | return NULL; | 32 | --- a/block/qcow2.c |
36 | } | 33 | +++ b/block/qcow2.c |
37 | 34 | @@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, | |
38 | - anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst), | 35 | return status; |
39 | - NULL, errp); | 36 | } |
40 | - if (!anon) { | 37 | |
41 | - return NULL; | 38 | -/* handle reading after the end of the backing file */ |
39 | -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, | ||
40 | - int64_t offset, int bytes) | ||
41 | -{ | ||
42 | - uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE; | ||
43 | - int n1; | ||
44 | - | ||
45 | - if ((offset + bytes) <= bs_size) { | ||
46 | - return bytes; | ||
42 | - } | 47 | - } |
43 | - | 48 | - |
44 | for (lst = bms; lst; lst = lst->next) { | 49 | - if (offset >= bs_size) { |
45 | switch (lst->value->type) { | 50 | - n1 = 0; |
46 | const char *name, *node; | 51 | - } else { |
47 | @@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, | 52 | - n1 = bs_size - offset; |
48 | src = bdrv_find_dirty_bitmap(bs, name); | 53 | - } |
49 | if (!src) { | 54 | - |
50 | error_setg(errp, "Dirty bitmap '%s' not found", name); | 55 | - qemu_iovec_memset(qiov, n1, 0, bytes - n1); |
51 | - dst = NULL; | 56 | - |
52 | - goto out; | 57 | - return n1; |
53 | + goto fail; | 58 | -} |
54 | } | 59 | - |
55 | break; | 60 | static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, |
56 | case QTYPE_QDICT: | 61 | uint64_t bytes, QEMUIOVector *qiov, |
57 | @@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, | 62 | int flags) |
58 | name = lst->value->u.external.name; | 63 | { |
59 | src = block_dirty_bitmap_lookup(node, name, NULL, errp); | 64 | BDRVQcow2State *s = bs->opaque; |
60 | if (!src) { | 65 | - int offset_in_cluster, n1; |
61 | - dst = NULL; | 66 | + int offset_in_cluster; |
62 | - goto out; | 67 | int ret; |
63 | + goto fail; | 68 | unsigned int cur_bytes; /* number of bytes in current iteration */ |
64 | } | 69 | uint64_t cluster_offset = 0; |
65 | break; | 70 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, |
66 | default: | 71 | case QCOW2_CLUSTER_UNALLOCATED: |
67 | abort(); | 72 | |
68 | } | 73 | if (bs->backing) { |
69 | 74 | - /* read from the base image */ | |
70 | - if (!bdrv_merge_dirty_bitmap(anon, src, NULL, errp)) { | 75 | - n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov, |
71 | - dst = NULL; | 76 | - offset, cur_bytes); |
72 | - goto out; | 77 | - if (n1 > 0) { |
73 | + /* We do backup only for first merge operation */ | 78 | - QEMUIOVector local_qiov; |
74 | + if (!bdrv_merge_dirty_bitmap(dst, src, | 79 | - |
75 | + local_backup ? NULL : &local_backup, | 80 | - qemu_iovec_init(&local_qiov, hd_qiov.niov); |
76 | + errp)) | 81 | - qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1); |
77 | + { | 82 | - |
78 | + goto fail; | 83 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); |
79 | } | 84 | - qemu_co_mutex_unlock(&s->lock); |
80 | } | 85 | - ret = bdrv_co_preadv(bs->backing, offset, n1, |
81 | 86 | - &local_qiov, 0); | |
82 | - /* Merge into dst; dst is unchanged on failure. */ | 87 | - qemu_co_mutex_lock(&s->lock); |
83 | - if (!bdrv_merge_dirty_bitmap(dst, anon, backup, errp)) { | 88 | - |
84 | - dst = NULL; | 89 | - qemu_iovec_destroy(&local_qiov); |
85 | - goto out; | 90 | - |
86 | + if (backup) { | 91 | - if (ret < 0) { |
87 | + *backup = local_backup; | 92 | - goto fail; |
88 | + } else { | 93 | - } |
89 | + hbitmap_free(local_backup); | 94 | + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); |
90 | } | 95 | + qemu_co_mutex_unlock(&s->lock); |
91 | 96 | + ret = bdrv_co_preadv(bs->backing, offset, cur_bytes, | |
92 | - out: | 97 | + &hd_qiov, 0); |
93 | - bdrv_release_dirty_bitmap(anon); | 98 | + qemu_co_mutex_lock(&s->lock); |
94 | return dst; | 99 | + if (ret < 0) { |
95 | + | 100 | + goto fail; |
96 | +fail: | 101 | } |
97 | + if (local_backup) { | 102 | } else { |
98 | + bdrv_restore_dirty_bitmap(dst, local_backup); | 103 | /* Note: in this case, no need to wait */ |
99 | + } | ||
100 | + | ||
101 | + return NULL; | ||
102 | } | ||
103 | |||
104 | void qmp_block_dirty_bitmap_merge(const char *node, const char *target, | ||
105 | -- | 104 | -- |
106 | 2.35.3 | 105 | 2.13.6 |
106 | |||
107 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Removing a quorum child node with x-blockdev-change results in a quorum | ||
2 | driver state that cannot be recreated with create options because it | ||
3 | would require a list with gaps. This causes trouble in at least | ||
4 | .bdrv_refresh_filename(). | ||
1 | 5 | ||
6 | Document this problem so that we won't accidentally mark the command | ||
7 | stable without having addressed it. | ||
8 | |||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
10 | Reviewed-by: Alberto Garcia <berto@igalia.com> | ||
11 | --- | ||
12 | qapi/block-core.json | 4 ++++ | ||
13 | 1 file changed, 4 insertions(+) | ||
14 | |||
15 | diff --git a/qapi/block-core.json b/qapi/block-core.json | ||
16 | index XXXXXXX..XXXXXXX 100644 | ||
17 | --- a/qapi/block-core.json | ||
18 | +++ b/qapi/block-core.json | ||
19 | @@ -XXX,XX +XXX,XX @@ | ||
20 | # does not support all kinds of operations, all kinds of children, nor | ||
21 | # all block drivers. | ||
22 | # | ||
23 | +# FIXME Removing children from a quorum node means introducing gaps in the | ||
24 | +# child indices. This cannot be represented in the 'children' list of | ||
25 | +# BlockdevOptionsQuorum, as returned by .bdrv_refresh_filename(). | ||
26 | +# | ||
27 | # Warning: The data in a new quorum child MUST be consistent with that of | ||
28 | # the rest of the array. | ||
29 | # | ||
30 | -- | ||
31 | 2.13.6 | ||
32 | |||
33 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | From: Doug Gale <doug16k@gmail.com> |
---|---|---|---|
2 | 2 | ||
3 | Now the req->size is set to the correct value only | 3 | Add trace output for commands, errors, and undefined behavior. |
4 | when handling VIRTIO_BLK_T_GET_ID request. This patch | 4 | Add guest error log output for undefined behavior. |
5 | fixes it. | 5 | Report invalid undefined accesses to MMIO. |
6 | Annotate unlikely error checks with unlikely. | ||
6 | 7 | ||
7 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | 8 | Signed-off-by: Doug Gale <doug16k@gmail.com> |
8 | Message-Id: <20220523084611.91-3-xieyongji@bytedance.com> | 9 | Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org> |
9 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 10 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 12 | --- |
12 | block/export/vhost-user-blk-server.c | 5 ++--- | 13 | hw/block/nvme.c | 349 ++++++++++++++++++++++++++++++++++++++++++-------- |
13 | 1 file changed, 2 insertions(+), 3 deletions(-) | 14 | hw/block/trace-events | 93 ++++++++++++++ |
15 | 2 files changed, 390 insertions(+), 52 deletions(-) | ||
14 | 16 | ||
15 | diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c | 17 | diff --git a/hw/block/nvme.c b/hw/block/nvme.c |
16 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/export/vhost-user-blk-server.c | 19 | --- a/hw/block/nvme.c |
18 | +++ b/block/export/vhost-user-blk-server.c | 20 | +++ b/hw/block/nvme.c |
19 | @@ -XXX,XX +XXX,XX @@ static void vu_blk_req_complete(VuBlkReq *req) | 21 | @@ -XXX,XX +XXX,XX @@ |
22 | #include "qapi/visitor.h" | ||
23 | #include "sysemu/block-backend.h" | ||
24 | |||
25 | +#include "qemu/log.h" | ||
26 | +#include "trace.h" | ||
27 | #include "nvme.h" | ||
28 | |||
29 | +#define NVME_GUEST_ERR(trace, fmt, ...) \ | ||
30 | + do { \ | ||
31 | + (trace_##trace)(__VA_ARGS__); \ | ||
32 | + qemu_log_mask(LOG_GUEST_ERROR, #trace \ | ||
33 | + " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \ | ||
34 | + } while (0) | ||
35 | + | ||
36 | static void nvme_process_sq(void *opaque); | ||
37 | |||
38 | static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) | ||
39 | @@ -XXX,XX +XXX,XX @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq) | ||
20 | { | 40 | { |
21 | VuDev *vu_dev = &req->server->vu_dev; | 41 | if (cq->irq_enabled) { |
22 | 42 | if (msix_enabled(&(n->parent_obj))) { | |
23 | - /* IO size with 1 extra status byte */ | 43 | + trace_nvme_irq_msix(cq->vector); |
24 | - vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1); | 44 | msix_notify(&(n->parent_obj), cq->vector); |
25 | + vu_queue_push(vu_dev, req->vq, &req->elem, req->size); | 45 | } else { |
26 | vu_queue_notify(vu_dev, req->vq); | 46 | + trace_nvme_irq_pin(); |
27 | 47 | pci_irq_pulse(&n->parent_obj); | |
28 | free(req); | 48 | } |
29 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque) | 49 | + } else { |
30 | goto err; | 50 | + trace_nvme_irq_masked(); |
31 | } | 51 | } |
32 | 52 | } | |
33 | + req->size = iov_size(in_iov, in_num); | 53 | |
34 | /* We always touch the last byte, so just see how big in_iov is. */ | 54 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, |
35 | req->in = (void *)in_iov[in_num - 1].iov_base | 55 | trans_len = MIN(len, trans_len); |
36 | + in_iov[in_num - 1].iov_len | 56 | int num_prps = (len >> n->page_bits) + 1; |
37 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque) | 57 | |
38 | VIRTIO_BLK_ID_BYTES); | 58 | - if (!prp1) { |
39 | snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk"); | 59 | + if (unlikely(!prp1)) { |
40 | req->in->status = VIRTIO_BLK_S_OK; | 60 | + trace_nvme_err_invalid_prp(); |
41 | - req->size = elem->in_sg[0].iov_len; | 61 | return NVME_INVALID_FIELD | NVME_DNR; |
42 | break; | 62 | } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr && |
43 | } | 63 | prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) { |
44 | case VIRTIO_BLK_T_DISCARD: | 64 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, |
65 | } | ||
66 | len -= trans_len; | ||
67 | if (len) { | ||
68 | - if (!prp2) { | ||
69 | + if (unlikely(!prp2)) { | ||
70 | + trace_nvme_err_invalid_prp2_missing(); | ||
71 | goto unmap; | ||
72 | } | ||
73 | if (len > n->page_size) { | ||
74 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, | ||
75 | uint64_t prp_ent = le64_to_cpu(prp_list[i]); | ||
76 | |||
77 | if (i == n->max_prp_ents - 1 && len > n->page_size) { | ||
78 | - if (!prp_ent || prp_ent & (n->page_size - 1)) { | ||
79 | + if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { | ||
80 | + trace_nvme_err_invalid_prplist_ent(prp_ent); | ||
81 | goto unmap; | ||
82 | } | ||
83 | |||
84 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, | ||
85 | prp_ent = le64_to_cpu(prp_list[i]); | ||
86 | } | ||
87 | |||
88 | - if (!prp_ent || prp_ent & (n->page_size - 1)) { | ||
89 | + if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { | ||
90 | + trace_nvme_err_invalid_prplist_ent(prp_ent); | ||
91 | goto unmap; | ||
92 | } | ||
93 | |||
94 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, | ||
95 | i++; | ||
96 | } | ||
97 | } else { | ||
98 | - if (prp2 & (n->page_size - 1)) { | ||
99 | + if (unlikely(prp2 & (n->page_size - 1))) { | ||
100 | + trace_nvme_err_invalid_prp2_align(prp2); | ||
101 | goto unmap; | ||
102 | } | ||
103 | if (qsg->nsg) { | ||
104 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, | ||
105 | QEMUIOVector iov; | ||
106 | uint16_t status = NVME_SUCCESS; | ||
107 | |||
108 | + trace_nvme_dma_read(prp1, prp2); | ||
109 | + | ||
110 | if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) { | ||
111 | return NVME_INVALID_FIELD | NVME_DNR; | ||
112 | } | ||
113 | if (qsg.nsg > 0) { | ||
114 | - if (dma_buf_read(ptr, len, &qsg)) { | ||
115 | + if (unlikely(dma_buf_read(ptr, len, &qsg))) { | ||
116 | + trace_nvme_err_invalid_dma(); | ||
117 | status = NVME_INVALID_FIELD | NVME_DNR; | ||
118 | } | ||
119 | qemu_sglist_destroy(&qsg); | ||
120 | } else { | ||
121 | - if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) { | ||
122 | + if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) { | ||
123 | + trace_nvme_err_invalid_dma(); | ||
124 | status = NVME_INVALID_FIELD | NVME_DNR; | ||
125 | } | ||
126 | qemu_iovec_destroy(&iov); | ||
127 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, | ||
128 | uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS); | ||
129 | uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS); | ||
130 | |||
131 | - if (slba + nlb > ns->id_ns.nsze) { | ||
132 | + if (unlikely(slba + nlb > ns->id_ns.nsze)) { | ||
133 | + trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); | ||
134 | return NVME_LBA_RANGE | NVME_DNR; | ||
135 | } | ||
136 | |||
137 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, | ||
138 | int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0; | ||
139 | enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; | ||
140 | |||
141 | - if ((slba + nlb) > ns->id_ns.nsze) { | ||
142 | + trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba); | ||
143 | + | ||
144 | + if (unlikely((slba + nlb) > ns->id_ns.nsze)) { | ||
145 | block_acct_invalid(blk_get_stats(n->conf.blk), acct); | ||
146 | + trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); | ||
147 | return NVME_LBA_RANGE | NVME_DNR; | ||
148 | } | ||
149 | |||
150 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) | ||
151 | NvmeNamespace *ns; | ||
152 | uint32_t nsid = le32_to_cpu(cmd->nsid); | ||
153 | |||
154 | - if (nsid == 0 || nsid > n->num_namespaces) { | ||
155 | + if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { | ||
156 | + trace_nvme_err_invalid_ns(nsid, n->num_namespaces); | ||
157 | return NVME_INVALID_NSID | NVME_DNR; | ||
158 | } | ||
159 | |||
160 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) | ||
161 | case NVME_CMD_READ: | ||
162 | return nvme_rw(n, ns, cmd, req); | ||
163 | default: | ||
164 | + trace_nvme_err_invalid_opc(cmd->opcode); | ||
165 | return NVME_INVALID_OPCODE | NVME_DNR; | ||
166 | } | ||
167 | } | ||
168 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd) | ||
169 | NvmeCQueue *cq; | ||
170 | uint16_t qid = le16_to_cpu(c->qid); | ||
171 | |||
172 | - if (!qid || nvme_check_sqid(n, qid)) { | ||
173 | + if (unlikely(!qid || nvme_check_sqid(n, qid))) { | ||
174 | + trace_nvme_err_invalid_del_sq(qid); | ||
175 | return NVME_INVALID_QID | NVME_DNR; | ||
176 | } | ||
177 | |||
178 | + trace_nvme_del_sq(qid); | ||
179 | + | ||
180 | sq = n->sq[qid]; | ||
181 | while (!QTAILQ_EMPTY(&sq->out_req_list)) { | ||
182 | req = QTAILQ_FIRST(&sq->out_req_list); | ||
183 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) | ||
184 | uint16_t qflags = le16_to_cpu(c->sq_flags); | ||
185 | uint64_t prp1 = le64_to_cpu(c->prp1); | ||
186 | |||
187 | - if (!cqid || nvme_check_cqid(n, cqid)) { | ||
188 | + trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags); | ||
189 | + | ||
190 | + if (unlikely(!cqid || nvme_check_cqid(n, cqid))) { | ||
191 | + trace_nvme_err_invalid_create_sq_cqid(cqid); | ||
192 | return NVME_INVALID_CQID | NVME_DNR; | ||
193 | } | ||
194 | - if (!sqid || !nvme_check_sqid(n, sqid)) { | ||
195 | + if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) { | ||
196 | + trace_nvme_err_invalid_create_sq_sqid(sqid); | ||
197 | return NVME_INVALID_QID | NVME_DNR; | ||
198 | } | ||
199 | - if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) { | ||
200 | + if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { | ||
201 | + trace_nvme_err_invalid_create_sq_size(qsize); | ||
202 | return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; | ||
203 | } | ||
204 | - if (!prp1 || prp1 & (n->page_size - 1)) { | ||
205 | + if (unlikely(!prp1 || prp1 & (n->page_size - 1))) { | ||
206 | + trace_nvme_err_invalid_create_sq_addr(prp1); | ||
207 | return NVME_INVALID_FIELD | NVME_DNR; | ||
208 | } | ||
209 | - if (!(NVME_SQ_FLAGS_PC(qflags))) { | ||
210 | + if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) { | ||
211 | + trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags)); | ||
212 | return NVME_INVALID_FIELD | NVME_DNR; | ||
213 | } | ||
214 | sq = g_malloc0(sizeof(*sq)); | ||
215 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd) | ||
216 | NvmeCQueue *cq; | ||
217 | uint16_t qid = le16_to_cpu(c->qid); | ||
218 | |||
219 | - if (!qid || nvme_check_cqid(n, qid)) { | ||
220 | + if (unlikely(!qid || nvme_check_cqid(n, qid))) { | ||
221 | + trace_nvme_err_invalid_del_cq_cqid(qid); | ||
222 | return NVME_INVALID_CQID | NVME_DNR; | ||
223 | } | ||
224 | |||
225 | cq = n->cq[qid]; | ||
226 | - if (!QTAILQ_EMPTY(&cq->sq_list)) { | ||
227 | + if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) { | ||
228 | + trace_nvme_err_invalid_del_cq_notempty(qid); | ||
229 | return NVME_INVALID_QUEUE_DEL; | ||
230 | } | ||
231 | + trace_nvme_del_cq(qid); | ||
232 | nvme_free_cq(cq, n); | ||
233 | return NVME_SUCCESS; | ||
234 | } | ||
235 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd) | ||
236 | uint16_t qflags = le16_to_cpu(c->cq_flags); | ||
237 | uint64_t prp1 = le64_to_cpu(c->prp1); | ||
238 | |||
239 | - if (!cqid || !nvme_check_cqid(n, cqid)) { | ||
240 | + trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags, | ||
241 | + NVME_CQ_FLAGS_IEN(qflags) != 0); | ||
242 | + | ||
243 | + if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) { | ||
244 | + trace_nvme_err_invalid_create_cq_cqid(cqid); | ||
245 | return NVME_INVALID_CQID | NVME_DNR; | ||
246 | } | ||
247 | - if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) { | ||
248 | + if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { | ||
249 | + trace_nvme_err_invalid_create_cq_size(qsize); | ||
250 | return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; | ||
251 | } | ||
252 | - if (!prp1) { | ||
253 | + if (unlikely(!prp1)) { | ||
254 | + trace_nvme_err_invalid_create_cq_addr(prp1); | ||
255 | return NVME_INVALID_FIELD | NVME_DNR; | ||
256 | } | ||
257 | - if (vector > n->num_queues) { | ||
258 | + if (unlikely(vector > n->num_queues)) { | ||
259 | + trace_nvme_err_invalid_create_cq_vector(vector); | ||
260 | return NVME_INVALID_IRQ_VECTOR | NVME_DNR; | ||
261 | } | ||
262 | - if (!(NVME_CQ_FLAGS_PC(qflags))) { | ||
263 | + if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) { | ||
264 | + trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags)); | ||
265 | return NVME_INVALID_FIELD | NVME_DNR; | ||
266 | } | ||
267 | |||
268 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c) | ||
269 | uint64_t prp1 = le64_to_cpu(c->prp1); | ||
270 | uint64_t prp2 = le64_to_cpu(c->prp2); | ||
271 | |||
272 | + trace_nvme_identify_ctrl(); | ||
273 | + | ||
274 | return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), | ||
275 | prp1, prp2); | ||
276 | } | ||
277 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) | ||
278 | uint64_t prp1 = le64_to_cpu(c->prp1); | ||
279 | uint64_t prp2 = le64_to_cpu(c->prp2); | ||
280 | |||
281 | - if (nsid == 0 || nsid > n->num_namespaces) { | ||
282 | + trace_nvme_identify_ns(nsid); | ||
283 | + | ||
284 | + if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { | ||
285 | + trace_nvme_err_invalid_ns(nsid, n->num_namespaces); | ||
286 | return NVME_INVALID_NSID | NVME_DNR; | ||
287 | } | ||
288 | |||
289 | ns = &n->namespaces[nsid - 1]; | ||
290 | + | ||
291 | return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), | ||
292 | prp1, prp2); | ||
293 | } | ||
294 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) | ||
295 | uint16_t ret; | ||
296 | int i, j = 0; | ||
297 | |||
298 | + trace_nvme_identify_nslist(min_nsid); | ||
299 | + | ||
300 | list = g_malloc0(data_len); | ||
301 | for (i = 0; i < n->num_namespaces; i++) { | ||
302 | if (i < min_nsid) { | ||
303 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd) | ||
304 | case 0x02: | ||
305 | return nvme_identify_nslist(n, c); | ||
306 | default: | ||
307 | + trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); | ||
308 | return NVME_INVALID_FIELD | NVME_DNR; | ||
309 | } | ||
310 | } | ||
311 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) | ||
312 | switch (dw10) { | ||
313 | case NVME_VOLATILE_WRITE_CACHE: | ||
314 | result = blk_enable_write_cache(n->conf.blk); | ||
315 | + trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); | ||
316 | break; | ||
317 | case NVME_NUMBER_OF_QUEUES: | ||
318 | result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16)); | ||
319 | + trace_nvme_getfeat_numq(result); | ||
320 | break; | ||
321 | default: | ||
322 | + trace_nvme_err_invalid_getfeat(dw10); | ||
323 | return NVME_INVALID_FIELD | NVME_DNR; | ||
324 | } | ||
325 | |||
326 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) | ||
327 | blk_set_enable_write_cache(n->conf.blk, dw11 & 1); | ||
328 | break; | ||
329 | case NVME_NUMBER_OF_QUEUES: | ||
330 | + trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1, | ||
331 | + ((dw11 >> 16) & 0xFFFF) + 1, | ||
332 | + n->num_queues - 1, n->num_queues - 1); | ||
333 | req->cqe.result = | ||
334 | cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16)); | ||
335 | break; | ||
336 | default: | ||
337 | + trace_nvme_err_invalid_setfeat(dw10); | ||
338 | return NVME_INVALID_FIELD | NVME_DNR; | ||
339 | } | ||
340 | return NVME_SUCCESS; | ||
341 | @@ -XXX,XX +XXX,XX @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) | ||
342 | case NVME_ADM_CMD_GET_FEATURES: | ||
343 | return nvme_get_feature(n, cmd, req); | ||
344 | default: | ||
345 | + trace_nvme_err_invalid_admin_opc(cmd->opcode); | ||
346 | return NVME_INVALID_OPCODE | NVME_DNR; | ||
347 | } | ||
348 | } | ||
349 | @@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n) | ||
350 | uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12; | ||
351 | uint32_t page_size = 1 << page_bits; | ||
352 | |||
353 | - if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq || | ||
354 | - n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) || | ||
355 | - NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) || | ||
356 | - NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) || | ||
357 | - NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) || | ||
358 | - NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) || | ||
359 | - NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) || | ||
360 | - NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) || | ||
361 | - !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) { | ||
362 | + if (unlikely(n->cq[0])) { | ||
363 | + trace_nvme_err_startfail_cq(); | ||
364 | + return -1; | ||
365 | + } | ||
366 | + if (unlikely(n->sq[0])) { | ||
367 | + trace_nvme_err_startfail_sq(); | ||
368 | + return -1; | ||
369 | + } | ||
370 | + if (unlikely(!n->bar.asq)) { | ||
371 | + trace_nvme_err_startfail_nbarasq(); | ||
372 | + return -1; | ||
373 | + } | ||
374 | + if (unlikely(!n->bar.acq)) { | ||
375 | + trace_nvme_err_startfail_nbaracq(); | ||
376 | + return -1; | ||
377 | + } | ||
378 | + if (unlikely(n->bar.asq & (page_size - 1))) { | ||
379 | + trace_nvme_err_startfail_asq_misaligned(n->bar.asq); | ||
380 | + return -1; | ||
381 | + } | ||
382 | + if (unlikely(n->bar.acq & (page_size - 1))) { | ||
383 | + trace_nvme_err_startfail_acq_misaligned(n->bar.acq); | ||
384 | + return -1; | ||
385 | + } | ||
386 | + if (unlikely(NVME_CC_MPS(n->bar.cc) < | ||
387 | + NVME_CAP_MPSMIN(n->bar.cap))) { | ||
388 | + trace_nvme_err_startfail_page_too_small( | ||
389 | + NVME_CC_MPS(n->bar.cc), | ||
390 | + NVME_CAP_MPSMIN(n->bar.cap)); | ||
391 | + return -1; | ||
392 | + } | ||
393 | + if (unlikely(NVME_CC_MPS(n->bar.cc) > | ||
394 | + NVME_CAP_MPSMAX(n->bar.cap))) { | ||
395 | + trace_nvme_err_startfail_page_too_large( | ||
396 | + NVME_CC_MPS(n->bar.cc), | ||
397 | + NVME_CAP_MPSMAX(n->bar.cap)); | ||
398 | + return -1; | ||
399 | + } | ||
400 | + if (unlikely(NVME_CC_IOCQES(n->bar.cc) < | ||
401 | + NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) { | ||
402 | + trace_nvme_err_startfail_cqent_too_small( | ||
403 | + NVME_CC_IOCQES(n->bar.cc), | ||
404 | + NVME_CTRL_CQES_MIN(n->bar.cap)); | ||
405 | + return -1; | ||
406 | + } | ||
407 | + if (unlikely(NVME_CC_IOCQES(n->bar.cc) > | ||
408 | + NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) { | ||
409 | + trace_nvme_err_startfail_cqent_too_large( | ||
410 | + NVME_CC_IOCQES(n->bar.cc), | ||
411 | + NVME_CTRL_CQES_MAX(n->bar.cap)); | ||
412 | + return -1; | ||
413 | + } | ||
414 | + if (unlikely(NVME_CC_IOSQES(n->bar.cc) < | ||
415 | + NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) { | ||
416 | + trace_nvme_err_startfail_sqent_too_small( | ||
417 | + NVME_CC_IOSQES(n->bar.cc), | ||
418 | + NVME_CTRL_SQES_MIN(n->bar.cap)); | ||
419 | + return -1; | ||
420 | + } | ||
421 | + if (unlikely(NVME_CC_IOSQES(n->bar.cc) > | ||
422 | + NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) { | ||
423 | + trace_nvme_err_startfail_sqent_too_large( | ||
424 | + NVME_CC_IOSQES(n->bar.cc), | ||
425 | + NVME_CTRL_SQES_MAX(n->bar.cap)); | ||
426 | + return -1; | ||
427 | + } | ||
428 | + if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) { | ||
429 | + trace_nvme_err_startfail_asqent_sz_zero(); | ||
430 | + return -1; | ||
431 | + } | ||
432 | + if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) { | ||
433 | + trace_nvme_err_startfail_acqent_sz_zero(); | ||
434 | return -1; | ||
435 | } | ||
436 | |||
437 | @@ -XXX,XX +XXX,XX @@ static int nvme_start_ctrl(NvmeCtrl *n) | ||
438 | static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, | ||
439 | unsigned size) | ||
440 | { | ||
441 | + if (unlikely(offset & (sizeof(uint32_t) - 1))) { | ||
442 | + NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32, | ||
443 | + "MMIO write not 32-bit aligned," | ||
444 | + " offset=0x%"PRIx64"", offset); | ||
445 | + /* should be ignored, fall through for now */ | ||
446 | + } | ||
447 | + | ||
448 | + if (unlikely(size < sizeof(uint32_t))) { | ||
449 | + NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall, | ||
450 | + "MMIO write smaller than 32-bits," | ||
451 | + " offset=0x%"PRIx64", size=%u", | ||
452 | + offset, size); | ||
453 | + /* should be ignored, fall through for now */ | ||
454 | + } | ||
455 | + | ||
456 | switch (offset) { | ||
457 | - case 0xc: | ||
458 | + case 0xc: /* INTMS */ | ||
459 | + if (unlikely(msix_enabled(&(n->parent_obj)))) { | ||
460 | + NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix, | ||
461 | + "undefined access to interrupt mask set" | ||
462 | + " when MSI-X is enabled"); | ||
463 | + /* should be ignored, fall through for now */ | ||
464 | + } | ||
465 | n->bar.intms |= data & 0xffffffff; | ||
466 | n->bar.intmc = n->bar.intms; | ||
467 | + trace_nvme_mmio_intm_set(data & 0xffffffff, | ||
468 | + n->bar.intmc); | ||
469 | break; | ||
470 | - case 0x10: | ||
471 | + case 0x10: /* INTMC */ | ||
472 | + if (unlikely(msix_enabled(&(n->parent_obj)))) { | ||
473 | + NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix, | ||
474 | + "undefined access to interrupt mask clr" | ||
475 | + " when MSI-X is enabled"); | ||
476 | + /* should be ignored, fall through for now */ | ||
477 | + } | ||
478 | n->bar.intms &= ~(data & 0xffffffff); | ||
479 | n->bar.intmc = n->bar.intms; | ||
480 | + trace_nvme_mmio_intm_clr(data & 0xffffffff, | ||
481 | + n->bar.intmc); | ||
482 | break; | ||
483 | - case 0x14: | ||
484 | + case 0x14: /* CC */ | ||
485 | + trace_nvme_mmio_cfg(data & 0xffffffff); | ||
486 | /* Windows first sends data, then sends enable bit */ | ||
487 | if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) && | ||
488 | !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc)) | ||
489 | @@ -XXX,XX +XXX,XX @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, | ||
490 | |||
491 | if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) { | ||
492 | n->bar.cc = data; | ||
493 | - if (nvme_start_ctrl(n)) { | ||
494 | + if (unlikely(nvme_start_ctrl(n))) { | ||
495 | + trace_nvme_err_startfail(); | ||
496 | n->bar.csts = NVME_CSTS_FAILED; | ||
497 | } else { | ||
498 | + trace_nvme_mmio_start_success(); | ||
499 | n->bar.csts = NVME_CSTS_READY; | ||
500 | } | ||
501 | } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) { | ||
502 | + trace_nvme_mmio_stopped(); | ||
503 | nvme_clear_ctrl(n); | ||
504 | n->bar.csts &= ~NVME_CSTS_READY; | ||
505 | } | ||
506 | if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) { | ||
507 | - nvme_clear_ctrl(n); | ||
508 | - n->bar.cc = data; | ||
509 | - n->bar.csts |= NVME_CSTS_SHST_COMPLETE; | ||
510 | + trace_nvme_mmio_shutdown_set(); | ||
511 | + nvme_clear_ctrl(n); | ||
512 | + n->bar.cc = data; | ||
513 | + n->bar.csts |= NVME_CSTS_SHST_COMPLETE; | ||
514 | } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) { | ||
515 | - n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE; | ||
516 | - n->bar.cc = data; | ||
517 | + trace_nvme_mmio_shutdown_cleared(); | ||
518 | + n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE; | ||
519 | + n->bar.cc = data; | ||
520 | + } | ||
521 | + break; | ||
522 | + case 0x1C: /* CSTS */ | ||
523 | + if (data & (1 << 4)) { | ||
524 | + NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported, | ||
525 | + "attempted to W1C CSTS.NSSRO" | ||
526 | + " but CAP.NSSRS is zero (not supported)"); | ||
527 | + } else if (data != 0) { | ||
528 | + NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts, | ||
529 | + "attempted to set a read only bit" | ||
530 | + " of controller status"); | ||
531 | + } | ||
532 | + break; | ||
533 | + case 0x20: /* NSSR */ | ||
534 | + if (data == 0x4E564D65) { | ||
535 | + trace_nvme_ub_mmiowr_ssreset_unsupported(); | ||
536 | + } else { | ||
537 | + /* The spec says that writes of other values have no effect */ | ||
538 | + return; | ||
539 | } | ||
540 | break; | ||
541 | - case 0x24: | ||
542 | + case 0x24: /* AQA */ | ||
543 | n->bar.aqa = data & 0xffffffff; | ||
544 | + trace_nvme_mmio_aqattr(data & 0xffffffff); | ||
545 | break; | ||
546 | - case 0x28: | ||
547 | + case 0x28: /* ASQ */ | ||
548 | n->bar.asq = data; | ||
549 | + trace_nvme_mmio_asqaddr(data); | ||
550 | break; | ||
551 | - case 0x2c: | ||
552 | + case 0x2c: /* ASQ hi */ | ||
553 | n->bar.asq |= data << 32; | ||
554 | + trace_nvme_mmio_asqaddr_hi(data, n->bar.asq); | ||
555 | break; | ||
556 | - case 0x30: | ||
557 | + case 0x30: /* ACQ */ | ||
558 | + trace_nvme_mmio_acqaddr(data); | ||
559 | n->bar.acq = data; | ||
560 | break; | ||
561 | - case 0x34: | ||
562 | + case 0x34: /* ACQ hi */ | ||
563 | n->bar.acq |= data << 32; | ||
564 | + trace_nvme_mmio_acqaddr_hi(data, n->bar.acq); | ||
565 | break; | ||
566 | + case 0x38: /* CMBLOC */ | ||
567 | + NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved, | ||
568 | + "invalid write to reserved CMBLOC" | ||
569 | + " when CMBSZ is zero, ignored"); | ||
570 | + return; | ||
571 | + case 0x3C: /* CMBSZ */ | ||
572 | + NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly, | ||
573 | + "invalid write to read only CMBSZ, ignored"); | ||
574 | + return; | ||
575 | default: | ||
576 | + NVME_GUEST_ERR(nvme_ub_mmiowr_invalid, | ||
577 | + "invalid MMIO write," | ||
578 | + " offset=0x%"PRIx64", data=%"PRIx64"", | ||
579 | + offset, data); | ||
580 | break; | ||
581 | } | ||
582 | } | ||
583 | @@ -XXX,XX +XXX,XX @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) | ||
584 | uint8_t *ptr = (uint8_t *)&n->bar; | ||
585 | uint64_t val = 0; | ||
586 | |||
587 | + if (unlikely(addr & (sizeof(uint32_t) - 1))) { | ||
588 | + NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32, | ||
589 | + "MMIO read not 32-bit aligned," | ||
590 | + " offset=0x%"PRIx64"", addr); | ||
591 | + /* should RAZ, fall through for now */ | ||
592 | + } else if (unlikely(size < sizeof(uint32_t))) { | ||
593 | + NVME_GUEST_ERR(nvme_ub_mmiord_toosmall, | ||
594 | + "MMIO read smaller than 32-bits," | ||
595 | + " offset=0x%"PRIx64"", addr); | ||
596 | + /* should RAZ, fall through for now */ | ||
597 | + } | ||
598 | + | ||
599 | if (addr < sizeof(n->bar)) { | ||
600 | memcpy(&val, ptr + addr, size); | ||
601 | + } else { | ||
602 | + NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs, | ||
603 | + "MMIO read beyond last register," | ||
604 | + " offset=0x%"PRIx64", returning 0", addr); | ||
605 | } | ||
606 | + | ||
607 | return val; | ||
608 | } | ||
609 | |||
610 | @@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) | ||
611 | { | ||
612 | uint32_t qid; | ||
613 | |||
614 | - if (addr & ((1 << 2) - 1)) { | ||
615 | + if (unlikely(addr & ((1 << 2) - 1))) { | ||
616 | + NVME_GUEST_ERR(nvme_ub_db_wr_misaligned, | ||
617 | + "doorbell write not 32-bit aligned," | ||
618 | + " offset=0x%"PRIx64", ignoring", addr); | ||
619 | return; | ||
620 | } | ||
621 | |||
622 | if (((addr - 0x1000) >> 2) & 1) { | ||
623 | + /* Completion queue doorbell write */ | ||
624 | + | ||
625 | uint16_t new_head = val & 0xffff; | ||
626 | int start_sqs; | ||
627 | NvmeCQueue *cq; | ||
628 | |||
629 | qid = (addr - (0x1000 + (1 << 2))) >> 3; | ||
630 | - if (nvme_check_cqid(n, qid)) { | ||
631 | + if (unlikely(nvme_check_cqid(n, qid))) { | ||
632 | + NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq, | ||
633 | + "completion queue doorbell write" | ||
634 | + " for nonexistent queue," | ||
635 | + " sqid=%"PRIu32", ignoring", qid); | ||
636 | return; | ||
637 | } | ||
638 | |||
639 | cq = n->cq[qid]; | ||
640 | - if (new_head >= cq->size) { | ||
641 | + if (unlikely(new_head >= cq->size)) { | ||
642 | + NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead, | ||
643 | + "completion queue doorbell write value" | ||
644 | + " beyond queue size, sqid=%"PRIu32"," | ||
645 | + " new_head=%"PRIu16", ignoring", | ||
646 | + qid, new_head); | ||
647 | return; | ||
648 | } | ||
649 | |||
650 | @@ -XXX,XX +XXX,XX @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) | ||
651 | nvme_isr_notify(n, cq); | ||
652 | } | ||
653 | } else { | ||
654 | + /* Submission queue doorbell write */ | ||
655 | + | ||
656 | uint16_t new_tail = val & 0xffff; | ||
657 | NvmeSQueue *sq; | ||
658 | |||
659 | qid = (addr - 0x1000) >> 3; | ||
660 | - if (nvme_check_sqid(n, qid)) { | ||
661 | + if (unlikely(nvme_check_sqid(n, qid))) { | ||
662 | + NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq, | ||
663 | + "submission queue doorbell write" | ||
664 | + " for nonexistent queue," | ||
665 | + " sqid=%"PRIu32", ignoring", qid); | ||
666 | return; | ||
667 | } | ||
668 | |||
669 | sq = n->sq[qid]; | ||
670 | - if (new_tail >= sq->size) { | ||
671 | + if (unlikely(new_tail >= sq->size)) { | ||
672 | + NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail, | ||
673 | + "submission queue doorbell write value" | ||
674 | + " beyond queue size, sqid=%"PRIu32"," | ||
675 | + " new_tail=%"PRIu16", ignoring", | ||
676 | + qid, new_tail); | ||
677 | return; | ||
678 | } | ||
679 | |||
680 | diff --git a/hw/block/trace-events b/hw/block/trace-events | ||
681 | index XXXXXXX..XXXXXXX 100644 | ||
682 | --- a/hw/block/trace-events | ||
683 | +++ b/hw/block/trace-events | ||
684 | @@ -XXX,XX +XXX,XX @@ virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint6 | ||
685 | hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d" | ||
686 | hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int trans) "blk %p CHS %u %u %u trans %d" | ||
687 | |||
688 | +# hw/block/nvme.c | ||
689 | +# nvme traces for successful events | ||
690 | +nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" | ||
691 | +nvme_irq_pin(void) "pulsing IRQ pin" | ||
692 | +nvme_irq_masked(void) "IRQ is masked" | ||
693 | +nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" | ||
694 | +nvme_rw(char const *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64"" | ||
695 | +nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" | ||
696 | +nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" | ||
697 | +nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" | ||
698 | +nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16"" | ||
699 | +nvme_identify_ctrl(void) "identify controller" | ||
700 | +nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16"" | ||
701 | +nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16"" | ||
702 | +nvme_getfeat_vwcache(char const* result) "get feature volatile write cache, result=%s" | ||
703 | +nvme_getfeat_numq(int result) "get feature number of queues, result=%d" | ||
704 | +nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" | ||
705 | +nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64"" | ||
706 | +nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64"" | ||
707 | +nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64"" | ||
708 | +nvme_mmio_aqattr(uint64_t data) "wrote MMIO, admin queue attributes=0x%"PRIx64"" | ||
709 | +nvme_mmio_asqaddr(uint64_t data) "wrote MMIO, admin submission queue address=0x%"PRIx64"" | ||
710 | +nvme_mmio_acqaddr(uint64_t data) "wrote MMIO, admin completion queue address=0x%"PRIx64"" | ||
711 | +nvme_mmio_asqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin submission queue high half=0x%"PRIx64", new_address=0x%"PRIx64"" | ||
712 | +nvme_mmio_acqaddr_hi(uint64_t data, uint64_t new_addr) "wrote MMIO, admin completion queue high half=0x%"PRIx64", new_address=0x%"PRIx64"" | ||
713 | +nvme_mmio_start_success(void) "setting controller enable bit succeeded" | ||
714 | +nvme_mmio_stopped(void) "cleared controller enable bit" | ||
715 | +nvme_mmio_shutdown_set(void) "shutdown bit set" | ||
716 | +nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" | ||
717 | + | ||
718 | +# nvme traces for error conditions | ||
719 | +nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" | ||
720 | +nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" | ||
721 | +nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" | ||
722 | +nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred" | ||
723 | +nvme_err_invalid_field(void) "invalid field" | ||
724 | +nvme_err_invalid_prp(void) "invalid PRP" | ||
725 | +nvme_err_invalid_sgl(void) "invalid SGL" | ||
726 | +nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u" | ||
727 | +nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" | ||
728 | +nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" | ||
729 | +nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" | ||
730 | +nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" | ||
731 | +nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" | ||
732 | +nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16"" | ||
733 | +nvme_err_invalid_create_sq_size(uint16_t qsize) "failed creating submission queue, invalid qsize=%"PRIu16"" | ||
734 | +nvme_err_invalid_create_sq_addr(uint64_t addr) "failed creating submission queue, addr=0x%"PRIx64"" | ||
735 | +nvme_err_invalid_create_sq_qflags(uint16_t qflags) "failed creating submission queue, qflags=%"PRIu16"" | ||
736 | +nvme_err_invalid_del_cq_cqid(uint16_t cqid) "failed deleting completion queue, cqid=%"PRIu16"" | ||
737 | +nvme_err_invalid_del_cq_notempty(uint16_t cqid) "failed deleting completion queue, it is not empty, cqid=%"PRIu16"" | ||
738 | +nvme_err_invalid_create_cq_cqid(uint16_t cqid) "failed creating completion queue, cqid=%"PRIu16"" | ||
739 | +nvme_err_invalid_create_cq_size(uint16_t size) "failed creating completion queue, size=%"PRIu16"" | ||
740 | +nvme_err_invalid_create_cq_addr(uint64_t addr) "failed creating completion queue, addr=0x%"PRIx64"" | ||
741 | +nvme_err_invalid_create_cq_vector(uint16_t vector) "failed creating completion queue, vector=%"PRIu16"" | ||
742 | +nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completion queue, qflags=%"PRIu16"" | ||
743 | +nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16"" | ||
744 | +nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32"" | ||
745 | +nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32"" | ||
746 | +nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues" | ||
747 | +nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues" | ||
748 | +nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null" | ||
749 | +nvme_err_startfail_nbaracq(void) "nvme_start_ctrl failed because the admin completion queue address is null" | ||
750 | +nvme_err_startfail_asq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin submission queue address is misaligned: 0x%"PRIx64"" | ||
751 | +nvme_err_startfail_acq_misaligned(uint64_t addr) "nvme_start_ctrl failed because the admin completion queue address is misaligned: 0x%"PRIx64"" | ||
752 | +nvme_err_startfail_page_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too small: log2size=%u, min=%u" | ||
753 | +nvme_err_startfail_page_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the page size is too large: log2size=%u, max=%u" | ||
754 | +nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too small: log2size=%u, min=%u" | ||
755 | +nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u" | ||
756 | +nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u" | ||
757 | +nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u" | ||
758 | +nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero" | ||
759 | +nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero" | ||
760 | +nvme_err_startfail(void) "setting controller enable bit failed" | ||
761 | + | ||
762 | +# Traces for undefined behavior | ||
763 | +nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64"" | ||
764 | +nvme_ub_mmiowr_toosmall(uint64_t offset, unsigned size) "MMIO write smaller than 32 bits, offset=0x%"PRIx64", size=%u" | ||
765 | +nvme_ub_mmiowr_intmask_with_msix(void) "undefined access to interrupt mask set when MSI-X is enabled" | ||
766 | +nvme_ub_mmiowr_ro_csts(void) "attempted to set a read only bit of controller status" | ||
767 | +nvme_ub_mmiowr_ssreset_w1c_unsupported(void) "attempted to W1C CSTS.NSSRO but CAP.NSSRS is zero (not supported)" | ||
768 | +nvme_ub_mmiowr_ssreset_unsupported(void) "attempted NVM subsystem reset but CAP.NSSRS is zero (not supported)" | ||
769 | +nvme_ub_mmiowr_cmbloc_reserved(void) "invalid write to reserved CMBLOC when CMBSZ is zero, ignored" | ||
770 | +nvme_ub_mmiowr_cmbsz_readonly(void) "invalid write to read only CMBSZ, ignored" | ||
771 | +nvme_ub_mmiowr_invalid(uint64_t offset, uint64_t data) "invalid MMIO write, offset=0x%"PRIx64", data=0x%"PRIx64"" | ||
772 | +nvme_ub_mmiord_misaligned32(uint64_t offset) "MMIO read not 32-bit aligned, offset=0x%"PRIx64"" | ||
773 | +nvme_ub_mmiord_toosmall(uint64_t offset) "MMIO read smaller than 32-bits, offset=0x%"PRIx64"" | ||
774 | +nvme_ub_mmiord_invalid_ofs(uint64_t offset) "MMIO read beyond last register, offset=0x%"PRIx64", returning 0" | ||
775 | +nvme_ub_db_wr_misaligned(uint64_t offset) "doorbell write not 32-bit aligned, offset=0x%"PRIx64", ignoring" | ||
776 | +nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for nonexistent queue, cqid=%"PRIu32", ignoring" | ||
777 | +nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring" | ||
778 | +nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring" | ||
779 | +nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring" | ||
780 | + | ||
781 | # hw/block/xen_disk.c | ||
782 | xen_disk_alloc(char *name) "%s" | ||
783 | xen_disk_init(char *name) "%s" | ||
45 | -- | 784 | -- |
46 | 2.35.3 | 785 | 2.13.6 |
786 | |||
787 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Fam Zheng <famz@redhat.com> | ||
1 | 2 | ||
3 | Management tools create overlays of running guests with qemu-img: | ||
4 | |||
5 | $ qemu-img create -b /image/in/use.qcow2 -f qcow2 /overlay/image.qcow2 | ||
6 | |||
7 | but this doesn't work anymore due to image locking: | ||
8 | |||
9 | qemu-img: /overlay/image.qcow2: Failed to get shared "write" lock | ||
10 | Is another process using the image? | ||
11 | Could not open backing image to determine size. | ||
12 | Use the force share option to allow this use case again. | ||
13 | |||
14 | Cc: qemu-stable@nongnu.org | ||
15 | Signed-off-by: Fam Zheng <famz@redhat.com> | ||
16 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
17 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
18 | --- | ||
19 | block.c | 3 ++- | ||
20 | 1 file changed, 2 insertions(+), 1 deletion(-) | ||
21 | |||
22 | diff --git a/block.c b/block.c | ||
23 | index XXXXXXX..XXXXXXX 100644 | ||
24 | --- a/block.c | ||
25 | +++ b/block.c | ||
26 | @@ -XXX,XX +XXX,XX @@ void bdrv_img_create(const char *filename, const char *fmt, | ||
27 | back_flags = flags; | ||
28 | back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); | ||
29 | |||
30 | + backing_options = qdict_new(); | ||
31 | if (backing_fmt) { | ||
32 | - backing_options = qdict_new(); | ||
33 | qdict_put_str(backing_options, "driver", backing_fmt); | ||
34 | } | ||
35 | + qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true); | ||
36 | |||
37 | bs = bdrv_open(full_backing, NULL, backing_options, back_flags, | ||
38 | &local_err); | ||
39 | -- | ||
40 | 2.13.6 | ||
41 | |||
42 | diff view generated by jsdifflib |
1 | From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org> | 1 | From: Thomas Huth <thuth@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | At the end we ignore failure of bdrv_merge_dirty_bitmap() and report | 3 | It's not working anymore since QEMU v1.3.0 - time to remove it now. |
4 | success. And still set errp. That's wrong. | ||
5 | 4 | ||
6 | Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> | 5 | Signed-off-by: Thomas Huth <thuth@redhat.com> |
7 | Reviewed-by: Nikita Lapshin <nikita.lapshin@virtuozzo.com> | 6 | Reviewed-by: John Snow <jsnow@redhat.com> |
8 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 7 | Reviewed-by: Markus Armbruster <armbru@redhat.com> |
9 | Message-Id: <20220517111206.23585-2-v.sementsov-og@mail.ru> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 8 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 9 | --- |
12 | block/monitor/bitmap-qmp-cmds.c | 5 ++++- | 10 | blockdev.c | 11 ----------- |
13 | 1 file changed, 4 insertions(+), 1 deletion(-) | 11 | qemu-doc.texi | 6 ------ |
12 | 2 files changed, 17 deletions(-) | ||
14 | 13 | ||
15 | diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c | 14 | diff --git a/blockdev.c b/blockdev.c |
16 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/monitor/bitmap-qmp-cmds.c | 16 | --- a/blockdev.c |
18 | +++ b/block/monitor/bitmap-qmp-cmds.c | 17 | +++ b/blockdev.c |
19 | @@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, | 18 | @@ -XXX,XX +XXX,XX @@ QemuOptsList qemu_legacy_drive_opts = { |
19 | .type = QEMU_OPT_STRING, | ||
20 | .help = "chs translation (auto, lba, none)", | ||
21 | },{ | ||
22 | - .name = "boot", | ||
23 | - .type = QEMU_OPT_BOOL, | ||
24 | - .help = "(deprecated, ignored)", | ||
25 | - },{ | ||
26 | .name = "addr", | ||
27 | .type = QEMU_OPT_STRING, | ||
28 | .help = "pci address (virtio only)", | ||
29 | @@ -XXX,XX +XXX,XX @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type) | ||
30 | goto fail; | ||
20 | } | 31 | } |
21 | 32 | ||
22 | /* Merge into dst; dst is unchanged on failure. */ | 33 | - /* Deprecated option boot=[on|off] */ |
23 | - bdrv_merge_dirty_bitmap(dst, anon, backup, errp); | 34 | - if (qemu_opt_get(legacy_opts, "boot") != NULL) { |
24 | + if (!bdrv_merge_dirty_bitmap(dst, anon, backup, errp)) { | 35 | - fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be " |
25 | + dst = NULL; | 36 | - "ignored. Future versions will reject this parameter. Please " |
26 | + goto out; | 37 | - "update your scripts.\n"); |
27 | + } | 38 | - } |
28 | 39 | - | |
29 | out: | 40 | /* Other deprecated options */ |
30 | bdrv_release_dirty_bitmap(anon); | 41 | if (!qtest_enabled()) { |
42 | for (i = 0; i < ARRAY_SIZE(deprecated); i++) { | ||
43 | diff --git a/qemu-doc.texi b/qemu-doc.texi | ||
44 | index XXXXXXX..XXXXXXX 100644 | ||
45 | --- a/qemu-doc.texi | ||
46 | +++ b/qemu-doc.texi | ||
47 | @@ -XXX,XX +XXX,XX @@ deprecated. | ||
48 | |||
49 | @section System emulator command line arguments | ||
50 | |||
51 | -@subsection -drive boot=on|off (since 1.3.0) | ||
52 | - | ||
53 | -The ``boot=on|off'' option to the ``-drive'' argument is | ||
54 | -ignored. Applications should use the ``bootindex=N'' parameter | ||
55 | -to set an absolute ordering between devices instead. | ||
56 | - | ||
57 | @subsection -tdf (since 1.3.0) | ||
58 | |||
59 | The ``-tdf'' argument is ignored. The behaviour implemented | ||
31 | -- | 60 | -- |
32 | 2.35.3 | 61 | 2.13.6 |
62 | |||
63 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | 1 | From: Thomas Huth <thuth@redhat.com> | |
2 | |||
3 | It's been marked as deprecated since QEMU v2.10.0, and so far nobody | ||
4 | complained that we should keep it, so let's remove this legacy option | ||
5 | now to simplify the code quite a bit. | ||
6 | |||
7 | Signed-off-by: Thomas Huth <thuth@redhat.com> | ||
8 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
9 | Reviewed-by: Markus Armbruster <armbru@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
11 | --- | ||
12 | vl.c | 86 ++------------------------------------------------------- | ||
13 | qemu-doc.texi | 8 ------ | ||
14 | qemu-options.hx | 19 ++----------- | ||
15 | 3 files changed, 4 insertions(+), 109 deletions(-) | ||
16 | |||
17 | diff --git a/vl.c b/vl.c | ||
18 | index XXXXXXX..XXXXXXX 100644 | ||
19 | --- a/vl.c | ||
20 | +++ b/vl.c | ||
21 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp) | ||
22 | const char *boot_order = NULL; | ||
23 | const char *boot_once = NULL; | ||
24 | DisplayState *ds; | ||
25 | - int cyls, heads, secs, translation; | ||
26 | QemuOpts *opts, *machine_opts; | ||
27 | - QemuOpts *hda_opts = NULL, *icount_opts = NULL, *accel_opts = NULL; | ||
28 | + QemuOpts *icount_opts = NULL, *accel_opts = NULL; | ||
29 | QemuOptsList *olist; | ||
30 | int optind; | ||
31 | const char *optarg; | ||
32 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp) | ||
33 | |||
34 | cpu_model = NULL; | ||
35 | snapshot = 0; | ||
36 | - cyls = heads = secs = 0; | ||
37 | - translation = BIOS_ATA_TRANSLATION_AUTO; | ||
38 | |||
39 | nb_nics = 0; | ||
40 | |||
41 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp) | ||
42 | if (optind >= argc) | ||
43 | break; | ||
44 | if (argv[optind][0] != '-') { | ||
45 | - hda_opts = drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS); | ||
46 | + drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS); | ||
47 | } else { | ||
48 | const QEMUOption *popt; | ||
49 | |||
50 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp) | ||
51 | cpu_model = optarg; | ||
52 | break; | ||
53 | case QEMU_OPTION_hda: | ||
54 | - { | ||
55 | - char buf[256]; | ||
56 | - if (cyls == 0) | ||
57 | - snprintf(buf, sizeof(buf), "%s", HD_OPTS); | ||
58 | - else | ||
59 | - snprintf(buf, sizeof(buf), | ||
60 | - "%s,cyls=%d,heads=%d,secs=%d%s", | ||
61 | - HD_OPTS , cyls, heads, secs, | ||
62 | - translation == BIOS_ATA_TRANSLATION_LBA ? | ||
63 | - ",trans=lba" : | ||
64 | - translation == BIOS_ATA_TRANSLATION_NONE ? | ||
65 | - ",trans=none" : ""); | ||
66 | - drive_add(IF_DEFAULT, 0, optarg, buf); | ||
67 | - break; | ||
68 | - } | ||
69 | case QEMU_OPTION_hdb: | ||
70 | case QEMU_OPTION_hdc: | ||
71 | case QEMU_OPTION_hdd: | ||
72 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv, char **envp) | ||
73 | case QEMU_OPTION_snapshot: | ||
74 | snapshot = 1; | ||
75 | break; | ||
76 | - case QEMU_OPTION_hdachs: | ||
77 | - { | ||
78 | - const char *p; | ||
79 | - p = optarg; | ||
80 | - cyls = strtol(p, (char **)&p, 0); | ||
81 | - if (cyls < 1 || cyls > 16383) | ||
82 | - goto chs_fail; | ||
83 | - if (*p != ',') | ||
84 | - goto chs_fail; | ||
85 | - p++; | ||
86 | - heads = strtol(p, (char **)&p, 0); | ||
87 | - if (heads < 1 || heads > 16) | ||
88 | - goto chs_fail; | ||
89 | - if (*p != ',') | ||
90 | - goto chs_fail; | ||
91 | - p++; | ||
92 | - secs = strtol(p, (char **)&p, 0); | ||
93 | - if (secs < 1 || secs > 63) | ||
94 | - goto chs_fail; | ||
95 | - if (*p == ',') { | ||
96 | - p++; | ||
97 | - if (!strcmp(p, "large")) { | ||
98 | - translation = BIOS_ATA_TRANSLATION_LARGE; | ||
99 | - } else if (!strcmp(p, "rechs")) { | ||
100 | - translation = BIOS_ATA_TRANSLATION_RECHS; | ||
101 | - } else if (!strcmp(p, "none")) { | ||
102 | - translation = BIOS_ATA_TRANSLATION_NONE; | ||
103 | - } else if (!strcmp(p, "lba")) { | ||
104 | - translation = BIOS_ATA_TRANSLATION_LBA; | ||
105 | - } else if (!strcmp(p, "auto")) { | ||
106 | - translation = BIOS_ATA_TRANSLATION_AUTO; | ||
107 | - } else { | ||
108 | - goto chs_fail; | ||
109 | - } | ||
110 | - } else if (*p != '\0') { | ||
111 | - chs_fail: | ||
112 | - error_report("invalid physical CHS format"); | ||
113 | - exit(1); | ||
114 | - } | ||
115 | - if (hda_opts != NULL) { | ||
116 | - qemu_opt_set_number(hda_opts, "cyls", cyls, | ||
117 | - &error_abort); | ||
118 | - qemu_opt_set_number(hda_opts, "heads", heads, | ||
119 | - &error_abort); | ||
120 | - qemu_opt_set_number(hda_opts, "secs", secs, | ||
121 | - &error_abort); | ||
122 | - if (translation == BIOS_ATA_TRANSLATION_LARGE) { | ||
123 | - qemu_opt_set(hda_opts, "trans", "large", | ||
124 | - &error_abort); | ||
125 | - } else if (translation == BIOS_ATA_TRANSLATION_RECHS) { | ||
126 | - qemu_opt_set(hda_opts, "trans", "rechs", | ||
127 | - &error_abort); | ||
128 | - } else if (translation == BIOS_ATA_TRANSLATION_LBA) { | ||
129 | - qemu_opt_set(hda_opts, "trans", "lba", | ||
130 | - &error_abort); | ||
131 | - } else if (translation == BIOS_ATA_TRANSLATION_NONE) { | ||
132 | - qemu_opt_set(hda_opts, "trans", "none", | ||
133 | - &error_abort); | ||
134 | - } | ||
135 | - } | ||
136 | - } | ||
137 | - error_report("'-hdachs' is deprecated, please use '-device" | ||
138 | - " ide-hd,cyls=c,heads=h,secs=s,...' instead"); | ||
139 | - break; | ||
140 | case QEMU_OPTION_numa: | ||
141 | opts = qemu_opts_parse_noisily(qemu_find_opts("numa"), | ||
142 | optarg, true); | ||
143 | diff --git a/qemu-doc.texi b/qemu-doc.texi | ||
144 | index XXXXXXX..XXXXXXX 100644 | ||
145 | --- a/qemu-doc.texi | ||
146 | +++ b/qemu-doc.texi | ||
147 | @@ -XXX,XX +XXX,XX @@ The ``--net dump'' argument is now replaced with the | ||
148 | ``-object filter-dump'' argument which works in combination | ||
149 | with the modern ``-netdev`` backends instead. | ||
150 | |||
151 | -@subsection -hdachs (since 2.10.0) | ||
152 | - | ||
153 | -The ``-hdachs'' argument is now a synonym for setting | ||
154 | -the ``cyls'', ``heads'', ``secs'', and ``trans'' properties | ||
155 | -on the ``ide-hd'' device using the ``-device'' argument. | ||
156 | -The new syntax allows different settings to be provided | ||
157 | -per disk. | ||
158 | - | ||
159 | @subsection -usbdevice (since 2.10.0) | ||
160 | |||
161 | The ``-usbdevice DEV'' argument is now a synonym for setting | ||
162 | diff --git a/qemu-options.hx b/qemu-options.hx | ||
163 | index XXXXXXX..XXXXXXX 100644 | ||
164 | --- a/qemu-options.hx | ||
165 | +++ b/qemu-options.hx | ||
166 | @@ -XXX,XX +XXX,XX @@ of available connectors of a given interface type. | ||
167 | @item media=@var{media} | ||
168 | This option defines the type of the media: disk or cdrom. | ||
169 | @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}] | ||
170 | -These options have the same definition as they have in @option{-hdachs}. | ||
171 | -These parameters are deprecated, use the corresponding parameters | ||
172 | +Force disk physical geometry and the optional BIOS translation (trans=none or | ||
173 | +lba). These parameters are deprecated, use the corresponding parameters | ||
174 | of @code{-device} instead. | ||
175 | @item snapshot=@var{snapshot} | ||
176 | @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive | ||
177 | @@ -XXX,XX +XXX,XX @@ the raw disk image you use is not written back. You can however force | ||
178 | the write back by pressing @key{C-a s} (@pxref{disk_images}). | ||
179 | ETEXI | ||
180 | |||
181 | -DEF("hdachs", HAS_ARG, QEMU_OPTION_hdachs, \ | ||
182 | - "-hdachs c,h,s[,t]\n" \ | ||
183 | - " force hard disk 0 physical geometry and the optional BIOS\n" \ | ||
184 | - " translation (t=none or lba) (usually QEMU can guess them)\n", | ||
185 | - QEMU_ARCH_ALL) | ||
186 | -STEXI | ||
187 | -@item -hdachs @var{c},@var{h},@var{s},[,@var{t}] | ||
188 | -@findex -hdachs | ||
189 | -Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <= | ||
190 | -@var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS | ||
191 | -translation mode (@var{t}=none, lba or auto). Usually QEMU can guess | ||
192 | -all those parameters. This option is deprecated, please use | ||
193 | -@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead. | ||
194 | -ETEXI | ||
195 | - | ||
196 | DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev, | ||
197 | "-fsdev fsdriver,id=id[,path=path,][security_model={mapped-xattr|mapped-file|passthrough|none}]\n" | ||
198 | " [,writeout=immediate][,readonly][,socket=socket|sock_fd=sock_fd][,fmode=fmode][,dmode=dmode]\n" | ||
199 | -- | ||
200 | 2.13.6 | ||
201 | |||
202 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Thomas Huth <thuth@redhat.com> | ||
1 | 2 | ||
3 | Looks like we forgot to announce the deprecation of these options in | ||
4 | the corresponding chapter of the qemu-doc text, so let's do that now. | ||
5 | |||
6 | Signed-off-by: Thomas Huth <thuth@redhat.com> | ||
7 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
8 | Reviewed-by: Markus Armbruster <armbru@redhat.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
10 | --- | ||
11 | qemu-doc.texi | 15 +++++++++++++++ | ||
12 | 1 file changed, 15 insertions(+) | ||
13 | |||
14 | diff --git a/qemu-doc.texi b/qemu-doc.texi | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/qemu-doc.texi | ||
17 | +++ b/qemu-doc.texi | ||
18 | @@ -XXX,XX +XXX,XX @@ longer be directly supported in QEMU. | ||
19 | The ``-drive if=scsi'' argument is replaced by the the | ||
20 | ``-device BUS-TYPE'' argument combined with ``-drive if=none''. | ||
21 | |||
22 | +@subsection -drive cyls=...,heads=...,secs=...,trans=... (since 2.10.0) | ||
23 | + | ||
24 | +The drive geometry arguments are replaced by the the geometry arguments | ||
25 | +that can be specified with the ``-device'' parameter. | ||
26 | + | ||
27 | +@subsection -drive serial=... (since 2.10.0) | ||
28 | + | ||
29 | +The drive serial argument is replaced by the the serial argument | ||
30 | +that can be specified with the ``-device'' parameter. | ||
31 | + | ||
32 | +@subsection -drive addr=... (since 2.10.0) | ||
33 | + | ||
34 | +The drive addr argument is replaced by the the addr argument | ||
35 | +that can be specified with the ``-device'' parameter. | ||
36 | + | ||
37 | @subsection -net dump (since 2.10.0) | ||
38 | |||
39 | The ``--net dump'' argument is now replaced with the | ||
40 | -- | ||
41 | 2.13.6 | ||
42 | |||
43 | diff view generated by jsdifflib |
1 | From: Stefan Hajnoczi <stefanha@redhat.com> | 1 | From: Fam Zheng <famz@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | bdrv_co_drain() has not been used since commit 9a0cec664eef ("mirror: | 3 | Signed-off-by: Fam Zheng <famz@redhat.com> |
4 | use bdrv_drained_begin/bdrv_drained_end") in 2016. Remove it so there | ||
5 | are fewer drain scenarios to worry about. | ||
6 | |||
7 | Use bdrv_drained_begin()/bdrv_drained_end() instead. They are "mixed" | ||
8 | functions that can be called from coroutine context. Unlike | ||
9 | bdrv_co_drain(), these functions provide control of the length of the | ||
10 | drained section, which is usually the right thing. | ||
11 | |||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Message-Id: <20220521122714.3837731-1-stefanha@redhat.com> | ||
14 | Reviewed-by: Emanuele Giuseppe Esposito <eesposit@redhat.com> | ||
15 | Reviewed-by: Alberto Faria <afaria@redhat.com> | ||
16 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
17 | --- | 5 | --- |
18 | include/block/block-io.h | 1 - | 6 | include/block/block_int.h | 1 - |
19 | block/io.c | 15 --------------- | 7 | block/io.c | 18 ------------------ |
20 | 2 files changed, 16 deletions(-) | 8 | 2 files changed, 19 deletions(-) |
21 | 9 | ||
22 | diff --git a/include/block/block-io.h b/include/block/block-io.h | 10 | diff --git a/include/block/block_int.h b/include/block/block_int.h |
23 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/include/block/block-io.h | 12 | --- a/include/block/block_int.h |
25 | +++ b/include/block/block-io.h | 13 | +++ b/include/block/block_int.h |
26 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); | 14 | @@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk); |
27 | cond); }) | 15 | bool blk_dev_is_medium_locked(BlockBackend *blk); |
28 | 16 | ||
29 | void bdrv_drain(BlockDriverState *bs); | 17 | void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes); |
30 | -void coroutine_fn bdrv_co_drain(BlockDriverState *bs); | 18 | -bool bdrv_requests_pending(BlockDriverState *bs); |
31 | 19 | ||
32 | int generated_co_wrapper | 20 | void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); |
33 | bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, | 21 | void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in); |
34 | diff --git a/block/io.c b/block/io.c | 22 | diff --git a/block/io.c b/block/io.c |
35 | index XXXXXXX..XXXXXXX 100644 | 23 | index XXXXXXX..XXXXXXX 100644 |
36 | --- a/block/io.c | 24 | --- a/block/io.c |
37 | +++ b/block/io.c | 25 | +++ b/block/io.c |
38 | @@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) | 26 | @@ -XXX,XX +XXX,XX @@ void bdrv_disable_copy_on_read(BlockDriverState *bs) |
39 | BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0); | 27 | assert(old >= 1); |
40 | } | 28 | } |
41 | 29 | ||
42 | -/* | 30 | -/* Check if any requests are in-flight (including throttled requests) */ |
43 | - * Wait for pending requests to complete on a single BlockDriverState subtree, | 31 | -bool bdrv_requests_pending(BlockDriverState *bs) |
44 | - * and suspend block driver's internal I/O until next request arrives. | ||
45 | - * | ||
46 | - * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState | ||
47 | - * AioContext. | ||
48 | - */ | ||
49 | -void coroutine_fn bdrv_co_drain(BlockDriverState *bs) | ||
50 | -{ | 32 | -{ |
51 | - IO_OR_GS_CODE(); | 33 | - BdrvChild *child; |
52 | - assert(qemu_in_coroutine()); | 34 | - |
53 | - bdrv_drained_begin(bs); | 35 | - if (atomic_read(&bs->in_flight)) { |
54 | - bdrv_drained_end(bs); | 36 | - return true; |
37 | - } | ||
38 | - | ||
39 | - QLIST_FOREACH(child, &bs->children, next) { | ||
40 | - if (bdrv_requests_pending(child->bs)) { | ||
41 | - return true; | ||
42 | - } | ||
43 | - } | ||
44 | - | ||
45 | - return false; | ||
55 | -} | 46 | -} |
56 | - | 47 | - |
57 | void bdrv_drain(BlockDriverState *bs) | 48 | typedef struct { |
58 | { | 49 | Coroutine *co; |
59 | IO_OR_GS_CODE(); | 50 | BlockDriverState *bs; |
60 | -- | 51 | -- |
61 | 2.35.3 | 52 | 2.13.6 |
53 | |||
54 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
2 | Reviewed-by: Fam Zheng <famz@redhat.com> | ||
3 | --- | ||
4 | block/io.c | 6 ++++++ | ||
5 | 1 file changed, 6 insertions(+) | ||
1 | 6 | ||
7 | diff --git a/block/io.c b/block/io.c | ||
8 | index XXXXXXX..XXXXXXX 100644 | ||
9 | --- a/block/io.c | ||
10 | +++ b/block/io.c | ||
11 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
12 | BdrvNextIterator it; | ||
13 | GSList *aio_ctxs = NULL, *ctx; | ||
14 | |||
15 | + /* BDRV_POLL_WHILE() for a node can only be called from its own I/O thread | ||
16 | + * or the main loop AioContext. We potentially use BDRV_POLL_WHILE() on | ||
17 | + * nodes in several different AioContexts, so make sure we're in the main | ||
18 | + * context. */ | ||
19 | + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); | ||
20 | + | ||
21 | block_job_pause_all(); | ||
22 | |||
23 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { | ||
24 | -- | ||
25 | 2.13.6 | ||
26 | |||
27 | diff view generated by jsdifflib |
1 | From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org> | 1 | bdrv_drained_begin() doesn't increase bs->quiesce_counter recursively |
---|---|---|---|
2 | and also doesn't notify other parent nodes of children, which both means | ||
3 | that the child nodes are not actually drained, and bdrv_drained_begin() | ||
4 | is providing useful functionality only on a single node. | ||
2 | 5 | ||
3 | We have too much logic to simply check that bitmaps are of the same | 6 | To keep things consistent, we also shouldn't call the block driver |
4 | size. Let's just define that hbitmap_merge() and | 7 | callbacks recursively. |
5 | bdrv_dirty_bitmap_merge_internal() require their argument bitmaps be of | ||
6 | same size, this simplifies things. | ||
7 | 8 | ||
8 | Let's look through the callers: | 9 | A proper recursive drain version that provides an actually working |
10 | drained section for child nodes will be introduced later. | ||
9 | 11 | ||
10 | For backup_init_bcs_bitmap() we already assert that merge can't fail. | 12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
13 | Reviewed-by: Fam Zheng <famz@redhat.com> | ||
14 | --- | ||
15 | block/io.c | 16 +++++++++------- | ||
16 | 1 file changed, 9 insertions(+), 7 deletions(-) | ||
11 | 17 | ||
12 | In bdrv_reclaim_dirty_bitmap_locked() we gracefully handle the error | 18 | diff --git a/block/io.c b/block/io.c |
13 | that can't happen: successor always has same size as its parent, drop | ||
14 | this logic. | ||
15 | |||
16 | In bdrv_merge_dirty_bitmap() we already has assertion and separate | ||
17 | check. Make the check explicit and improve error message. | ||
18 | |||
19 | Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> | ||
20 | Reviewed-by: Nikita Lapshin <nikita.lapshin@virtuozzo.com> | ||
21 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
22 | Message-Id: <20220517111206.23585-4-v.sementsov-og@mail.ru> | ||
23 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
24 | --- | ||
25 | include/block/block_int-io.h | 2 +- | ||
26 | include/qemu/hbitmap.h | 15 ++------------- | ||
27 | block/backup.c | 6 ++---- | ||
28 | block/dirty-bitmap.c | 26 +++++++++++--------------- | ||
29 | util/hbitmap.c | 25 +++++++------------------ | ||
30 | 5 files changed, 23 insertions(+), 51 deletions(-) | ||
31 | |||
32 | diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h | ||
33 | index XXXXXXX..XXXXXXX 100644 | 19 | index XXXXXXX..XXXXXXX 100644 |
34 | --- a/include/block/block_int-io.h | 20 | --- a/block/io.c |
35 | +++ b/include/block/block_int-io.h | 21 | +++ b/block/io.c |
36 | @@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk); | 22 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) |
37 | void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes); | 23 | } |
38 | 24 | ||
39 | void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); | 25 | /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ |
40 | -bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, | 26 | -static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) |
41 | +void bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, | 27 | +static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive) |
42 | const BdrvDirtyBitmap *src, | ||
43 | HBitmap **backup, bool lock); | ||
44 | |||
45 | diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h | ||
46 | index XXXXXXX..XXXXXXX 100644 | ||
47 | --- a/include/qemu/hbitmap.h | ||
48 | +++ b/include/qemu/hbitmap.h | ||
49 | @@ -XXX,XX +XXX,XX @@ void hbitmap_truncate(HBitmap *hb, uint64_t size); | ||
50 | * | ||
51 | * Store result of merging @a and @b into @result. | ||
52 | * @result is allowed to be equal to @a or @b. | ||
53 | - * | ||
54 | - * Return true if the merge was successful, | ||
55 | - * false if it was not attempted. | ||
56 | - */ | ||
57 | -bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result); | ||
58 | - | ||
59 | -/** | ||
60 | - * hbitmap_can_merge: | ||
61 | - * | ||
62 | - * hbitmap_can_merge(a, b) && hbitmap_can_merge(a, result) is sufficient and | ||
63 | - * necessary for hbitmap_merge will not fail. | ||
64 | - * | ||
65 | + * All bitmaps must have same size. | ||
66 | */ | ||
67 | -bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b); | ||
68 | +void hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result); | ||
69 | |||
70 | /** | ||
71 | * hbitmap_empty: | ||
72 | diff --git a/block/backup.c b/block/backup.c | ||
73 | index XXXXXXX..XXXXXXX 100644 | ||
74 | --- a/block/backup.c | ||
75 | +++ b/block/backup.c | ||
76 | @@ -XXX,XX +XXX,XX @@ out: | ||
77 | |||
78 | static void backup_init_bcs_bitmap(BackupBlockJob *job) | ||
79 | { | 28 | { |
80 | - bool ret; | 29 | BdrvChild *child, *tmp; |
81 | uint64_t estimate; | 30 | BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin}; |
82 | BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(job->bcs); | 31 | @@ -XXX,XX +XXX,XX @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) |
83 | 32 | bdrv_coroutine_enter(bs, data.co); | |
84 | if (job->sync_mode == MIRROR_SYNC_MODE_BITMAP) { | 33 | BDRV_POLL_WHILE(bs, !data.done); |
85 | bdrv_clear_dirty_bitmap(bcs_bitmap, NULL); | 34 | |
86 | - ret = bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap, | 35 | - QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { |
87 | - NULL, true); | 36 | - bdrv_drain_invoke(child->bs, begin); |
88 | - assert(ret); | 37 | + if (recursive) { |
89 | + bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap, NULL, | 38 | + QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { |
90 | + true); | 39 | + bdrv_drain_invoke(child->bs, begin, true); |
91 | } else if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { | 40 | + } |
92 | /* | ||
93 | * We can't hog the coroutine to initialize this thoroughly. | ||
94 | diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c | ||
95 | index XXXXXXX..XXXXXXX 100644 | ||
96 | --- a/block/dirty-bitmap.c | ||
97 | +++ b/block/dirty-bitmap.c | ||
98 | @@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *parent, | ||
99 | return NULL; | ||
100 | } | ||
101 | |||
102 | - if (!hbitmap_merge(parent->bitmap, successor->bitmap, parent->bitmap)) { | ||
103 | - error_setg(errp, "Merging of parent and successor bitmap failed"); | ||
104 | - return NULL; | ||
105 | - } | ||
106 | + hbitmap_merge(parent->bitmap, successor->bitmap, parent->bitmap); | ||
107 | |||
108 | parent->disabled = successor->disabled; | ||
109 | parent->busy = false; | ||
110 | @@ -XXX,XX +XXX,XX @@ bool bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src, | ||
111 | goto out; | ||
112 | } | ||
113 | |||
114 | - if (!hbitmap_can_merge(dest->bitmap, src->bitmap)) { | ||
115 | - error_setg(errp, "Bitmaps are incompatible and can't be merged"); | ||
116 | + if (bdrv_dirty_bitmap_size(src) != bdrv_dirty_bitmap_size(dest)) { | ||
117 | + error_setg(errp, "Bitmaps are of different sizes (destination size is %" | ||
118 | + PRId64 ", source size is %" PRId64 ") and can't be merged", | ||
119 | + bdrv_dirty_bitmap_size(dest), bdrv_dirty_bitmap_size(src)); | ||
120 | goto out; | ||
121 | } | ||
122 | |||
123 | - ret = bdrv_dirty_bitmap_merge_internal(dest, src, backup, false); | ||
124 | - assert(ret); | ||
125 | + bdrv_dirty_bitmap_merge_internal(dest, src, backup, false); | ||
126 | + ret = true; | ||
127 | |||
128 | out: | ||
129 | bdrv_dirty_bitmaps_unlock(dest->bs); | ||
130 | @@ -XXX,XX +XXX,XX @@ out: | ||
131 | /** | ||
132 | * bdrv_dirty_bitmap_merge_internal: merge src into dest. | ||
133 | * Does NOT check bitmap permissions; not suitable for use as public API. | ||
134 | + * @dest, @src and @backup (if not NULL) must have same size. | ||
135 | * | ||
136 | * @backup: If provided, make a copy of dest here prior to merge. | ||
137 | * @lock: If true, lock and unlock bitmaps on the way in/out. | ||
138 | - * returns true if the merge succeeded; false if unattempted. | ||
139 | */ | ||
140 | -bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, | ||
141 | +void bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, | ||
142 | const BdrvDirtyBitmap *src, | ||
143 | HBitmap **backup, | ||
144 | bool lock) | ||
145 | { | ||
146 | - bool ret; | ||
147 | IO_CODE(); | ||
148 | |||
149 | assert(!bdrv_dirty_bitmap_readonly(dest)); | ||
150 | @@ -XXX,XX +XXX,XX @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, | ||
151 | if (backup) { | ||
152 | *backup = dest->bitmap; | ||
153 | dest->bitmap = hbitmap_alloc(dest->size, hbitmap_granularity(*backup)); | ||
154 | - ret = hbitmap_merge(*backup, src->bitmap, dest->bitmap); | ||
155 | + hbitmap_merge(*backup, src->bitmap, dest->bitmap); | ||
156 | } else { | ||
157 | - ret = hbitmap_merge(dest->bitmap, src->bitmap, dest->bitmap); | ||
158 | + hbitmap_merge(dest->bitmap, src->bitmap, dest->bitmap); | ||
159 | } | ||
160 | |||
161 | if (lock) { | ||
162 | @@ -XXX,XX +XXX,XX @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, | ||
163 | bdrv_dirty_bitmaps_unlock(src->bs); | ||
164 | } | ||
165 | } | ||
166 | - | ||
167 | - return ret; | ||
168 | } | ||
169 | diff --git a/util/hbitmap.c b/util/hbitmap.c | ||
170 | index XXXXXXX..XXXXXXX 100644 | ||
171 | --- a/util/hbitmap.c | ||
172 | +++ b/util/hbitmap.c | ||
173 | @@ -XXX,XX +XXX,XX @@ void hbitmap_truncate(HBitmap *hb, uint64_t size) | ||
174 | } | 41 | } |
175 | } | 42 | } |
176 | 43 | ||
177 | -bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b) | 44 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs) |
178 | -{ | 45 | bdrv_parent_drained_begin(bs); |
179 | - return (a->orig_size == b->orig_size); | ||
180 | -} | ||
181 | - | ||
182 | /** | ||
183 | * hbitmap_sparse_merge: performs dst = dst | src | ||
184 | * works with differing granularities. | ||
185 | @@ -XXX,XX +XXX,XX @@ static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src) | ||
186 | * Given HBitmaps A and B, let R := A (BITOR) B. | ||
187 | * Bitmaps A and B will not be modified, | ||
188 | * except when bitmap R is an alias of A or B. | ||
189 | - * | ||
190 | - * @return true if the merge was successful, | ||
191 | - * false if it was not attempted. | ||
192 | + * Bitmaps must have same size. | ||
193 | */ | ||
194 | -bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result) | ||
195 | +void hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result) | ||
196 | { | ||
197 | int i; | ||
198 | uint64_t j; | ||
199 | |||
200 | - if (!hbitmap_can_merge(a, b) || !hbitmap_can_merge(a, result)) { | ||
201 | - return false; | ||
202 | - } | ||
203 | - assert(hbitmap_can_merge(b, result)); | ||
204 | + assert(a->orig_size == result->orig_size); | ||
205 | + assert(b->orig_size == result->orig_size); | ||
206 | |||
207 | if ((!hbitmap_count(a) && result == b) || | ||
208 | (!hbitmap_count(b) && result == a)) { | ||
209 | - return true; | ||
210 | + return; | ||
211 | } | 46 | } |
212 | 47 | ||
213 | if (!hbitmap_count(a) && !hbitmap_count(b)) { | 48 | - bdrv_drain_invoke(bs, true); |
214 | hbitmap_reset_all(result); | 49 | + bdrv_drain_invoke(bs, true, false); |
215 | - return true; | 50 | bdrv_drain_recurse(bs); |
216 | + return; | 51 | } |
52 | |||
53 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs) | ||
217 | } | 54 | } |
218 | 55 | ||
219 | if (a->granularity != b->granularity) { | 56 | /* Re-enable things in child-to-parent order */ |
220 | @@ -XXX,XX +XXX,XX @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result) | 57 | - bdrv_drain_invoke(bs, false); |
221 | if (b != result) { | 58 | + bdrv_drain_invoke(bs, false, false); |
222 | hbitmap_sparse_merge(result, b); | 59 | bdrv_parent_drained_end(bs); |
223 | } | 60 | aio_enable_external(bdrv_get_aio_context(bs)); |
224 | - return true; | ||
225 | + return; | ||
226 | } | ||
227 | |||
228 | /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant. | ||
229 | @@ -XXX,XX +XXX,XX @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result) | ||
230 | |||
231 | /* Recompute the dirty count */ | ||
232 | result->count = hb_count_between(result, 0, result->size - 1); | ||
233 | - | ||
234 | - return true; | ||
235 | } | 61 | } |
236 | 62 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | |
237 | char *hbitmap_sha256(const HBitmap *bitmap, Error **errp) | 63 | aio_context_acquire(aio_context); |
64 | aio_disable_external(aio_context); | ||
65 | bdrv_parent_drained_begin(bs); | ||
66 | - bdrv_drain_invoke(bs, true); | ||
67 | + bdrv_drain_invoke(bs, true, true); | ||
68 | aio_context_release(aio_context); | ||
69 | |||
70 | if (!g_slist_find(aio_ctxs, aio_context)) { | ||
71 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) | ||
72 | |||
73 | /* Re-enable things in child-to-parent order */ | ||
74 | aio_context_acquire(aio_context); | ||
75 | - bdrv_drain_invoke(bs, false); | ||
76 | + bdrv_drain_invoke(bs, false, true); | ||
77 | bdrv_parent_drained_end(bs); | ||
78 | aio_enable_external(aio_context); | ||
79 | aio_context_release(aio_context); | ||
238 | -- | 80 | -- |
239 | 2.35.3 | 81 | 2.13.6 |
82 | |||
83 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | The existing test is for bdrv_drain_all_begin/end() only. Generalise the |
---|---|---|---|
2 | test case so that it can be run for the other variants as well. At the | ||
3 | moment this is only bdrv_drain_begin/end(), but in a while, we'll add | ||
4 | another one. | ||
2 | 5 | ||
3 | Abstract the common logic of virtio-blk I/O process to a function | 6 | Also, add a backing file to the test node to test whether the operations |
4 | named virtio_blk_process_req(). It's needed for the following commit. | 7 | work recursively. |
5 | 8 | ||
6 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
7 | Message-Id: <20220523084611.91-4-xieyongji@bytedance.com> | ||
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 10 | --- |
11 | block/export/virtio-blk-handler.h | 37 ++++ | 11 | tests/test-bdrv-drain.c | 69 ++++++++++++++++++++++++++++++++++++++++++++----- |
12 | block/export/vhost-user-blk-server.c | 259 +++------------------------ | 12 | 1 file changed, 62 insertions(+), 7 deletions(-) |
13 | block/export/virtio-blk-handler.c | 240 +++++++++++++++++++++++++ | ||
14 | MAINTAINERS | 2 + | ||
15 | block/export/meson.build | 2 +- | ||
16 | 5 files changed, 301 insertions(+), 239 deletions(-) | ||
17 | create mode 100644 block/export/virtio-blk-handler.h | ||
18 | create mode 100644 block/export/virtio-blk-handler.c | ||
19 | 13 | ||
20 | diff --git a/block/export/virtio-blk-handler.h b/block/export/virtio-blk-handler.h | 14 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
21 | new file mode 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
22 | index XXXXXXX..XXXXXXX | 16 | --- a/tests/test-bdrv-drain.c |
23 | --- /dev/null | 17 | +++ b/tests/test-bdrv-drain.c |
24 | +++ b/block/export/virtio-blk-handler.h | 18 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_test = { |
25 | @@ -XXX,XX +XXX,XX @@ | 19 | |
26 | +/* | 20 | .bdrv_co_drain_begin = bdrv_test_co_drain_begin, |
27 | + * Handler for virtio-blk I/O | 21 | .bdrv_co_drain_end = bdrv_test_co_drain_end, |
28 | + * | ||
29 | + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | ||
30 | + * | ||
31 | + * Author: | ||
32 | + * Xie Yongji <xieyongji@bytedance.com> | ||
33 | + * | ||
34 | + * This work is licensed under the terms of the GNU GPL, version 2 or | ||
35 | + * later. See the COPYING file in the top-level directory. | ||
36 | + */ | ||
37 | + | 22 | + |
38 | +#ifndef VIRTIO_BLK_HANDLER_H | 23 | + .bdrv_child_perm = bdrv_format_default_perms, |
39 | +#define VIRTIO_BLK_HANDLER_H | ||
40 | + | ||
41 | +#include "sysemu/block-backend.h" | ||
42 | + | ||
43 | +#define VIRTIO_BLK_SECTOR_BITS 9 | ||
44 | +#define VIRTIO_BLK_SECTOR_SIZE (1ULL << VIRTIO_BLK_SECTOR_BITS) | ||
45 | + | ||
46 | +#define VIRTIO_BLK_MAX_DISCARD_SECTORS 32768 | ||
47 | +#define VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS 32768 | ||
48 | + | ||
49 | +typedef struct { | ||
50 | + BlockBackend *blk; | ||
51 | + const char *serial; | ||
52 | + uint32_t logical_block_size; | ||
53 | + bool writable; | ||
54 | +} VirtioBlkHandler; | ||
55 | + | ||
56 | +int coroutine_fn virtio_blk_process_req(VirtioBlkHandler *handler, | ||
57 | + struct iovec *in_iov, | ||
58 | + struct iovec *out_iov, | ||
59 | + unsigned int in_num, | ||
60 | + unsigned int out_num); | ||
61 | + | ||
62 | +#endif /* VIRTIO_BLK_HANDLER_H */ | ||
63 | diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c | ||
64 | index XXXXXXX..XXXXXXX 100644 | ||
65 | --- a/block/export/vhost-user-blk-server.c | ||
66 | +++ b/block/export/vhost-user-blk-server.c | ||
67 | @@ -XXX,XX +XXX,XX @@ | ||
68 | #include "vhost-user-blk-server.h" | ||
69 | #include "qapi/error.h" | ||
70 | #include "qom/object_interfaces.h" | ||
71 | -#include "sysemu/block-backend.h" | ||
72 | #include "util/block-helpers.h" | ||
73 | - | ||
74 | -/* | ||
75 | - * Sector units are 512 bytes regardless of the | ||
76 | - * virtio_blk_config->blk_size value. | ||
77 | - */ | ||
78 | -#define VIRTIO_BLK_SECTOR_BITS 9 | ||
79 | -#define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS) | ||
80 | +#include "virtio-blk-handler.h" | ||
81 | |||
82 | enum { | ||
83 | VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1, | ||
84 | - VHOST_USER_BLK_MAX_DISCARD_SECTORS = 32768, | ||
85 | - VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS = 32768, | ||
86 | -}; | ||
87 | -struct virtio_blk_inhdr { | ||
88 | - unsigned char status; | ||
89 | }; | 24 | }; |
90 | 25 | ||
91 | typedef struct VuBlkReq { | 26 | static void aio_ret_cb(void *opaque, int ret) |
92 | VuVirtqElement elem; | 27 | @@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret) |
93 | - int64_t sector_num; | 28 | *aio_ret = ret; |
94 | - size_t size; | ||
95 | - struct virtio_blk_inhdr *in; | ||
96 | - struct virtio_blk_outhdr out; | ||
97 | VuServer *server; | ||
98 | struct VuVirtq *vq; | ||
99 | } VuBlkReq; | ||
100 | @@ -XXX,XX +XXX,XX @@ typedef struct VuBlkReq { | ||
101 | typedef struct { | ||
102 | BlockExport export; | ||
103 | VuServer vu_server; | ||
104 | - uint32_t blk_size; | ||
105 | + VirtioBlkHandler handler; | ||
106 | QIOChannelSocket *sioc; | ||
107 | struct virtio_blk_config blkcfg; | ||
108 | - bool writable; | ||
109 | } VuBlkExport; | ||
110 | |||
111 | -static void vu_blk_req_complete(VuBlkReq *req) | ||
112 | +static void vu_blk_req_complete(VuBlkReq *req, size_t in_len) | ||
113 | { | ||
114 | VuDev *vu_dev = &req->server->vu_dev; | ||
115 | |||
116 | - vu_queue_push(vu_dev, req->vq, &req->elem, req->size); | ||
117 | + vu_queue_push(vu_dev, req->vq, &req->elem, in_len); | ||
118 | vu_queue_notify(vu_dev, req->vq); | ||
119 | |||
120 | free(req); | ||
121 | } | 29 | } |
122 | 30 | ||
123 | -static bool vu_blk_sect_range_ok(VuBlkExport *vexp, uint64_t sector, | 31 | -static void test_drv_cb_drain_all(void) |
124 | - size_t size) | 32 | +enum drain_type { |
125 | -{ | 33 | + BDRV_DRAIN_ALL, |
126 | - uint64_t nb_sectors; | 34 | + BDRV_DRAIN, |
127 | - uint64_t total_sectors; | ||
128 | - | ||
129 | - if (size % VIRTIO_BLK_SECTOR_SIZE) { | ||
130 | - return false; | ||
131 | - } | ||
132 | - | ||
133 | - nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS; | ||
134 | - | ||
135 | - QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE); | ||
136 | - if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) { | ||
137 | - return false; | ||
138 | - } | ||
139 | - if ((sector << VIRTIO_BLK_SECTOR_BITS) % vexp->blk_size) { | ||
140 | - return false; | ||
141 | - } | ||
142 | - blk_get_geometry(vexp->export.blk, &total_sectors); | ||
143 | - if (sector > total_sectors || nb_sectors > total_sectors - sector) { | ||
144 | - return false; | ||
145 | - } | ||
146 | - return true; | ||
147 | -} | ||
148 | - | ||
149 | -static int coroutine_fn | ||
150 | -vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct iovec *iov, | ||
151 | - uint32_t iovcnt, uint32_t type) | ||
152 | -{ | ||
153 | - BlockBackend *blk = vexp->export.blk; | ||
154 | - struct virtio_blk_discard_write_zeroes desc; | ||
155 | - ssize_t size; | ||
156 | - uint64_t sector; | ||
157 | - uint32_t num_sectors; | ||
158 | - uint32_t max_sectors; | ||
159 | - uint32_t flags; | ||
160 | - int bytes; | ||
161 | - | ||
162 | - /* Only one desc is currently supported */ | ||
163 | - if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) { | ||
164 | - return VIRTIO_BLK_S_UNSUPP; | ||
165 | - } | ||
166 | - | ||
167 | - size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc)); | ||
168 | - if (unlikely(size != sizeof(desc))) { | ||
169 | - error_report("Invalid size %zd, expected %zu", size, sizeof(desc)); | ||
170 | - return VIRTIO_BLK_S_IOERR; | ||
171 | - } | ||
172 | - | ||
173 | - sector = le64_to_cpu(desc.sector); | ||
174 | - num_sectors = le32_to_cpu(desc.num_sectors); | ||
175 | - flags = le32_to_cpu(desc.flags); | ||
176 | - max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ? | ||
177 | - VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS : | ||
178 | - VHOST_USER_BLK_MAX_DISCARD_SECTORS; | ||
179 | - | ||
180 | - /* This check ensures that 'bytes' fits in an int */ | ||
181 | - if (unlikely(num_sectors > max_sectors)) { | ||
182 | - return VIRTIO_BLK_S_IOERR; | ||
183 | - } | ||
184 | - | ||
185 | - bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS; | ||
186 | - | ||
187 | - if (unlikely(!vu_blk_sect_range_ok(vexp, sector, bytes))) { | ||
188 | - return VIRTIO_BLK_S_IOERR; | ||
189 | - } | ||
190 | - | ||
191 | - /* | ||
192 | - * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard | ||
193 | - * and write zeroes commands if any unknown flag is set. | ||
194 | - */ | ||
195 | - if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) { | ||
196 | - return VIRTIO_BLK_S_UNSUPP; | ||
197 | - } | ||
198 | - | ||
199 | - if (type == VIRTIO_BLK_T_WRITE_ZEROES) { | ||
200 | - int blk_flags = 0; | ||
201 | - | ||
202 | - if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { | ||
203 | - blk_flags |= BDRV_REQ_MAY_UNMAP; | ||
204 | - } | ||
205 | - | ||
206 | - if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS, | ||
207 | - bytes, blk_flags) == 0) { | ||
208 | - return VIRTIO_BLK_S_OK; | ||
209 | - } | ||
210 | - } else if (type == VIRTIO_BLK_T_DISCARD) { | ||
211 | - /* | ||
212 | - * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for | ||
213 | - * discard commands if the unmap flag is set. | ||
214 | - */ | ||
215 | - if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) { | ||
216 | - return VIRTIO_BLK_S_UNSUPP; | ||
217 | - } | ||
218 | - | ||
219 | - if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS, | ||
220 | - bytes) == 0) { | ||
221 | - return VIRTIO_BLK_S_OK; | ||
222 | - } | ||
223 | - } | ||
224 | - | ||
225 | - return VIRTIO_BLK_S_IOERR; | ||
226 | -} | ||
227 | - | ||
228 | /* Called with server refcount increased, must decrease before returning */ | ||
229 | static void coroutine_fn vu_blk_virtio_process_req(void *opaque) | ||
230 | { | ||
231 | VuBlkReq *req = opaque; | ||
232 | VuServer *server = req->server; | ||
233 | VuVirtqElement *elem = &req->elem; | ||
234 | - uint32_t type; | ||
235 | - | ||
236 | VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server); | ||
237 | - BlockBackend *blk = vexp->export.blk; | ||
238 | - | ||
239 | + VirtioBlkHandler *handler = &vexp->handler; | ||
240 | struct iovec *in_iov = elem->in_sg; | ||
241 | struct iovec *out_iov = elem->out_sg; | ||
242 | unsigned in_num = elem->in_num; | ||
243 | unsigned out_num = elem->out_num; | ||
244 | - | ||
245 | - /* refer to hw/block/virtio_blk.c */ | ||
246 | - if (elem->out_num < 1 || elem->in_num < 1) { | ||
247 | - error_report("virtio-blk request missing headers"); | ||
248 | - goto err; | ||
249 | - } | ||
250 | - | ||
251 | - if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out, | ||
252 | - sizeof(req->out)) != sizeof(req->out))) { | ||
253 | - error_report("virtio-blk request outhdr too short"); | ||
254 | - goto err; | ||
255 | - } | ||
256 | - | ||
257 | - iov_discard_front(&out_iov, &out_num, sizeof(req->out)); | ||
258 | - | ||
259 | - if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) { | ||
260 | - error_report("virtio-blk request inhdr too short"); | ||
261 | - goto err; | ||
262 | - } | ||
263 | - | ||
264 | - req->size = iov_size(in_iov, in_num); | ||
265 | - /* We always touch the last byte, so just see how big in_iov is. */ | ||
266 | - req->in = (void *)in_iov[in_num - 1].iov_base | ||
267 | - + in_iov[in_num - 1].iov_len | ||
268 | - - sizeof(struct virtio_blk_inhdr); | ||
269 | - iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); | ||
270 | - | ||
271 | - type = le32_to_cpu(req->out.type); | ||
272 | - switch (type & ~VIRTIO_BLK_T_BARRIER) { | ||
273 | - case VIRTIO_BLK_T_IN: | ||
274 | - case VIRTIO_BLK_T_OUT: { | ||
275 | - QEMUIOVector qiov; | ||
276 | - int64_t offset; | ||
277 | - ssize_t ret = 0; | ||
278 | - bool is_write = type & VIRTIO_BLK_T_OUT; | ||
279 | - req->sector_num = le64_to_cpu(req->out.sector); | ||
280 | - | ||
281 | - if (is_write && !vexp->writable) { | ||
282 | - req->in->status = VIRTIO_BLK_S_IOERR; | ||
283 | - break; | ||
284 | - } | ||
285 | - | ||
286 | - if (is_write) { | ||
287 | - qemu_iovec_init_external(&qiov, out_iov, out_num); | ||
288 | - } else { | ||
289 | - qemu_iovec_init_external(&qiov, in_iov, in_num); | ||
290 | - } | ||
291 | - | ||
292 | - if (unlikely(!vu_blk_sect_range_ok(vexp, | ||
293 | - req->sector_num, | ||
294 | - qiov.size))) { | ||
295 | - req->in->status = VIRTIO_BLK_S_IOERR; | ||
296 | - break; | ||
297 | - } | ||
298 | - | ||
299 | - offset = req->sector_num << VIRTIO_BLK_SECTOR_BITS; | ||
300 | - | ||
301 | - if (is_write) { | ||
302 | - ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0); | ||
303 | - } else { | ||
304 | - ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0); | ||
305 | - } | ||
306 | - if (ret >= 0) { | ||
307 | - req->in->status = VIRTIO_BLK_S_OK; | ||
308 | - } else { | ||
309 | - req->in->status = VIRTIO_BLK_S_IOERR; | ||
310 | - } | ||
311 | - break; | ||
312 | - } | ||
313 | - case VIRTIO_BLK_T_FLUSH: | ||
314 | - if (blk_co_flush(blk) == 0) { | ||
315 | - req->in->status = VIRTIO_BLK_S_OK; | ||
316 | - } else { | ||
317 | - req->in->status = VIRTIO_BLK_S_IOERR; | ||
318 | - } | ||
319 | - break; | ||
320 | - case VIRTIO_BLK_T_GET_ID: { | ||
321 | - size_t size = MIN(iov_size(&elem->in_sg[0], in_num), | ||
322 | - VIRTIO_BLK_ID_BYTES); | ||
323 | - snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk"); | ||
324 | - req->in->status = VIRTIO_BLK_S_OK; | ||
325 | - break; | ||
326 | + int in_len; | ||
327 | + | ||
328 | + in_len = virtio_blk_process_req(handler, in_iov, out_iov, | ||
329 | + in_num, out_num); | ||
330 | + if (in_len < 0) { | ||
331 | + free(req); | ||
332 | + vhost_user_server_unref(server); | ||
333 | + return; | ||
334 | } | ||
335 | - case VIRTIO_BLK_T_DISCARD: | ||
336 | - case VIRTIO_BLK_T_WRITE_ZEROES: { | ||
337 | - if (!vexp->writable) { | ||
338 | - req->in->status = VIRTIO_BLK_S_IOERR; | ||
339 | - break; | ||
340 | - } | ||
341 | - | ||
342 | - req->in->status = vu_blk_discard_write_zeroes(vexp, out_iov, out_num, | ||
343 | - type); | ||
344 | - break; | ||
345 | - } | ||
346 | - default: | ||
347 | - req->in->status = VIRTIO_BLK_S_UNSUPP; | ||
348 | - break; | ||
349 | - } | ||
350 | - | ||
351 | - vu_blk_req_complete(req); | ||
352 | - vhost_user_server_unref(server); | ||
353 | - return; | ||
354 | |||
355 | -err: | ||
356 | - free(req); | ||
357 | + vu_blk_req_complete(req, in_len); | ||
358 | vhost_user_server_unref(server); | ||
359 | } | ||
360 | |||
361 | @@ -XXX,XX +XXX,XX @@ static uint64_t vu_blk_get_features(VuDev *dev) | ||
362 | 1ull << VIRTIO_RING_F_EVENT_IDX | | ||
363 | 1ull << VHOST_USER_F_PROTOCOL_FEATURES; | ||
364 | |||
365 | - if (!vexp->writable) { | ||
366 | + if (!vexp->handler.writable) { | ||
367 | features |= 1ull << VIRTIO_BLK_F_RO; | ||
368 | } | ||
369 | |||
370 | @@ -XXX,XX +XXX,XX @@ vu_blk_initialize_config(BlockDriverState *bs, | ||
371 | config->opt_io_size = cpu_to_le32(1); | ||
372 | config->num_queues = cpu_to_le16(num_queues); | ||
373 | config->max_discard_sectors = | ||
374 | - cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS); | ||
375 | + cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS); | ||
376 | config->max_discard_seg = cpu_to_le32(1); | ||
377 | config->discard_sector_alignment = | ||
378 | cpu_to_le32(blk_size >> VIRTIO_BLK_SECTOR_BITS); | ||
379 | config->max_write_zeroes_sectors | ||
380 | - = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS); | ||
381 | + = cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS); | ||
382 | config->max_write_zeroes_seg = cpu_to_le32(1); | ||
383 | } | ||
384 | |||
385 | @@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
386 | uint64_t logical_block_size; | ||
387 | uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT; | ||
388 | |||
389 | - vexp->writable = opts->writable; | ||
390 | vexp->blkcfg.wce = 0; | ||
391 | |||
392 | if (vu_opts->has_logical_block_size) { | ||
393 | @@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
394 | error_propagate(errp, local_err); | ||
395 | return -EINVAL; | ||
396 | } | ||
397 | - vexp->blk_size = logical_block_size; | ||
398 | |||
399 | if (vu_opts->has_num_queues) { | ||
400 | num_queues = vu_opts->num_queues; | ||
401 | @@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
402 | error_setg(errp, "num-queues must be greater than 0"); | ||
403 | return -EINVAL; | ||
404 | } | ||
405 | + vexp->handler.blk = exp->blk; | ||
406 | + vexp->handler.serial = "vhost_user_blk"; | ||
407 | + vexp->handler.logical_block_size = logical_block_size; | ||
408 | + vexp->handler.writable = opts->writable; | ||
409 | |||
410 | vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg, | ||
411 | logical_block_size, num_queues); | ||
412 | diff --git a/block/export/virtio-blk-handler.c b/block/export/virtio-blk-handler.c | ||
413 | new file mode 100644 | ||
414 | index XXXXXXX..XXXXXXX | ||
415 | --- /dev/null | ||
416 | +++ b/block/export/virtio-blk-handler.c | ||
417 | @@ -XXX,XX +XXX,XX @@ | ||
418 | +/* | ||
419 | + * Handler for virtio-blk I/O | ||
420 | + * | ||
421 | + * Copyright (c) 2020 Red Hat, Inc. | ||
422 | + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | ||
423 | + * | ||
424 | + * Author: | ||
425 | + * Coiby Xu <coiby.xu@gmail.com> | ||
426 | + * Xie Yongji <xieyongji@bytedance.com> | ||
427 | + * | ||
428 | + * This work is licensed under the terms of the GNU GPL, version 2 or | ||
429 | + * later. See the COPYING file in the top-level directory. | ||
430 | + */ | ||
431 | + | ||
432 | +#include "qemu/osdep.h" | ||
433 | +#include "qemu/error-report.h" | ||
434 | +#include "virtio-blk-handler.h" | ||
435 | + | ||
436 | +#include "standard-headers/linux/virtio_blk.h" | ||
437 | + | ||
438 | +struct virtio_blk_inhdr { | ||
439 | + unsigned char status; | ||
440 | +}; | 35 | +}; |
441 | + | 36 | + |
442 | +static bool virtio_blk_sect_range_ok(BlockBackend *blk, uint32_t block_size, | 37 | +static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs) |
443 | + uint64_t sector, size_t size) | ||
444 | +{ | 38 | +{ |
445 | + uint64_t nb_sectors; | 39 | + switch (drain_type) { |
446 | + uint64_t total_sectors; | 40 | + case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break; |
447 | + | 41 | + case BDRV_DRAIN: bdrv_drained_begin(bs); break; |
448 | + if (size % VIRTIO_BLK_SECTOR_SIZE) { | 42 | + default: g_assert_not_reached(); |
449 | + return false; | ||
450 | + } | 43 | + } |
451 | + | ||
452 | + nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS; | ||
453 | + | ||
454 | + QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE); | ||
455 | + if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) { | ||
456 | + return false; | ||
457 | + } | ||
458 | + if ((sector << VIRTIO_BLK_SECTOR_BITS) % block_size) { | ||
459 | + return false; | ||
460 | + } | ||
461 | + blk_get_geometry(blk, &total_sectors); | ||
462 | + if (sector > total_sectors || nb_sectors > total_sectors - sector) { | ||
463 | + return false; | ||
464 | + } | ||
465 | + return true; | ||
466 | +} | 44 | +} |
467 | + | 45 | + |
468 | +static int coroutine_fn | 46 | +static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs) |
469 | +virtio_blk_discard_write_zeroes(VirtioBlkHandler *handler, struct iovec *iov, | ||
470 | + uint32_t iovcnt, uint32_t type) | ||
471 | +{ | 47 | +{ |
472 | + BlockBackend *blk = handler->blk; | 48 | + switch (drain_type) { |
473 | + struct virtio_blk_discard_write_zeroes desc; | 49 | + case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break; |
474 | + ssize_t size; | 50 | + case BDRV_DRAIN: bdrv_drained_end(bs); break; |
475 | + uint64_t sector; | 51 | + default: g_assert_not_reached(); |
476 | + uint32_t num_sectors; | ||
477 | + uint32_t max_sectors; | ||
478 | + uint32_t flags; | ||
479 | + int bytes; | ||
480 | + | ||
481 | + /* Only one desc is currently supported */ | ||
482 | + if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) { | ||
483 | + return VIRTIO_BLK_S_UNSUPP; | ||
484 | + } | 52 | + } |
485 | + | ||
486 | + size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc)); | ||
487 | + if (unlikely(size != sizeof(desc))) { | ||
488 | + error_report("Invalid size %zd, expected %zu", size, sizeof(desc)); | ||
489 | + return VIRTIO_BLK_S_IOERR; | ||
490 | + } | ||
491 | + | ||
492 | + sector = le64_to_cpu(desc.sector); | ||
493 | + num_sectors = le32_to_cpu(desc.num_sectors); | ||
494 | + flags = le32_to_cpu(desc.flags); | ||
495 | + max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ? | ||
496 | + VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS : | ||
497 | + VIRTIO_BLK_MAX_DISCARD_SECTORS; | ||
498 | + | ||
499 | + /* This check ensures that 'bytes' fits in an int */ | ||
500 | + if (unlikely(num_sectors > max_sectors)) { | ||
501 | + return VIRTIO_BLK_S_IOERR; | ||
502 | + } | ||
503 | + | ||
504 | + bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS; | ||
505 | + | ||
506 | + if (unlikely(!virtio_blk_sect_range_ok(blk, handler->logical_block_size, | ||
507 | + sector, bytes))) { | ||
508 | + return VIRTIO_BLK_S_IOERR; | ||
509 | + } | ||
510 | + | ||
511 | + /* | ||
512 | + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard | ||
513 | + * and write zeroes commands if any unknown flag is set. | ||
514 | + */ | ||
515 | + if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) { | ||
516 | + return VIRTIO_BLK_S_UNSUPP; | ||
517 | + } | ||
518 | + | ||
519 | + if (type == VIRTIO_BLK_T_WRITE_ZEROES) { | ||
520 | + int blk_flags = 0; | ||
521 | + | ||
522 | + if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { | ||
523 | + blk_flags |= BDRV_REQ_MAY_UNMAP; | ||
524 | + } | ||
525 | + | ||
526 | + if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS, | ||
527 | + bytes, blk_flags) == 0) { | ||
528 | + return VIRTIO_BLK_S_OK; | ||
529 | + } | ||
530 | + } else if (type == VIRTIO_BLK_T_DISCARD) { | ||
531 | + /* | ||
532 | + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for | ||
533 | + * discard commands if the unmap flag is set. | ||
534 | + */ | ||
535 | + if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) { | ||
536 | + return VIRTIO_BLK_S_UNSUPP; | ||
537 | + } | ||
538 | + | ||
539 | + if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS, | ||
540 | + bytes) == 0) { | ||
541 | + return VIRTIO_BLK_S_OK; | ||
542 | + } | ||
543 | + } | ||
544 | + | ||
545 | + return VIRTIO_BLK_S_IOERR; | ||
546 | +} | 53 | +} |
547 | + | 54 | + |
548 | +int coroutine_fn virtio_blk_process_req(VirtioBlkHandler *handler, | 55 | +static void test_drv_cb_common(enum drain_type drain_type, bool recursive) |
549 | + struct iovec *in_iov, | 56 | { |
550 | + struct iovec *out_iov, | 57 | BlockBackend *blk; |
551 | + unsigned int in_num, | 58 | - BlockDriverState *bs; |
552 | + unsigned int out_num) | 59 | - BDRVTestState *s; |
60 | + BlockDriverState *bs, *backing; | ||
61 | + BDRVTestState *s, *backing_s; | ||
62 | BlockAIOCB *acb; | ||
63 | int aio_ret; | ||
64 | |||
65 | @@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void) | ||
66 | s = bs->opaque; | ||
67 | blk_insert_bs(blk, bs, &error_abort); | ||
68 | |||
69 | + backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); | ||
70 | + backing_s = backing->opaque; | ||
71 | + bdrv_set_backing_hd(bs, backing, &error_abort); | ||
72 | + | ||
73 | /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */ | ||
74 | g_assert_cmpint(s->drain_count, ==, 0); | ||
75 | - bdrv_drain_all_begin(); | ||
76 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
77 | + | ||
78 | + do_drain_begin(drain_type, bs); | ||
79 | + | ||
80 | g_assert_cmpint(s->drain_count, ==, 1); | ||
81 | - bdrv_drain_all_end(); | ||
82 | + g_assert_cmpint(backing_s->drain_count, ==, !!recursive); | ||
83 | + | ||
84 | + do_drain_end(drain_type, bs); | ||
85 | + | ||
86 | g_assert_cmpint(s->drain_count, ==, 0); | ||
87 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
88 | |||
89 | /* Now do the same while a request is pending */ | ||
90 | aio_ret = -EINPROGRESS; | ||
91 | @@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_all(void) | ||
92 | g_assert_cmpint(aio_ret, ==, -EINPROGRESS); | ||
93 | |||
94 | g_assert_cmpint(s->drain_count, ==, 0); | ||
95 | - bdrv_drain_all_begin(); | ||
96 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
97 | + | ||
98 | + do_drain_begin(drain_type, bs); | ||
99 | + | ||
100 | g_assert_cmpint(aio_ret, ==, 0); | ||
101 | g_assert_cmpint(s->drain_count, ==, 1); | ||
102 | - bdrv_drain_all_end(); | ||
103 | + g_assert_cmpint(backing_s->drain_count, ==, !!recursive); | ||
104 | + | ||
105 | + do_drain_end(drain_type, bs); | ||
106 | + | ||
107 | g_assert_cmpint(s->drain_count, ==, 0); | ||
108 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
109 | |||
110 | + bdrv_unref(backing); | ||
111 | bdrv_unref(bs); | ||
112 | blk_unref(blk); | ||
113 | } | ||
114 | |||
115 | +static void test_drv_cb_drain_all(void) | ||
553 | +{ | 116 | +{ |
554 | + BlockBackend *blk = handler->blk; | 117 | + test_drv_cb_common(BDRV_DRAIN_ALL, true); |
555 | + struct virtio_blk_inhdr *in; | 118 | +} |
556 | + struct virtio_blk_outhdr out; | ||
557 | + uint32_t type; | ||
558 | + int in_len; | ||
559 | + | 119 | + |
560 | + if (out_num < 1 || in_num < 1) { | 120 | +static void test_drv_cb_drain(void) |
561 | + error_report("virtio-blk request missing headers"); | 121 | +{ |
562 | + return -EINVAL; | 122 | + test_drv_cb_common(BDRV_DRAIN, false); |
563 | + } | 123 | +} |
564 | + | 124 | + |
565 | + if (unlikely(iov_to_buf(out_iov, out_num, 0, &out, | 125 | int main(int argc, char **argv) |
566 | + sizeof(out)) != sizeof(out))) { | 126 | { |
567 | + error_report("virtio-blk request outhdr too short"); | 127 | bdrv_init(); |
568 | + return -EINVAL; | 128 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) |
569 | + } | 129 | g_test_init(&argc, &argv, NULL); |
570 | + | 130 | |
571 | + iov_discard_front(&out_iov, &out_num, sizeof(out)); | 131 | g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all); |
572 | + | 132 | + g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain); |
573 | + if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) { | 133 | |
574 | + error_report("virtio-blk request inhdr too short"); | 134 | return g_test_run(); |
575 | + return -EINVAL; | 135 | } |
576 | + } | ||
577 | + | ||
578 | + /* We always touch the last byte, so just see how big in_iov is. */ | ||
579 | + in_len = iov_size(in_iov, in_num); | ||
580 | + in = (void *)in_iov[in_num - 1].iov_base | ||
581 | + + in_iov[in_num - 1].iov_len | ||
582 | + - sizeof(struct virtio_blk_inhdr); | ||
583 | + iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); | ||
584 | + | ||
585 | + type = le32_to_cpu(out.type); | ||
586 | + switch (type & ~VIRTIO_BLK_T_BARRIER) { | ||
587 | + case VIRTIO_BLK_T_IN: | ||
588 | + case VIRTIO_BLK_T_OUT: { | ||
589 | + QEMUIOVector qiov; | ||
590 | + int64_t offset; | ||
591 | + ssize_t ret = 0; | ||
592 | + bool is_write = type & VIRTIO_BLK_T_OUT; | ||
593 | + int64_t sector_num = le64_to_cpu(out.sector); | ||
594 | + | ||
595 | + if (is_write && !handler->writable) { | ||
596 | + in->status = VIRTIO_BLK_S_IOERR; | ||
597 | + break; | ||
598 | + } | ||
599 | + | ||
600 | + if (is_write) { | ||
601 | + qemu_iovec_init_external(&qiov, out_iov, out_num); | ||
602 | + } else { | ||
603 | + qemu_iovec_init_external(&qiov, in_iov, in_num); | ||
604 | + } | ||
605 | + | ||
606 | + if (unlikely(!virtio_blk_sect_range_ok(blk, | ||
607 | + handler->logical_block_size, | ||
608 | + sector_num, qiov.size))) { | ||
609 | + in->status = VIRTIO_BLK_S_IOERR; | ||
610 | + break; | ||
611 | + } | ||
612 | + | ||
613 | + offset = sector_num << VIRTIO_BLK_SECTOR_BITS; | ||
614 | + | ||
615 | + if (is_write) { | ||
616 | + ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0); | ||
617 | + } else { | ||
618 | + ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0); | ||
619 | + } | ||
620 | + if (ret >= 0) { | ||
621 | + in->status = VIRTIO_BLK_S_OK; | ||
622 | + } else { | ||
623 | + in->status = VIRTIO_BLK_S_IOERR; | ||
624 | + } | ||
625 | + break; | ||
626 | + } | ||
627 | + case VIRTIO_BLK_T_FLUSH: | ||
628 | + if (blk_co_flush(blk) == 0) { | ||
629 | + in->status = VIRTIO_BLK_S_OK; | ||
630 | + } else { | ||
631 | + in->status = VIRTIO_BLK_S_IOERR; | ||
632 | + } | ||
633 | + break; | ||
634 | + case VIRTIO_BLK_T_GET_ID: { | ||
635 | + size_t size = MIN(strlen(handler->serial) + 1, | ||
636 | + MIN(iov_size(in_iov, in_num), | ||
637 | + VIRTIO_BLK_ID_BYTES)); | ||
638 | + iov_from_buf(in_iov, in_num, 0, handler->serial, size); | ||
639 | + in->status = VIRTIO_BLK_S_OK; | ||
640 | + break; | ||
641 | + } | ||
642 | + case VIRTIO_BLK_T_DISCARD: | ||
643 | + case VIRTIO_BLK_T_WRITE_ZEROES: | ||
644 | + if (!handler->writable) { | ||
645 | + in->status = VIRTIO_BLK_S_IOERR; | ||
646 | + break; | ||
647 | + } | ||
648 | + in->status = virtio_blk_discard_write_zeroes(handler, out_iov, | ||
649 | + out_num, type); | ||
650 | + break; | ||
651 | + default: | ||
652 | + in->status = VIRTIO_BLK_S_UNSUPP; | ||
653 | + break; | ||
654 | + } | ||
655 | + | ||
656 | + return in_len; | ||
657 | +} | ||
658 | diff --git a/MAINTAINERS b/MAINTAINERS | ||
659 | index XXXXXXX..XXXXXXX 100644 | ||
660 | --- a/MAINTAINERS | ||
661 | +++ b/MAINTAINERS | ||
662 | @@ -XXX,XX +XXX,XX @@ M: Coiby Xu <Coiby.Xu@gmail.com> | ||
663 | S: Maintained | ||
664 | F: block/export/vhost-user-blk-server.c | ||
665 | F: block/export/vhost-user-blk-server.h | ||
666 | +F: block/export/virtio-blk-handler.c | ||
667 | +F: block/export/virtio-blk-handler.h | ||
668 | F: include/qemu/vhost-user-server.h | ||
669 | F: tests/qtest/libqos/vhost-user-blk.c | ||
670 | F: tests/qtest/libqos/vhost-user-blk.h | ||
671 | diff --git a/block/export/meson.build b/block/export/meson.build | ||
672 | index XXXXXXX..XXXXXXX 100644 | ||
673 | --- a/block/export/meson.build | ||
674 | +++ b/block/export/meson.build | ||
675 | @@ -XXX,XX +XXX,XX @@ | ||
676 | blockdev_ss.add(files('export.c')) | ||
677 | |||
678 | if have_vhost_user_blk_server | ||
679 | - blockdev_ss.add(files('vhost-user-blk-server.c')) | ||
680 | + blockdev_ss.add(files('vhost-user-blk-server.c', 'virtio-blk-handler.c')) | ||
681 | endif | ||
682 | |||
683 | blockdev_ss.add(when: fuse, if_true: files('fuse.c')) | ||
684 | -- | 136 | -- |
685 | 2.35.3 | 137 | 2.13.6 |
138 | |||
139 | diff view generated by jsdifflib |
1 | From: Eric Blake <eblake@redhat.com> | 1 | This is currently only working correctly for bdrv_drain(), not for |
---|---|---|---|
2 | bdrv_drain_all(). Leave a comment for the drain_all case, we'll address | ||
3 | it later. | ||
2 | 4 | ||
3 | CID 1488362 points out that the second 'rc >= 0' check is now dead | ||
4 | code. | ||
5 | |||
6 | Reported-by: Peter Maydell <peter.maydell@linaro.org> | ||
7 | Fixes: 172f5f1a40(nbd: remove peppering of nbd_client_connected) | ||
8 | Signed-off-by: Eric Blake <eblake@redhat.com> | ||
9 | Message-Id: <20220516210519.76135-1-eblake@redhat.com> | ||
10 | Reviewed-by: Peter Maydell <peter.maydell@linaro.org> | ||
11 | Reviewed-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru> | ||
12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
13 | --- | 6 | --- |
14 | block/nbd.c | 8 ++------ | 7 | tests/test-bdrv-drain.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ |
15 | 1 file changed, 2 insertions(+), 6 deletions(-) | 8 | 1 file changed, 45 insertions(+) |
16 | 9 | ||
17 | diff --git a/block/nbd.c b/block/nbd.c | 10 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
18 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
19 | --- a/block/nbd.c | 12 | --- a/tests/test-bdrv-drain.c |
20 | +++ b/block/nbd.c | 13 | +++ b/tests/test-bdrv-drain.c |
21 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nbd_co_send_request(BlockDriverState *bs, | 14 | @@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void) |
22 | if (qiov) { | 15 | test_drv_cb_common(BDRV_DRAIN, false); |
23 | qio_channel_set_cork(s->ioc, true); | 16 | } |
24 | rc = nbd_send_request(s->ioc, request); | 17 | |
25 | - if (rc >= 0) { | 18 | +static void test_quiesce_common(enum drain_type drain_type, bool recursive) |
26 | - if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov, | 19 | +{ |
27 | - NULL) < 0) { | 20 | + BlockBackend *blk; |
28 | - rc = -EIO; | 21 | + BlockDriverState *bs, *backing; |
29 | - } | 22 | + |
30 | - } else if (rc >= 0) { | 23 | + blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); |
31 | + if (rc >= 0 && qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov, | 24 | + bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR, |
32 | + NULL) < 0) { | 25 | + &error_abort); |
33 | rc = -EIO; | 26 | + blk_insert_bs(blk, bs, &error_abort); |
34 | } | 27 | + |
35 | qio_channel_set_cork(s->ioc, false); | 28 | + backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); |
29 | + bdrv_set_backing_hd(bs, backing, &error_abort); | ||
30 | + | ||
31 | + g_assert_cmpint(bs->quiesce_counter, ==, 0); | ||
32 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); | ||
33 | + | ||
34 | + do_drain_begin(drain_type, bs); | ||
35 | + | ||
36 | + g_assert_cmpint(bs->quiesce_counter, ==, 1); | ||
37 | + g_assert_cmpint(backing->quiesce_counter, ==, !!recursive); | ||
38 | + | ||
39 | + do_drain_end(drain_type, bs); | ||
40 | + | ||
41 | + g_assert_cmpint(bs->quiesce_counter, ==, 0); | ||
42 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); | ||
43 | + | ||
44 | + bdrv_unref(backing); | ||
45 | + bdrv_unref(bs); | ||
46 | + blk_unref(blk); | ||
47 | +} | ||
48 | + | ||
49 | +static void test_quiesce_drain_all(void) | ||
50 | +{ | ||
51 | + // XXX drain_all doesn't quiesce | ||
52 | + //test_quiesce_common(BDRV_DRAIN_ALL, true); | ||
53 | +} | ||
54 | + | ||
55 | +static void test_quiesce_drain(void) | ||
56 | +{ | ||
57 | + test_quiesce_common(BDRV_DRAIN, false); | ||
58 | +} | ||
59 | + | ||
60 | int main(int argc, char **argv) | ||
61 | { | ||
62 | bdrv_init(); | ||
63 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
64 | g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all); | ||
65 | g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain); | ||
66 | |||
67 | + g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all); | ||
68 | + g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain); | ||
69 | + | ||
70 | return g_test_run(); | ||
71 | } | ||
36 | -- | 72 | -- |
37 | 2.35.3 | 73 | 2.13.6 |
74 | |||
75 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | Block jobs already paused themselves when their main BlockBackend |
---|---|---|---|
2 | entered a drained section. This is not good enough: We also want to | ||
3 | pause a block job and may not submit new requests if, for example, the | ||
4 | mirror target node should be drained. | ||
2 | 5 | ||
3 | Add a 'serial' option to allow user to specify this value | 6 | This implements .drained_begin/end callbacks in child_job in order to |
4 | explicitly. And the default value is changed to an empty | 7 | consider all block nodes related to the job, and removes the |
5 | string as what we did in "hw/block/virtio-blk.c". | 8 | BlockBackend callbacks which are unnecessary now because the root of the |
9 | job main BlockBackend is always referenced with a child_job, too. | ||
6 | 10 | ||
7 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
8 | Message-Id: <20220614051532.92-6-xieyongji@bytedance.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 12 | --- |
11 | qapi/block-export.json | 4 +++- | 13 | blockjob.c | 22 +++++++++------------- |
12 | docs/tools/qemu-storage-daemon.rst | 2 +- | 14 | 1 file changed, 9 insertions(+), 13 deletions(-) |
13 | block/export/virtio-blk-handler.h | 2 +- | ||
14 | block/export/vduse-blk.c | 20 ++++++++++++++------ | ||
15 | block/export/vhost-user-blk-server.c | 4 +++- | ||
16 | storage-daemon/qemu-storage-daemon.c | 1 + | ||
17 | 6 files changed, 23 insertions(+), 10 deletions(-) | ||
18 | 15 | ||
19 | diff --git a/qapi/block-export.json b/qapi/block-export.json | 16 | diff --git a/blockjob.c b/blockjob.c |
20 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/qapi/block-export.json | 18 | --- a/blockjob.c |
22 | +++ b/qapi/block-export.json | 19 | +++ b/blockjob.c |
23 | @@ -XXX,XX +XXX,XX @@ | 20 | @@ -XXX,XX +XXX,XX @@ static char *child_job_get_parent_desc(BdrvChild *c) |
24 | # @queue-size: the size of virtqueue. Defaults to 256. | 21 | job->id); |
25 | # @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE] | ||
26 | # and must be power of 2. Defaults to 512 bytes. | ||
27 | +# @serial: the serial number of virtio block device. Defaults to empty string. | ||
28 | # | ||
29 | # Since: 7.1 | ||
30 | ## | ||
31 | { 'struct': 'BlockExportOptionsVduseBlk', | ||
32 | 'data': { '*num-queues': 'uint16', | ||
33 | '*queue-size': 'uint16', | ||
34 | - '*logical-block-size': 'size'} } | ||
35 | + '*logical-block-size': 'size', | ||
36 | + '*serial': 'str' } } | ||
37 | |||
38 | ## | ||
39 | # @NbdServerAddOptions: | ||
40 | diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst | ||
41 | index XXXXXXX..XXXXXXX 100644 | ||
42 | --- a/docs/tools/qemu-storage-daemon.rst | ||
43 | +++ b/docs/tools/qemu-storage-daemon.rst | ||
44 | @@ -XXX,XX +XXX,XX @@ Standard options: | ||
45 | --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>] | ||
46 | --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>] | ||
47 | --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto] | ||
48 | - --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>] | ||
49 | + --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>] | ||
50 | |||
51 | is a block export definition. ``node-name`` is the block node that should be | ||
52 | exported. ``writable`` determines whether or not the export allows write | ||
53 | diff --git a/block/export/virtio-blk-handler.h b/block/export/virtio-blk-handler.h | ||
54 | index XXXXXXX..XXXXXXX 100644 | ||
55 | --- a/block/export/virtio-blk-handler.h | ||
56 | +++ b/block/export/virtio-blk-handler.h | ||
57 | @@ -XXX,XX +XXX,XX @@ | ||
58 | |||
59 | typedef struct { | ||
60 | BlockBackend *blk; | ||
61 | - const char *serial; | ||
62 | + char *serial; | ||
63 | uint32_t logical_block_size; | ||
64 | bool writable; | ||
65 | } VirtioBlkHandler; | ||
66 | diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c | ||
67 | index XXXXXXX..XXXXXXX 100644 | ||
68 | --- a/block/export/vduse-blk.c | ||
69 | +++ b/block/export/vduse-blk.c | ||
70 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
71 | Error *local_err = NULL; | ||
72 | struct virtio_blk_config config = { 0 }; | ||
73 | uint64_t features; | ||
74 | - int i; | ||
75 | + int i, ret; | ||
76 | |||
77 | if (vblk_opts->has_num_queues) { | ||
78 | num_queues = vblk_opts->num_queues; | ||
79 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
80 | } | ||
81 | vblk_exp->num_queues = num_queues; | ||
82 | vblk_exp->handler.blk = exp->blk; | ||
83 | - vblk_exp->handler.serial = exp->id; | ||
84 | + vblk_exp->handler.serial = g_strdup(vblk_opts->has_serial ? | ||
85 | + vblk_opts->serial : ""); | ||
86 | vblk_exp->handler.logical_block_size = logical_block_size; | ||
87 | vblk_exp->handler.writable = opts->writable; | ||
88 | |||
89 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
90 | vblk_exp); | ||
91 | if (!vblk_exp->dev) { | ||
92 | error_setg(errp, "failed to create vduse device"); | ||
93 | - return -ENOMEM; | ||
94 | + ret = -ENOMEM; | ||
95 | + goto err_dev; | ||
96 | } | ||
97 | |||
98 | vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s", | ||
99 | g_get_tmp_dir(), exp->id); | ||
100 | if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) { | ||
101 | error_setg(errp, "failed to set reconnect log file"); | ||
102 | - vduse_dev_destroy(vblk_exp->dev); | ||
103 | - g_free(vblk_exp->recon_file); | ||
104 | - return -EINVAL; | ||
105 | + ret = -EINVAL; | ||
106 | + goto err; | ||
107 | } | ||
108 | |||
109 | for (i = 0; i < num_queues; i++) { | ||
110 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
111 | blk_set_dev_ops(exp->blk, &vduse_block_ops, exp); | ||
112 | |||
113 | return 0; | ||
114 | +err: | ||
115 | + vduse_dev_destroy(vblk_exp->dev); | ||
116 | + g_free(vblk_exp->recon_file); | ||
117 | +err_dev: | ||
118 | + g_free(vblk_exp->handler.serial); | ||
119 | + return ret; | ||
120 | } | 22 | } |
121 | 23 | ||
122 | static void vduse_blk_exp_delete(BlockExport *exp) | 24 | -static const BdrvChildRole child_job = { |
123 | @@ -XXX,XX +XXX,XX @@ static void vduse_blk_exp_delete(BlockExport *exp) | 25 | - .get_parent_desc = child_job_get_parent_desc, |
124 | unlink(vblk_exp->recon_file); | 26 | - .stay_at_node = true, |
125 | } | 27 | -}; |
126 | g_free(vblk_exp->recon_file); | 28 | - |
127 | + g_free(vblk_exp->handler.serial); | 29 | -static void block_job_drained_begin(void *opaque) |
30 | +static void child_job_drained_begin(BdrvChild *c) | ||
31 | { | ||
32 | - BlockJob *job = opaque; | ||
33 | + BlockJob *job = c->opaque; | ||
34 | block_job_pause(job); | ||
128 | } | 35 | } |
129 | 36 | ||
130 | static void vduse_blk_exp_request_shutdown(BlockExport *exp) | 37 | -static void block_job_drained_end(void *opaque) |
131 | diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c | 38 | +static void child_job_drained_end(BdrvChild *c) |
132 | index XXXXXXX..XXXXXXX 100644 | 39 | { |
133 | --- a/block/export/vhost-user-blk-server.c | 40 | - BlockJob *job = opaque; |
134 | +++ b/block/export/vhost-user-blk-server.c | 41 | + BlockJob *job = c->opaque; |
135 | @@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | 42 | block_job_resume(job); |
136 | return -EINVAL; | ||
137 | } | ||
138 | vexp->handler.blk = exp->blk; | ||
139 | - vexp->handler.serial = "vhost_user_blk"; | ||
140 | + vexp->handler.serial = g_strdup("vhost_user_blk"); | ||
141 | vexp->handler.logical_block_size = logical_block_size; | ||
142 | vexp->handler.writable = opts->writable; | ||
143 | |||
144 | @@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
145 | num_queues, &vu_blk_iface, errp)) { | ||
146 | blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, | ||
147 | blk_aio_detach, vexp); | ||
148 | + g_free(vexp->handler.serial); | ||
149 | return -EADDRNOTAVAIL; | ||
150 | } | ||
151 | |||
152 | @@ -XXX,XX +XXX,XX @@ static void vu_blk_exp_delete(BlockExport *exp) | ||
153 | |||
154 | blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | ||
155 | vexp); | ||
156 | + g_free(vexp->handler.serial); | ||
157 | } | 43 | } |
158 | 44 | ||
159 | const BlockExportDriver blk_exp_vhost_user_blk = { | 45 | -static const BlockDevOps block_job_dev_ops = { |
160 | diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c | 46 | - .drained_begin = block_job_drained_begin, |
161 | index XXXXXXX..XXXXXXX 100644 | 47 | - .drained_end = block_job_drained_end, |
162 | --- a/storage-daemon/qemu-storage-daemon.c | 48 | +static const BdrvChildRole child_job = { |
163 | +++ b/storage-daemon/qemu-storage-daemon.c | 49 | + .get_parent_desc = child_job_get_parent_desc, |
164 | @@ -XXX,XX +XXX,XX @@ static void help(void) | 50 | + .drained_begin = child_job_drained_begin, |
165 | " [,writable=on|off][,num-queues=<num-queues>]\n" | 51 | + .drained_end = child_job_drained_end, |
166 | " [,queue-size=<queue-size>]\n" | 52 | + .stay_at_node = true, |
167 | " [,logical-block-size=<logical-block-size>]\n" | 53 | }; |
168 | +" [,serial=<serial-number>]\n" | 54 | |
169 | " export the specified block node as a vduse-blk\n" | 55 | void block_job_remove_all_bdrv(BlockJob *job) |
170 | " device using the id as the VDUSE device name\n" | 56 | @@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, |
171 | "\n" | 57 | block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort); |
58 | bs->job = job; | ||
59 | |||
60 | - blk_set_dev_ops(blk, &block_job_dev_ops, job); | ||
61 | bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker); | ||
62 | |||
63 | QLIST_INSERT_HEAD(&block_jobs, job, job_list); | ||
172 | -- | 64 | -- |
173 | 2.35.3 | 65 | 2.13.6 |
66 | |||
67 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | Block jobs must be paused if any of the involved nodes are drained. |
---|---|---|---|
2 | 2 | ||
3 | To support reconnecting after restart or crash, VDUSE backend | ||
4 | might need to resubmit inflight I/Os. This stores the metadata | ||
5 | such as the index of inflight I/O's descriptors to a shm file so | ||
6 | that VDUSE backend can restore them during reconnecting. | ||
7 | |||
8 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
9 | Message-Id: <20220523084611.91-9-xieyongji@bytedance.com> | ||
10 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
12 | --- | 4 | --- |
13 | subprojects/libvduse/libvduse.h | 12 ++ | 5 | tests/test-bdrv-drain.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++ |
14 | block/export/vduse-blk.c | 19 ++- | 6 | 1 file changed, 121 insertions(+) |
15 | subprojects/libvduse/libvduse.c | 235 +++++++++++++++++++++++++++++++- | ||
16 | 3 files changed, 260 insertions(+), 6 deletions(-) | ||
17 | 7 | ||
18 | diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h | 8 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
19 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/subprojects/libvduse/libvduse.h | 10 | --- a/tests/test-bdrv-drain.c |
21 | +++ b/subprojects/libvduse/libvduse.h | 11 | +++ b/tests/test-bdrv-drain.c |
22 | @@ -XXX,XX +XXX,XX @@ int vduse_dev_update_config(VduseDev *dev, uint32_t size, | 12 | @@ -XXX,XX +XXX,XX @@ |
23 | */ | 13 | |
24 | int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size); | 14 | #include "qemu/osdep.h" |
25 | 15 | #include "block/block.h" | |
26 | +/** | 16 | +#include "block/blockjob_int.h" |
27 | + * vduse_set_reconnect_log_file: | 17 | #include "sysemu/block-backend.h" |
28 | + * @dev: VDUSE device | 18 | #include "qapi/error.h" |
29 | + * @file: filename of reconnect log | 19 | |
30 | + * | 20 | @@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void) |
31 | + * Specify the file to store log for reconnecting. It should | 21 | test_quiesce_common(BDRV_DRAIN, false); |
32 | + * be called before vduse_dev_setup_queue(). | 22 | } |
33 | + * | 23 | |
34 | + * Returns: 0 on success, -errno on failure. | ||
35 | + */ | ||
36 | +int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename); | ||
37 | + | 24 | + |
38 | /** | 25 | +typedef struct TestBlockJob { |
39 | * vduse_dev_create_by_fd: | 26 | + BlockJob common; |
40 | * @fd: passed file descriptor | 27 | + bool should_complete; |
41 | diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c | 28 | +} TestBlockJob; |
42 | index XXXXXXX..XXXXXXX 100644 | 29 | + |
43 | --- a/block/export/vduse-blk.c | 30 | +static void test_job_completed(BlockJob *job, void *opaque) |
44 | +++ b/block/export/vduse-blk.c | 31 | +{ |
45 | @@ -XXX,XX +XXX,XX @@ typedef struct VduseBlkExport { | 32 | + block_job_completed(job, 0); |
46 | VirtioBlkHandler handler; | 33 | +} |
47 | VduseDev *dev; | 34 | + |
48 | uint16_t num_queues; | 35 | +static void coroutine_fn test_job_start(void *opaque) |
49 | + char *recon_file; | 36 | +{ |
50 | unsigned int inflight; | 37 | + TestBlockJob *s = opaque; |
51 | } VduseBlkExport; | 38 | + |
52 | 39 | + while (!s->should_complete) { | |
53 | @@ -XXX,XX +XXX,XX @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) | 40 | + block_job_sleep_ns(&s->common, 100000); |
54 | |||
55 | aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), | ||
56 | true, on_vduse_vq_kick, NULL, NULL, NULL, vq); | ||
57 | + /* Make sure we don't miss any kick afer reconnecting */ | ||
58 | + eventfd_write(vduse_queue_get_fd(vq), 1); | ||
59 | } | ||
60 | |||
61 | static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq) | ||
62 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
63 | return -ENOMEM; | ||
64 | } | ||
65 | |||
66 | + vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s", | ||
67 | + g_get_tmp_dir(), exp->id); | ||
68 | + if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) { | ||
69 | + error_setg(errp, "failed to set reconnect log file"); | ||
70 | + vduse_dev_destroy(vblk_exp->dev); | ||
71 | + g_free(vblk_exp->recon_file); | ||
72 | + return -EINVAL; | ||
73 | + } | 41 | + } |
74 | + | 42 | + |
75 | for (i = 0; i < num_queues; i++) { | 43 | + block_job_defer_to_main_loop(&s->common, test_job_completed, NULL); |
76 | vduse_dev_setup_queue(vblk_exp->dev, i, queue_size); | ||
77 | } | ||
78 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
79 | static void vduse_blk_exp_delete(BlockExport *exp) | ||
80 | { | ||
81 | VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | ||
82 | + int ret; | ||
83 | |||
84 | blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | ||
85 | vblk_exp); | ||
86 | blk_set_dev_ops(exp->blk, NULL, NULL); | ||
87 | - vduse_dev_destroy(vblk_exp->dev); | ||
88 | + ret = vduse_dev_destroy(vblk_exp->dev); | ||
89 | + if (ret != -EBUSY) { | ||
90 | + unlink(vblk_exp->recon_file); | ||
91 | + } | ||
92 | + g_free(vblk_exp->recon_file); | ||
93 | } | ||
94 | |||
95 | static void vduse_blk_exp_request_shutdown(BlockExport *exp) | ||
96 | diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c | ||
97 | index XXXXXXX..XXXXXXX 100644 | ||
98 | --- a/subprojects/libvduse/libvduse.c | ||
99 | +++ b/subprojects/libvduse/libvduse.c | ||
100 | @@ -XXX,XX +XXX,XX @@ | ||
101 | #define VDUSE_VQ_ALIGN 4096 | ||
102 | #define MAX_IOVA_REGIONS 256 | ||
103 | |||
104 | +#define LOG_ALIGNMENT 64 | ||
105 | + | ||
106 | /* Round number down to multiple */ | ||
107 | #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) | ||
108 | |||
109 | @@ -XXX,XX +XXX,XX @@ | ||
110 | #define unlikely(x) __builtin_expect(!!(x), 0) | ||
111 | #endif | ||
112 | |||
113 | +typedef struct VduseDescStateSplit { | ||
114 | + uint8_t inflight; | ||
115 | + uint8_t padding[5]; | ||
116 | + uint16_t next; | ||
117 | + uint64_t counter; | ||
118 | +} VduseDescStateSplit; | ||
119 | + | ||
120 | +typedef struct VduseVirtqLogInflight { | ||
121 | + uint64_t features; | ||
122 | + uint16_t version; | ||
123 | + uint16_t desc_num; | ||
124 | + uint16_t last_batch_head; | ||
125 | + uint16_t used_idx; | ||
126 | + VduseDescStateSplit desc[]; | ||
127 | +} VduseVirtqLogInflight; | ||
128 | + | ||
129 | +typedef struct VduseVirtqLog { | ||
130 | + VduseVirtqLogInflight inflight; | ||
131 | +} VduseVirtqLog; | ||
132 | + | ||
133 | +typedef struct VduseVirtqInflightDesc { | ||
134 | + uint16_t index; | ||
135 | + uint64_t counter; | ||
136 | +} VduseVirtqInflightDesc; | ||
137 | + | ||
138 | typedef struct VduseRing { | ||
139 | unsigned int num; | ||
140 | uint64_t desc_addr; | ||
141 | @@ -XXX,XX +XXX,XX @@ struct VduseVirtq { | ||
142 | bool ready; | ||
143 | int fd; | ||
144 | VduseDev *dev; | ||
145 | + VduseVirtqInflightDesc *resubmit_list; | ||
146 | + uint16_t resubmit_num; | ||
147 | + uint64_t counter; | ||
148 | + VduseVirtqLog *log; | ||
149 | }; | ||
150 | |||
151 | typedef struct VduseIovaRegion { | ||
152 | @@ -XXX,XX +XXX,XX @@ struct VduseDev { | ||
153 | int fd; | ||
154 | int ctrl_fd; | ||
155 | void *priv; | ||
156 | + void *log; | ||
157 | }; | ||
158 | |||
159 | +static inline size_t vduse_vq_log_size(uint16_t queue_size) | ||
160 | +{ | ||
161 | + return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size + | ||
162 | + sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT); | ||
163 | +} | 44 | +} |
164 | + | 45 | + |
165 | +static void *vduse_log_get(const char *filename, size_t size) | 46 | +static void test_job_complete(BlockJob *job, Error **errp) |
166 | +{ | 47 | +{ |
167 | + void *ptr = MAP_FAILED; | 48 | + TestBlockJob *s = container_of(job, TestBlockJob, common); |
168 | + int fd; | 49 | + s->should_complete = true; |
169 | + | ||
170 | + fd = open(filename, O_RDWR | O_CREAT, 0600); | ||
171 | + if (fd == -1) { | ||
172 | + return MAP_FAILED; | ||
173 | + } | ||
174 | + | ||
175 | + if (ftruncate(fd, size) == -1) { | ||
176 | + goto out; | ||
177 | + } | ||
178 | + | ||
179 | + ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||
180 | + | ||
181 | +out: | ||
182 | + close(fd); | ||
183 | + return ptr; | ||
184 | +} | 50 | +} |
185 | + | 51 | + |
186 | static inline bool has_feature(uint64_t features, unsigned int fbit) | 52 | +BlockJobDriver test_job_driver = { |
187 | { | 53 | + .instance_size = sizeof(TestBlockJob), |
188 | assert(fbit < 64); | 54 | + .start = test_job_start, |
189 | @@ -XXX,XX +XXX,XX @@ static int vduse_inject_irq(VduseDev *dev, int index) | 55 | + .complete = test_job_complete, |
190 | return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index); | 56 | +}; |
191 | } | 57 | + |
192 | 58 | +static void test_blockjob_common(enum drain_type drain_type) | |
193 | +static int inflight_desc_compare(const void *a, const void *b) | ||
194 | +{ | 59 | +{ |
195 | + VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a, | 60 | + BlockBackend *blk_src, *blk_target; |
196 | + *desc1 = (VduseVirtqInflightDesc *)b; | 61 | + BlockDriverState *src, *target; |
62 | + BlockJob *job; | ||
63 | + int ret; | ||
197 | + | 64 | + |
198 | + if (desc1->counter > desc0->counter && | 65 | + src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR, |
199 | + (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) { | 66 | + &error_abort); |
200 | + return 1; | 67 | + blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); |
68 | + blk_insert_bs(blk_src, src, &error_abort); | ||
69 | + | ||
70 | + target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR, | ||
71 | + &error_abort); | ||
72 | + blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); | ||
73 | + blk_insert_bs(blk_target, target, &error_abort); | ||
74 | + | ||
75 | + job = block_job_create("job0", &test_job_driver, src, 0, BLK_PERM_ALL, 0, | ||
76 | + 0, NULL, NULL, &error_abort); | ||
77 | + block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort); | ||
78 | + block_job_start(job); | ||
79 | + | ||
80 | + g_assert_cmpint(job->pause_count, ==, 0); | ||
81 | + g_assert_false(job->paused); | ||
82 | + g_assert_false(job->busy); /* We're in block_job_sleep_ns() */ | ||
83 | + | ||
84 | + do_drain_begin(drain_type, src); | ||
85 | + | ||
86 | + if (drain_type == BDRV_DRAIN_ALL) { | ||
87 | + /* bdrv_drain_all() drains both src and target, and involves an | ||
88 | + * additional block_job_pause_all() */ | ||
89 | + g_assert_cmpint(job->pause_count, ==, 3); | ||
90 | + } else { | ||
91 | + g_assert_cmpint(job->pause_count, ==, 1); | ||
201 | + } | 92 | + } |
93 | + /* XXX We don't wait until the job is actually paused. Is this okay? */ | ||
94 | + /* g_assert_true(job->paused); */ | ||
95 | + g_assert_false(job->busy); /* The job is paused */ | ||
202 | + | 96 | + |
203 | + return -1; | 97 | + do_drain_end(drain_type, src); |
98 | + | ||
99 | + g_assert_cmpint(job->pause_count, ==, 0); | ||
100 | + g_assert_false(job->paused); | ||
101 | + g_assert_false(job->busy); /* We're in block_job_sleep_ns() */ | ||
102 | + | ||
103 | + do_drain_begin(drain_type, target); | ||
104 | + | ||
105 | + if (drain_type == BDRV_DRAIN_ALL) { | ||
106 | + /* bdrv_drain_all() drains both src and target, and involves an | ||
107 | + * additional block_job_pause_all() */ | ||
108 | + g_assert_cmpint(job->pause_count, ==, 3); | ||
109 | + } else { | ||
110 | + g_assert_cmpint(job->pause_count, ==, 1); | ||
111 | + } | ||
112 | + /* XXX We don't wait until the job is actually paused. Is this okay? */ | ||
113 | + /* g_assert_true(job->paused); */ | ||
114 | + g_assert_false(job->busy); /* The job is paused */ | ||
115 | + | ||
116 | + do_drain_end(drain_type, target); | ||
117 | + | ||
118 | + g_assert_cmpint(job->pause_count, ==, 0); | ||
119 | + g_assert_false(job->paused); | ||
120 | + g_assert_false(job->busy); /* We're in block_job_sleep_ns() */ | ||
121 | + | ||
122 | + ret = block_job_complete_sync(job, &error_abort); | ||
123 | + g_assert_cmpint(ret, ==, 0); | ||
124 | + | ||
125 | + blk_unref(blk_src); | ||
126 | + blk_unref(blk_target); | ||
127 | + bdrv_unref(src); | ||
128 | + bdrv_unref(target); | ||
204 | +} | 129 | +} |
205 | + | 130 | + |
206 | +static int vduse_queue_check_inflights(VduseVirtq *vq) | 131 | +static void test_blockjob_drain_all(void) |
207 | +{ | 132 | +{ |
208 | + int i = 0; | 133 | + test_blockjob_common(BDRV_DRAIN_ALL); |
209 | + VduseDev *dev = vq->dev; | ||
210 | + | ||
211 | + vq->used_idx = le16toh(vq->vring.used->idx); | ||
212 | + vq->resubmit_num = 0; | ||
213 | + vq->resubmit_list = NULL; | ||
214 | + vq->counter = 0; | ||
215 | + | ||
216 | + if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) { | ||
217 | + if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) { | ||
218 | + return -1; | ||
219 | + } | ||
220 | + | ||
221 | + vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0; | ||
222 | + | ||
223 | + barrier(); | ||
224 | + | ||
225 | + vq->log->inflight.used_idx = vq->used_idx; | ||
226 | + } | ||
227 | + | ||
228 | + for (i = 0; i < vq->log->inflight.desc_num; i++) { | ||
229 | + if (vq->log->inflight.desc[i].inflight == 1) { | ||
230 | + vq->inuse++; | ||
231 | + } | ||
232 | + } | ||
233 | + | ||
234 | + vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx; | ||
235 | + | ||
236 | + if (vq->inuse) { | ||
237 | + vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc)); | ||
238 | + if (!vq->resubmit_list) { | ||
239 | + return -1; | ||
240 | + } | ||
241 | + | ||
242 | + for (i = 0; i < vq->log->inflight.desc_num; i++) { | ||
243 | + if (vq->log->inflight.desc[i].inflight) { | ||
244 | + vq->resubmit_list[vq->resubmit_num].index = i; | ||
245 | + vq->resubmit_list[vq->resubmit_num].counter = | ||
246 | + vq->log->inflight.desc[i].counter; | ||
247 | + vq->resubmit_num++; | ||
248 | + } | ||
249 | + } | ||
250 | + | ||
251 | + if (vq->resubmit_num > 1) { | ||
252 | + qsort(vq->resubmit_list, vq->resubmit_num, | ||
253 | + sizeof(VduseVirtqInflightDesc), inflight_desc_compare); | ||
254 | + } | ||
255 | + vq->counter = vq->resubmit_list[0].counter + 1; | ||
256 | + } | ||
257 | + | ||
258 | + vduse_inject_irq(dev, vq->index); | ||
259 | + | ||
260 | + return 0; | ||
261 | +} | 134 | +} |
262 | + | 135 | + |
263 | +static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx) | 136 | +static void test_blockjob_drain(void) |
264 | +{ | 137 | +{ |
265 | + vq->log->inflight.desc[desc_idx].counter = vq->counter++; | 138 | + test_blockjob_common(BDRV_DRAIN); |
266 | + | ||
267 | + barrier(); | ||
268 | + | ||
269 | + vq->log->inflight.desc[desc_idx].inflight = 1; | ||
270 | + | ||
271 | + return 0; | ||
272 | +} | 139 | +} |
273 | + | 140 | + |
274 | +static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx) | 141 | int main(int argc, char **argv) |
275 | +{ | 142 | { |
276 | + vq->log->inflight.last_batch_head = desc_idx; | 143 | bdrv_init(); |
144 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
145 | g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all); | ||
146 | g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain); | ||
147 | |||
148 | + g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); | ||
149 | + g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); | ||
277 | + | 150 | + |
278 | + return 0; | 151 | return g_test_run(); |
279 | +} | ||
280 | + | ||
281 | +static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx) | ||
282 | +{ | ||
283 | + vq->log->inflight.desc[desc_idx].inflight = 0; | ||
284 | + | ||
285 | + barrier(); | ||
286 | + | ||
287 | + vq->log->inflight.used_idx = vq->used_idx; | ||
288 | + | ||
289 | + return 0; | ||
290 | +} | ||
291 | + | ||
292 | static void vduse_iova_remove_region(VduseDev *dev, uint64_t start, | ||
293 | uint64_t last) | ||
294 | { | ||
295 | @@ -XXX,XX +XXX,XX @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz) | ||
296 | unsigned int head; | ||
297 | VduseVirtqElement *elem; | ||
298 | VduseDev *dev = vq->dev; | ||
299 | + int i; | ||
300 | |||
301 | if (unlikely(!vq->vring.avail)) { | ||
302 | return NULL; | ||
303 | } | ||
304 | |||
305 | + if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) { | ||
306 | + i = (--vq->resubmit_num); | ||
307 | + elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz); | ||
308 | + | ||
309 | + if (!vq->resubmit_num) { | ||
310 | + free(vq->resubmit_list); | ||
311 | + vq->resubmit_list = NULL; | ||
312 | + } | ||
313 | + | ||
314 | + return elem; | ||
315 | + } | ||
316 | + | ||
317 | if (vduse_queue_empty(vq)) { | ||
318 | return NULL; | ||
319 | } | ||
320 | @@ -XXX,XX +XXX,XX @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz) | ||
321 | |||
322 | vq->inuse++; | ||
323 | |||
324 | + vduse_queue_inflight_get(vq, head); | ||
325 | + | ||
326 | return elem; | ||
327 | } | 152 | } |
328 | |||
329 | @@ -XXX,XX +XXX,XX @@ void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, | ||
330 | unsigned int len) | ||
331 | { | ||
332 | vduse_queue_fill(vq, elem, len, 0); | ||
333 | + vduse_queue_inflight_pre_put(vq, elem->index); | ||
334 | vduse_queue_flush(vq, 1); | ||
335 | + vduse_queue_inflight_post_put(vq, elem->index); | ||
336 | } | ||
337 | |||
338 | static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr, | ||
339 | @@ -XXX,XX +XXX,XX @@ static void vduse_queue_enable(VduseVirtq *vq) | ||
340 | } | ||
341 | |||
342 | vq->fd = fd; | ||
343 | - vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index; | ||
344 | - vq->inuse = 0; | ||
345 | - vq->used_idx = 0; | ||
346 | vq->signalled_used_valid = false; | ||
347 | vq->ready = true; | ||
348 | |||
349 | + if (vduse_queue_check_inflights(vq)) { | ||
350 | + fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index); | ||
351 | + close(fd); | ||
352 | + return; | ||
353 | + } | ||
354 | + | ||
355 | dev->ops->enable_queue(dev, vq); | ||
356 | } | ||
357 | |||
358 | @@ -XXX,XX +XXX,XX @@ static void vduse_dev_start_dataplane(VduseDev *dev) | ||
359 | |||
360 | static void vduse_dev_stop_dataplane(VduseDev *dev) | ||
361 | { | ||
362 | + size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); | ||
363 | int i; | ||
364 | |||
365 | for (i = 0; i < dev->num_queues; i++) { | ||
366 | vduse_queue_disable(&dev->vqs[i]); | ||
367 | } | ||
368 | + if (dev->log) { | ||
369 | + memset(dev->log, 0, log_size); | ||
370 | + } | ||
371 | dev->features = 0; | ||
372 | vduse_iova_remove_region(dev, 0, ULONG_MAX); | ||
373 | } | ||
374 | @@ -XXX,XX +XXX,XX @@ int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size) | ||
375 | return -errno; | ||
376 | } | ||
377 | |||
378 | + vduse_queue_enable(vq); | ||
379 | + | ||
380 | + return 0; | ||
381 | +} | ||
382 | + | ||
383 | +int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename) | ||
384 | +{ | ||
385 | + | ||
386 | + size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); | ||
387 | + void *log; | ||
388 | + int i; | ||
389 | + | ||
390 | + dev->log = log = vduse_log_get(filename, log_size); | ||
391 | + if (log == MAP_FAILED) { | ||
392 | + fprintf(stderr, "Failed to get vduse log\n"); | ||
393 | + return -EINVAL; | ||
394 | + } | ||
395 | + | ||
396 | + for (i = 0; i < dev->num_queues; i++) { | ||
397 | + dev->vqs[i].log = log; | ||
398 | + dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE; | ||
399 | + log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE)); | ||
400 | + } | ||
401 | + | ||
402 | return 0; | ||
403 | } | ||
404 | |||
405 | @@ -XXX,XX +XXX,XX @@ static int vduse_dev_init(VduseDev *dev, const char *name, | ||
406 | return -errno; | ||
407 | } | ||
408 | |||
409 | + if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { | ||
410 | + fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); | ||
411 | + close(fd); | ||
412 | + return -errno; | ||
413 | + } | ||
414 | + | ||
415 | dev_name = strdup(name); | ||
416 | if (!dev_name) { | ||
417 | close(fd); | ||
418 | @@ -XXX,XX +XXX,XX @@ VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues, | ||
419 | return NULL; | ||
420 | } | ||
421 | |||
422 | + if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { | ||
423 | + fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); | ||
424 | + free(dev); | ||
425 | + return NULL; | ||
426 | + } | ||
427 | + | ||
428 | ret = vduse_dev_init_vqs(dev, num_queues); | ||
429 | if (ret) { | ||
430 | fprintf(stderr, "Failed to init vqs\n"); | ||
431 | @@ -XXX,XX +XXX,XX @@ VduseDev *vduse_dev_create(const char *name, uint32_t device_id, | ||
432 | |||
433 | ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config); | ||
434 | free(dev_config); | ||
435 | - if (ret < 0) { | ||
436 | + if (ret && errno != EEXIST) { | ||
437 | fprintf(stderr, "Failed to create vduse device %s: %s\n", | ||
438 | name, strerror(errno)); | ||
439 | goto err_dev; | ||
440 | @@ -XXX,XX +XXX,XX @@ err_ctrl: | ||
441 | |||
442 | int vduse_dev_destroy(VduseDev *dev) | ||
443 | { | ||
444 | - int ret = 0; | ||
445 | + size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE); | ||
446 | + int i, ret = 0; | ||
447 | |||
448 | + if (dev->log) { | ||
449 | + munmap(dev->log, log_size); | ||
450 | + } | ||
451 | + for (i = 0; i < dev->num_queues; i++) { | ||
452 | + free(dev->vqs[i].resubmit_list); | ||
453 | + } | ||
454 | free(dev->vqs); | ||
455 | if (dev->fd >= 0) { | ||
456 | close(dev->fd); | ||
457 | -- | 153 | -- |
458 | 2.35.3 | 154 | 2.13.6 |
155 | |||
156 | diff view generated by jsdifflib |
1 | From: Stefan Hajnoczi <stefanha@redhat.com> | 1 | Block jobs are already paused using the BdrvChildRole drain callbacks, |
---|---|---|---|
2 | so we don't need an additional block_job_pause_all() call. | ||
2 | 3 | ||
3 | Document vduse-blk exports in qemu-storage-daemon --help and the | ||
4 | qemu-storage-daemon(1) man page. | ||
5 | |||
6 | Based-on: <20220523084611.91-1-xieyongji@bytedance.com> | ||
7 | Cc: Xie Yongji <xieyongji@bytedance.com> | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Message-Id: <20220525121947.859820-1-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 5 | --- |
12 | docs/tools/qemu-storage-daemon.rst | 21 +++++++++++++++++++++ | 6 | block/io.c | 4 ---- |
13 | storage-daemon/qemu-storage-daemon.c | 9 +++++++++ | 7 | tests/test-bdrv-drain.c | 10 ++++------ |
14 | 2 files changed, 30 insertions(+) | 8 | 2 files changed, 4 insertions(+), 10 deletions(-) |
15 | 9 | ||
16 | diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst | 10 | diff --git a/block/io.c b/block/io.c |
17 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
18 | --- a/docs/tools/qemu-storage-daemon.rst | 12 | --- a/block/io.c |
19 | +++ b/docs/tools/qemu-storage-daemon.rst | 13 | +++ b/block/io.c |
20 | @@ -XXX,XX +XXX,XX @@ Standard options: | 14 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) |
21 | --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>] | 15 | * context. */ |
22 | --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>] | 16 | assert(qemu_get_current_aio_context() == qemu_get_aio_context()); |
23 | --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto] | 17 | |
24 | + --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>] | 18 | - block_job_pause_all(); |
25 | 19 | - | |
26 | is a block export definition. ``node-name`` is the block node that should be | 20 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { |
27 | exported. ``writable`` determines whether or not the export allows write | 21 | AioContext *aio_context = bdrv_get_aio_context(bs); |
28 | @@ -XXX,XX +XXX,XX @@ Standard options: | 22 | |
29 | ``allow-other`` to auto (the default) will try enabling this option, and on | 23 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) |
30 | error fall back to disabling it. | 24 | aio_enable_external(aio_context); |
31 | 25 | aio_context_release(aio_context); | |
32 | + The ``vduse-blk`` export type uses the ``id`` as the VDUSE device name. | 26 | } |
33 | + ``num-queues`` sets the number of virtqueues (the default is 1). | 27 | - |
34 | + ``queue-size`` sets the virtqueue descriptor table size (the default is 256). | 28 | - block_job_resume_all(); |
35 | + | 29 | } |
36 | + The instantiated VDUSE device must then be added to the vDPA bus using the | 30 | |
37 | + vdpa(8) command from the iproute2 project:: | 31 | void bdrv_drain_all(void) |
38 | + | 32 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
39 | + # vdpa dev add name <id> mgmtdev vduse | ||
40 | + | ||
41 | + The device can be removed from the vDPA bus later as follows:: | ||
42 | + | ||
43 | + # vdpa dev del <id> | ||
44 | + | ||
45 | + For more information about attaching vDPA devices to the host with | ||
46 | + virtio_vdpa.ko or attaching them to guests with vhost_vdpa.ko, see | ||
47 | + https://vdpa-dev.gitlab.io/. | ||
48 | + | ||
49 | + For more information about VDUSE, see | ||
50 | + https://docs.kernel.org/userspace-api/vduse.html. | ||
51 | + | ||
52 | .. option:: --monitor MONITORDEF | ||
53 | |||
54 | is a QMP monitor definition. See the :manpage:`qemu(1)` manual page for | ||
55 | diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c | ||
56 | index XXXXXXX..XXXXXXX 100644 | 33 | index XXXXXXX..XXXXXXX 100644 |
57 | --- a/storage-daemon/qemu-storage-daemon.c | 34 | --- a/tests/test-bdrv-drain.c |
58 | +++ b/storage-daemon/qemu-storage-daemon.c | 35 | +++ b/tests/test-bdrv-drain.c |
59 | @@ -XXX,XX +XXX,XX @@ static void help(void) | 36 | @@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type) |
60 | " vhost-user-blk device over file descriptor\n" | 37 | do_drain_begin(drain_type, src); |
61 | "\n" | 38 | |
62 | #endif /* CONFIG_VHOST_USER_BLK_SERVER */ | 39 | if (drain_type == BDRV_DRAIN_ALL) { |
63 | +#ifdef CONFIG_VDUSE_BLK_EXPORT | 40 | - /* bdrv_drain_all() drains both src and target, and involves an |
64 | +" --export [type=]vduse-blk,id=<id>,node-name=<node-name>\n" | 41 | - * additional block_job_pause_all() */ |
65 | +" [,writable=on|off][,num-queues=<num-queues>]\n" | 42 | - g_assert_cmpint(job->pause_count, ==, 3); |
66 | +" [,queue-size=<queue-size>]\n" | 43 | + /* bdrv_drain_all() drains both src and target */ |
67 | +" [,logical-block-size=<logical-block-size>]\n" | 44 | + g_assert_cmpint(job->pause_count, ==, 2); |
68 | +" export the specified block node as a vduse-blk\n" | 45 | } else { |
69 | +" device using the id as the VDUSE device name\n" | 46 | g_assert_cmpint(job->pause_count, ==, 1); |
70 | +"\n" | 47 | } |
71 | +#endif /* CONFIG_VDUSE_BLK_EXPORT */ | 48 | @@ -XXX,XX +XXX,XX @@ static void test_blockjob_common(enum drain_type drain_type) |
72 | " --monitor [chardev=]name[,mode=control][,pretty[=on|off]]\n" | 49 | do_drain_begin(drain_type, target); |
73 | " configure a QMP monitor\n" | 50 | |
74 | "\n" | 51 | if (drain_type == BDRV_DRAIN_ALL) { |
52 | - /* bdrv_drain_all() drains both src and target, and involves an | ||
53 | - * additional block_job_pause_all() */ | ||
54 | - g_assert_cmpint(job->pause_count, ==, 3); | ||
55 | + /* bdrv_drain_all() drains both src and target */ | ||
56 | + g_assert_cmpint(job->pause_count, ==, 2); | ||
57 | } else { | ||
58 | g_assert_cmpint(job->pause_count, ==, 1); | ||
59 | } | ||
75 | -- | 60 | -- |
76 | 2.35.3 | 61 | 2.13.6 |
62 | |||
63 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | bdrv_do_drained_begin() restricts the call of parent callbacks and |
---|---|---|---|
2 | aio_disable_external() to the outermost drain section, but the block | ||
3 | driver callbacks are always called. bdrv_do_drained_end() must match | ||
4 | this behaviour, otherwise nodes stay drained even if begin/end calls | ||
5 | were balanced. | ||
2 | 6 | ||
3 | This adds vduse header to linux headers so that the | ||
4 | relevant VDUSE API can be used in subsequent patches. | ||
5 | |||
6 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
8 | Message-Id: <20220523084611.91-5-xieyongji@bytedance.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 7 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 8 | --- |
11 | linux-headers/linux/vduse.h | 306 ++++++++++++++++++++++++++++++++ | 9 | block/io.c | 12 +++++++----- |
12 | scripts/update-linux-headers.sh | 2 +- | 10 | 1 file changed, 7 insertions(+), 5 deletions(-) |
13 | 2 files changed, 307 insertions(+), 1 deletion(-) | ||
14 | create mode 100644 linux-headers/linux/vduse.h | ||
15 | 11 | ||
16 | diff --git a/linux-headers/linux/vduse.h b/linux-headers/linux/vduse.h | 12 | diff --git a/block/io.c b/block/io.c |
17 | new file mode 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
18 | index XXXXXXX..XXXXXXX | 14 | --- a/block/io.c |
19 | --- /dev/null | 15 | +++ b/block/io.c |
20 | +++ b/linux-headers/linux/vduse.h | 16 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs) |
21 | @@ -XXX,XX +XXX,XX @@ | 17 | |
22 | +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ | 18 | void bdrv_drained_end(BlockDriverState *bs) |
23 | +#ifndef _VDUSE_H_ | 19 | { |
24 | +#define _VDUSE_H_ | 20 | + int old_quiesce_counter; |
25 | + | 21 | + |
26 | +#include <linux/types.h> | 22 | if (qemu_in_coroutine()) { |
27 | + | 23 | bdrv_co_yield_to_drain(bs, false); |
28 | +#define VDUSE_BASE 0x81 | 24 | return; |
29 | + | 25 | } |
30 | +/* The ioctls for control device (/dev/vduse/control) */ | 26 | assert(bs->quiesce_counter > 0); |
31 | + | 27 | - if (atomic_fetch_dec(&bs->quiesce_counter) > 1) { |
32 | +#define VDUSE_API_VERSION 0 | 28 | - return; |
33 | + | 29 | - } |
34 | +/* | 30 | + old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); |
35 | + * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION). | 31 | |
36 | + * This is used for future extension. | 32 | /* Re-enable things in child-to-parent order */ |
37 | + */ | 33 | bdrv_drain_invoke(bs, false, false); |
38 | +#define VDUSE_GET_API_VERSION _IOR(VDUSE_BASE, 0x00, __u64) | 34 | - bdrv_parent_drained_end(bs); |
39 | + | 35 | - aio_enable_external(bdrv_get_aio_context(bs)); |
40 | +/* Set the version of VDUSE API that userspace supported. */ | 36 | + if (old_quiesce_counter == 1) { |
41 | +#define VDUSE_SET_API_VERSION _IOW(VDUSE_BASE, 0x01, __u64) | 37 | + bdrv_parent_drained_end(bs); |
42 | + | 38 | + aio_enable_external(bdrv_get_aio_context(bs)); |
43 | +/** | 39 | + } |
44 | + * struct vduse_dev_config - basic configuration of a VDUSE device | 40 | } |
45 | + * @name: VDUSE device name, needs to be NUL terminated | 41 | |
46 | + * @vendor_id: virtio vendor id | 42 | /* |
47 | + * @device_id: virtio device id | ||
48 | + * @features: virtio features | ||
49 | + * @vq_num: the number of virtqueues | ||
50 | + * @vq_align: the allocation alignment of virtqueue's metadata | ||
51 | + * @reserved: for future use, needs to be initialized to zero | ||
52 | + * @config_size: the size of the configuration space | ||
53 | + * @config: the buffer of the configuration space | ||
54 | + * | ||
55 | + * Structure used by VDUSE_CREATE_DEV ioctl to create VDUSE device. | ||
56 | + */ | ||
57 | +struct vduse_dev_config { | ||
58 | +#define VDUSE_NAME_MAX 256 | ||
59 | + char name[VDUSE_NAME_MAX]; | ||
60 | + __u32 vendor_id; | ||
61 | + __u32 device_id; | ||
62 | + __u64 features; | ||
63 | + __u32 vq_num; | ||
64 | + __u32 vq_align; | ||
65 | + __u32 reserved[13]; | ||
66 | + __u32 config_size; | ||
67 | + __u8 config[]; | ||
68 | +}; | ||
69 | + | ||
70 | +/* Create a VDUSE device which is represented by a char device (/dev/vduse/$NAME) */ | ||
71 | +#define VDUSE_CREATE_DEV _IOW(VDUSE_BASE, 0x02, struct vduse_dev_config) | ||
72 | + | ||
73 | +/* | ||
74 | + * Destroy a VDUSE device. Make sure there are no more references | ||
75 | + * to the char device (/dev/vduse/$NAME). | ||
76 | + */ | ||
77 | +#define VDUSE_DESTROY_DEV _IOW(VDUSE_BASE, 0x03, char[VDUSE_NAME_MAX]) | ||
78 | + | ||
79 | +/* The ioctls for VDUSE device (/dev/vduse/$NAME) */ | ||
80 | + | ||
81 | +/** | ||
82 | + * struct vduse_iotlb_entry - entry of IOTLB to describe one IOVA region [start, last] | ||
83 | + * @offset: the mmap offset on returned file descriptor | ||
84 | + * @start: start of the IOVA region | ||
85 | + * @last: last of the IOVA region | ||
86 | + * @perm: access permission of the IOVA region | ||
87 | + * | ||
88 | + * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region. | ||
89 | + */ | ||
90 | +struct vduse_iotlb_entry { | ||
91 | + __u64 offset; | ||
92 | + __u64 start; | ||
93 | + __u64 last; | ||
94 | +#define VDUSE_ACCESS_RO 0x1 | ||
95 | +#define VDUSE_ACCESS_WO 0x2 | ||
96 | +#define VDUSE_ACCESS_RW 0x3 | ||
97 | + __u8 perm; | ||
98 | +}; | ||
99 | + | ||
100 | +/* | ||
101 | + * Find the first IOVA region that overlaps with the range [start, last] | ||
102 | + * and return the corresponding file descriptor. Return -EINVAL means the | ||
103 | + * IOVA region doesn't exist. Caller should set start and last fields. | ||
104 | + */ | ||
105 | +#define VDUSE_IOTLB_GET_FD _IOWR(VDUSE_BASE, 0x10, struct vduse_iotlb_entry) | ||
106 | + | ||
107 | +/* | ||
108 | + * Get the negotiated virtio features. It's a subset of the features in | ||
109 | + * struct vduse_dev_config which can be accepted by virtio driver. It's | ||
110 | + * only valid after FEATURES_OK status bit is set. | ||
111 | + */ | ||
112 | +#define VDUSE_DEV_GET_FEATURES _IOR(VDUSE_BASE, 0x11, __u64) | ||
113 | + | ||
114 | +/** | ||
115 | + * struct vduse_config_data - data used to update configuration space | ||
116 | + * @offset: the offset from the beginning of configuration space | ||
117 | + * @length: the length to write to configuration space | ||
118 | + * @buffer: the buffer used to write from | ||
119 | + * | ||
120 | + * Structure used by VDUSE_DEV_SET_CONFIG ioctl to update device | ||
121 | + * configuration space. | ||
122 | + */ | ||
123 | +struct vduse_config_data { | ||
124 | + __u32 offset; | ||
125 | + __u32 length; | ||
126 | + __u8 buffer[]; | ||
127 | +}; | ||
128 | + | ||
129 | +/* Set device configuration space */ | ||
130 | +#define VDUSE_DEV_SET_CONFIG _IOW(VDUSE_BASE, 0x12, struct vduse_config_data) | ||
131 | + | ||
132 | +/* | ||
133 | + * Inject a config interrupt. It's usually used to notify virtio driver | ||
134 | + * that device configuration space has changed. | ||
135 | + */ | ||
136 | +#define VDUSE_DEV_INJECT_CONFIG_IRQ _IO(VDUSE_BASE, 0x13) | ||
137 | + | ||
138 | +/** | ||
139 | + * struct vduse_vq_config - basic configuration of a virtqueue | ||
140 | + * @index: virtqueue index | ||
141 | + * @max_size: the max size of virtqueue | ||
142 | + * @reserved: for future use, needs to be initialized to zero | ||
143 | + * | ||
144 | + * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. | ||
145 | + */ | ||
146 | +struct vduse_vq_config { | ||
147 | + __u32 index; | ||
148 | + __u16 max_size; | ||
149 | + __u16 reserved[13]; | ||
150 | +}; | ||
151 | + | ||
152 | +/* | ||
153 | + * Setup the specified virtqueue. Make sure all virtqueues have been | ||
154 | + * configured before the device is attached to vDPA bus. | ||
155 | + */ | ||
156 | +#define VDUSE_VQ_SETUP _IOW(VDUSE_BASE, 0x14, struct vduse_vq_config) | ||
157 | + | ||
158 | +/** | ||
159 | + * struct vduse_vq_state_split - split virtqueue state | ||
160 | + * @avail_index: available index | ||
161 | + */ | ||
162 | +struct vduse_vq_state_split { | ||
163 | + __u16 avail_index; | ||
164 | +}; | ||
165 | + | ||
166 | +/** | ||
167 | + * struct vduse_vq_state_packed - packed virtqueue state | ||
168 | + * @last_avail_counter: last driver ring wrap counter observed by device | ||
169 | + * @last_avail_idx: device available index | ||
170 | + * @last_used_counter: device ring wrap counter | ||
171 | + * @last_used_idx: used index | ||
172 | + */ | ||
173 | +struct vduse_vq_state_packed { | ||
174 | + __u16 last_avail_counter; | ||
175 | + __u16 last_avail_idx; | ||
176 | + __u16 last_used_counter; | ||
177 | + __u16 last_used_idx; | ||
178 | +}; | ||
179 | + | ||
180 | +/** | ||
181 | + * struct vduse_vq_info - information of a virtqueue | ||
182 | + * @index: virtqueue index | ||
183 | + * @num: the size of virtqueue | ||
184 | + * @desc_addr: address of desc area | ||
185 | + * @driver_addr: address of driver area | ||
186 | + * @device_addr: address of device area | ||
187 | + * @split: split virtqueue state | ||
188 | + * @packed: packed virtqueue state | ||
189 | + * @ready: ready status of virtqueue | ||
190 | + * | ||
191 | + * Structure used by VDUSE_VQ_GET_INFO ioctl to get virtqueue's information. | ||
192 | + */ | ||
193 | +struct vduse_vq_info { | ||
194 | + __u32 index; | ||
195 | + __u32 num; | ||
196 | + __u64 desc_addr; | ||
197 | + __u64 driver_addr; | ||
198 | + __u64 device_addr; | ||
199 | + union { | ||
200 | + struct vduse_vq_state_split split; | ||
201 | + struct vduse_vq_state_packed packed; | ||
202 | + }; | ||
203 | + __u8 ready; | ||
204 | +}; | ||
205 | + | ||
206 | +/* Get the specified virtqueue's information. Caller should set index field. */ | ||
207 | +#define VDUSE_VQ_GET_INFO _IOWR(VDUSE_BASE, 0x15, struct vduse_vq_info) | ||
208 | + | ||
209 | +/** | ||
210 | + * struct vduse_vq_eventfd - eventfd configuration for a virtqueue | ||
211 | + * @index: virtqueue index | ||
212 | + * @fd: eventfd, -1 means de-assigning the eventfd | ||
213 | + * | ||
214 | + * Structure used by VDUSE_VQ_SETUP_KICKFD ioctl to setup kick eventfd. | ||
215 | + */ | ||
216 | +struct vduse_vq_eventfd { | ||
217 | + __u32 index; | ||
218 | +#define VDUSE_EVENTFD_DEASSIGN -1 | ||
219 | + int fd; | ||
220 | +}; | ||
221 | + | ||
222 | +/* | ||
223 | + * Setup kick eventfd for specified virtqueue. The kick eventfd is used | ||
224 | + * by VDUSE kernel module to notify userspace to consume the avail vring. | ||
225 | + */ | ||
226 | +#define VDUSE_VQ_SETUP_KICKFD _IOW(VDUSE_BASE, 0x16, struct vduse_vq_eventfd) | ||
227 | + | ||
228 | +/* | ||
229 | + * Inject an interrupt for specific virtqueue. It's used to notify virtio driver | ||
230 | + * to consume the used vring. | ||
231 | + */ | ||
232 | +#define VDUSE_VQ_INJECT_IRQ _IOW(VDUSE_BASE, 0x17, __u32) | ||
233 | + | ||
234 | +/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */ | ||
235 | + | ||
236 | +/** | ||
237 | + * enum vduse_req_type - request type | ||
238 | + * @VDUSE_GET_VQ_STATE: get the state for specified virtqueue from userspace | ||
239 | + * @VDUSE_SET_STATUS: set the device status | ||
240 | + * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for | ||
241 | + * specified IOVA range via VDUSE_IOTLB_GET_FD ioctl | ||
242 | + */ | ||
243 | +enum vduse_req_type { | ||
244 | + VDUSE_GET_VQ_STATE, | ||
245 | + VDUSE_SET_STATUS, | ||
246 | + VDUSE_UPDATE_IOTLB, | ||
247 | +}; | ||
248 | + | ||
249 | +/** | ||
250 | + * struct vduse_vq_state - virtqueue state | ||
251 | + * @index: virtqueue index | ||
252 | + * @split: split virtqueue state | ||
253 | + * @packed: packed virtqueue state | ||
254 | + */ | ||
255 | +struct vduse_vq_state { | ||
256 | + __u32 index; | ||
257 | + union { | ||
258 | + struct vduse_vq_state_split split; | ||
259 | + struct vduse_vq_state_packed packed; | ||
260 | + }; | ||
261 | +}; | ||
262 | + | ||
263 | +/** | ||
264 | + * struct vduse_dev_status - device status | ||
265 | + * @status: device status | ||
266 | + */ | ||
267 | +struct vduse_dev_status { | ||
268 | + __u8 status; | ||
269 | +}; | ||
270 | + | ||
271 | +/** | ||
272 | + * struct vduse_iova_range - IOVA range [start, last] | ||
273 | + * @start: start of the IOVA range | ||
274 | + * @last: last of the IOVA range | ||
275 | + */ | ||
276 | +struct vduse_iova_range { | ||
277 | + __u64 start; | ||
278 | + __u64 last; | ||
279 | +}; | ||
280 | + | ||
281 | +/** | ||
282 | + * struct vduse_dev_request - control request | ||
283 | + * @type: request type | ||
284 | + * @request_id: request id | ||
285 | + * @reserved: for future use | ||
286 | + * @vq_state: virtqueue state, only index field is available | ||
287 | + * @s: device status | ||
288 | + * @iova: IOVA range for updating | ||
289 | + * @padding: padding | ||
290 | + * | ||
291 | + * Structure used by read(2) on /dev/vduse/$NAME. | ||
292 | + */ | ||
293 | +struct vduse_dev_request { | ||
294 | + __u32 type; | ||
295 | + __u32 request_id; | ||
296 | + __u32 reserved[4]; | ||
297 | + union { | ||
298 | + struct vduse_vq_state vq_state; | ||
299 | + struct vduse_dev_status s; | ||
300 | + struct vduse_iova_range iova; | ||
301 | + __u32 padding[32]; | ||
302 | + }; | ||
303 | +}; | ||
304 | + | ||
305 | +/** | ||
306 | + * struct vduse_dev_response - response to control request | ||
307 | + * @request_id: corresponding request id | ||
308 | + * @result: the result of request | ||
309 | + * @reserved: for future use, needs to be initialized to zero | ||
310 | + * @vq_state: virtqueue state | ||
311 | + * @padding: padding | ||
312 | + * | ||
313 | + * Structure used by write(2) on /dev/vduse/$NAME. | ||
314 | + */ | ||
315 | +struct vduse_dev_response { | ||
316 | + __u32 request_id; | ||
317 | +#define VDUSE_REQ_RESULT_OK 0x00 | ||
318 | +#define VDUSE_REQ_RESULT_FAILED 0x01 | ||
319 | + __u32 result; | ||
320 | + __u32 reserved[4]; | ||
321 | + union { | ||
322 | + struct vduse_vq_state vq_state; | ||
323 | + __u32 padding[32]; | ||
324 | + }; | ||
325 | +}; | ||
326 | + | ||
327 | +#endif /* _VDUSE_H_ */ | ||
328 | diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh | ||
329 | index XXXXXXX..XXXXXXX 100755 | ||
330 | --- a/scripts/update-linux-headers.sh | ||
331 | +++ b/scripts/update-linux-headers.sh | ||
332 | @@ -XXX,XX +XXX,XX @@ done | ||
333 | rm -rf "$output/linux-headers/linux" | ||
334 | mkdir -p "$output/linux-headers/linux" | ||
335 | for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \ | ||
336 | - psci.h psp-sev.h userfaultfd.h mman.h; do | ||
337 | + psci.h psp-sev.h userfaultfd.h mman.h vduse.h; do | ||
338 | cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" | ||
339 | done | ||
340 | |||
341 | -- | 43 | -- |
342 | 2.35.3 | 44 | 2.13.6 |
45 | |||
46 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | ||
---|---|---|---|
2 | |||
3 | To support block resize, this uses vduse_dev_update_config() | ||
4 | to update the capacity field in configuration space and inject | ||
5 | config interrupt on the block resize callback. | ||
6 | |||
7 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Message-Id: <20220523084611.91-8-xieyongji@bytedance.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 1 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 2 | --- |
12 | block/export/vduse-blk.c | 20 ++++++++++++++++++++ | 3 | tests/test-bdrv-drain.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ |
13 | 1 file changed, 20 insertions(+) | 4 | 1 file changed, 57 insertions(+) |
14 | 5 | ||
15 | diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c | 6 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
16 | index XXXXXXX..XXXXXXX 100644 | 7 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/export/vduse-blk.c | 8 | --- a/tests/test-bdrv-drain.c |
18 | +++ b/block/export/vduse-blk.c | 9 | +++ b/tests/test-bdrv-drain.c |
19 | @@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque) | 10 | @@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret) |
20 | vblk_exp->export.ctx = NULL; | 11 | enum drain_type { |
12 | BDRV_DRAIN_ALL, | ||
13 | BDRV_DRAIN, | ||
14 | + DRAIN_TYPE_MAX, | ||
15 | }; | ||
16 | |||
17 | static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs) | ||
18 | @@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void) | ||
19 | test_quiesce_common(BDRV_DRAIN, false); | ||
21 | } | 20 | } |
22 | 21 | ||
23 | +static void vduse_blk_resize(void *opaque) | 22 | +static void test_nested(void) |
24 | +{ | 23 | +{ |
25 | + BlockExport *exp = opaque; | 24 | + BlockBackend *blk; |
26 | + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | 25 | + BlockDriverState *bs, *backing; |
27 | + struct virtio_blk_config config; | 26 | + BDRVTestState *s, *backing_s; |
27 | + enum drain_type outer, inner; | ||
28 | + | 28 | + |
29 | + config.capacity = | 29 | + blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); |
30 | + cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); | 30 | + bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR, |
31 | + vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity), | 31 | + &error_abort); |
32 | + offsetof(struct virtio_blk_config, capacity), | 32 | + s = bs->opaque; |
33 | + (char *)&config.capacity); | 33 | + blk_insert_bs(blk, bs, &error_abort); |
34 | + | ||
35 | + backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); | ||
36 | + backing_s = backing->opaque; | ||
37 | + bdrv_set_backing_hd(bs, backing, &error_abort); | ||
38 | + | ||
39 | + for (outer = 0; outer < DRAIN_TYPE_MAX; outer++) { | ||
40 | + for (inner = 0; inner < DRAIN_TYPE_MAX; inner++) { | ||
41 | + /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */ | ||
42 | + int bs_quiesce = (outer != BDRV_DRAIN_ALL) + | ||
43 | + (inner != BDRV_DRAIN_ALL); | ||
44 | + int backing_quiesce = 0; | ||
45 | + int backing_cb_cnt = (outer != BDRV_DRAIN) + | ||
46 | + (inner != BDRV_DRAIN); | ||
47 | + | ||
48 | + g_assert_cmpint(bs->quiesce_counter, ==, 0); | ||
49 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); | ||
50 | + g_assert_cmpint(s->drain_count, ==, 0); | ||
51 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
52 | + | ||
53 | + do_drain_begin(outer, bs); | ||
54 | + do_drain_begin(inner, bs); | ||
55 | + | ||
56 | + g_assert_cmpint(bs->quiesce_counter, ==, bs_quiesce); | ||
57 | + g_assert_cmpint(backing->quiesce_counter, ==, backing_quiesce); | ||
58 | + g_assert_cmpint(s->drain_count, ==, 2); | ||
59 | + g_assert_cmpint(backing_s->drain_count, ==, backing_cb_cnt); | ||
60 | + | ||
61 | + do_drain_end(inner, bs); | ||
62 | + do_drain_end(outer, bs); | ||
63 | + | ||
64 | + g_assert_cmpint(bs->quiesce_counter, ==, 0); | ||
65 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); | ||
66 | + g_assert_cmpint(s->drain_count, ==, 0); | ||
67 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
68 | + } | ||
69 | + } | ||
70 | + | ||
71 | + bdrv_unref(backing); | ||
72 | + bdrv_unref(bs); | ||
73 | + blk_unref(blk); | ||
34 | +} | 74 | +} |
35 | + | 75 | + |
36 | +static const BlockDevOps vduse_block_ops = { | 76 | |
37 | + .resize_cb = vduse_blk_resize, | 77 | typedef struct TestBlockJob { |
38 | +}; | 78 | BlockJob common; |
79 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
80 | g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all); | ||
81 | g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain); | ||
82 | |||
83 | + g_test_add_func("/bdrv-drain/nested", test_nested); | ||
39 | + | 84 | + |
40 | static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | 85 | g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); |
41 | Error **errp) | 86 | g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); |
42 | { | ||
43 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
44 | blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | ||
45 | vblk_exp); | ||
46 | |||
47 | + blk_set_dev_ops(exp->blk, &vduse_block_ops, exp); | ||
48 | + | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | @@ -XXX,XX +XXX,XX @@ static void vduse_blk_exp_delete(BlockExport *exp) | ||
53 | |||
54 | blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | ||
55 | vblk_exp); | ||
56 | + blk_set_dev_ops(exp->blk, NULL, NULL); | ||
57 | vduse_dev_destroy(vblk_exp->dev); | ||
58 | } | ||
59 | 87 | ||
60 | -- | 88 | -- |
61 | 2.35.3 | 89 | 2.13.6 |
90 | |||
91 | diff view generated by jsdifflib |
1 | From: Emanuele Giuseppe Esposito <eesposit@redhat.com> | 1 | This is in preparation for subtree drains, i.e. drained sections that |
---|---|---|---|
2 | 2 | affect not only a single node, but recursively all child nodes, too. | |
3 | It seems that aio_wait_kick always required a memory barrier | 3 | |
4 | or atomic operation in the caller, but nobody actually | 4 | Calling the parent callbacks for drain is pointless when we just came |
5 | took care of doing it. | 5 | from that parent node recursively and leads to multiple increases of |
6 | 6 | bs->quiesce_counter in a single drain call. Don't do it. | |
7 | Let's put the barrier in the function instead, and pair it | 7 | |
8 | with another one in AIO_WAIT_WHILE. Read aio_wait_kick() | 8 | In order for this to work correctly, the parent callback must be called |
9 | comment for further explanation. | 9 | for every bdrv_drain_begin/end() call, not only for the outermost one: |
10 | 10 | ||
11 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> | 11 | If we have a node N with two parents A and B, recursive draining of A |
12 | Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com> | 12 | should cause the quiesce_counter of B to increase because its child N is |
13 | Message-Id: <20220524173054.12651-1-eesposit@redhat.com> | 13 | drained independently of B. If now B is recursively drained, too, A must |
14 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> | 14 | increase its quiesce_counter because N is drained independently of A |
15 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 15 | only now, even if N is going from quiesce_counter 1 to 2. |
16 | |||
16 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 17 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
17 | --- | 18 | --- |
18 | include/block/aio-wait.h | 2 ++ | 19 | include/block/block.h | 4 ++-- |
19 | util/aio-wait.c | 16 +++++++++++++++- | 20 | block.c | 13 +++++++++---- |
20 | 2 files changed, 17 insertions(+), 1 deletion(-) | 21 | block/io.c | 47 ++++++++++++++++++++++++++++++++++------------- |
21 | 22 | 3 files changed, 45 insertions(+), 19 deletions(-) | |
22 | diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h | 23 | |
24 | diff --git a/include/block/block.h b/include/block/block.h | ||
23 | index XXXXXXX..XXXXXXX 100644 | 25 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/include/block/aio-wait.h | 26 | --- a/include/block/block.h |
25 | +++ b/include/block/aio-wait.h | 27 | +++ b/include/block/block.h |
26 | @@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait; | 28 | @@ -XXX,XX +XXX,XX @@ void bdrv_io_unplug(BlockDriverState *bs); |
27 | AioContext *ctx_ = (ctx); \ | 29 | * Begin a quiesced section of all users of @bs. This is part of |
28 | /* Increment wait_->num_waiters before evaluating cond. */ \ | 30 | * bdrv_drained_begin. |
29 | qatomic_inc(&wait_->num_waiters); \ | 31 | */ |
30 | + /* Paired with smp_mb in aio_wait_kick(). */ \ | 32 | -void bdrv_parent_drained_begin(BlockDriverState *bs); |
31 | + smp_mb(); \ | 33 | +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore); |
32 | if (ctx_ && in_aio_context_home_thread(ctx_)) { \ | 34 | |
33 | while ((cond)) { \ | 35 | /** |
34 | aio_poll(ctx_, true); \ | 36 | * bdrv_parent_drained_end: |
35 | diff --git a/util/aio-wait.c b/util/aio-wait.c | 37 | @@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_begin(BlockDriverState *bs); |
38 | * End a quiesced section of all users of @bs. This is part of | ||
39 | * bdrv_drained_end. | ||
40 | */ | ||
41 | -void bdrv_parent_drained_end(BlockDriverState *bs); | ||
42 | +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore); | ||
43 | |||
44 | /** | ||
45 | * bdrv_drained_begin: | ||
46 | diff --git a/block.c b/block.c | ||
36 | index XXXXXXX..XXXXXXX 100644 | 47 | index XXXXXXX..XXXXXXX 100644 |
37 | --- a/util/aio-wait.c | 48 | --- a/block.c |
38 | +++ b/util/aio-wait.c | 49 | +++ b/block.c |
39 | @@ -XXX,XX +XXX,XX @@ static void dummy_bh_cb(void *opaque) | 50 | @@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child, |
40 | 51 | BlockDriverState *new_bs) | |
41 | void aio_wait_kick(void) | 52 | { |
42 | { | 53 | BlockDriverState *old_bs = child->bs; |
43 | - /* The barrier (or an atomic op) is in the caller. */ | 54 | + int i; |
44 | + /* | 55 | |
45 | + * Paired with smp_mb in AIO_WAIT_WHILE. Here we have: | 56 | if (old_bs && new_bs) { |
46 | + * write(condition); | 57 | assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)); |
47 | + * aio_wait_kick() { | 58 | } |
48 | + * smp_mb(); | 59 | if (old_bs) { |
49 | + * read(num_waiters); | 60 | if (old_bs->quiesce_counter && child->role->drained_end) { |
50 | + * } | 61 | - child->role->drained_end(child); |
51 | + * | 62 | + for (i = 0; i < old_bs->quiesce_counter; i++) { |
52 | + * And in AIO_WAIT_WHILE: | 63 | + child->role->drained_end(child); |
53 | + * write(num_waiters); | 64 | + } |
54 | + * smp_mb(); | 65 | } |
55 | + * read(condition); | 66 | if (child->role->detach) { |
56 | + */ | 67 | child->role->detach(child); |
57 | + smp_mb(); | 68 | @@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child, |
69 | if (new_bs) { | ||
70 | QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent); | ||
71 | if (new_bs->quiesce_counter && child->role->drained_begin) { | ||
72 | - child->role->drained_begin(child); | ||
73 | + for (i = 0; i < new_bs->quiesce_counter; i++) { | ||
74 | + child->role->drained_begin(child); | ||
75 | + } | ||
76 | } | ||
77 | |||
78 | if (child->role->attach) { | ||
79 | @@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context) | ||
80 | AioContext *ctx = bdrv_get_aio_context(bs); | ||
81 | |||
82 | aio_disable_external(ctx); | ||
83 | - bdrv_parent_drained_begin(bs); | ||
84 | + bdrv_parent_drained_begin(bs, NULL); | ||
85 | bdrv_drain(bs); /* ensure there are no in-flight requests */ | ||
86 | |||
87 | while (aio_poll(ctx, false)) { | ||
88 | @@ -XXX,XX +XXX,XX @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context) | ||
89 | */ | ||
90 | aio_context_acquire(new_context); | ||
91 | bdrv_attach_aio_context(bs, new_context); | ||
92 | - bdrv_parent_drained_end(bs); | ||
93 | + bdrv_parent_drained_end(bs, NULL); | ||
94 | aio_enable_external(ctx); | ||
95 | aio_context_release(new_context); | ||
96 | } | ||
97 | diff --git a/block/io.c b/block/io.c | ||
98 | index XXXXXXX..XXXXXXX 100644 | ||
99 | --- a/block/io.c | ||
100 | +++ b/block/io.c | ||
101 | @@ -XXX,XX +XXX,XX @@ | ||
102 | static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, | ||
103 | int64_t offset, int bytes, BdrvRequestFlags flags); | ||
104 | |||
105 | -void bdrv_parent_drained_begin(BlockDriverState *bs) | ||
106 | +void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) | ||
107 | { | ||
108 | BdrvChild *c, *next; | ||
109 | |||
110 | QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { | ||
111 | + if (c == ignore) { | ||
112 | + continue; | ||
113 | + } | ||
114 | if (c->role->drained_begin) { | ||
115 | c->role->drained_begin(c); | ||
116 | } | ||
117 | } | ||
118 | } | ||
119 | |||
120 | -void bdrv_parent_drained_end(BlockDriverState *bs) | ||
121 | +void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) | ||
122 | { | ||
123 | BdrvChild *c, *next; | ||
124 | |||
125 | QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { | ||
126 | + if (c == ignore) { | ||
127 | + continue; | ||
128 | + } | ||
129 | if (c->role->drained_end) { | ||
130 | c->role->drained_end(c); | ||
131 | } | ||
132 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
133 | BlockDriverState *bs; | ||
134 | bool done; | ||
135 | bool begin; | ||
136 | + BdrvChild *parent; | ||
137 | } BdrvCoDrainData; | ||
138 | |||
139 | static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) | ||
140 | @@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs) | ||
141 | return waited; | ||
142 | } | ||
143 | |||
144 | +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent); | ||
145 | +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent); | ||
58 | + | 146 | + |
59 | if (qatomic_read(&global_aio_wait.num_waiters)) { | 147 | static void bdrv_co_drain_bh_cb(void *opaque) |
60 | aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); | 148 | { |
149 | BdrvCoDrainData *data = opaque; | ||
150 | @@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque) | ||
151 | |||
152 | bdrv_dec_in_flight(bs); | ||
153 | if (data->begin) { | ||
154 | - bdrv_drained_begin(bs); | ||
155 | + bdrv_do_drained_begin(bs, data->parent); | ||
156 | } else { | ||
157 | - bdrv_drained_end(bs); | ||
158 | + bdrv_do_drained_end(bs, data->parent); | ||
159 | } | ||
160 | |||
161 | data->done = true; | ||
162 | @@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque) | ||
163 | } | ||
164 | |||
165 | static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, | ||
166 | - bool begin) | ||
167 | + bool begin, BdrvChild *parent) | ||
168 | { | ||
169 | BdrvCoDrainData data; | ||
170 | |||
171 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, | ||
172 | .bs = bs, | ||
173 | .done = false, | ||
174 | .begin = begin, | ||
175 | + .parent = parent, | ||
176 | }; | ||
177 | bdrv_inc_in_flight(bs); | ||
178 | aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), | ||
179 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, | ||
180 | assert(data.done); | ||
181 | } | ||
182 | |||
183 | -void bdrv_drained_begin(BlockDriverState *bs) | ||
184 | +static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent) | ||
185 | { | ||
186 | if (qemu_in_coroutine()) { | ||
187 | - bdrv_co_yield_to_drain(bs, true); | ||
188 | + bdrv_co_yield_to_drain(bs, true, parent); | ||
189 | return; | ||
190 | } | ||
191 | |||
192 | /* Stop things in parent-to-child order */ | ||
193 | if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { | ||
194 | aio_disable_external(bdrv_get_aio_context(bs)); | ||
195 | - bdrv_parent_drained_begin(bs); | ||
196 | } | ||
197 | |||
198 | + bdrv_parent_drained_begin(bs, parent); | ||
199 | bdrv_drain_invoke(bs, true, false); | ||
200 | bdrv_drain_recurse(bs); | ||
201 | } | ||
202 | |||
203 | -void bdrv_drained_end(BlockDriverState *bs) | ||
204 | +void bdrv_drained_begin(BlockDriverState *bs) | ||
205 | +{ | ||
206 | + bdrv_do_drained_begin(bs, NULL); | ||
207 | +} | ||
208 | + | ||
209 | +static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) | ||
210 | { | ||
211 | int old_quiesce_counter; | ||
212 | |||
213 | if (qemu_in_coroutine()) { | ||
214 | - bdrv_co_yield_to_drain(bs, false); | ||
215 | + bdrv_co_yield_to_drain(bs, false, parent); | ||
216 | return; | ||
217 | } | ||
218 | assert(bs->quiesce_counter > 0); | ||
219 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_end(BlockDriverState *bs) | ||
220 | |||
221 | /* Re-enable things in child-to-parent order */ | ||
222 | bdrv_drain_invoke(bs, false, false); | ||
223 | + bdrv_parent_drained_end(bs, parent); | ||
224 | if (old_quiesce_counter == 1) { | ||
225 | - bdrv_parent_drained_end(bs); | ||
226 | aio_enable_external(bdrv_get_aio_context(bs)); | ||
227 | } | ||
228 | } | ||
229 | |||
230 | +void bdrv_drained_end(BlockDriverState *bs) | ||
231 | +{ | ||
232 | + bdrv_do_drained_end(bs, NULL); | ||
233 | +} | ||
234 | + | ||
235 | /* | ||
236 | * Wait for pending requests to complete on a single BlockDriverState subtree, | ||
237 | * and suspend block driver's internal I/O until next request arrives. | ||
238 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_begin(void) | ||
239 | /* Stop things in parent-to-child order */ | ||
240 | aio_context_acquire(aio_context); | ||
241 | aio_disable_external(aio_context); | ||
242 | - bdrv_parent_drained_begin(bs); | ||
243 | + bdrv_parent_drained_begin(bs, NULL); | ||
244 | bdrv_drain_invoke(bs, true, true); | ||
245 | aio_context_release(aio_context); | ||
246 | |||
247 | @@ -XXX,XX +XXX,XX @@ void bdrv_drain_all_end(void) | ||
248 | /* Re-enable things in child-to-parent order */ | ||
249 | aio_context_acquire(aio_context); | ||
250 | bdrv_drain_invoke(bs, false, true); | ||
251 | - bdrv_parent_drained_end(bs); | ||
252 | + bdrv_parent_drained_end(bs, NULL); | ||
253 | aio_enable_external(aio_context); | ||
254 | aio_context_release(aio_context); | ||
61 | } | 255 | } |
62 | -- | 256 | -- |
63 | 2.35.3 | 257 | 2.13.6 |
258 | |||
259 | diff view generated by jsdifflib |
1 | From: Stefano Garzarella <sgarzare@redhat.com> | 1 | bdrv_drained_begin() waits for the completion of requests in the whole |
---|---|---|---|
2 | subtree, but it only actually keeps its immediate bs parameter quiesced | ||
3 | until bdrv_drained_end(). | ||
2 | 4 | ||
3 | If the namespace does not exist, rbd_create() fails with -ENOENT and | 5 | Add a version that keeps the whole subtree drained. As of this commit, |
4 | QEMU reports a generic "error rbd create: No such file or directory": | 6 | graph changes cannot be allowed during a subtree drained section, but |
7 | this will be fixed soon. | ||
5 | 8 | ||
6 | $ qemu-img create rbd:rbd/namespace/image 1M | ||
7 | Formatting 'rbd:rbd/namespace/image', fmt=raw size=1048576 | ||
8 | qemu-img: rbd:rbd/namespace/image: error rbd create: No such file or directory | ||
9 | |||
10 | Unfortunately rados_ioctx_set_namespace() does not fail if the namespace | ||
11 | does not exist, so let's use rbd_namespace_exists() in qemu_rbd_connect() | ||
12 | to check if the namespace exists, reporting a more understandable error: | ||
13 | |||
14 | $ qemu-img create rbd:rbd/namespace/image 1M | ||
15 | Formatting 'rbd:rbd/namespace/image', fmt=raw size=1048576 | ||
16 | qemu-img: rbd:rbd/namespace/image: namespace 'namespace' does not exist | ||
17 | |||
18 | Reported-by: Tingting Mao <timao@redhat.com> | ||
19 | Reviewed-by: Ilya Dryomov <idryomov@gmail.com> | ||
20 | Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> | ||
21 | Message-Id: <20220517071012.6120-1-sgarzare@redhat.com> | ||
22 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
23 | --- | 10 | --- |
24 | block/rbd.c | 24 ++++++++++++++++++++++++ | 11 | include/block/block.h | 13 +++++++++++++ |
25 | meson.build | 6 ++++++ | 12 | block/io.c | 54 ++++++++++++++++++++++++++++++++++++++++----------- |
26 | 2 files changed, 30 insertions(+) | 13 | 2 files changed, 56 insertions(+), 11 deletions(-) |
27 | 14 | ||
28 | diff --git a/block/rbd.c b/block/rbd.c | 15 | diff --git a/include/block/block.h b/include/block/block.h |
29 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
30 | --- a/block/rbd.c | 17 | --- a/include/block/block.h |
31 | +++ b/block/rbd.c | 18 | +++ b/include/block/block.h |
32 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, | 19 | @@ -XXX,XX +XXX,XX @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore); |
33 | error_setg_errno(errp, -r, "error opening pool %s", opts->pool); | 20 | void bdrv_drained_begin(BlockDriverState *bs); |
34 | goto failed_shutdown; | 21 | |
22 | /** | ||
23 | + * Like bdrv_drained_begin, but recursively begins a quiesced section for | ||
24 | + * exclusive access to all child nodes as well. | ||
25 | + * | ||
26 | + * Graph changes are not allowed during a subtree drain section. | ||
27 | + */ | ||
28 | +void bdrv_subtree_drained_begin(BlockDriverState *bs); | ||
29 | + | ||
30 | +/** | ||
31 | * bdrv_drained_end: | ||
32 | * | ||
33 | * End a quiescent section started by bdrv_drained_begin(). | ||
34 | */ | ||
35 | void bdrv_drained_end(BlockDriverState *bs); | ||
36 | |||
37 | +/** | ||
38 | + * End a quiescent section started by bdrv_subtree_drained_begin(). | ||
39 | + */ | ||
40 | +void bdrv_subtree_drained_end(BlockDriverState *bs); | ||
41 | + | ||
42 | void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child, | ||
43 | Error **errp); | ||
44 | void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); | ||
45 | diff --git a/block/io.c b/block/io.c | ||
46 | index XXXXXXX..XXXXXXX 100644 | ||
47 | --- a/block/io.c | ||
48 | +++ b/block/io.c | ||
49 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
50 | BlockDriverState *bs; | ||
51 | bool done; | ||
52 | bool begin; | ||
53 | + bool recursive; | ||
54 | BdrvChild *parent; | ||
55 | } BdrvCoDrainData; | ||
56 | |||
57 | @@ -XXX,XX +XXX,XX @@ static bool bdrv_drain_recurse(BlockDriverState *bs) | ||
58 | return waited; | ||
59 | } | ||
60 | |||
61 | -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent); | ||
62 | -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent); | ||
63 | +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, | ||
64 | + BdrvChild *parent); | ||
65 | +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, | ||
66 | + BdrvChild *parent); | ||
67 | |||
68 | static void bdrv_co_drain_bh_cb(void *opaque) | ||
69 | { | ||
70 | @@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque) | ||
71 | |||
72 | bdrv_dec_in_flight(bs); | ||
73 | if (data->begin) { | ||
74 | - bdrv_do_drained_begin(bs, data->parent); | ||
75 | + bdrv_do_drained_begin(bs, data->recursive, data->parent); | ||
76 | } else { | ||
77 | - bdrv_do_drained_end(bs, data->parent); | ||
78 | + bdrv_do_drained_end(bs, data->recursive, data->parent); | ||
79 | } | ||
80 | |||
81 | data->done = true; | ||
82 | @@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque) | ||
83 | } | ||
84 | |||
85 | static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, | ||
86 | - bool begin, BdrvChild *parent) | ||
87 | + bool begin, bool recursive, | ||
88 | + BdrvChild *parent) | ||
89 | { | ||
90 | BdrvCoDrainData data; | ||
91 | |||
92 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, | ||
93 | .bs = bs, | ||
94 | .done = false, | ||
95 | .begin = begin, | ||
96 | + .recursive = recursive, | ||
97 | .parent = parent, | ||
98 | }; | ||
99 | bdrv_inc_in_flight(bs); | ||
100 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, | ||
101 | assert(data.done); | ||
102 | } | ||
103 | |||
104 | -static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent) | ||
105 | +static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, | ||
106 | + BdrvChild *parent) | ||
107 | { | ||
108 | + BdrvChild *child, *next; | ||
109 | + | ||
110 | if (qemu_in_coroutine()) { | ||
111 | - bdrv_co_yield_to_drain(bs, true, parent); | ||
112 | + bdrv_co_yield_to_drain(bs, true, recursive, parent); | ||
113 | return; | ||
114 | } | ||
115 | |||
116 | @@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent) | ||
117 | bdrv_parent_drained_begin(bs, parent); | ||
118 | bdrv_drain_invoke(bs, true, false); | ||
119 | bdrv_drain_recurse(bs); | ||
120 | + | ||
121 | + if (recursive) { | ||
122 | + QLIST_FOREACH_SAFE(child, &bs->children, next, next) { | ||
123 | + bdrv_do_drained_begin(child->bs, true, child); | ||
124 | + } | ||
125 | + } | ||
126 | } | ||
127 | |||
128 | void bdrv_drained_begin(BlockDriverState *bs) | ||
129 | { | ||
130 | - bdrv_do_drained_begin(bs, NULL); | ||
131 | + bdrv_do_drained_begin(bs, false, NULL); | ||
132 | +} | ||
133 | + | ||
134 | +void bdrv_subtree_drained_begin(BlockDriverState *bs) | ||
135 | +{ | ||
136 | + bdrv_do_drained_begin(bs, true, NULL); | ||
137 | } | ||
138 | |||
139 | -static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) | ||
140 | +static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, | ||
141 | + BdrvChild *parent) | ||
142 | { | ||
143 | + BdrvChild *child, *next; | ||
144 | int old_quiesce_counter; | ||
145 | |||
146 | if (qemu_in_coroutine()) { | ||
147 | - bdrv_co_yield_to_drain(bs, false, parent); | ||
148 | + bdrv_co_yield_to_drain(bs, false, recursive, parent); | ||
149 | return; | ||
150 | } | ||
151 | assert(bs->quiesce_counter > 0); | ||
152 | @@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) | ||
153 | if (old_quiesce_counter == 1) { | ||
154 | aio_enable_external(bdrv_get_aio_context(bs)); | ||
35 | } | 155 | } |
36 | + | 156 | + |
37 | +#ifdef HAVE_RBD_NAMESPACE_EXISTS | 157 | + if (recursive) { |
38 | + if (opts->has_q_namespace && strlen(opts->q_namespace) > 0) { | 158 | + QLIST_FOREACH_SAFE(child, &bs->children, next, next) { |
39 | + bool exists; | 159 | + bdrv_do_drained_end(child->bs, true, child); |
40 | + | ||
41 | + r = rbd_namespace_exists(*io_ctx, opts->q_namespace, &exists); | ||
42 | + if (r < 0) { | ||
43 | + error_setg_errno(errp, -r, "error checking namespace"); | ||
44 | + goto failed_ioctx_destroy; | ||
45 | + } | ||
46 | + | ||
47 | + if (!exists) { | ||
48 | + error_setg(errp, "namespace '%s' does not exist", | ||
49 | + opts->q_namespace); | ||
50 | + r = -ENOENT; | ||
51 | + goto failed_ioctx_destroy; | ||
52 | + } | 160 | + } |
53 | + } | 161 | + } |
54 | +#endif | 162 | } |
163 | |||
164 | void bdrv_drained_end(BlockDriverState *bs) | ||
165 | { | ||
166 | - bdrv_do_drained_end(bs, NULL); | ||
167 | + bdrv_do_drained_end(bs, false, NULL); | ||
168 | +} | ||
55 | + | 169 | + |
56 | /* | 170 | +void bdrv_subtree_drained_end(BlockDriverState *bs) |
57 | * Set the namespace after opening the io context on the pool, | 171 | +{ |
58 | * if nspace == NULL or if nspace == "", it is just as we did nothing | 172 | + bdrv_do_drained_end(bs, true, NULL); |
59 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, | 173 | } |
60 | r = 0; | 174 | |
61 | goto out; | 175 | /* |
62 | |||
63 | +#ifdef HAVE_RBD_NAMESPACE_EXISTS | ||
64 | +failed_ioctx_destroy: | ||
65 | + rados_ioctx_destroy(*io_ctx); | ||
66 | +#endif | ||
67 | failed_shutdown: | ||
68 | rados_shutdown(*cluster); | ||
69 | out: | ||
70 | diff --git a/meson.build b/meson.build | ||
71 | index XXXXXXX..XXXXXXX 100644 | ||
72 | --- a/meson.build | ||
73 | +++ b/meson.build | ||
74 | @@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs')) | ||
75 | config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: util)) | ||
76 | config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul')) | ||
77 | config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>')) | ||
78 | +if rbd.found() | ||
79 | + config_host_data.set('HAVE_RBD_NAMESPACE_EXISTS', | ||
80 | + cc.has_function('rbd_namespace_exists', | ||
81 | + dependencies: rbd, | ||
82 | + prefix: '#include <rbd/librbd.h>')) | ||
83 | +endif | ||
84 | if rdma.found() | ||
85 | config_host_data.set('HAVE_IBV_ADVISE_MR', | ||
86 | cc.has_function('ibv_advise_mr', | ||
87 | -- | 176 | -- |
88 | 2.35.3 | 177 | 2.13.6 |
178 | |||
179 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | Add a subtree drain version to the existing test cases. |
---|---|---|---|
2 | 2 | ||
3 | This supports passing NULL ops to blk_set_dev_ops() | ||
4 | so that we can remove stale ops in some cases. | ||
5 | |||
6 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
8 | Message-Id: <20220523084611.91-2-xieyongji@bytedance.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 4 | --- |
11 | block/block-backend.c | 2 +- | 5 | tests/test-bdrv-drain.c | 27 ++++++++++++++++++++++++++- |
12 | 1 file changed, 1 insertion(+), 1 deletion(-) | 6 | 1 file changed, 26 insertions(+), 1 deletion(-) |
13 | 7 | ||
14 | diff --git a/block/block-backend.c b/block/block-backend.c | 8 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
15 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
16 | --- a/block/block-backend.c | 10 | --- a/tests/test-bdrv-drain.c |
17 | +++ b/block/block-backend.c | 11 | +++ b/tests/test-bdrv-drain.c |
18 | @@ -XXX,XX +XXX,XX @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, | 12 | @@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret) |
19 | blk->dev_opaque = opaque; | 13 | enum drain_type { |
20 | 14 | BDRV_DRAIN_ALL, | |
21 | /* Are we currently quiesced? Should we enforce this right now? */ | 15 | BDRV_DRAIN, |
22 | - if (blk->quiesce_counter && ops->drained_begin) { | 16 | + BDRV_SUBTREE_DRAIN, |
23 | + if (blk->quiesce_counter && ops && ops->drained_begin) { | 17 | DRAIN_TYPE_MAX, |
24 | ops->drained_begin(opaque); | 18 | }; |
19 | |||
20 | @@ -XXX,XX +XXX,XX @@ static void do_drain_begin(enum drain_type drain_type, BlockDriverState *bs) | ||
21 | switch (drain_type) { | ||
22 | case BDRV_DRAIN_ALL: bdrv_drain_all_begin(); break; | ||
23 | case BDRV_DRAIN: bdrv_drained_begin(bs); break; | ||
24 | + case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_begin(bs); break; | ||
25 | default: g_assert_not_reached(); | ||
25 | } | 26 | } |
26 | } | 27 | } |
28 | @@ -XXX,XX +XXX,XX @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs) | ||
29 | switch (drain_type) { | ||
30 | case BDRV_DRAIN_ALL: bdrv_drain_all_end(); break; | ||
31 | case BDRV_DRAIN: bdrv_drained_end(bs); break; | ||
32 | + case BDRV_SUBTREE_DRAIN: bdrv_subtree_drained_end(bs); break; | ||
33 | default: g_assert_not_reached(); | ||
34 | } | ||
35 | } | ||
36 | @@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain(void) | ||
37 | test_drv_cb_common(BDRV_DRAIN, false); | ||
38 | } | ||
39 | |||
40 | +static void test_drv_cb_drain_subtree(void) | ||
41 | +{ | ||
42 | + test_drv_cb_common(BDRV_SUBTREE_DRAIN, true); | ||
43 | +} | ||
44 | + | ||
45 | static void test_quiesce_common(enum drain_type drain_type, bool recursive) | ||
46 | { | ||
47 | BlockBackend *blk; | ||
48 | @@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain(void) | ||
49 | test_quiesce_common(BDRV_DRAIN, false); | ||
50 | } | ||
51 | |||
52 | +static void test_quiesce_drain_subtree(void) | ||
53 | +{ | ||
54 | + test_quiesce_common(BDRV_SUBTREE_DRAIN, true); | ||
55 | +} | ||
56 | + | ||
57 | static void test_nested(void) | ||
58 | { | ||
59 | BlockBackend *blk; | ||
60 | @@ -XXX,XX +XXX,XX @@ static void test_nested(void) | ||
61 | /* XXX bdrv_drain_all() doesn't increase the quiesce_counter */ | ||
62 | int bs_quiesce = (outer != BDRV_DRAIN_ALL) + | ||
63 | (inner != BDRV_DRAIN_ALL); | ||
64 | - int backing_quiesce = 0; | ||
65 | + int backing_quiesce = (outer == BDRV_SUBTREE_DRAIN) + | ||
66 | + (inner == BDRV_SUBTREE_DRAIN); | ||
67 | int backing_cb_cnt = (outer != BDRV_DRAIN) + | ||
68 | (inner != BDRV_DRAIN); | ||
69 | |||
70 | @@ -XXX,XX +XXX,XX @@ static void test_blockjob_drain(void) | ||
71 | test_blockjob_common(BDRV_DRAIN); | ||
72 | } | ||
73 | |||
74 | +static void test_blockjob_drain_subtree(void) | ||
75 | +{ | ||
76 | + test_blockjob_common(BDRV_SUBTREE_DRAIN); | ||
77 | +} | ||
78 | + | ||
79 | int main(int argc, char **argv) | ||
80 | { | ||
81 | bdrv_init(); | ||
82 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
83 | |||
84 | g_test_add_func("/bdrv-drain/driver-cb/drain_all", test_drv_cb_drain_all); | ||
85 | g_test_add_func("/bdrv-drain/driver-cb/drain", test_drv_cb_drain); | ||
86 | + g_test_add_func("/bdrv-drain/driver-cb/drain_subtree", | ||
87 | + test_drv_cb_drain_subtree); | ||
88 | |||
89 | g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all); | ||
90 | g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain); | ||
91 | + g_test_add_func("/bdrv-drain/quiesce/drain_subtree", | ||
92 | + test_quiesce_drain_subtree); | ||
93 | |||
94 | g_test_add_func("/bdrv-drain/nested", test_nested); | ||
95 | |||
96 | g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); | ||
97 | g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); | ||
98 | + g_test_add_func("/bdrv-drain/blockjob/drain_subtree", | ||
99 | + test_blockjob_drain_subtree); | ||
100 | |||
101 | return g_test_run(); | ||
102 | } | ||
27 | -- | 103 | -- |
28 | 2.35.3 | 104 | 2.13.6 |
105 | |||
106 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | If bdrv_do_drained_begin/end() are called in coroutine context, they |
---|---|---|---|
2 | first use a BH to get out of the coroutine context. Call some existing | ||
3 | tests again from a coroutine to cover this code path. | ||
2 | 4 | ||
3 | This implements a VDUSE block backends based on | ||
4 | the libvduse library. We can use it to export the BDSs | ||
5 | for both VM and container (host) usage. | ||
6 | |||
7 | The new command-line syntax is: | ||
8 | |||
9 | $ qemu-storage-daemon \ | ||
10 | --blockdev file,node-name=drive0,filename=test.img \ | ||
11 | --export vduse-blk,node-name=drive0,id=vduse-export0,writable=on | ||
12 | |||
13 | After the qemu-storage-daemon started, we need to use | ||
14 | the "vdpa" command to attach the device to vDPA bus: | ||
15 | |||
16 | $ vdpa dev add name vduse-export0 mgmtdev vduse | ||
17 | |||
18 | Also the device must be removed via the "vdpa" command | ||
19 | before we stop the qemu-storage-daemon. | ||
20 | |||
21 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
22 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
23 | Message-Id: <20220523084611.91-7-xieyongji@bytedance.com> | ||
24 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
25 | --- | 6 | --- |
26 | qapi/block-export.json | 28 ++- | 7 | tests/test-bdrv-drain.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ |
27 | meson_options.txt | 2 + | 8 | 1 file changed, 59 insertions(+) |
28 | block/export/vduse-blk.h | 20 +++ | ||
29 | block/export/export.c | 6 + | ||
30 | block/export/vduse-blk.c | 329 ++++++++++++++++++++++++++++++++++ | ||
31 | MAINTAINERS | 4 +- | ||
32 | block/export/meson.build | 5 + | ||
33 | meson.build | 13 ++ | ||
34 | scripts/meson-buildoptions.sh | 4 + | ||
35 | 9 files changed, 407 insertions(+), 4 deletions(-) | ||
36 | create mode 100644 block/export/vduse-blk.h | ||
37 | create mode 100644 block/export/vduse-blk.c | ||
38 | 9 | ||
39 | diff --git a/qapi/block-export.json b/qapi/block-export.json | 10 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
40 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
41 | --- a/qapi/block-export.json | 12 | --- a/tests/test-bdrv-drain.c |
42 | +++ b/qapi/block-export.json | 13 | +++ b/tests/test-bdrv-drain.c |
43 | @@ -XXX,XX +XXX,XX @@ | 14 | @@ -XXX,XX +XXX,XX @@ static void aio_ret_cb(void *opaque, int ret) |
44 | '*allow-other': 'FuseExportAllowOther' }, | 15 | *aio_ret = ret; |
45 | 'if': 'CONFIG_FUSE' } | 16 | } |
46 | 17 | ||
47 | +## | 18 | +typedef struct CallInCoroutineData { |
48 | +# @BlockExportOptionsVduseBlk: | 19 | + void (*entry)(void); |
49 | +# | 20 | + bool done; |
50 | +# A vduse-blk block export. | 21 | +} CallInCoroutineData; |
51 | +# | ||
52 | +# @num-queues: the number of virtqueues. Defaults to 1. | ||
53 | +# @queue-size: the size of virtqueue. Defaults to 256. | ||
54 | +# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE] | ||
55 | +# and must be power of 2. Defaults to 512 bytes. | ||
56 | +# | ||
57 | +# Since: 7.1 | ||
58 | +## | ||
59 | +{ 'struct': 'BlockExportOptionsVduseBlk', | ||
60 | + 'data': { '*num-queues': 'uint16', | ||
61 | + '*queue-size': 'uint16', | ||
62 | + '*logical-block-size': 'size'} } | ||
63 | + | 22 | + |
64 | ## | 23 | +static coroutine_fn void call_in_coroutine_entry(void *opaque) |
65 | # @NbdServerAddOptions: | 24 | +{ |
66 | # | 25 | + CallInCoroutineData *data = opaque; |
67 | @@ -XXX,XX +XXX,XX @@ | ||
68 | # @nbd: NBD export | ||
69 | # @vhost-user-blk: vhost-user-blk export (since 5.2) | ||
70 | # @fuse: FUSE export (since: 6.0) | ||
71 | +# @vduse-blk: vduse-blk export (since 7.1) | ||
72 | # | ||
73 | # Since: 4.2 | ||
74 | ## | ||
75 | @@ -XXX,XX +XXX,XX @@ | ||
76 | 'data': [ 'nbd', | ||
77 | { 'name': 'vhost-user-blk', | ||
78 | 'if': 'CONFIG_VHOST_USER_BLK_SERVER' }, | ||
79 | - { 'name': 'fuse', 'if': 'CONFIG_FUSE' } ] } | ||
80 | + { 'name': 'fuse', 'if': 'CONFIG_FUSE' }, | ||
81 | + { 'name': 'vduse-blk', 'if': 'CONFIG_VDUSE_BLK_EXPORT' } ] } | ||
82 | |||
83 | ## | ||
84 | # @BlockExportOptions: | ||
85 | @@ -XXX,XX +XXX,XX @@ | ||
86 | # Describes a block export, i.e. how single node should be exported on an | ||
87 | # external interface. | ||
88 | # | ||
89 | -# @id: A unique identifier for the block export (across all export types) | ||
90 | +# @id: A unique identifier for the block export (across the host for vduse-blk | ||
91 | +# export type or across all export types for other types) | ||
92 | # | ||
93 | # @node-name: The node name of the block node to be exported (since: 5.2) | ||
94 | # | ||
95 | @@ -XXX,XX +XXX,XX @@ | ||
96 | 'vhost-user-blk': { 'type': 'BlockExportOptionsVhostUserBlk', | ||
97 | 'if': 'CONFIG_VHOST_USER_BLK_SERVER' }, | ||
98 | 'fuse': { 'type': 'BlockExportOptionsFuse', | ||
99 | - 'if': 'CONFIG_FUSE' } | ||
100 | + 'if': 'CONFIG_FUSE' }, | ||
101 | + 'vduse-blk': { 'type': 'BlockExportOptionsVduseBlk', | ||
102 | + 'if': 'CONFIG_VDUSE_BLK_EXPORT' } | ||
103 | } } | ||
104 | |||
105 | ## | ||
106 | diff --git a/meson_options.txt b/meson_options.txt | ||
107 | index XXXXXXX..XXXXXXX 100644 | ||
108 | --- a/meson_options.txt | ||
109 | +++ b/meson_options.txt | ||
110 | @@ -XXX,XX +XXX,XX @@ option('virtiofsd', type: 'feature', value: 'auto', | ||
111 | description: 'build virtiofs daemon (virtiofsd)') | ||
112 | option('libvduse', type: 'feature', value: 'auto', | ||
113 | description: 'build VDUSE Library') | ||
114 | +option('vduse_blk_export', type: 'feature', value: 'auto', | ||
115 | + description: 'VDUSE block export support') | ||
116 | |||
117 | option('capstone', type: 'feature', value: 'auto', | ||
118 | description: 'Whether and how to find the capstone library') | ||
119 | diff --git a/block/export/vduse-blk.h b/block/export/vduse-blk.h | ||
120 | new file mode 100644 | ||
121 | index XXXXXXX..XXXXXXX | ||
122 | --- /dev/null | ||
123 | +++ b/block/export/vduse-blk.h | ||
124 | @@ -XXX,XX +XXX,XX @@ | ||
125 | +/* | ||
126 | + * Export QEMU block device via VDUSE | ||
127 | + * | ||
128 | + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | ||
129 | + * | ||
130 | + * Author: | ||
131 | + * Xie Yongji <xieyongji@bytedance.com> | ||
132 | + * | ||
133 | + * This work is licensed under the terms of the GNU GPL, version 2 or | ||
134 | + * later. See the COPYING file in the top-level directory. | ||
135 | + */ | ||
136 | + | 26 | + |
137 | +#ifndef VDUSE_BLK_H | 27 | + data->entry(); |
138 | +#define VDUSE_BLK_H | 28 | + data->done = true; |
139 | + | ||
140 | +#include "block/export.h" | ||
141 | + | ||
142 | +extern const BlockExportDriver blk_exp_vduse_blk; | ||
143 | + | ||
144 | +#endif /* VDUSE_BLK_H */ | ||
145 | diff --git a/block/export/export.c b/block/export/export.c | ||
146 | index XXXXXXX..XXXXXXX 100644 | ||
147 | --- a/block/export/export.c | ||
148 | +++ b/block/export/export.c | ||
149 | @@ -XXX,XX +XXX,XX @@ | ||
150 | #ifdef CONFIG_VHOST_USER_BLK_SERVER | ||
151 | #include "vhost-user-blk-server.h" | ||
152 | #endif | ||
153 | +#ifdef CONFIG_VDUSE_BLK_EXPORT | ||
154 | +#include "vduse-blk.h" | ||
155 | +#endif | ||
156 | |||
157 | static const BlockExportDriver *blk_exp_drivers[] = { | ||
158 | &blk_exp_nbd, | ||
159 | @@ -XXX,XX +XXX,XX @@ static const BlockExportDriver *blk_exp_drivers[] = { | ||
160 | #ifdef CONFIG_FUSE | ||
161 | &blk_exp_fuse, | ||
162 | #endif | ||
163 | +#ifdef CONFIG_VDUSE_BLK_EXPORT | ||
164 | + &blk_exp_vduse_blk, | ||
165 | +#endif | ||
166 | }; | ||
167 | |||
168 | /* Only accessed from the main thread */ | ||
169 | diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c | ||
170 | new file mode 100644 | ||
171 | index XXXXXXX..XXXXXXX | ||
172 | --- /dev/null | ||
173 | +++ b/block/export/vduse-blk.c | ||
174 | @@ -XXX,XX +XXX,XX @@ | ||
175 | +/* | ||
176 | + * Export QEMU block device via VDUSE | ||
177 | + * | ||
178 | + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved. | ||
179 | + * | ||
180 | + * Author: | ||
181 | + * Xie Yongji <xieyongji@bytedance.com> | ||
182 | + * | ||
183 | + * This work is licensed under the terms of the GNU GPL, version 2 or | ||
184 | + * later. See the COPYING file in the top-level directory. | ||
185 | + */ | ||
186 | + | ||
187 | +#include <sys/eventfd.h> | ||
188 | + | ||
189 | +#include "qemu/osdep.h" | ||
190 | +#include "qapi/error.h" | ||
191 | +#include "block/export.h" | ||
192 | +#include "qemu/error-report.h" | ||
193 | +#include "util/block-helpers.h" | ||
194 | +#include "subprojects/libvduse/libvduse.h" | ||
195 | +#include "virtio-blk-handler.h" | ||
196 | + | ||
197 | +#include "standard-headers/linux/virtio_blk.h" | ||
198 | + | ||
199 | +#define VDUSE_DEFAULT_NUM_QUEUE 1 | ||
200 | +#define VDUSE_DEFAULT_QUEUE_SIZE 256 | ||
201 | + | ||
202 | +typedef struct VduseBlkExport { | ||
203 | + BlockExport export; | ||
204 | + VirtioBlkHandler handler; | ||
205 | + VduseDev *dev; | ||
206 | + uint16_t num_queues; | ||
207 | + unsigned int inflight; | ||
208 | +} VduseBlkExport; | ||
209 | + | ||
210 | +typedef struct VduseBlkReq { | ||
211 | + VduseVirtqElement elem; | ||
212 | + VduseVirtq *vq; | ||
213 | +} VduseBlkReq; | ||
214 | + | ||
215 | +static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp) | ||
216 | +{ | ||
217 | + vblk_exp->inflight++; | ||
218 | +} | 29 | +} |
219 | + | 30 | + |
220 | +static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp) | 31 | +static void call_in_coroutine(void (*entry)(void)) |
221 | +{ | 32 | +{ |
222 | + if (--vblk_exp->inflight == 0) { | 33 | + Coroutine *co; |
223 | + aio_wait_kick(); | 34 | + CallInCoroutineData data = { |
35 | + .entry = entry, | ||
36 | + .done = false, | ||
37 | + }; | ||
38 | + | ||
39 | + co = qemu_coroutine_create(call_in_coroutine_entry, &data); | ||
40 | + qemu_coroutine_enter(co); | ||
41 | + while (!data.done) { | ||
42 | + aio_poll(qemu_get_aio_context(), true); | ||
224 | + } | 43 | + } |
225 | +} | 44 | +} |
226 | + | 45 | + |
227 | +static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len) | 46 | enum drain_type { |
47 | BDRV_DRAIN_ALL, | ||
48 | BDRV_DRAIN, | ||
49 | @@ -XXX,XX +XXX,XX @@ static void test_drv_cb_drain_subtree(void) | ||
50 | test_drv_cb_common(BDRV_SUBTREE_DRAIN, true); | ||
51 | } | ||
52 | |||
53 | +static void test_drv_cb_co_drain(void) | ||
228 | +{ | 54 | +{ |
229 | + vduse_queue_push(req->vq, &req->elem, in_len); | 55 | + call_in_coroutine(test_drv_cb_drain); |
230 | + vduse_queue_notify(req->vq); | ||
231 | + | ||
232 | + free(req); | ||
233 | +} | 56 | +} |
234 | + | 57 | + |
235 | +static void coroutine_fn vduse_blk_virtio_process_req(void *opaque) | 58 | +static void test_drv_cb_co_drain_subtree(void) |
236 | +{ | 59 | +{ |
237 | + VduseBlkReq *req = opaque; | 60 | + call_in_coroutine(test_drv_cb_drain_subtree); |
238 | + VduseVirtq *vq = req->vq; | ||
239 | + VduseDev *dev = vduse_queue_get_dev(vq); | ||
240 | + VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | ||
241 | + VirtioBlkHandler *handler = &vblk_exp->handler; | ||
242 | + VduseVirtqElement *elem = &req->elem; | ||
243 | + struct iovec *in_iov = elem->in_sg; | ||
244 | + struct iovec *out_iov = elem->out_sg; | ||
245 | + unsigned in_num = elem->in_num; | ||
246 | + unsigned out_num = elem->out_num; | ||
247 | + int in_len; | ||
248 | + | ||
249 | + in_len = virtio_blk_process_req(handler, in_iov, | ||
250 | + out_iov, in_num, out_num); | ||
251 | + if (in_len < 0) { | ||
252 | + free(req); | ||
253 | + return; | ||
254 | + } | ||
255 | + | ||
256 | + vduse_blk_req_complete(req, in_len); | ||
257 | + vduse_blk_inflight_dec(vblk_exp); | ||
258 | +} | 61 | +} |
259 | + | 62 | + |
260 | +static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq) | 63 | static void test_quiesce_common(enum drain_type drain_type, bool recursive) |
64 | { | ||
65 | BlockBackend *blk; | ||
66 | @@ -XXX,XX +XXX,XX @@ static void test_quiesce_drain_subtree(void) | ||
67 | test_quiesce_common(BDRV_SUBTREE_DRAIN, true); | ||
68 | } | ||
69 | |||
70 | +static void test_quiesce_co_drain(void) | ||
261 | +{ | 71 | +{ |
262 | + VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | 72 | + call_in_coroutine(test_quiesce_drain); |
263 | + | ||
264 | + while (1) { | ||
265 | + VduseBlkReq *req; | ||
266 | + | ||
267 | + req = vduse_queue_pop(vq, sizeof(VduseBlkReq)); | ||
268 | + if (!req) { | ||
269 | + break; | ||
270 | + } | ||
271 | + req->vq = vq; | ||
272 | + | ||
273 | + Coroutine *co = | ||
274 | + qemu_coroutine_create(vduse_blk_virtio_process_req, req); | ||
275 | + | ||
276 | + vduse_blk_inflight_inc(vblk_exp); | ||
277 | + qemu_coroutine_enter(co); | ||
278 | + } | ||
279 | +} | 73 | +} |
280 | + | 74 | + |
281 | +static void on_vduse_vq_kick(void *opaque) | 75 | +static void test_quiesce_co_drain_subtree(void) |
282 | +{ | 76 | +{ |
283 | + VduseVirtq *vq = opaque; | 77 | + call_in_coroutine(test_quiesce_drain_subtree); |
284 | + VduseDev *dev = vduse_queue_get_dev(vq); | ||
285 | + int fd = vduse_queue_get_fd(vq); | ||
286 | + eventfd_t kick_data; | ||
287 | + | ||
288 | + if (eventfd_read(fd, &kick_data) == -1) { | ||
289 | + error_report("failed to read data from eventfd"); | ||
290 | + return; | ||
291 | + } | ||
292 | + | ||
293 | + vduse_blk_vq_handler(dev, vq); | ||
294 | +} | 78 | +} |
295 | + | 79 | + |
296 | +static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq) | 80 | static void test_nested(void) |
297 | +{ | 81 | { |
298 | + VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | 82 | BlockBackend *blk; |
299 | + | 83 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) |
300 | + aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), | 84 | g_test_add_func("/bdrv-drain/driver-cb/drain_subtree", |
301 | + true, on_vduse_vq_kick, NULL, NULL, NULL, vq); | 85 | test_drv_cb_drain_subtree); |
302 | +} | 86 | |
303 | + | 87 | + // XXX bdrv_drain_all() doesn't work in coroutine context |
304 | +static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq) | 88 | + g_test_add_func("/bdrv-drain/driver-cb/co/drain", test_drv_cb_co_drain); |
305 | +{ | 89 | + g_test_add_func("/bdrv-drain/driver-cb/co/drain_subtree", |
306 | + VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev); | 90 | + test_drv_cb_co_drain_subtree); |
307 | + | ||
308 | + aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq), | ||
309 | + true, NULL, NULL, NULL, NULL, NULL); | ||
310 | +} | ||
311 | + | ||
312 | +static const VduseOps vduse_blk_ops = { | ||
313 | + .enable_queue = vduse_blk_enable_queue, | ||
314 | + .disable_queue = vduse_blk_disable_queue, | ||
315 | +}; | ||
316 | + | ||
317 | +static void on_vduse_dev_kick(void *opaque) | ||
318 | +{ | ||
319 | + VduseDev *dev = opaque; | ||
320 | + | ||
321 | + vduse_dev_handler(dev); | ||
322 | +} | ||
323 | + | ||
324 | +static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx) | ||
325 | +{ | ||
326 | + int i; | ||
327 | + | ||
328 | + aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), | ||
329 | + true, on_vduse_dev_kick, NULL, NULL, NULL, | ||
330 | + vblk_exp->dev); | ||
331 | + | ||
332 | + for (i = 0; i < vblk_exp->num_queues; i++) { | ||
333 | + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); | ||
334 | + int fd = vduse_queue_get_fd(vq); | ||
335 | + | ||
336 | + if (fd < 0) { | ||
337 | + continue; | ||
338 | + } | ||
339 | + aio_set_fd_handler(vblk_exp->export.ctx, fd, true, | ||
340 | + on_vduse_vq_kick, NULL, NULL, NULL, vq); | ||
341 | + } | ||
342 | +} | ||
343 | + | ||
344 | +static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp) | ||
345 | +{ | ||
346 | + int i; | ||
347 | + | ||
348 | + for (i = 0; i < vblk_exp->num_queues; i++) { | ||
349 | + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i); | ||
350 | + int fd = vduse_queue_get_fd(vq); | ||
351 | + | ||
352 | + if (fd < 0) { | ||
353 | + continue; | ||
354 | + } | ||
355 | + aio_set_fd_handler(vblk_exp->export.ctx, fd, | ||
356 | + true, NULL, NULL, NULL, NULL, NULL); | ||
357 | + } | ||
358 | + aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev), | ||
359 | + true, NULL, NULL, NULL, NULL, NULL); | ||
360 | + | ||
361 | + AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0); | ||
362 | +} | ||
363 | + | 91 | + |
364 | + | 92 | + |
365 | +static void blk_aio_attached(AioContext *ctx, void *opaque) | 93 | g_test_add_func("/bdrv-drain/quiesce/drain_all", test_quiesce_drain_all); |
366 | +{ | 94 | g_test_add_func("/bdrv-drain/quiesce/drain", test_quiesce_drain); |
367 | + VduseBlkExport *vblk_exp = opaque; | 95 | g_test_add_func("/bdrv-drain/quiesce/drain_subtree", |
96 | test_quiesce_drain_subtree); | ||
97 | |||
98 | + // XXX bdrv_drain_all() doesn't work in coroutine context | ||
99 | + g_test_add_func("/bdrv-drain/quiesce/co/drain", test_quiesce_co_drain); | ||
100 | + g_test_add_func("/bdrv-drain/quiesce/co/drain_subtree", | ||
101 | + test_quiesce_co_drain_subtree); | ||
368 | + | 102 | + |
369 | + vblk_exp->export.ctx = ctx; | 103 | g_test_add_func("/bdrv-drain/nested", test_nested); |
370 | + vduse_blk_attach_ctx(vblk_exp, ctx); | 104 | |
371 | +} | 105 | g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); |
372 | + | ||
373 | +static void blk_aio_detach(void *opaque) | ||
374 | +{ | ||
375 | + VduseBlkExport *vblk_exp = opaque; | ||
376 | + | ||
377 | + vduse_blk_detach_ctx(vblk_exp); | ||
378 | + vblk_exp->export.ctx = NULL; | ||
379 | +} | ||
380 | + | ||
381 | +static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | ||
382 | + Error **errp) | ||
383 | +{ | ||
384 | + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | ||
385 | + BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk; | ||
386 | + uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE; | ||
387 | + uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE; | ||
388 | + uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE; | ||
389 | + Error *local_err = NULL; | ||
390 | + struct virtio_blk_config config = { 0 }; | ||
391 | + uint64_t features; | ||
392 | + int i; | ||
393 | + | ||
394 | + if (vblk_opts->has_num_queues) { | ||
395 | + num_queues = vblk_opts->num_queues; | ||
396 | + if (num_queues == 0) { | ||
397 | + error_setg(errp, "num-queues must be greater than 0"); | ||
398 | + return -EINVAL; | ||
399 | + } | ||
400 | + } | ||
401 | + | ||
402 | + if (vblk_opts->has_queue_size) { | ||
403 | + queue_size = vblk_opts->queue_size; | ||
404 | + if (queue_size <= 2 || !is_power_of_2(queue_size) || | ||
405 | + queue_size > VIRTQUEUE_MAX_SIZE) { | ||
406 | + error_setg(errp, "queue-size is invalid"); | ||
407 | + return -EINVAL; | ||
408 | + } | ||
409 | + } | ||
410 | + | ||
411 | + if (vblk_opts->has_logical_block_size) { | ||
412 | + logical_block_size = vblk_opts->logical_block_size; | ||
413 | + check_block_size(exp->id, "logical-block-size", logical_block_size, | ||
414 | + &local_err); | ||
415 | + if (local_err) { | ||
416 | + error_propagate(errp, local_err); | ||
417 | + return -EINVAL; | ||
418 | + } | ||
419 | + } | ||
420 | + vblk_exp->num_queues = num_queues; | ||
421 | + vblk_exp->handler.blk = exp->blk; | ||
422 | + vblk_exp->handler.serial = exp->id; | ||
423 | + vblk_exp->handler.logical_block_size = logical_block_size; | ||
424 | + vblk_exp->handler.writable = opts->writable; | ||
425 | + | ||
426 | + config.capacity = | ||
427 | + cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS); | ||
428 | + config.seg_max = cpu_to_le32(queue_size - 2); | ||
429 | + config.min_io_size = cpu_to_le16(1); | ||
430 | + config.opt_io_size = cpu_to_le32(1); | ||
431 | + config.num_queues = cpu_to_le16(num_queues); | ||
432 | + config.blk_size = cpu_to_le32(logical_block_size); | ||
433 | + config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS); | ||
434 | + config.max_discard_seg = cpu_to_le32(1); | ||
435 | + config.discard_sector_alignment = | ||
436 | + cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS); | ||
437 | + config.max_write_zeroes_sectors = | ||
438 | + cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS); | ||
439 | + config.max_write_zeroes_seg = cpu_to_le32(1); | ||
440 | + | ||
441 | + features = vduse_get_virtio_features() | | ||
442 | + (1ULL << VIRTIO_BLK_F_SEG_MAX) | | ||
443 | + (1ULL << VIRTIO_BLK_F_TOPOLOGY) | | ||
444 | + (1ULL << VIRTIO_BLK_F_BLK_SIZE) | | ||
445 | + (1ULL << VIRTIO_BLK_F_FLUSH) | | ||
446 | + (1ULL << VIRTIO_BLK_F_DISCARD) | | ||
447 | + (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); | ||
448 | + | ||
449 | + if (num_queues > 1) { | ||
450 | + features |= 1ULL << VIRTIO_BLK_F_MQ; | ||
451 | + } | ||
452 | + if (!opts->writable) { | ||
453 | + features |= 1ULL << VIRTIO_BLK_F_RO; | ||
454 | + } | ||
455 | + | ||
456 | + vblk_exp->dev = vduse_dev_create(exp->id, VIRTIO_ID_BLOCK, 0, | ||
457 | + features, num_queues, | ||
458 | + sizeof(struct virtio_blk_config), | ||
459 | + (char *)&config, &vduse_blk_ops, | ||
460 | + vblk_exp); | ||
461 | + if (!vblk_exp->dev) { | ||
462 | + error_setg(errp, "failed to create vduse device"); | ||
463 | + return -ENOMEM; | ||
464 | + } | ||
465 | + | ||
466 | + for (i = 0; i < num_queues; i++) { | ||
467 | + vduse_dev_setup_queue(vblk_exp->dev, i, queue_size); | ||
468 | + } | ||
469 | + | ||
470 | + aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true, | ||
471 | + on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev); | ||
472 | + | ||
473 | + blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | ||
474 | + vblk_exp); | ||
475 | + | ||
476 | + return 0; | ||
477 | +} | ||
478 | + | ||
479 | +static void vduse_blk_exp_delete(BlockExport *exp) | ||
480 | +{ | ||
481 | + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | ||
482 | + | ||
483 | + blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, | ||
484 | + vblk_exp); | ||
485 | + vduse_dev_destroy(vblk_exp->dev); | ||
486 | +} | ||
487 | + | ||
488 | +static void vduse_blk_exp_request_shutdown(BlockExport *exp) | ||
489 | +{ | ||
490 | + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export); | ||
491 | + | ||
492 | + aio_context_acquire(vblk_exp->export.ctx); | ||
493 | + vduse_blk_detach_ctx(vblk_exp); | ||
494 | + aio_context_acquire(vblk_exp->export.ctx); | ||
495 | +} | ||
496 | + | ||
497 | +const BlockExportDriver blk_exp_vduse_blk = { | ||
498 | + .type = BLOCK_EXPORT_TYPE_VDUSE_BLK, | ||
499 | + .instance_size = sizeof(VduseBlkExport), | ||
500 | + .create = vduse_blk_exp_create, | ||
501 | + .delete = vduse_blk_exp_delete, | ||
502 | + .request_shutdown = vduse_blk_exp_request_shutdown, | ||
503 | +}; | ||
504 | diff --git a/MAINTAINERS b/MAINTAINERS | ||
505 | index XXXXXXX..XXXXXXX 100644 | ||
506 | --- a/MAINTAINERS | ||
507 | +++ b/MAINTAINERS | ||
508 | @@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org | ||
509 | S: Supported | ||
510 | F: block/export/fuse.c | ||
511 | |||
512 | -VDUSE library | ||
513 | +VDUSE library and block device exports | ||
514 | M: Xie Yongji <xieyongji@bytedance.com> | ||
515 | S: Maintained | ||
516 | F: subprojects/libvduse/ | ||
517 | +F: block/export/vduse-blk.c | ||
518 | +F: block/export/vduse-blk.h | ||
519 | |||
520 | Replication | ||
521 | M: Wen Congyang <wencongyang2@huawei.com> | ||
522 | diff --git a/block/export/meson.build b/block/export/meson.build | ||
523 | index XXXXXXX..XXXXXXX 100644 | ||
524 | --- a/block/export/meson.build | ||
525 | +++ b/block/export/meson.build | ||
526 | @@ -XXX,XX +XXX,XX @@ if have_vhost_user_blk_server | ||
527 | endif | ||
528 | |||
529 | blockdev_ss.add(when: fuse, if_true: files('fuse.c')) | ||
530 | + | ||
531 | +if have_vduse_blk_export | ||
532 | + blockdev_ss.add(files('vduse-blk.c', 'virtio-blk-handler.c')) | ||
533 | + blockdev_ss.add(libvduse) | ||
534 | +endif | ||
535 | diff --git a/meson.build b/meson.build | ||
536 | index XXXXXXX..XXXXXXX 100644 | ||
537 | --- a/meson.build | ||
538 | +++ b/meson.build | ||
539 | @@ -XXX,XX +XXX,XX @@ elif get_option('libvduse').disabled() | ||
540 | have_libvduse = false | ||
541 | endif | ||
542 | |||
543 | +have_vduse_blk_export = (have_libvduse and targetos == 'linux') | ||
544 | +if get_option('vduse_blk_export').enabled() | ||
545 | + if targetos != 'linux' | ||
546 | + error('vduse_blk_export requires linux') | ||
547 | + elif not have_libvduse | ||
548 | + error('vduse_blk_export requires libvduse support') | ||
549 | + endif | ||
550 | +elif get_option('vduse_blk_export').disabled() | ||
551 | + have_vduse_blk_export = false | ||
552 | +endif | ||
553 | + | ||
554 | # libbpf | ||
555 | libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config') | ||
556 | if libbpf.found() and not cc.links(''' | ||
557 | @@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto) | ||
558 | config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa) | ||
559 | config_host_data.set('CONFIG_VMNET', vmnet.found()) | ||
560 | config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server) | ||
561 | +config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export) | ||
562 | config_host_data.set('CONFIG_PNG', png.found()) | ||
563 | config_host_data.set('CONFIG_VNC', vnc.found()) | ||
564 | config_host_data.set('CONFIG_VNC_JPEG', jpeg.found()) | ||
565 | @@ -XXX,XX +XXX,XX @@ if have_block | ||
566 | summary_info += {'qed support': get_option('qed').allowed()} | ||
567 | summary_info += {'parallels support': get_option('parallels').allowed()} | ||
568 | summary_info += {'FUSE exports': fuse} | ||
569 | + summary_info += {'VDUSE block exports': have_vduse_blk_export} | ||
570 | endif | ||
571 | summary(summary_info, bool_yn: true, section: 'Block layer support') | ||
572 | |||
573 | diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh | ||
574 | index XXXXXXX..XXXXXXX 100644 | ||
575 | --- a/scripts/meson-buildoptions.sh | ||
576 | +++ b/scripts/meson-buildoptions.sh | ||
577 | @@ -XXX,XX +XXX,XX @@ meson_options_help() { | ||
578 | printf "%s\n" ' vhost-user vhost-user backend support' | ||
579 | printf "%s\n" ' vhost-user-blk-server' | ||
580 | printf "%s\n" ' build vhost-user-blk server' | ||
581 | + printf "%s\n" ' vduse-blk-export' | ||
582 | + printf "%s\n" ' VDUSE block export support' | ||
583 | printf "%s\n" ' vhost-vdpa vhost-vdpa kernel backend support' | ||
584 | printf "%s\n" ' virglrenderer virgl rendering support' | ||
585 | printf "%s\n" ' virtfs virtio-9p support' | ||
586 | @@ -XXX,XX +XXX,XX @@ _meson_option_parse() { | ||
587 | --disable-vhost-user) printf "%s" -Dvhost_user=disabled ;; | ||
588 | --enable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=enabled ;; | ||
589 | --disable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=disabled ;; | ||
590 | + --enable-vduse-blk-export) printf "%s" -Dvduse_blk_export=enabled ;; | ||
591 | + --disable-vduse-blk-export) printf "%s" -Dvduse_blk_export=disabled ;; | ||
592 | --enable-vhost-vdpa) printf "%s" -Dvhost_vdpa=enabled ;; | ||
593 | --disable-vhost-vdpa) printf "%s" -Dvhost_vdpa=disabled ;; | ||
594 | --enable-virglrenderer) printf "%s" -Dvirglrenderer=enabled ;; | ||
595 | -- | 106 | -- |
596 | 2.35.3 | 107 | 2.13.6 |
108 | |||
109 | diff view generated by jsdifflib |
1 | From: Stefan Hajnoczi <stefanha@redhat.com> | 1 | Test that drain sections are correctly propagated through the graph. |
---|---|---|---|
2 | 2 | ||
3 | Commit 1b7fd729559c ("block: rename buffer_alignment to | ||
4 | guest_block_size") noted: | ||
5 | |||
6 | At this point, the field is set by the device emulation, but completely | ||
7 | ignored by the block layer. | ||
8 | |||
9 | The last time the value of buffer_alignment/guest_block_size was | ||
10 | actually used was before commit 339064d50639 ("block: Don't use guest | ||
11 | sector size for qemu_blockalign()"). | ||
12 | |||
13 | This value has not been used since 2013. Get rid of it. | ||
14 | |||
15 | Cc: Xie Yongji <xieyongji@bytedance.com> | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
17 | Message-Id: <20220518130945.2657905-1-stefanha@redhat.com> | ||
18 | Reviewed-by: Paul Durrant <paul@xen.org> | ||
19 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
20 | Reviewed-by: Alberto Faria <afaria@redhat.com> | ||
21 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
22 | --- | 4 | --- |
23 | include/sysemu/block-backend-io.h | 1 - | 5 | tests/test-bdrv-drain.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ |
24 | block/block-backend.c | 10 ---------- | 6 | 1 file changed, 74 insertions(+) |
25 | block/export/vhost-user-blk-server.c | 1 - | ||
26 | hw/block/virtio-blk.c | 1 - | ||
27 | hw/block/xen-block.c | 1 - | ||
28 | hw/ide/core.c | 1 - | ||
29 | hw/scsi/scsi-disk.c | 1 - | ||
30 | hw/scsi/scsi-generic.c | 1 - | ||
31 | 8 files changed, 17 deletions(-) | ||
32 | 7 | ||
33 | diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h | 8 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
34 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
35 | --- a/include/sysemu/block-backend-io.h | 10 | --- a/tests/test-bdrv-drain.c |
36 | +++ b/include/sysemu/block-backend-io.h | 11 | +++ b/tests/test-bdrv-drain.c |
37 | @@ -XXX,XX +XXX,XX @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action, | 12 | @@ -XXX,XX +XXX,XX @@ static void test_nested(void) |
38 | void blk_iostatus_set_err(BlockBackend *blk, int error); | ||
39 | int blk_get_max_iov(BlockBackend *blk); | ||
40 | int blk_get_max_hw_iov(BlockBackend *blk); | ||
41 | -void blk_set_guest_block_size(BlockBackend *blk, int align); | ||
42 | |||
43 | void blk_io_plug(BlockBackend *blk); | ||
44 | void blk_io_unplug(BlockBackend *blk); | ||
45 | diff --git a/block/block-backend.c b/block/block-backend.c | ||
46 | index XXXXXXX..XXXXXXX 100644 | ||
47 | --- a/block/block-backend.c | ||
48 | +++ b/block/block-backend.c | ||
49 | @@ -XXX,XX +XXX,XX @@ struct BlockBackend { | ||
50 | const BlockDevOps *dev_ops; | ||
51 | void *dev_opaque; | ||
52 | |||
53 | - /* the block size for which the guest device expects atomicity */ | ||
54 | - int guest_block_size; | ||
55 | - | ||
56 | /* If the BDS tree is removed, some of its options are stored here (which | ||
57 | * can be used to restore those options in the new BDS on insert) */ | ||
58 | BlockBackendRootState root_state; | ||
59 | @@ -XXX,XX +XXX,XX @@ void blk_detach_dev(BlockBackend *blk, DeviceState *dev) | ||
60 | blk->dev = NULL; | ||
61 | blk->dev_ops = NULL; | ||
62 | blk->dev_opaque = NULL; | ||
63 | - blk->guest_block_size = 512; | ||
64 | blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort); | ||
65 | blk_unref(blk); | 13 | blk_unref(blk); |
66 | } | 14 | } |
67 | @@ -XXX,XX +XXX,XX @@ int blk_get_max_iov(BlockBackend *blk) | 15 | |
68 | return blk->root->bs->bl.max_iov; | 16 | +static void test_multiparent(void) |
69 | } | 17 | +{ |
70 | 18 | + BlockBackend *blk_a, *blk_b; | |
71 | -void blk_set_guest_block_size(BlockBackend *blk, int align) | 19 | + BlockDriverState *bs_a, *bs_b, *backing; |
72 | -{ | 20 | + BDRVTestState *a_s, *b_s, *backing_s; |
73 | - IO_CODE(); | 21 | + |
74 | - blk->guest_block_size = align; | 22 | + blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); |
75 | -} | 23 | + bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR, |
76 | - | 24 | + &error_abort); |
77 | void *blk_try_blockalign(BlockBackend *blk, size_t size) | 25 | + a_s = bs_a->opaque; |
78 | { | 26 | + blk_insert_bs(blk_a, bs_a, &error_abort); |
79 | IO_CODE(); | 27 | + |
80 | diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c | 28 | + blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); |
81 | index XXXXXXX..XXXXXXX 100644 | 29 | + bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR, |
82 | --- a/block/export/vhost-user-blk-server.c | 30 | + &error_abort); |
83 | +++ b/block/export/vhost-user-blk-server.c | 31 | + b_s = bs_b->opaque; |
84 | @@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | 32 | + blk_insert_bs(blk_b, bs_b, &error_abort); |
85 | return -EINVAL; | 33 | + |
86 | } | 34 | + backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); |
87 | vexp->blk_size = logical_block_size; | 35 | + backing_s = backing->opaque; |
88 | - blk_set_guest_block_size(exp->blk, logical_block_size); | 36 | + bdrv_set_backing_hd(bs_a, backing, &error_abort); |
89 | 37 | + bdrv_set_backing_hd(bs_b, backing, &error_abort); | |
90 | if (vu_opts->has_num_queues) { | 38 | + |
91 | num_queues = vu_opts->num_queues; | 39 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 0); |
92 | diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c | 40 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 0); |
93 | index XXXXXXX..XXXXXXX 100644 | 41 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); |
94 | --- a/hw/block/virtio-blk.c | 42 | + g_assert_cmpint(a_s->drain_count, ==, 0); |
95 | +++ b/hw/block/virtio-blk.c | 43 | + g_assert_cmpint(b_s->drain_count, ==, 0); |
96 | @@ -XXX,XX +XXX,XX @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) | 44 | + g_assert_cmpint(backing_s->drain_count, ==, 0); |
97 | 45 | + | |
98 | s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); | 46 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); |
99 | blk_set_dev_ops(s->blk, &virtio_block_ops, s); | 47 | + |
100 | - blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size); | 48 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 1); |
101 | 49 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 1); | |
102 | blk_iostatus_enable(s->blk); | 50 | + g_assert_cmpint(backing->quiesce_counter, ==, 1); |
103 | 51 | + g_assert_cmpint(a_s->drain_count, ==, 1); | |
104 | diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c | 52 | + g_assert_cmpint(b_s->drain_count, ==, 1); |
105 | index XXXXXXX..XXXXXXX 100644 | 53 | + g_assert_cmpint(backing_s->drain_count, ==, 1); |
106 | --- a/hw/block/xen-block.c | 54 | + |
107 | +++ b/hw/block/xen-block.c | 55 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b); |
108 | @@ -XXX,XX +XXX,XX @@ static void xen_block_realize(XenDevice *xendev, Error **errp) | 56 | + |
109 | } | 57 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 2); |
110 | 58 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 2); | |
111 | blk_set_dev_ops(blk, &xen_block_dev_ops, blockdev); | 59 | + g_assert_cmpint(backing->quiesce_counter, ==, 2); |
112 | - blk_set_guest_block_size(blk, conf->logical_block_size); | 60 | + g_assert_cmpint(a_s->drain_count, ==, 2); |
113 | 61 | + g_assert_cmpint(b_s->drain_count, ==, 2); | |
114 | if (conf->discard_granularity == -1) { | 62 | + g_assert_cmpint(backing_s->drain_count, ==, 2); |
115 | conf->discard_granularity = conf->physical_block_size; | 63 | + |
116 | diff --git a/hw/ide/core.c b/hw/ide/core.c | 64 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_b); |
117 | index XXXXXXX..XXXXXXX 100644 | 65 | + |
118 | --- a/hw/ide/core.c | 66 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 1); |
119 | +++ b/hw/ide/core.c | 67 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 1); |
120 | @@ -XXX,XX +XXX,XX @@ int ide_init_drive(IDEState *s, BlockBackend *blk, IDEDriveKind kind, | 68 | + g_assert_cmpint(backing->quiesce_counter, ==, 1); |
121 | s->smart_selftest_count = 0; | 69 | + g_assert_cmpint(a_s->drain_count, ==, 1); |
122 | if (kind == IDE_CD) { | 70 | + g_assert_cmpint(b_s->drain_count, ==, 1); |
123 | blk_set_dev_ops(blk, &ide_cd_block_ops, s); | 71 | + g_assert_cmpint(backing_s->drain_count, ==, 1); |
124 | - blk_set_guest_block_size(blk, 2048); | 72 | + |
125 | } else { | 73 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); |
126 | if (!blk_is_inserted(s->blk)) { | 74 | + |
127 | error_setg(errp, "Device needs media, but drive is empty"); | 75 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 0); |
128 | diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c | 76 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 0); |
129 | index XXXXXXX..XXXXXXX 100644 | 77 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); |
130 | --- a/hw/scsi/scsi-disk.c | 78 | + g_assert_cmpint(a_s->drain_count, ==, 0); |
131 | +++ b/hw/scsi/scsi-disk.c | 79 | + g_assert_cmpint(b_s->drain_count, ==, 0); |
132 | @@ -XXX,XX +XXX,XX @@ static void scsi_realize(SCSIDevice *dev, Error **errp) | 80 | + g_assert_cmpint(backing_s->drain_count, ==, 0); |
133 | } else { | 81 | + |
134 | blk_set_dev_ops(s->qdev.conf.blk, &scsi_disk_block_ops, s); | 82 | + bdrv_unref(backing); |
135 | } | 83 | + bdrv_unref(bs_a); |
136 | - blk_set_guest_block_size(s->qdev.conf.blk, s->qdev.blocksize); | 84 | + bdrv_unref(bs_b); |
137 | 85 | + blk_unref(blk_a); | |
138 | blk_iostatus_enable(s->qdev.conf.blk); | 86 | + blk_unref(blk_b); |
139 | 87 | +} | |
140 | diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c | 88 | + |
141 | index XXXXXXX..XXXXXXX 100644 | 89 | |
142 | --- a/hw/scsi/scsi-generic.c | 90 | typedef struct TestBlockJob { |
143 | +++ b/hw/scsi/scsi-generic.c | 91 | BlockJob common; |
144 | @@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret) | 92 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) |
145 | s->blocksize = ldl_be_p(&r->buf[8]); | 93 | test_quiesce_co_drain_subtree); |
146 | s->max_lba = ldq_be_p(&r->buf[0]); | 94 | |
147 | } | 95 | g_test_add_func("/bdrv-drain/nested", test_nested); |
148 | - blk_set_guest_block_size(s->conf.blk, s->blocksize); | 96 | + g_test_add_func("/bdrv-drain/multiparent", test_multiparent); |
149 | 97 | ||
150 | /* | 98 | g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); |
151 | * Patch MODE SENSE device specific parameters if the BDS is opened | 99 | g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); |
152 | -- | 100 | -- |
153 | 2.35.3 | 101 | 2.13.6 |
102 | |||
103 | diff view generated by jsdifflib |
1 | From: Xie Yongji <xieyongji@bytedance.com> | 1 | We need to remember how many of the drain sections in which a node is |
---|---|---|---|
2 | 2 | were recursive (i.e. subtree drain rather than node drain), so that they | |
3 | Currently we use 'id' option as the name of VDUSE device. | 3 | can be correctly applied when children are added or removed during the |
4 | It's a bit confusing since we use one value for two different | 4 | drained section. |
5 | purposes: the ID to identfy the export within QEMU (must be | 5 | |
6 | distinct from any other exports in the same QEMU process, but | 6 | With this change, it is safe to modify the graph even inside a |
7 | can overlap with names used by other processes), and the VDUSE | 7 | bdrv_subtree_drained_begin/end() section. |
8 | name to uniquely identify it on the host (must be distinct from | 8 | |
9 | other VDUSE devices on the same host, but can overlap with other | ||
10 | export types like NBD in the same process). To make it clear, | ||
11 | this patch adds a separate 'name' option to specify the VDUSE | ||
12 | name for the vduse-blk export instead. | ||
13 | |||
14 | Signed-off-by: Xie Yongji <xieyongji@bytedance.com> | ||
15 | Message-Id: <20220614051532.92-7-xieyongji@bytedance.com> | ||
16 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
17 | --- | 10 | --- |
18 | qapi/block-export.json | 7 ++++--- | 11 | include/block/block.h | 2 -- |
19 | docs/tools/qemu-storage-daemon.rst | 5 +++-- | 12 | include/block/block_int.h | 5 +++++ |
20 | block/export/vduse-blk.c | 4 ++-- | 13 | block.c | 32 +++++++++++++++++++++++++++++--- |
21 | storage-daemon/qemu-storage-daemon.c | 8 ++++---- | 14 | block/io.c | 28 ++++++++++++++++++++++++---- |
22 | 4 files changed, 13 insertions(+), 11 deletions(-) | 15 | 4 files changed, 58 insertions(+), 9 deletions(-) |
23 | 16 | ||
24 | diff --git a/qapi/block-export.json b/qapi/block-export.json | 17 | diff --git a/include/block/block.h b/include/block/block.h |
25 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/qapi/block-export.json | 19 | --- a/include/block/block.h |
27 | +++ b/qapi/block-export.json | 20 | +++ b/include/block/block.h |
28 | @@ -XXX,XX +XXX,XX @@ | 21 | @@ -XXX,XX +XXX,XX @@ void bdrv_drained_begin(BlockDriverState *bs); |
29 | # | 22 | /** |
30 | # A vduse-blk block export. | 23 | * Like bdrv_drained_begin, but recursively begins a quiesced section for |
31 | # | 24 | * exclusive access to all child nodes as well. |
32 | +# @name: the name of VDUSE device (must be unique across the host). | 25 | - * |
33 | # @num-queues: the number of virtqueues. Defaults to 1. | 26 | - * Graph changes are not allowed during a subtree drain section. |
34 | # @queue-size: the size of virtqueue. Defaults to 256. | 27 | */ |
35 | # @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE] | 28 | void bdrv_subtree_drained_begin(BlockDriverState *bs); |
36 | @@ -XXX,XX +XXX,XX @@ | 29 | |
37 | # Since: 7.1 | 30 | diff --git a/include/block/block_int.h b/include/block/block_int.h |
38 | ## | 31 | index XXXXXXX..XXXXXXX 100644 |
39 | { 'struct': 'BlockExportOptionsVduseBlk', | 32 | --- a/include/block/block_int.h |
40 | - 'data': { '*num-queues': 'uint16', | 33 | +++ b/include/block/block_int.h |
41 | + 'data': { 'name': 'str', | 34 | @@ -XXX,XX +XXX,XX @@ struct BlockDriverState { |
42 | + '*num-queues': 'uint16', | 35 | |
43 | '*queue-size': 'uint16', | 36 | /* Accessed with atomic ops. */ |
44 | '*logical-block-size': 'size', | 37 | int quiesce_counter; |
45 | '*serial': 'str' } } | 38 | + int recursive_quiesce_counter; |
46 | @@ -XXX,XX +XXX,XX @@ | 39 | + |
47 | # Describes a block export, i.e. how single node should be exported on an | 40 | unsigned int write_gen; /* Current data generation */ |
48 | # external interface. | 41 | |
49 | # | 42 | /* Protected by reqs_lock. */ |
50 | -# @id: A unique identifier for the block export (across the host for vduse-blk | 43 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, |
51 | -# export type or across all export types for other types) | 44 | int64_t offset, unsigned int bytes, QEMUIOVector *qiov, |
52 | +# @id: A unique identifier for the block export (across all export types) | 45 | BdrvRequestFlags flags); |
53 | # | 46 | |
54 | # @node-name: The node name of the block node to be exported (since: 5.2) | 47 | +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); |
55 | # | 48 | +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); |
56 | diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst | 49 | + |
57 | index XXXXXXX..XXXXXXX 100644 | 50 | int get_tmp_filename(char *filename, int size); |
58 | --- a/docs/tools/qemu-storage-daemon.rst | 51 | BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, |
59 | +++ b/docs/tools/qemu-storage-daemon.rst | 52 | const char *filename); |
60 | @@ -XXX,XX +XXX,XX @@ Standard options: | 53 | diff --git a/block.c b/block.c |
61 | --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>] | 54 | index XXXXXXX..XXXXXXX 100644 |
62 | --export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>] | 55 | --- a/block.c |
63 | --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto] | 56 | +++ b/block.c |
64 | - --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>] | 57 | @@ -XXX,XX +XXX,XX @@ static void bdrv_child_cb_drained_end(BdrvChild *child) |
65 | + --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>] | 58 | bdrv_drained_end(bs); |
66 | 59 | } | |
67 | is a block export definition. ``node-name`` is the block node that should be | 60 | |
68 | exported. ``writable`` determines whether or not the export allows write | 61 | +static void bdrv_child_cb_attach(BdrvChild *child) |
69 | @@ -XXX,XX +XXX,XX @@ Standard options: | 62 | +{ |
70 | ``allow-other`` to auto (the default) will try enabling this option, and on | 63 | + BlockDriverState *bs = child->opaque; |
71 | error fall back to disabling it. | 64 | + bdrv_apply_subtree_drain(child, bs); |
72 | 65 | +} | |
73 | - The ``vduse-blk`` export type uses the ``id`` as the VDUSE device name. | 66 | + |
74 | + The ``vduse-blk`` export type takes a ``name`` (must be unique across the host) | 67 | +static void bdrv_child_cb_detach(BdrvChild *child) |
75 | + to create the VDUSE device. | 68 | +{ |
76 | ``num-queues`` sets the number of virtqueues (the default is 1). | 69 | + BlockDriverState *bs = child->opaque; |
77 | ``queue-size`` sets the virtqueue descriptor table size (the default is 256). | 70 | + bdrv_unapply_subtree_drain(child, bs); |
78 | 71 | +} | |
79 | diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c | 72 | + |
80 | index XXXXXXX..XXXXXXX 100644 | 73 | static int bdrv_child_cb_inactivate(BdrvChild *child) |
81 | --- a/block/export/vduse-blk.c | 74 | { |
82 | +++ b/block/export/vduse-blk.c | 75 | BlockDriverState *bs = child->opaque; |
83 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | 76 | @@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_file = { |
84 | features |= 1ULL << VIRTIO_BLK_F_RO; | 77 | .inherit_options = bdrv_inherited_options, |
78 | .drained_begin = bdrv_child_cb_drained_begin, | ||
79 | .drained_end = bdrv_child_cb_drained_end, | ||
80 | + .attach = bdrv_child_cb_attach, | ||
81 | + .detach = bdrv_child_cb_detach, | ||
82 | .inactivate = bdrv_child_cb_inactivate, | ||
83 | }; | ||
84 | |||
85 | @@ -XXX,XX +XXX,XX @@ const BdrvChildRole child_format = { | ||
86 | .inherit_options = bdrv_inherited_fmt_options, | ||
87 | .drained_begin = bdrv_child_cb_drained_begin, | ||
88 | .drained_end = bdrv_child_cb_drained_end, | ||
89 | + .attach = bdrv_child_cb_attach, | ||
90 | + .detach = bdrv_child_cb_detach, | ||
91 | .inactivate = bdrv_child_cb_inactivate, | ||
92 | }; | ||
93 | |||
94 | @@ -XXX,XX +XXX,XX @@ static void bdrv_backing_attach(BdrvChild *c) | ||
95 | parent->backing_blocker); | ||
96 | bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET, | ||
97 | parent->backing_blocker); | ||
98 | + | ||
99 | + bdrv_child_cb_attach(c); | ||
100 | } | ||
101 | |||
102 | static void bdrv_backing_detach(BdrvChild *c) | ||
103 | @@ -XXX,XX +XXX,XX @@ static void bdrv_backing_detach(BdrvChild *c) | ||
104 | bdrv_op_unblock_all(c->bs, parent->backing_blocker); | ||
105 | error_free(parent->backing_blocker); | ||
106 | parent->backing_blocker = NULL; | ||
107 | + | ||
108 | + bdrv_child_cb_detach(c); | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | @@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child, | ||
113 | assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs)); | ||
85 | } | 114 | } |
86 | 115 | if (old_bs) { | |
87 | - vblk_exp->dev = vduse_dev_create(exp->id, VIRTIO_ID_BLOCK, 0, | 116 | + /* Detach first so that the recursive drain sections coming from @child |
88 | + vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0, | 117 | + * are already gone and we only end the drain sections that came from |
89 | features, num_queues, | 118 | + * elsewhere. */ |
90 | sizeof(struct virtio_blk_config), | 119 | + if (child->role->detach) { |
91 | (char *)&config, &vduse_blk_ops, | 120 | + child->role->detach(child); |
92 | @@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, | 121 | + } |
122 | if (old_bs->quiesce_counter && child->role->drained_end) { | ||
123 | for (i = 0; i < old_bs->quiesce_counter; i++) { | ||
124 | child->role->drained_end(child); | ||
125 | } | ||
126 | } | ||
127 | - if (child->role->detach) { | ||
128 | - child->role->detach(child); | ||
129 | - } | ||
130 | QLIST_REMOVE(child, next_parent); | ||
93 | } | 131 | } |
94 | 132 | ||
95 | vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s", | 133 | @@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child, |
96 | - g_get_tmp_dir(), exp->id); | 134 | } |
97 | + g_get_tmp_dir(), vblk_opts->name); | 135 | } |
98 | if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) { | 136 | |
99 | error_setg(errp, "failed to set reconnect log file"); | 137 | + /* Attach only after starting new drained sections, so that recursive |
100 | ret = -EINVAL; | 138 | + * drain sections coming from @child don't get an extra .drained_begin |
101 | diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c | 139 | + * callback. */ |
102 | index XXXXXXX..XXXXXXX 100644 | 140 | if (child->role->attach) { |
103 | --- a/storage-daemon/qemu-storage-daemon.c | 141 | child->role->attach(child); |
104 | +++ b/storage-daemon/qemu-storage-daemon.c | 142 | } |
105 | @@ -XXX,XX +XXX,XX @@ static void help(void) | 143 | diff --git a/block/io.c b/block/io.c |
106 | #endif /* CONFIG_VHOST_USER_BLK_SERVER */ | 144 | index XXXXXXX..XXXXXXX 100644 |
107 | #ifdef CONFIG_VDUSE_BLK_EXPORT | 145 | --- a/block/io.c |
108 | " --export [type=]vduse-blk,id=<id>,node-name=<node-name>\n" | 146 | +++ b/block/io.c |
109 | -" [,writable=on|off][,num-queues=<num-queues>]\n" | 147 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, |
110 | -" [,queue-size=<queue-size>]\n" | 148 | assert(data.done); |
111 | +" ,name=<vduse-name>[,writable=on|off]\n" | 149 | } |
112 | +" [,num-queues=<num-queues>][,queue-size=<queue-size>]\n" | 150 | |
113 | " [,logical-block-size=<logical-block-size>]\n" | 151 | -static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, |
114 | " [,serial=<serial-number>]\n" | 152 | - BdrvChild *parent) |
115 | -" export the specified block node as a vduse-blk\n" | 153 | +void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, |
116 | -" device using the id as the VDUSE device name\n" | 154 | + BdrvChild *parent) |
117 | +" export the specified block node as a\n" | 155 | { |
118 | +" vduse-blk device\n" | 156 | BdrvChild *child, *next; |
119 | "\n" | 157 | |
120 | #endif /* CONFIG_VDUSE_BLK_EXPORT */ | 158 | @@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, |
121 | " --monitor [chardev=]name[,mode=control][,pretty[=on|off]]\n" | 159 | bdrv_drain_recurse(bs); |
160 | |||
161 | if (recursive) { | ||
162 | + bs->recursive_quiesce_counter++; | ||
163 | QLIST_FOREACH_SAFE(child, &bs->children, next, next) { | ||
164 | bdrv_do_drained_begin(child->bs, true, child); | ||
165 | } | ||
166 | @@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_begin(BlockDriverState *bs) | ||
167 | bdrv_do_drained_begin(bs, true, NULL); | ||
168 | } | ||
169 | |||
170 | -static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, | ||
171 | - BdrvChild *parent) | ||
172 | +void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, | ||
173 | + BdrvChild *parent) | ||
174 | { | ||
175 | BdrvChild *child, *next; | ||
176 | int old_quiesce_counter; | ||
177 | @@ -XXX,XX +XXX,XX @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, | ||
178 | } | ||
179 | |||
180 | if (recursive) { | ||
181 | + bs->recursive_quiesce_counter--; | ||
182 | QLIST_FOREACH_SAFE(child, &bs->children, next, next) { | ||
183 | bdrv_do_drained_end(child->bs, true, child); | ||
184 | } | ||
185 | @@ -XXX,XX +XXX,XX @@ void bdrv_subtree_drained_end(BlockDriverState *bs) | ||
186 | bdrv_do_drained_end(bs, true, NULL); | ||
187 | } | ||
188 | |||
189 | +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) | ||
190 | +{ | ||
191 | + int i; | ||
192 | + | ||
193 | + for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { | ||
194 | + bdrv_do_drained_begin(child->bs, true, child); | ||
195 | + } | ||
196 | +} | ||
197 | + | ||
198 | +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) | ||
199 | +{ | ||
200 | + int i; | ||
201 | + | ||
202 | + for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { | ||
203 | + bdrv_do_drained_end(child->bs, true, child); | ||
204 | + } | ||
205 | +} | ||
206 | + | ||
207 | /* | ||
208 | * Wait for pending requests to complete on a single BlockDriverState subtree, | ||
209 | * and suspend block driver's internal I/O until next request arrives. | ||
122 | -- | 210 | -- |
123 | 2.35.3 | 211 | 2.13.6 |
212 | |||
213 | diff view generated by jsdifflib |
1 | From: Fabian Ebner <f.ebner@proxmox.com> | ||
---|---|---|---|
2 | |||
3 | On 64-bit platforms, assigning SIZE_MAX to the int64_t max_pdiscard | ||
4 | results in a negative value, and the following assertion would trigger | ||
5 | down the line (it's not the same max_pdiscard, but computed from the | ||
6 | other one): | ||
7 | qemu-system-x86_64: ../block/io.c:3166: bdrv_co_pdiscard: Assertion | ||
8 | `max_pdiscard >= bs->bl.request_alignment' failed. | ||
9 | |||
10 | On 32-bit platforms, it's fine to keep using SIZE_MAX. | ||
11 | |||
12 | The assertion in qemu_gluster_co_pdiscard() is checking that the value | ||
13 | of 'bytes' can safely be passed to glfs_discard_async(), which takes a | ||
14 | size_t for the argument in question, so it is kept as is. And since | ||
15 | max_pdiscard is still <= SIZE_MAX, relying on max_pdiscard is still | ||
16 | fine. | ||
17 | |||
18 | Fixes: 0c8022876f ("block: use int64_t instead of int in driver discard handlers") | ||
19 | Cc: qemu-stable@nongnu.org | ||
20 | Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | ||
21 | Message-Id: <20220520075922.43972-1-f.ebner@proxmox.com> | ||
22 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
23 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> | ||
24 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> | ||
25 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 1 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
26 | --- | 2 | --- |
27 | block/gluster.c | 2 +- | 3 | tests/test-bdrv-drain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ |
28 | 1 file changed, 1 insertion(+), 1 deletion(-) | 4 | 1 file changed, 80 insertions(+) |
29 | 5 | ||
30 | diff --git a/block/gluster.c b/block/gluster.c | 6 | diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c |
31 | index XXXXXXX..XXXXXXX 100644 | 7 | index XXXXXXX..XXXXXXX 100644 |
32 | --- a/block/gluster.c | 8 | --- a/tests/test-bdrv-drain.c |
33 | +++ b/block/gluster.c | 9 | +++ b/tests/test-bdrv-drain.c |
34 | @@ -XXX,XX +XXX,XX @@ out: | 10 | @@ -XXX,XX +XXX,XX @@ static void test_multiparent(void) |
35 | static void qemu_gluster_refresh_limits(BlockDriverState *bs, Error **errp) | 11 | blk_unref(blk_b); |
36 | { | ||
37 | bs->bl.max_transfer = GLUSTER_MAX_TRANSFER; | ||
38 | - bs->bl.max_pdiscard = SIZE_MAX; | ||
39 | + bs->bl.max_pdiscard = MIN(SIZE_MAX, INT64_MAX); | ||
40 | } | 12 | } |
41 | 13 | ||
42 | static int qemu_gluster_reopen_prepare(BDRVReopenState *state, | 14 | +static void test_graph_change(void) |
15 | +{ | ||
16 | + BlockBackend *blk_a, *blk_b; | ||
17 | + BlockDriverState *bs_a, *bs_b, *backing; | ||
18 | + BDRVTestState *a_s, *b_s, *backing_s; | ||
19 | + | ||
20 | + blk_a = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); | ||
21 | + bs_a = bdrv_new_open_driver(&bdrv_test, "test-node-a", BDRV_O_RDWR, | ||
22 | + &error_abort); | ||
23 | + a_s = bs_a->opaque; | ||
24 | + blk_insert_bs(blk_a, bs_a, &error_abort); | ||
25 | + | ||
26 | + blk_b = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); | ||
27 | + bs_b = bdrv_new_open_driver(&bdrv_test, "test-node-b", BDRV_O_RDWR, | ||
28 | + &error_abort); | ||
29 | + b_s = bs_b->opaque; | ||
30 | + blk_insert_bs(blk_b, bs_b, &error_abort); | ||
31 | + | ||
32 | + backing = bdrv_new_open_driver(&bdrv_test, "backing", 0, &error_abort); | ||
33 | + backing_s = backing->opaque; | ||
34 | + bdrv_set_backing_hd(bs_a, backing, &error_abort); | ||
35 | + | ||
36 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 0); | ||
37 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 0); | ||
38 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); | ||
39 | + g_assert_cmpint(a_s->drain_count, ==, 0); | ||
40 | + g_assert_cmpint(b_s->drain_count, ==, 0); | ||
41 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
42 | + | ||
43 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); | ||
44 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); | ||
45 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_a); | ||
46 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b); | ||
47 | + do_drain_begin(BDRV_SUBTREE_DRAIN, bs_b); | ||
48 | + | ||
49 | + bdrv_set_backing_hd(bs_b, backing, &error_abort); | ||
50 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 5); | ||
51 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 5); | ||
52 | + g_assert_cmpint(backing->quiesce_counter, ==, 5); | ||
53 | + g_assert_cmpint(a_s->drain_count, ==, 5); | ||
54 | + g_assert_cmpint(b_s->drain_count, ==, 5); | ||
55 | + g_assert_cmpint(backing_s->drain_count, ==, 5); | ||
56 | + | ||
57 | + bdrv_set_backing_hd(bs_b, NULL, &error_abort); | ||
58 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 3); | ||
59 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 2); | ||
60 | + g_assert_cmpint(backing->quiesce_counter, ==, 3); | ||
61 | + g_assert_cmpint(a_s->drain_count, ==, 3); | ||
62 | + g_assert_cmpint(b_s->drain_count, ==, 2); | ||
63 | + g_assert_cmpint(backing_s->drain_count, ==, 3); | ||
64 | + | ||
65 | + bdrv_set_backing_hd(bs_b, backing, &error_abort); | ||
66 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 5); | ||
67 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 5); | ||
68 | + g_assert_cmpint(backing->quiesce_counter, ==, 5); | ||
69 | + g_assert_cmpint(a_s->drain_count, ==, 5); | ||
70 | + g_assert_cmpint(b_s->drain_count, ==, 5); | ||
71 | + g_assert_cmpint(backing_s->drain_count, ==, 5); | ||
72 | + | ||
73 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_b); | ||
74 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_b); | ||
75 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); | ||
76 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); | ||
77 | + do_drain_end(BDRV_SUBTREE_DRAIN, bs_a); | ||
78 | + | ||
79 | + g_assert_cmpint(bs_a->quiesce_counter, ==, 0); | ||
80 | + g_assert_cmpint(bs_b->quiesce_counter, ==, 0); | ||
81 | + g_assert_cmpint(backing->quiesce_counter, ==, 0); | ||
82 | + g_assert_cmpint(a_s->drain_count, ==, 0); | ||
83 | + g_assert_cmpint(b_s->drain_count, ==, 0); | ||
84 | + g_assert_cmpint(backing_s->drain_count, ==, 0); | ||
85 | + | ||
86 | + bdrv_unref(backing); | ||
87 | + bdrv_unref(bs_a); | ||
88 | + bdrv_unref(bs_b); | ||
89 | + blk_unref(blk_a); | ||
90 | + blk_unref(blk_b); | ||
91 | +} | ||
92 | + | ||
93 | |||
94 | typedef struct TestBlockJob { | ||
95 | BlockJob common; | ||
96 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
97 | |||
98 | g_test_add_func("/bdrv-drain/nested", test_nested); | ||
99 | g_test_add_func("/bdrv-drain/multiparent", test_multiparent); | ||
100 | + g_test_add_func("/bdrv-drain/graph-change", test_graph_change); | ||
101 | |||
102 | g_test_add_func("/bdrv-drain/blockjob/drain_all", test_blockjob_drain_all); | ||
103 | g_test_add_func("/bdrv-drain/blockjob/drain", test_blockjob_drain); | ||
43 | -- | 104 | -- |
44 | 2.35.3 | 105 | 2.13.6 |
106 | |||
107 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Since commit bde70715, base is the only node that is reopened in | ||
2 | commit_start(). This means that the code, which still involves an | ||
3 | explicit BlockReopenQueue, can now be simplified by using bdrv_reopen(). | ||
1 | 4 | ||
5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
6 | Reviewed-by: Fam Zheng <famz@redhat.com> | ||
7 | --- | ||
8 | block/commit.c | 8 +------- | ||
9 | 1 file changed, 1 insertion(+), 7 deletions(-) | ||
10 | |||
11 | diff --git a/block/commit.c b/block/commit.c | ||
12 | index XXXXXXX..XXXXXXX 100644 | ||
13 | --- a/block/commit.c | ||
14 | +++ b/block/commit.c | ||
15 | @@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs, | ||
16 | const char *filter_node_name, Error **errp) | ||
17 | { | ||
18 | CommitBlockJob *s; | ||
19 | - BlockReopenQueue *reopen_queue = NULL; | ||
20 | int orig_base_flags; | ||
21 | BlockDriverState *iter; | ||
22 | BlockDriverState *commit_top_bs = NULL; | ||
23 | @@ -XXX,XX +XXX,XX @@ void commit_start(const char *job_id, BlockDriverState *bs, | ||
24 | /* convert base to r/w, if necessary */ | ||
25 | orig_base_flags = bdrv_get_flags(base); | ||
26 | if (!(orig_base_flags & BDRV_O_RDWR)) { | ||
27 | - reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL, | ||
28 | - orig_base_flags | BDRV_O_RDWR); | ||
29 | - } | ||
30 | - | ||
31 | - if (reopen_queue) { | ||
32 | - bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err); | ||
33 | + bdrv_reopen(base, orig_base_flags | BDRV_O_RDWR, &local_err); | ||
34 | if (local_err != NULL) { | ||
35 | error_propagate(errp, local_err); | ||
36 | goto fail; | ||
37 | -- | ||
38 | 2.13.6 | ||
39 | |||
40 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | The bdrv_reopen*() implementation doesn't like it if the graph is | ||
2 | changed between queuing nodes for reopen and actually reopening them | ||
3 | (one of the reasons is that queuing can be recursive). | ||
1 | 4 | ||
5 | So instead of draining the device only in bdrv_reopen_multiple(), | ||
6 | require that callers already drained all affected nodes, and assert this | ||
7 | in bdrv_reopen_queue(). | ||
8 | |||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
10 | Reviewed-by: Fam Zheng <famz@redhat.com> | ||
11 | --- | ||
12 | block.c | 23 ++++++++++++++++------- | ||
13 | block/replication.c | 6 ++++++ | ||
14 | qemu-io-cmds.c | 3 +++ | ||
15 | 3 files changed, 25 insertions(+), 7 deletions(-) | ||
16 | |||
17 | diff --git a/block.c b/block.c | ||
18 | index XXXXXXX..XXXXXXX 100644 | ||
19 | --- a/block.c | ||
20 | +++ b/block.c | ||
21 | @@ -XXX,XX +XXX,XX @@ BlockDriverState *bdrv_open(const char *filename, const char *reference, | ||
22 | * returns a pointer to bs_queue, which is either the newly allocated | ||
23 | * bs_queue, or the existing bs_queue being used. | ||
24 | * | ||
25 | + * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple(). | ||
26 | */ | ||
27 | static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, | ||
28 | BlockDriverState *bs, | ||
29 | @@ -XXX,XX +XXX,XX @@ static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, | ||
30 | BdrvChild *child; | ||
31 | QDict *old_options, *explicit_options; | ||
32 | |||
33 | + /* Make sure that the caller remembered to use a drained section. This is | ||
34 | + * important to avoid graph changes between the recursive queuing here and | ||
35 | + * bdrv_reopen_multiple(). */ | ||
36 | + assert(bs->quiesce_counter > 0); | ||
37 | + | ||
38 | if (bs_queue == NULL) { | ||
39 | bs_queue = g_new0(BlockReopenQueue, 1); | ||
40 | QSIMPLEQ_INIT(bs_queue); | ||
41 | @@ -XXX,XX +XXX,XX @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, | ||
42 | * If all devices prepare successfully, then the changes are committed | ||
43 | * to all devices. | ||
44 | * | ||
45 | + * All affected nodes must be drained between bdrv_reopen_queue() and | ||
46 | + * bdrv_reopen_multiple(). | ||
47 | */ | ||
48 | int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp) | ||
49 | { | ||
50 | @@ -XXX,XX +XXX,XX @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er | ||
51 | |||
52 | assert(bs_queue != NULL); | ||
53 | |||
54 | - aio_context_release(ctx); | ||
55 | - bdrv_drain_all_begin(); | ||
56 | - aio_context_acquire(ctx); | ||
57 | - | ||
58 | QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { | ||
59 | + assert(bs_entry->state.bs->quiesce_counter > 0); | ||
60 | if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { | ||
61 | error_propagate(errp, local_err); | ||
62 | goto cleanup; | ||
63 | @@ -XXX,XX +XXX,XX @@ cleanup: | ||
64 | } | ||
65 | g_free(bs_queue); | ||
66 | |||
67 | - bdrv_drain_all_end(); | ||
68 | - | ||
69 | return ret; | ||
70 | } | ||
71 | |||
72 | @@ -XXX,XX +XXX,XX @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) | ||
73 | { | ||
74 | int ret = -1; | ||
75 | Error *local_err = NULL; | ||
76 | - BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags); | ||
77 | + BlockReopenQueue *queue; | ||
78 | |||
79 | + bdrv_subtree_drained_begin(bs); | ||
80 | + | ||
81 | + queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags); | ||
82 | ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err); | ||
83 | if (local_err != NULL) { | ||
84 | error_propagate(errp, local_err); | ||
85 | } | ||
86 | + | ||
87 | + bdrv_subtree_drained_end(bs); | ||
88 | + | ||
89 | return ret; | ||
90 | } | ||
91 | |||
92 | diff --git a/block/replication.c b/block/replication.c | ||
93 | index XXXXXXX..XXXXXXX 100644 | ||
94 | --- a/block/replication.c | ||
95 | +++ b/block/replication.c | ||
96 | @@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, | ||
97 | new_secondary_flags = s->orig_secondary_flags; | ||
98 | } | ||
99 | |||
100 | + bdrv_subtree_drained_begin(s->hidden_disk->bs); | ||
101 | + bdrv_subtree_drained_begin(s->secondary_disk->bs); | ||
102 | + | ||
103 | if (orig_hidden_flags != new_hidden_flags) { | ||
104 | reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs, NULL, | ||
105 | new_hidden_flags); | ||
106 | @@ -XXX,XX +XXX,XX @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, | ||
107 | reopen_queue, &local_err); | ||
108 | error_propagate(errp, local_err); | ||
109 | } | ||
110 | + | ||
111 | + bdrv_subtree_drained_end(s->hidden_disk->bs); | ||
112 | + bdrv_subtree_drained_end(s->secondary_disk->bs); | ||
113 | } | ||
114 | |||
115 | static void backup_job_cleanup(BlockDriverState *bs) | ||
116 | diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c | ||
117 | index XXXXXXX..XXXXXXX 100644 | ||
118 | --- a/qemu-io-cmds.c | ||
119 | +++ b/qemu-io-cmds.c | ||
120 | @@ -XXX,XX +XXX,XX @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) | ||
121 | opts = qopts ? qemu_opts_to_qdict(qopts, NULL) : NULL; | ||
122 | qemu_opts_reset(&reopen_opts); | ||
123 | |||
124 | + bdrv_subtree_drained_begin(bs); | ||
125 | brq = bdrv_reopen_queue(NULL, bs, opts, flags); | ||
126 | bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, &local_err); | ||
127 | + bdrv_subtree_drained_end(bs); | ||
128 | + | ||
129 | if (local_err) { | ||
130 | error_report_err(local_err); | ||
131 | } else { | ||
132 | -- | ||
133 | 2.13.6 | ||
134 | |||
135 | diff view generated by jsdifflib |