1 | The following changes since commit 52ed34cbddde1cb89b2ac263e758e349a77f21e1: | 1 | The following changes since commit 825b96dbcee23d134b691fc75618b59c5f53da32: |
---|---|---|---|
2 | 2 | ||
3 | Merge tag 'pull-request-2023-06-26' of https://gitlab.com/thuth/qemu into staging (2023-06-26 10:38:41 +0200) | 3 | Merge tag 'migration-20250310-pull-request' of https://gitlab.com/farosas/qemu into staging (2025-03-11 09:32:07 +0800) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://repo.or.cz/qemu/kevin.git tags/for-upstream | 7 | https://repo.or.cz/qemu/kevin.git tags/for-upstream |
8 | 8 | ||
9 | for you to fetch changes up to 17362398ee1a7f04e8006a46333145d8b707fd35: | 9 | for you to fetch changes up to a93c04f3cbe690877b3297a9df4767aa811fcd97: |
10 | 10 | ||
11 | block: use bdrv_co_debug_event in coroutine context (2023-06-28 09:46:34 +0200) | 11 | virtio-scsi: only expose cmd vqs via iothread-vq-mapping (2025-03-11 15:49:22 +0100) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Block layer patches | 14 | Block layer patches |
15 | 15 | ||
16 | - Re-enable the graph lock | 16 | - virtio-scsi: add iothread-vq-mapping parameter |
17 | - More fixes to coroutine_fn marking | 17 | - Improve writethrough performance |
18 | - Fix missing zero init in bdrv_snapshot_goto() | ||
19 | - Code cleanup and iotests fixes | ||
18 | 20 | ||
19 | ---------------------------------------------------------------- | 21 | ---------------------------------------------------------------- |
20 | Kevin Wolf (11): | 22 | Kevin Wolf (8): |
21 | iotests: Test active commit with iothread and background I/O | 23 | block: Remove unused blk_op_is_blocked() |
22 | qdev-properties-system: Lock AioContext for blk_insert_bs() | 24 | block: Zero block driver state before reopening |
23 | test-block-iothread: Lock AioContext for blk_insert_bs() | 25 | file-posix: Support FUA writes |
24 | block: Fix AioContext locking in bdrv_open_child() | 26 | block/io: Ignore FUA with cache.no-flush=on |
25 | block: Fix AioContext locking in bdrv_attach_child_common() | 27 | aio: Create AioPolledEvent |
26 | block: Fix AioContext locking in bdrv_reopen_parse_file_or_backing() | 28 | aio-posix: Factor out adjust_polling_time() |
27 | block: Fix AioContext locking in bdrv_open_inherit() | 29 | aio-posix: Separate AioPolledEvent per AioHandler |
28 | block: Fix AioContext locking in bdrv_open_backing_file() | 30 | aio-posix: Adjust polling time also for new handlers |
29 | blockjob: Fix AioContext locking in block_job_add_bdrv() | ||
30 | graph-lock: Unlock the AioContext while polling | ||
31 | Revert "graph-lock: Disable locking for now" | ||
32 | 31 | ||
33 | Paolo Bonzini (12): | 32 | Stefan Hajnoczi (13): |
34 | file-posix: remove incorrect coroutine_fn calls | 33 | scsi-disk: drop unused SCSIDiskState->bh field |
35 | qed: mark more functions as coroutine_fns and GRAPH_RDLOCK | 34 | dma: use current AioContext for dma_blk_io() |
36 | vpc: mark more functions as coroutine_fns and GRAPH_RDLOCK | 35 | scsi: track per-SCSIRequest AioContext |
37 | bochs: mark more functions as coroutine_fns and GRAPH_RDLOCK | 36 | scsi: introduce requests_lock |
38 | block: mark another function as coroutine_fns and GRAPH_UNLOCKED | 37 | virtio-scsi: introduce event and ctrl virtqueue locks |
39 | cloop: mark more functions as coroutine_fns and GRAPH_RDLOCK | 38 | virtio-scsi: protect events_dropped field |
40 | dmg: mark more functions as coroutine_fns and GRAPH_RDLOCK | 39 | virtio-scsi: perform TMFs in appropriate AioContexts |
41 | vmdk: mark more functions as coroutine_fns and GRAPH_RDLOCK | 40 | virtio-blk: extract cleanup_iothread_vq_mapping() function |
42 | vhdx: mark more functions as coroutine_fns and GRAPH_RDLOCK | 41 | virtio-blk: tidy up iothread_vq_mapping functions |
43 | qcow2: mark more functions as coroutine_fns and GRAPH_RDLOCK | 42 | virtio: extract iothread-vq-mapping.h API |
44 | block: use bdrv_co_getlength in coroutine context | 43 | virtio-scsi: add iothread-vq-mapping parameter |
45 | block: use bdrv_co_debug_event in coroutine context | 44 | virtio-scsi: handle ctrl virtqueue in main loop |
45 | virtio-scsi: only expose cmd vqs via iothread-vq-mapping | ||
46 | 46 | ||
47 | block/qcow2.h | 33 +++-- | 47 | Thomas Huth (1): |
48 | block/vhdx.h | 5 +- | 48 | iotests: Limit qsd-migrate to working formats |
49 | include/block/block-io.h | 7 ++ | 49 | |
50 | include/block/graph-lock.h | 6 +- | 50 | include/block/aio.h | 5 +- |
51 | block.c | 114 ++++++++++++++++-- | 51 | include/block/raw-aio.h | 8 +- |
52 | block/bochs.c | 7 +- | 52 | include/hw/scsi/scsi.h | 8 +- |
53 | block/cloop.c | 9 +- | 53 | include/hw/virtio/iothread-vq-mapping.h | 45 +++ |
54 | block/dmg.c | 21 ++-- | 54 | include/hw/virtio/virtio-scsi.h | 15 +- |
55 | block/file-posix.c | 29 +++-- | 55 | include/system/block-backend-global-state.h | 1 - |
56 | block/graph-lock.c | 43 +++---- | 56 | include/system/dma.h | 3 +- |
57 | block/io.c | 14 +-- | 57 | util/aio-posix.h | 1 + |
58 | block/parallels.c | 4 +- | 58 | block/block-backend.c | 12 - |
59 | block/qcow.c | 30 ++--- | 59 | block/file-posix.c | 26 +- |
60 | block/qcow2-bitmap.c | 26 ++-- | 60 | block/io.c | 4 + |
61 | block/qcow2-cluster.c | 24 ++-- | 61 | block/io_uring.c | 13 +- |
62 | block/qcow2-refcount.c | 134 +++++++++++---------- | 62 | block/linux-aio.c | 24 +- |
63 | block/qcow2.c | 20 +-- | 63 | block/snapshot.c | 1 + |
64 | block/qed-check.c | 5 +- | 64 | hw/block/virtio-blk.c | 132 +------- |
65 | block/qed-table.c | 6 +- | 65 | hw/ide/core.c | 3 +- |
66 | block/qed.c | 15 +-- | 66 | hw/ide/macio.c | 3 +- |
67 | block/raw-format.c | 4 +- | 67 | hw/scsi/scsi-bus.c | 121 +++++-- |
68 | block/vhdx-log.c | 36 +++--- | 68 | hw/scsi/scsi-disk.c | 24 +- |
69 | block/vhdx.c | 73 ++++++----- | 69 | hw/scsi/virtio-scsi-dataplane.c | 103 ++++-- |
70 | block/vmdk.c | 55 ++++----- | 70 | hw/scsi/virtio-scsi.c | 502 ++++++++++++++++------------ |
71 | block/vpc.c | 52 ++++---- | 71 | hw/virtio/iothread-vq-mapping.c | 131 ++++++++ |
72 | blockjob.c | 17 ++- | 72 | system/dma-helpers.c | 8 +- |
73 | hw/core/qdev-properties-system.c | 8 +- | 73 | util/aio-posix.c | 114 ++++--- |
74 | tests/unit/test-block-iothread.c | 7 +- | 74 | util/async.c | 1 - |
75 | tests/qemu-iotests/tests/iothreads-commit-active | 85 +++++++++++++ | 75 | hw/virtio/meson.build | 1 + |
76 | .../qemu-iotests/tests/iothreads-commit-active.out | 23 ++++ | 76 | meson.build | 4 + |
77 | 30 files changed, 573 insertions(+), 339 deletions(-) | 77 | tests/qemu-iotests/tests/qsd-migrate | 2 +- |
78 | create mode 100755 tests/qemu-iotests/tests/iothreads-commit-active | 78 | 28 files changed, 803 insertions(+), 512 deletions(-) |
79 | create mode 100644 tests/qemu-iotests/tests/iothreads-commit-active.out | 79 | create mode 100644 include/hw/virtio/iothread-vq-mapping.h |
80 | create mode 100644 hw/virtio/iothread-vq-mapping.c | diff view generated by jsdifflib |
1 | bdrv_attach_child() requires that the caller holds the AioContext lock | 1 | Commit fc4e394b28 removed the last caller of blk_op_is_blocked(). Remove |
---|---|---|---|
2 | for the new child node. Take it in bdrv_open_child() and document that | 2 | the now unused function. |
3 | the caller must not hold any AioContext apart from the main AioContext. | ||
4 | 3 | ||
5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
6 | Message-ID: <20230605085711.21261-5-kwolf@redhat.com> | 5 | Message-ID: <20250206165331.379033-1-kwolf@redhat.com> |
6 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 8 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
9 | --- | 9 | --- |
10 | block.c | 13 +++++++++++-- | 10 | include/system/block-backend-global-state.h | 1 - |
11 | 1 file changed, 11 insertions(+), 2 deletions(-) | 11 | block/block-backend.c | 12 ------------ |
12 | 2 files changed, 13 deletions(-) | ||
12 | 13 | ||
13 | diff --git a/block.c b/block.c | 14 | diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h |
14 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/block.c | 16 | --- a/include/system/block-backend-global-state.h |
16 | +++ b/block.c | 17 | +++ b/include/system/block-backend-global-state.h |
17 | @@ -XXX,XX +XXX,XX @@ done: | 18 | @@ -XXX,XX +XXX,XX @@ bool blk_supports_write_perm(BlockBackend *blk); |
18 | * | 19 | bool blk_is_sg(BlockBackend *blk); |
19 | * The BlockdevRef will be removed from the options QDict. | 20 | void blk_set_enable_write_cache(BlockBackend *blk, bool wce); |
20 | * | 21 | int blk_get_flags(BlockBackend *blk); |
21 | + * The caller must hold the lock of the main AioContext and no other AioContext. | 22 | -bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp); |
22 | * @parent can move to a different AioContext in this function. Callers must | 23 | int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, |
23 | * make sure that their AioContext locking is still correct after this. | 24 | Error **errp); |
24 | */ | 25 | void blk_add_aio_context_notifier(BlockBackend *blk, |
25 | @@ -XXX,XX +XXX,XX @@ BdrvChild *bdrv_open_child(const char *filename, | 26 | diff --git a/block/block-backend.c b/block/block-backend.c |
26 | bool allow_none, Error **errp) | 27 | index XXXXXXX..XXXXXXX 100644 |
27 | { | 28 | --- a/block/block-backend.c |
28 | BlockDriverState *bs; | 29 | +++ b/block/block-backend.c |
29 | + BdrvChild *child; | 30 | @@ -XXX,XX +XXX,XX @@ void *blk_blockalign(BlockBackend *blk, size_t size) |
30 | + AioContext *ctx; | 31 | return qemu_blockalign(blk ? blk_bs(blk) : NULL, size); |
31 | |||
32 | GLOBAL_STATE_CODE(); | ||
33 | |||
34 | @@ -XXX,XX +XXX,XX @@ BdrvChild *bdrv_open_child(const char *filename, | ||
35 | return NULL; | ||
36 | } | ||
37 | |||
38 | - return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, | ||
39 | - errp); | ||
40 | + ctx = bdrv_get_aio_context(bs); | ||
41 | + aio_context_acquire(ctx); | ||
42 | + child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, | ||
43 | + errp); | ||
44 | + aio_context_release(ctx); | ||
45 | + | ||
46 | + return child; | ||
47 | } | 32 | } |
48 | 33 | ||
49 | /* | 34 | -bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp) |
50 | * Wrapper on bdrv_open_child() for most popular case: open primary child of bs. | 35 | -{ |
51 | * | 36 | - BlockDriverState *bs = blk_bs(blk); |
52 | + * The caller must hold the lock of the main AioContext and no other AioContext. | 37 | - GLOBAL_STATE_CODE(); |
53 | * @parent can move to a different AioContext in this function. Callers must | 38 | - GRAPH_RDLOCK_GUARD_MAINLOOP(); |
54 | * make sure that their AioContext locking is still correct after this. | 39 | - |
55 | */ | 40 | - if (!bs) { |
41 | - return false; | ||
42 | - } | ||
43 | - | ||
44 | - return bdrv_op_is_blocked(bs, op, errp); | ||
45 | -} | ||
46 | |||
47 | /** | ||
48 | * Return BB's current AioContext. Note that this context may change | ||
56 | -- | 49 | -- |
57 | 2.41.0 | 50 | 2.48.1 |
51 | |||
52 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | Block drivers assume in their .bdrv_open() implementation that their |
---|---|---|---|
2 | state in bs->opaque has been zeroed; it is initially allocated with | ||
3 | g_malloc0() in bdrv_open_driver(). | ||
2 | 4 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 5 | bdrv_snapshot_goto() needs to make sure that it is zeroed again before |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 6 | calling drv->bdrv_open() to avoid that block drivers use stale values. |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | ||
6 | 7 | ||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 8 | One symptom of this bug is VMDK running into a double free when the user |
8 | Message-ID: <20230601115145.196465-9-pbonzini@redhat.com> | 9 | tries to apply an internal snapshot like 'qemu-img snapshot -a test |
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 10 | test.vmdk'. This should be a graceful error because VMDK doesn't support |
11 | internal snapshots. | ||
12 | |||
13 | ==25507== Invalid free() / delete / delete[] / realloc() | ||
14 | ==25507== at 0x484B347: realloc (vg_replace_malloc.c:1801) | ||
15 | ==25507== by 0x54B592A: g_realloc (gmem.c:171) | ||
16 | ==25507== by 0x1B221D: vmdk_add_extent (../block/vmdk.c:570) | ||
17 | ==25507== by 0x1B1084: vmdk_open_sparse (../block/vmdk.c:1059) | ||
18 | ==25507== by 0x1AF3D8: vmdk_open (../block/vmdk.c:1371) | ||
19 | ==25507== by 0x1A2AE0: bdrv_snapshot_goto (../block/snapshot.c:299) | ||
20 | ==25507== by 0x205C77: img_snapshot (../qemu-img.c:3500) | ||
21 | ==25507== by 0x58FA087: (below main) (libc_start_call_main.h:58) | ||
22 | ==25507== Address 0x832f3e0 is 0 bytes inside a block of size 272 free'd | ||
23 | ==25507== at 0x4846B83: free (vg_replace_malloc.c:989) | ||
24 | ==25507== by 0x54AEAC4: g_free (gmem.c:208) | ||
25 | ==25507== by 0x1AF629: vmdk_close (../block/vmdk.c:2889) | ||
26 | ==25507== by 0x1A2A9C: bdrv_snapshot_goto (../block/snapshot.c:290) | ||
27 | ==25507== by 0x205C77: img_snapshot (../qemu-img.c:3500) | ||
28 | ==25507== by 0x58FA087: (below main) (libc_start_call_main.h:58) | ||
29 | |||
30 | This error was discovered by fuzzing qemu-img. | ||
31 | |||
32 | Cc: qemu-stable@nongnu.org | ||
33 | Closes: https://gitlab.com/qemu-project/qemu/-/issues/2853 | ||
34 | Closes: https://gitlab.com/qemu-project/qemu/-/issues/2851 | ||
35 | Reported-by: Denis Rastyogin <gerben@altlinux.org> | ||
36 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
37 | Message-ID: <20250310104858.28221-1-kwolf@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 38 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 39 | --- |
12 | block/vmdk.c | 27 ++++++++++++++------------- | 40 | block/snapshot.c | 1 + |
13 | 1 file changed, 14 insertions(+), 13 deletions(-) | 41 | 1 file changed, 1 insertion(+) |
14 | 42 | ||
15 | diff --git a/block/vmdk.c b/block/vmdk.c | 43 | diff --git a/block/snapshot.c b/block/snapshot.c |
16 | index XXXXXXX..XXXXXXX 100644 | 44 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/vmdk.c | 45 | --- a/block/snapshot.c |
18 | +++ b/block/vmdk.c | 46 | +++ b/block/snapshot.c |
19 | @@ -XXX,XX +XXX,XX @@ out: | 47 | @@ -XXX,XX +XXX,XX @@ int bdrv_snapshot_goto(BlockDriverState *bs, |
20 | return ret; | 48 | bdrv_graph_wrunlock(); |
21 | } | 49 | |
22 | 50 | ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp); | |
23 | -static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) | 51 | + memset(bs->opaque, 0, drv->instance_size); |
24 | +static int coroutine_fn GRAPH_RDLOCK | 52 | open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err); |
25 | +vmdk_write_cid(BlockDriverState *bs, uint32_t cid) | 53 | qobject_unref(options); |
26 | { | 54 | if (open_ret < 0) { |
27 | char *desc, *tmp_desc; | ||
28 | char *p_name, *tmp_str; | ||
29 | @@ -XXX,XX +XXX,XX @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) | ||
30 | |||
31 | desc = g_malloc0(DESC_SIZE); | ||
32 | tmp_desc = g_malloc0(DESC_SIZE); | ||
33 | - ret = bdrv_pread(bs->file, s->desc_offset, DESC_SIZE, desc, 0); | ||
34 | + ret = bdrv_co_pread(bs->file, s->desc_offset, DESC_SIZE, desc, 0); | ||
35 | if (ret < 0) { | ||
36 | goto out; | ||
37 | } | ||
38 | @@ -XXX,XX +XXX,XX @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) | ||
39 | pstrcat(desc, DESC_SIZE, tmp_desc); | ||
40 | } | ||
41 | |||
42 | - ret = bdrv_pwrite_sync(bs->file, s->desc_offset, DESC_SIZE, desc, 0); | ||
43 | + ret = bdrv_co_pwrite_sync(bs->file, s->desc_offset, DESC_SIZE, desc, 0); | ||
44 | |||
45 | out: | ||
46 | g_free(desc); | ||
47 | @@ -XXX,XX +XXX,XX @@ vmdk_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
48 | return ret; | ||
49 | } | ||
50 | |||
51 | -static int GRAPH_UNLOCKED | ||
52 | +static int coroutine_fn GRAPH_UNLOCKED | ||
53 | vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress, | ||
54 | bool zeroed_grain, Error **errp) | ||
55 | { | ||
56 | @@ -XXX,XX +XXX,XX @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress, | ||
57 | int gd_buf_size; | ||
58 | |||
59 | if (flat) { | ||
60 | - ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp); | ||
61 | + ret = blk_co_truncate(blk, filesize, false, PREALLOC_MODE_OFF, 0, errp); | ||
62 | goto exit; | ||
63 | } | ||
64 | magic = cpu_to_be32(VMDK4_MAGIC); | ||
65 | @@ -XXX,XX +XXX,XX @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress, | ||
66 | header.check_bytes[3] = 0xa; | ||
67 | |||
68 | /* write all the data */ | ||
69 | - ret = blk_pwrite(blk, 0, sizeof(magic), &magic, 0); | ||
70 | + ret = blk_co_pwrite(blk, 0, sizeof(magic), &magic, 0); | ||
71 | if (ret < 0) { | ||
72 | error_setg(errp, QERR_IO_ERROR); | ||
73 | goto exit; | ||
74 | } | ||
75 | - ret = blk_pwrite(blk, sizeof(magic), sizeof(header), &header, 0); | ||
76 | + ret = blk_co_pwrite(blk, sizeof(magic), sizeof(header), &header, 0); | ||
77 | if (ret < 0) { | ||
78 | error_setg(errp, QERR_IO_ERROR); | ||
79 | goto exit; | ||
80 | } | ||
81 | |||
82 | - ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false, | ||
83 | - PREALLOC_MODE_OFF, 0, errp); | ||
84 | + ret = blk_co_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false, | ||
85 | + PREALLOC_MODE_OFF, 0, errp); | ||
86 | if (ret < 0) { | ||
87 | goto exit; | ||
88 | } | ||
89 | @@ -XXX,XX +XXX,XX @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress, | ||
90 | i < gt_count; i++, tmp += gt_size) { | ||
91 | gd_buf[i] = cpu_to_le32(tmp); | ||
92 | } | ||
93 | - ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, | ||
94 | - gd_buf_size, gd_buf, 0); | ||
95 | + ret = blk_co_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, | ||
96 | + gd_buf_size, gd_buf, 0); | ||
97 | if (ret < 0) { | ||
98 | error_setg(errp, QERR_IO_ERROR); | ||
99 | goto exit; | ||
100 | @@ -XXX,XX +XXX,XX @@ vmdk_init_extent(BlockBackend *blk, int64_t filesize, bool flat, bool compress, | ||
101 | i < gt_count; i++, tmp += gt_size) { | ||
102 | gd_buf[i] = cpu_to_le32(tmp); | ||
103 | } | ||
104 | - ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, | ||
105 | - gd_buf_size, gd_buf, 0); | ||
106 | + ret = blk_co_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, | ||
107 | + gd_buf_size, gd_buf, 0); | ||
108 | if (ret < 0) { | ||
109 | error_setg(errp, QERR_IO_ERROR); | ||
110 | } | ||
111 | -- | 55 | -- |
112 | 2.41.0 | 56 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | Until now, FUA was always emulated with a separate flush after the write |
---|---|---|---|
2 | 2 | for file-posix. The overhead of processing a second request can reduce | |
3 | raw_co_getlength is called by handle_aiocb_write_zeroes, which is not a coroutine | 3 | performance significantly for a guest disk that has disabled the write |
4 | function. This is harmless because raw_co_getlength does not actually suspend, | 4 | cache, especially if the host disk is already write through, too, and |
5 | but in the interest of clarity make it a non-coroutine_fn that is just wrapped | 5 | the flush isn't actually doing anything. |
6 | by the coroutine_fn raw_co_getlength. Likewise, check_cache_dropped was only | 6 | |
7 | a coroutine_fn because it called raw_co_getlength, so it can be made non-coroutine | 7 | Advertise support for REQ_FUA in write requests and implement it for |
8 | as well. | 8 | Linux AIO and io_uring using the RWF_DSYNC flag for write requests. The |
9 | 9 | thread pool still performs a separate fdatasync() call. This can be | |
10 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 10 | improved later by using the pwritev2() syscall if available. |
11 | Message-ID: <20230601115145.196465-2-pbonzini@redhat.com> | 11 | |
12 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 12 | As an example, this is how fio numbers can be improved in some scenarios |
13 | with this patch (all using virtio-blk with cache=directsync on an nvme | ||
14 | block device for the VM, fio with ioengine=libaio,direct=1,sync=1): | ||
15 | |||
16 | | old | with FUA support | ||
17 | ------------------------------+---------------+------------------- | ||
18 | bs=4k, iodepth=1, numjobs=1 | 45.6k iops | 56.1k iops | ||
19 | bs=4k, iodepth=1, numjobs=16 | 183.3k iops | 236.0k iops | ||
20 | bs=4k, iodepth=16, numjobs=1 | 258.4k iops | 311.1k iops | ||
21 | |||
22 | However, not all scenarios are clear wins. On another slower disk I saw | ||
23 | little to no improvment. In fact, in two corner case scenarios, I even | ||
24 | observed a regression, which I however consider acceptable: | ||
25 | |||
26 | 1. On slow host disks in a write through cache mode, when the guest is | ||
27 | using virtio-blk in a separate iothread so that polling can be | ||
28 | enabled, and each completion is quickly followed up with a new | ||
29 | request (so that polling gets it), it can happen that enabling FUA | ||
30 | makes things slower - the additional very fast no-op flush we used to | ||
31 | have gave the adaptive polling algorithm a success so that it kept | ||
32 | polling. Without it, we only have the slow write request, which | ||
33 | disables polling. This is a problem in the polling algorithm that | ||
34 | will be fixed later in this series. | ||
35 | |||
36 | 2. With a high queue depth, it can be beneficial to have flush requests | ||
37 | for another reason: The optimisation in bdrv_co_flush() that flushes | ||
38 | only once per write generation acts as a synchronisation mechanism | ||
39 | that lets all requests complete at the same time. This can result in | ||
40 | better batching and if the disk is very fast (I only saw this with a | ||
41 | null_blk backend), this can make up for the overhead of the flush and | ||
42 | improve throughput. In theory, we could optionally introduce a | ||
43 | similar artificial latency in the normal completion path to achieve | ||
44 | the same kind of completion batching. This is not implemented in this | ||
45 | series. | ||
46 | |||
47 | Compatibility is not a concern for io_uring, it has supported RWF_DSYNC | ||
48 | from the start. Linux AIO started supporting it in Linux 4.13 and libaio | ||
49 | 0.3.111. The kernel is not a problem for any supported build platform, | ||
50 | so it's not necessary to add runtime checks. However, openSUSE is still | ||
51 | stuck with an older libaio version that would break the build. We must | ||
52 | detect this at build time to avoid build failures. | ||
53 | |||
54 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
55 | Message-ID: <20250307221634.71951-2-kwolf@redhat.com> | ||
56 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 57 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
14 | --- | 58 | --- |
15 | block/file-posix.c | 29 +++++++++++++++++------------ | 59 | include/block/raw-aio.h | 8 ++++++-- |
16 | 1 file changed, 17 insertions(+), 12 deletions(-) | 60 | block/file-posix.c | 26 ++++++++++++++++++-------- |
17 | 61 | block/io_uring.c | 13 ++++++++----- | |
62 | block/linux-aio.c | 24 +++++++++++++++++++++--- | ||
63 | meson.build | 4 ++++ | ||
64 | 5 files changed, 57 insertions(+), 18 deletions(-) | ||
65 | |||
66 | diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h | ||
67 | index XXXXXXX..XXXXXXX 100644 | ||
68 | --- a/include/block/raw-aio.h | ||
69 | +++ b/include/block/raw-aio.h | ||
70 | @@ -XXX,XX +XXX,XX @@ | ||
71 | #define QEMU_RAW_AIO_H | ||
72 | |||
73 | #include "block/aio.h" | ||
74 | +#include "block/block-common.h" | ||
75 | #include "qemu/iov.h" | ||
76 | |||
77 | /* AIO request types */ | ||
78 | @@ -XXX,XX +XXX,XX @@ void laio_cleanup(LinuxAioState *s); | ||
79 | |||
80 | /* laio_co_submit: submit I/O requests in the thread's current AioContext. */ | ||
81 | int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, | ||
82 | - int type, uint64_t dev_max_batch); | ||
83 | + int type, BdrvRequestFlags flags, | ||
84 | + uint64_t dev_max_batch); | ||
85 | |||
86 | bool laio_has_fdsync(int); | ||
87 | +bool laio_has_fua(void); | ||
88 | void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); | ||
89 | void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); | ||
90 | #endif | ||
91 | @@ -XXX,XX +XXX,XX @@ void luring_cleanup(LuringState *s); | ||
92 | |||
93 | /* luring_co_submit: submit I/O requests in the thread's current AioContext. */ | ||
94 | int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, | ||
95 | - QEMUIOVector *qiov, int type); | ||
96 | + QEMUIOVector *qiov, int type, | ||
97 | + BdrvRequestFlags flags); | ||
98 | void luring_detach_aio_context(LuringState *s, AioContext *old_context); | ||
99 | void luring_attach_aio_context(LuringState *s, AioContext *new_context); | ||
100 | #endif | ||
18 | diff --git a/block/file-posix.c b/block/file-posix.c | 101 | diff --git a/block/file-posix.c b/block/file-posix.c |
19 | index XXXXXXX..XXXXXXX 100644 | 102 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/block/file-posix.c | 103 | --- a/block/file-posix.c |
21 | +++ b/block/file-posix.c | 104 | +++ b/block/file-posix.c |
22 | @@ -XXX,XX +XXX,XX @@ static int fd_open(BlockDriverState *bs) | 105 | @@ -XXX,XX +XXX,XX @@ static int fd_open(BlockDriverState *bs) |
23 | return -EIO; | 106 | } |
24 | } | 107 | |
25 | 108 | static int64_t raw_getlength(BlockDriverState *bs); | |
26 | -static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs); | 109 | +static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs); |
27 | +static int64_t raw_getlength(BlockDriverState *bs); | ||
28 | 110 | ||
29 | typedef struct RawPosixAIOData { | 111 | typedef struct RawPosixAIOData { |
30 | BlockDriverState *bs; | 112 | BlockDriverState *bs; |
31 | @@ -XXX,XX +XXX,XX @@ static int handle_aiocb_write_zeroes(void *opaque) | 113 | @@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options, |
32 | #ifdef CONFIG_FALLOCATE | 114 | #endif |
33 | /* Last resort: we are trying to extend the file with zeroed data. This | 115 | s->needs_alignment = raw_needs_alignment(bs); |
34 | * can be done via fallocate(fd, 0) */ | 116 | |
35 | - len = raw_co_getlength(aiocb->bs); | 117 | + if (!s->use_linux_aio || laio_has_fua()) { |
36 | + len = raw_getlength(aiocb->bs); | 118 | + bs->supported_write_flags = BDRV_REQ_FUA; |
37 | if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) { | 119 | + } |
38 | int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); | 120 | + |
39 | if (ret == 0 || ret != -ENOTSUP) { | 121 | bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; |
40 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, | 122 | if (S_ISREG(st.st_mode)) { |
123 | /* When extending regular files, we get zeros from the OS */ | ||
124 | @@ -XXX,XX +XXX,XX @@ static inline bool raw_check_linux_aio(BDRVRawState *s) | ||
125 | #endif | ||
126 | |||
127 | static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, | ||
128 | - uint64_t bytes, QEMUIOVector *qiov, int type) | ||
129 | + uint64_t bytes, QEMUIOVector *qiov, int type, | ||
130 | + int flags) | ||
131 | { | ||
132 | BDRVRawState *s = bs->opaque; | ||
133 | RawPosixAIOData acb; | ||
134 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, | ||
135 | #ifdef CONFIG_LINUX_IO_URING | ||
136 | } else if (raw_check_linux_io_uring(s)) { | ||
137 | assert(qiov->size == bytes); | ||
138 | - ret = luring_co_submit(bs, s->fd, offset, qiov, type); | ||
139 | + ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags); | ||
140 | goto out; | ||
141 | #endif | ||
142 | #ifdef CONFIG_LINUX_AIO | ||
143 | } else if (raw_check_linux_aio(s)) { | ||
144 | assert(qiov->size == bytes); | ||
145 | - ret = laio_co_submit(s->fd, offset, qiov, type, | ||
146 | + ret = laio_co_submit(s->fd, offset, qiov, type, flags, | ||
147 | s->aio_max_batch); | ||
148 | goto out; | ||
149 | #endif | ||
150 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, | ||
151 | |||
152 | assert(qiov->size == bytes); | ||
153 | ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); | ||
154 | + if (ret == 0 && (flags & BDRV_REQ_FUA)) { | ||
155 | + /* TODO Use pwritev2() instead if it's available */ | ||
156 | + ret = raw_co_flush_to_disk(bs); | ||
157 | + } | ||
158 | goto out; /* Avoid the compiler err of unused label */ | ||
159 | |||
160 | out: | ||
161 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, | ||
162 | int64_t bytes, QEMUIOVector *qiov, | ||
163 | BdrvRequestFlags flags) | ||
164 | { | ||
165 | - return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ); | ||
166 | + return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags); | ||
167 | } | ||
168 | |||
169 | static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, | ||
170 | int64_t bytes, QEMUIOVector *qiov, | ||
171 | BdrvRequestFlags flags) | ||
172 | { | ||
173 | - return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE); | ||
174 | + return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags); | ||
175 | } | ||
176 | |||
177 | static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) | ||
178 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) | ||
179 | |||
180 | #ifdef CONFIG_LINUX_IO_URING | ||
181 | if (raw_check_linux_io_uring(s)) { | ||
182 | - return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH); | ||
183 | + return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0); | ||
41 | } | 184 | } |
42 | 185 | #endif | |
43 | if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { | 186 | #ifdef CONFIG_LINUX_AIO |
44 | - int64_t cur_length = raw_co_getlength(bs); | 187 | if (s->has_laio_fdsync && raw_check_linux_aio(s)) { |
45 | + int64_t cur_length = raw_getlength(bs); | 188 | - return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0); |
46 | 189 | + return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0); | |
47 | if (offset != cur_length && exact) { | 190 | } |
48 | error_setg(errp, "Cannot resize device files"); | 191 | #endif |
49 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, | 192 | return raw_thread_pool_submit(handle_aiocb_flush, &acb); |
50 | } | 193 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, |
51 | 194 | } | |
52 | #ifdef __OpenBSD__ | 195 | |
53 | -static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | 196 | trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS); |
54 | +static int64_t raw_getlength(BlockDriverState *bs) | 197 | - return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND); |
55 | { | 198 | + return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0); |
56 | BDRVRawState *s = bs->opaque; | 199 | } |
57 | int fd = s->fd; | 200 | #endif |
58 | @@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | 201 | |
59 | return st.st_size; | 202 | diff --git a/block/io_uring.c b/block/io_uring.c |
60 | } | 203 | index XXXXXXX..XXXXXXX 100644 |
61 | #elif defined(__NetBSD__) | 204 | --- a/block/io_uring.c |
62 | -static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | 205 | +++ b/block/io_uring.c |
63 | +static int64_t raw_getlength(BlockDriverState *bs) | 206 | @@ -XXX,XX +XXX,XX @@ static void luring_deferred_fn(void *opaque) |
64 | { | 207 | * |
65 | BDRVRawState *s = bs->opaque; | 208 | */ |
66 | int fd = s->fd; | 209 | static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, |
67 | @@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | 210 | - uint64_t offset, int type) |
68 | return st.st_size; | 211 | + uint64_t offset, int type, BdrvRequestFlags flags) |
69 | } | 212 | { |
70 | #elif defined(__sun__) | ||
71 | -static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | ||
72 | +static int64_t raw_getlength(BlockDriverState *bs) | ||
73 | { | ||
74 | BDRVRawState *s = bs->opaque; | ||
75 | struct dk_minfo minfo; | ||
76 | @@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | ||
77 | return size; | ||
78 | } | ||
79 | #elif defined(CONFIG_BSD) | ||
80 | -static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | ||
81 | +static int64_t raw_getlength(BlockDriverState *bs) | ||
82 | { | ||
83 | BDRVRawState *s = bs->opaque; | ||
84 | int fd = s->fd; | ||
85 | @@ -XXX,XX +XXX,XX @@ again: | ||
86 | return size; | ||
87 | } | ||
88 | #else | ||
89 | -static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | ||
90 | +static int64_t raw_getlength(BlockDriverState *bs) | ||
91 | { | ||
92 | BDRVRawState *s = bs->opaque; | ||
93 | int ret; | 213 | int ret; |
94 | @@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | 214 | struct io_uring_sqe *sqes = &luringcb->sqeq; |
95 | } | 215 | + int luring_flags; |
96 | #endif | 216 | |
97 | 217 | switch (type) { | |
98 | +static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs) | 218 | case QEMU_AIO_WRITE: |
219 | - io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, | ||
220 | - luringcb->qiov->niov, offset); | ||
221 | + luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; | ||
222 | + io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov, | ||
223 | + luringcb->qiov->niov, offset, luring_flags); | ||
224 | break; | ||
225 | case QEMU_AIO_ZONE_APPEND: | ||
226 | io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, | ||
227 | @@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, | ||
228 | } | ||
229 | |||
230 | int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, | ||
231 | - QEMUIOVector *qiov, int type) | ||
232 | + QEMUIOVector *qiov, int type, | ||
233 | + BdrvRequestFlags flags) | ||
234 | { | ||
235 | int ret; | ||
236 | AioContext *ctx = qemu_get_current_aio_context(); | ||
237 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, | ||
238 | }; | ||
239 | trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0, | ||
240 | type); | ||
241 | - ret = luring_do_submit(fd, &luringcb, s, offset, type); | ||
242 | + ret = luring_do_submit(fd, &luringcb, s, offset, type, flags); | ||
243 | |||
244 | if (ret < 0) { | ||
245 | return ret; | ||
246 | diff --git a/block/linux-aio.c b/block/linux-aio.c | ||
247 | index XXXXXXX..XXXXXXX 100644 | ||
248 | --- a/block/linux-aio.c | ||
249 | +++ b/block/linux-aio.c | ||
250 | @@ -XXX,XX +XXX,XX @@ static void laio_deferred_fn(void *opaque) | ||
251 | } | ||
252 | |||
253 | static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, | ||
254 | - int type, uint64_t dev_max_batch) | ||
255 | + int type, BdrvRequestFlags flags, | ||
256 | + uint64_t dev_max_batch) | ||
257 | { | ||
258 | LinuxAioState *s = laiocb->ctx; | ||
259 | struct iocb *iocbs = &laiocb->iocb; | ||
260 | QEMUIOVector *qiov = laiocb->qiov; | ||
261 | + int laio_flags; | ||
262 | |||
263 | switch (type) { | ||
264 | case QEMU_AIO_WRITE: | ||
265 | +#ifdef HAVE_IO_PREP_PWRITEV2 | ||
266 | + laio_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; | ||
267 | + io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags); | ||
268 | +#else | ||
269 | + assert(flags == 0); | ||
270 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | ||
271 | +#endif | ||
272 | break; | ||
273 | case QEMU_AIO_ZONE_APPEND: | ||
274 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | ||
275 | @@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, | ||
276 | } | ||
277 | |||
278 | int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, | ||
279 | - int type, uint64_t dev_max_batch) | ||
280 | + int type, BdrvRequestFlags flags, | ||
281 | + uint64_t dev_max_batch) | ||
282 | { | ||
283 | int ret; | ||
284 | AioContext *ctx = qemu_get_current_aio_context(); | ||
285 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, | ||
286 | .qiov = qiov, | ||
287 | }; | ||
288 | |||
289 | - ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch); | ||
290 | + ret = laio_do_submit(fd, &laiocb, offset, type, flags, dev_max_batch); | ||
291 | if (ret < 0) { | ||
292 | return ret; | ||
293 | } | ||
294 | @@ -XXX,XX +XXX,XX @@ bool laio_has_fdsync(int fd) | ||
295 | io_destroy(ctx); | ||
296 | return (ret == -EINVAL) ? false : true; | ||
297 | } | ||
298 | + | ||
299 | +bool laio_has_fua(void) | ||
99 | +{ | 300 | +{ |
100 | + return raw_getlength(bs); | 301 | +#ifdef HAVE_IO_PREP_PWRITEV2 |
302 | + return true; | ||
303 | +#else | ||
304 | + return false; | ||
305 | +#endif | ||
101 | +} | 306 | +} |
102 | + | 307 | diff --git a/meson.build b/meson.build |
103 | static int64_t coroutine_fn raw_co_get_allocated_file_size(BlockDriverState *bs) | 308 | index XXXXXXX..XXXXXXX 100644 |
104 | { | 309 | --- a/meson.build |
105 | struct stat st; | 310 | +++ b/meson.build |
106 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, | 311 | @@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_OPTRESET', |
107 | * round up if necessary. | 312 | cc.has_header_symbol('getopt.h', 'optreset')) |
108 | */ | 313 | config_host_data.set('HAVE_IPPROTO_MPTCP', |
109 | if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) { | 314 | cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP')) |
110 | - int64_t file_length = raw_co_getlength(bs); | 315 | +if libaio.found() |
111 | + int64_t file_length = raw_getlength(bs); | 316 | + config_host_data.set('HAVE_IO_PREP_PWRITEV2', |
112 | if (file_length > 0) { | 317 | + cc.has_header_symbol('libaio.h', 'io_prep_pwritev2')) |
113 | /* Ignore errors, this is just a safeguard */ | 318 | +endif |
114 | assert(hole == file_length); | 319 | |
115 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, | 320 | # has_member |
116 | 321 | config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', | |
117 | #if defined(__linux__) | ||
118 | /* Verify that the file is not in the page cache */ | ||
119 | -static void coroutine_fn check_cache_dropped(BlockDriverState *bs, Error **errp) | ||
120 | +static void check_cache_dropped(BlockDriverState *bs, Error **errp) | ||
121 | { | ||
122 | const size_t window_size = 128 * 1024 * 1024; | ||
123 | BDRVRawState *s = bs->opaque; | ||
124 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn check_cache_dropped(BlockDriverState *bs, Error **errp) | ||
125 | page_size = sysconf(_SC_PAGESIZE); | ||
126 | vec = g_malloc(DIV_ROUND_UP(window_size, page_size)); | ||
127 | |||
128 | - end = raw_co_getlength(bs); | ||
129 | + end = raw_getlength(bs); | ||
130 | |||
131 | for (offset = 0; offset < end; offset += window_size) { | ||
132 | void *new_window; | ||
133 | @@ -XXX,XX +XXX,XX @@ static int cdrom_reopen(BlockDriverState *bs) | ||
134 | |||
135 | static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs) | ||
136 | { | ||
137 | - return raw_co_getlength(bs) > 0; | ||
138 | + return raw_getlength(bs) > 0; | ||
139 | } | ||
140 | |||
141 | static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag) | ||
142 | -- | 322 | -- |
143 | 2.41.0 | 323 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | For block drivers that don't advertise FUA support, we already call |
---|---|---|---|
2 | bdrv_co_flush(), which considers BDRV_O_NO_FLUSH. However, drivers that | ||
3 | do support FUA still see the FUA flag with BDRV_O_NO_FLUSH and get the | ||
4 | associated performance penalty that cache.no-flush=on was supposed to | ||
5 | avoid. | ||
2 | 6 | ||
3 | bdrv_co_getlength was recently introduced, with bdrv_getlength becoming | 7 | Clear FUA for write requests if BDRV_O_NO_FLUSH is set. |
4 | a wrapper for use in unknown context. Switch to bdrv_co_getlength when | ||
5 | possible. | ||
6 | 8 | ||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
8 | Message-ID: <20230601115145.196465-12-pbonzini@redhat.com> | 10 | Message-ID: <20250307221634.71951-3-kwolf@redhat.com> |
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 11 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 13 | --- |
12 | block/io.c | 10 +++++----- | 14 | block/io.c | 4 ++++ |
13 | block/parallels.c | 4 ++-- | 15 | 1 file changed, 4 insertions(+) |
14 | block/qcow.c | 6 +++--- | ||
15 | block/vmdk.c | 4 ++-- | ||
16 | 4 files changed, 12 insertions(+), 12 deletions(-) | ||
17 | 16 | ||
18 | diff --git a/block/io.c b/block/io.c | 17 | diff --git a/block/io.c b/block/io.c |
19 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/block/io.c | 19 | --- a/block/io.c |
21 | +++ b/block/io.c | 20 | +++ b/block/io.c |
22 | @@ -XXX,XX +XXX,XX @@ bdrv_aligned_preadv(BdrvChild *child, BdrvTrackedRequest *req, | 21 | @@ -XXX,XX +XXX,XX @@ bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, |
22 | return -ENOMEDIUM; | ||
23 | } | 23 | } |
24 | 24 | ||
25 | /* Forward the request to the BlockDriver, possibly fragmenting it */ | 25 | + if (bs->open_flags & BDRV_O_NO_FLUSH) { |
26 | - total_bytes = bdrv_getlength(bs); | 26 | + flags &= ~BDRV_REQ_FUA; |
27 | + total_bytes = bdrv_co_getlength(bs); | 27 | + } |
28 | if (total_bytes < 0) { | 28 | + |
29 | ret = total_bytes; | 29 | if ((flags & BDRV_REQ_FUA) && |
30 | goto out; | 30 | (~bs->supported_write_flags & BDRV_REQ_FUA)) { |
31 | @@ -XXX,XX +XXX,XX @@ bdrv_co_block_status(BlockDriverState *bs, bool want_zero, | 31 | flags &= ~BDRV_REQ_FUA; |
32 | assert(pnum); | ||
33 | assert_bdrv_graph_readable(); | ||
34 | *pnum = 0; | ||
35 | - total_size = bdrv_getlength(bs); | ||
36 | + total_size = bdrv_co_getlength(bs); | ||
37 | if (total_size < 0) { | ||
38 | ret = total_size; | ||
39 | goto early_out; | ||
40 | @@ -XXX,XX +XXX,XX @@ bdrv_co_block_status(BlockDriverState *bs, bool want_zero, | ||
41 | bytes = n; | ||
42 | } | ||
43 | |||
44 | - /* Must be non-NULL or bdrv_getlength() would have failed */ | ||
45 | + /* Must be non-NULL or bdrv_co_getlength() would have failed */ | ||
46 | assert(bs->drv); | ||
47 | has_filtered_child = bdrv_filter_child(bs); | ||
48 | if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { | ||
49 | @@ -XXX,XX +XXX,XX @@ bdrv_co_block_status(BlockDriverState *bs, bool want_zero, | ||
50 | if (!cow_bs) { | ||
51 | ret |= BDRV_BLOCK_ZERO; | ||
52 | } else if (want_zero) { | ||
53 | - int64_t size2 = bdrv_getlength(cow_bs); | ||
54 | + int64_t size2 = bdrv_co_getlength(cow_bs); | ||
55 | |||
56 | if (size2 >= 0 && offset >= size2) { | ||
57 | ret |= BDRV_BLOCK_ZERO; | ||
58 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, | ||
59 | return ret; | ||
60 | } | ||
61 | |||
62 | - old_size = bdrv_getlength(bs); | ||
63 | + old_size = bdrv_co_getlength(bs); | ||
64 | if (old_size < 0) { | ||
65 | error_setg_errno(errp, -old_size, "Failed to get old image size"); | ||
66 | return old_size; | ||
67 | diff --git a/block/parallels.c b/block/parallels.c | ||
68 | index XXXXXXX..XXXXXXX 100644 | ||
69 | --- a/block/parallels.c | ||
70 | +++ b/block/parallels.c | ||
71 | @@ -XXX,XX +XXX,XX @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num, | ||
72 | assert(idx < s->bat_size && idx + to_allocate <= s->bat_size); | ||
73 | |||
74 | space = to_allocate * s->tracks; | ||
75 | - len = bdrv_getlength(bs->file->bs); | ||
76 | + len = bdrv_co_getlength(bs->file->bs); | ||
77 | if (len < 0) { | ||
78 | return len; | ||
79 | } | ||
80 | @@ -XXX,XX +XXX,XX @@ parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res, | ||
81 | uint32_t i; | ||
82 | int64_t off, high_off, size; | ||
83 | |||
84 | - size = bdrv_getlength(bs->file->bs); | ||
85 | + size = bdrv_co_getlength(bs->file->bs); | ||
86 | if (size < 0) { | ||
87 | res->check_errors++; | ||
88 | return size; | ||
89 | diff --git a/block/qcow.c b/block/qcow.c | ||
90 | index XXXXXXX..XXXXXXX 100644 | ||
91 | --- a/block/qcow.c | ||
92 | +++ b/block/qcow.c | ||
93 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
94 | if (!allocate) | ||
95 | return 0; | ||
96 | /* allocate a new l2 entry */ | ||
97 | - l2_offset = bdrv_getlength(bs->file->bs); | ||
98 | + l2_offset = bdrv_co_getlength(bs->file->bs); | ||
99 | if (l2_offset < 0) { | ||
100 | return l2_offset; | ||
101 | } | ||
102 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
103 | if (decompress_cluster(bs, cluster_offset) < 0) { | ||
104 | return -EIO; | ||
105 | } | ||
106 | - cluster_offset = bdrv_getlength(bs->file->bs); | ||
107 | + cluster_offset = bdrv_co_getlength(bs->file->bs); | ||
108 | if ((int64_t) cluster_offset < 0) { | ||
109 | return cluster_offset; | ||
110 | } | ||
111 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
112 | return ret; | ||
113 | } | ||
114 | } else { | ||
115 | - cluster_offset = bdrv_getlength(bs->file->bs); | ||
116 | + cluster_offset = bdrv_co_getlength(bs->file->bs); | ||
117 | if ((int64_t) cluster_offset < 0) { | ||
118 | return cluster_offset; | ||
119 | } | ||
120 | diff --git a/block/vmdk.c b/block/vmdk.c | ||
121 | index XXXXXXX..XXXXXXX 100644 | ||
122 | --- a/block/vmdk.c | ||
123 | +++ b/block/vmdk.c | ||
124 | @@ -XXX,XX +XXX,XX @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
125 | int64_t length; | ||
126 | |||
127 | for (i = 0; i < s->num_extents; i++) { | ||
128 | - length = bdrv_getlength(s->extents[i].file->bs); | ||
129 | + length = bdrv_co_getlength(s->extents[i].file->bs); | ||
130 | if (length < 0) { | ||
131 | return length; | ||
132 | } | ||
133 | @@ -XXX,XX +XXX,XX @@ vmdk_co_check(BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix) | ||
134 | break; | ||
135 | } | ||
136 | if (ret == VMDK_OK) { | ||
137 | - int64_t extent_len = bdrv_getlength(extent->file->bs); | ||
138 | + int64_t extent_len = bdrv_co_getlength(extent->file->bs); | ||
139 | if (extent_len < 0) { | ||
140 | fprintf(stderr, | ||
141 | "ERROR: could not get extent file length for sector %" | ||
142 | -- | 32 | -- |
143 | 2.41.0 | 33 | 2.48.1 | diff view generated by jsdifflib |
1 | The function can move the child node to a different AioContext. In this | 1 | As a preparation for having multiple adaptive polling states per |
---|---|---|---|
2 | case, it also must take the AioContext lock for the new context before | 2 | AioContext, move the 'ns' field into a separate struct. |
3 | calling functions that require the caller to hold the AioContext for the | ||
4 | child node. | ||
5 | 3 | ||
6 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
7 | Message-ID: <20230605085711.21261-6-kwolf@redhat.com> | 5 | Message-ID: <20250307221634.71951-4-kwolf@redhat.com> |
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 7 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 8 | --- |
11 | block.c | 21 ++++++++++++++++++++- | 9 | include/block/aio.h | 6 +++++- |
12 | 1 file changed, 20 insertions(+), 1 deletion(-) | 10 | util/aio-posix.c | 31 ++++++++++++++++--------------- |
11 | util/async.c | 3 ++- | ||
12 | 3 files changed, 23 insertions(+), 17 deletions(-) | ||
13 | 13 | ||
14 | diff --git a/block.c b/block.c | 14 | diff --git a/include/block/aio.h b/include/block/aio.h |
15 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
16 | --- a/block.c | 16 | --- a/include/block/aio.h |
17 | +++ b/block.c | 17 | +++ b/include/block/aio.h |
18 | @@ -XXX,XX +XXX,XX @@ static TransactionActionDrv bdrv_attach_child_common_drv = { | 18 | @@ -XXX,XX +XXX,XX @@ struct BHListSlice { |
19 | * Function doesn't update permissions, caller is responsible for this. | 19 | |
20 | * | 20 | typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; |
21 | * Returns new created child. | 21 | |
22 | + * | 22 | +typedef struct AioPolledEvent { |
23 | + * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and | 23 | + int64_t ns; /* current polling time in nanoseconds */ |
24 | + * @child_bs can move to a different AioContext in this function. Callers must | 24 | +} AioPolledEvent; |
25 | + * make sure that their AioContext locking is still correct after this. | 25 | + |
26 | */ | 26 | struct AioContext { |
27 | static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs, | 27 | GSource source; |
28 | const char *child_name, | 28 | |
29 | @@ -XXX,XX +XXX,XX @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs, | 29 | @@ -XXX,XX +XXX,XX @@ struct AioContext { |
30 | Transaction *tran, Error **errp) | 30 | int poll_disable_cnt; |
31 | { | 31 | |
32 | BdrvChild *new_child; | 32 | /* Polling mode parameters */ |
33 | - AioContext *parent_ctx; | 33 | - int64_t poll_ns; /* current polling time in nanoseconds */ |
34 | + AioContext *parent_ctx, *new_child_ctx; | 34 | + AioPolledEvent poll; |
35 | AioContext *child_ctx = bdrv_get_aio_context(child_bs); | 35 | int64_t poll_max_ns; /* maximum polling time in nanoseconds */ |
36 | 36 | int64_t poll_grow; /* polling time growth factor */ | |
37 | assert(child_class->get_parent_desc); | 37 | int64_t poll_shrink; /* polling time shrink factor */ |
38 | @@ -XXX,XX +XXX,XX @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs, | 38 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
39 | index XXXXXXX..XXXXXXX 100644 | ||
40 | --- a/util/aio-posix.c | ||
41 | +++ b/util/aio-posix.c | ||
42 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, | ||
43 | return false; | ||
44 | } | ||
45 | |||
46 | - max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
47 | + max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns); | ||
48 | if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
49 | /* | ||
50 | * Enable poll mode. It pairs with the poll_set_started() in | ||
51 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
52 | if (ctx->poll_max_ns) { | ||
53 | int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; | ||
54 | |||
55 | - if (block_ns <= ctx->poll_ns) { | ||
56 | + if (block_ns <= ctx->poll.ns) { | ||
57 | /* This is the sweet spot, no adjustment needed */ | ||
58 | } else if (block_ns > ctx->poll_max_ns) { | ||
59 | /* We'd have to poll for too long, poll less */ | ||
60 | - int64_t old = ctx->poll_ns; | ||
61 | + int64_t old = ctx->poll.ns; | ||
62 | |||
63 | if (ctx->poll_shrink) { | ||
64 | - ctx->poll_ns /= ctx->poll_shrink; | ||
65 | + ctx->poll.ns /= ctx->poll_shrink; | ||
66 | } else { | ||
67 | - ctx->poll_ns = 0; | ||
68 | + ctx->poll.ns = 0; | ||
69 | } | ||
70 | |||
71 | - trace_poll_shrink(ctx, old, ctx->poll_ns); | ||
72 | - } else if (ctx->poll_ns < ctx->poll_max_ns && | ||
73 | + trace_poll_shrink(ctx, old, ctx->poll.ns); | ||
74 | + } else if (ctx->poll.ns < ctx->poll_max_ns && | ||
75 | block_ns < ctx->poll_max_ns) { | ||
76 | /* There is room to grow, poll longer */ | ||
77 | - int64_t old = ctx->poll_ns; | ||
78 | + int64_t old = ctx->poll.ns; | ||
79 | int64_t grow = ctx->poll_grow; | ||
80 | |||
81 | if (grow == 0) { | ||
82 | grow = 2; | ||
83 | } | ||
84 | |||
85 | - if (ctx->poll_ns) { | ||
86 | - ctx->poll_ns *= grow; | ||
87 | + if (ctx->poll.ns) { | ||
88 | + ctx->poll.ns *= grow; | ||
89 | } else { | ||
90 | - ctx->poll_ns = 4000; /* start polling at 4 microseconds */ | ||
91 | + ctx->poll.ns = 4000; /* start polling at 4 microseconds */ | ||
92 | } | ||
93 | |||
94 | - if (ctx->poll_ns > ctx->poll_max_ns) { | ||
95 | - ctx->poll_ns = ctx->poll_max_ns; | ||
96 | + if (ctx->poll.ns > ctx->poll_max_ns) { | ||
97 | + ctx->poll.ns = ctx->poll_max_ns; | ||
98 | } | ||
99 | |||
100 | - trace_poll_grow(ctx, old, ctx->poll_ns); | ||
101 | + trace_poll_grow(ctx, old, ctx->poll.ns); | ||
39 | } | 102 | } |
40 | } | 103 | } |
41 | 104 | ||
42 | + new_child_ctx = bdrv_get_aio_context(child_bs); | 105 | @@ -XXX,XX +XXX,XX @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, |
43 | + if (new_child_ctx != child_ctx) { | 106 | /* No thread synchronization here, it doesn't matter if an incorrect value |
44 | + aio_context_release(child_ctx); | 107 | * is used once. |
45 | + aio_context_acquire(new_child_ctx); | 108 | */ |
46 | + } | 109 | + ctx->poll.ns = 0; |
47 | + | 110 | + |
48 | bdrv_ref(child_bs); | 111 | ctx->poll_max_ns = max_ns; |
49 | /* | 112 | - ctx->poll_ns = 0; |
50 | * Let every new BdrvChild start with a drained parent. Inserting the child | 113 | ctx->poll_grow = grow; |
51 | @@ -XXX,XX +XXX,XX @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs, | 114 | ctx->poll_shrink = shrink; |
52 | }; | 115 | |
53 | tran_add(tran, &bdrv_attach_child_common_drv, s); | 116 | diff --git a/util/async.c b/util/async.c |
54 | 117 | index XXXXXXX..XXXXXXX 100644 | |
55 | + if (new_child_ctx != child_ctx) { | 118 | --- a/util/async.c |
56 | + aio_context_release(new_child_ctx); | 119 | +++ b/util/async.c |
57 | + aio_context_acquire(child_ctx); | 120 | @@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp) |
58 | + } | 121 | qemu_rec_mutex_init(&ctx->lock); |
122 | timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); | ||
123 | |||
124 | - ctx->poll_ns = 0; | ||
125 | + ctx->poll.ns = 0; | ||
59 | + | 126 | + |
60 | return new_child; | 127 | ctx->poll_max_ns = 0; |
61 | } | 128 | ctx->poll_grow = 0; |
62 | 129 | ctx->poll_shrink = 0; | |
63 | /* | ||
64 | * Function doesn't update permissions, caller is responsible for this. | ||
65 | + * | ||
66 | + * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and | ||
67 | + * @child_bs can move to a different AioContext in this function. Callers must | ||
68 | + * make sure that their AioContext locking is still correct after this. | ||
69 | */ | ||
70 | static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs, | ||
71 | BlockDriverState *child_bs, | ||
72 | -- | 130 | -- |
73 | 2.41.0 | 131 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
---|---|---|---|
2 | 2 | Message-ID: <20250307221634.71951-5-kwolf@redhat.com> | |
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | ||
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | ||
6 | |||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
8 | Message-ID: <20230601115145.196465-8-pbonzini@redhat.com> | ||
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 5 | --- |
12 | block/dmg.c | 21 +++++++++++---------- | 6 | util/aio-posix.c | 77 ++++++++++++++++++++++++++---------------------- |
13 | 1 file changed, 11 insertions(+), 10 deletions(-) | 7 | 1 file changed, 41 insertions(+), 36 deletions(-) |
14 | 8 | ||
15 | diff --git a/block/dmg.c b/block/dmg.c | 9 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
16 | index XXXXXXX..XXXXXXX 100644 | 10 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/dmg.c | 11 | --- a/util/aio-posix.c |
18 | +++ b/block/dmg.c | 12 | +++ b/util/aio-posix.c |
19 | @@ -XXX,XX +XXX,XX @@ err: | 13 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, |
20 | return s->n_chunks; /* error */ | 14 | return false; |
21 | } | 15 | } |
22 | 16 | ||
23 | -static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | 17 | +static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll, |
24 | +static int coroutine_fn GRAPH_RDLOCK | 18 | + int64_t block_ns) |
25 | +dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | 19 | +{ |
20 | + if (block_ns <= poll->ns) { | ||
21 | + /* This is the sweet spot, no adjustment needed */ | ||
22 | + } else if (block_ns > ctx->poll_max_ns) { | ||
23 | + /* We'd have to poll for too long, poll less */ | ||
24 | + int64_t old = poll->ns; | ||
25 | + | ||
26 | + if (ctx->poll_shrink) { | ||
27 | + poll->ns /= ctx->poll_shrink; | ||
28 | + } else { | ||
29 | + poll->ns = 0; | ||
30 | + } | ||
31 | + | ||
32 | + trace_poll_shrink(ctx, old, poll->ns); | ||
33 | + } else if (poll->ns < ctx->poll_max_ns && | ||
34 | + block_ns < ctx->poll_max_ns) { | ||
35 | + /* There is room to grow, poll longer */ | ||
36 | + int64_t old = poll->ns; | ||
37 | + int64_t grow = ctx->poll_grow; | ||
38 | + | ||
39 | + if (grow == 0) { | ||
40 | + grow = 2; | ||
41 | + } | ||
42 | + | ||
43 | + if (poll->ns) { | ||
44 | + poll->ns *= grow; | ||
45 | + } else { | ||
46 | + poll->ns = 4000; /* start polling at 4 microseconds */ | ||
47 | + } | ||
48 | + | ||
49 | + if (poll->ns > ctx->poll_max_ns) { | ||
50 | + poll->ns = ctx->poll_max_ns; | ||
51 | + } | ||
52 | + | ||
53 | + trace_poll_grow(ctx, old, poll->ns); | ||
54 | + } | ||
55 | +} | ||
56 | + | ||
57 | bool aio_poll(AioContext *ctx, bool blocking) | ||
26 | { | 58 | { |
27 | BDRVDMGState *s = bs->opaque; | 59 | AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); |
28 | 60 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | |
29 | @@ -XXX,XX +XXX,XX @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | 61 | /* Adjust polling time */ |
30 | case UDZO: { /* zlib compressed */ | 62 | if (ctx->poll_max_ns) { |
31 | /* we need to buffer, because only the chunk as whole can be | 63 | int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; |
32 | * inflated. */ | 64 | - |
33 | - ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 65 | - if (block_ns <= ctx->poll.ns) { |
34 | - s->compressed_chunk, 0); | 66 | - /* This is the sweet spot, no adjustment needed */ |
35 | + ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 67 | - } else if (block_ns > ctx->poll_max_ns) { |
36 | + s->compressed_chunk, 0); | 68 | - /* We'd have to poll for too long, poll less */ |
37 | if (ret < 0) { | 69 | - int64_t old = ctx->poll.ns; |
38 | return -1; | 70 | - |
39 | } | 71 | - if (ctx->poll_shrink) { |
40 | @@ -XXX,XX +XXX,XX @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | 72 | - ctx->poll.ns /= ctx->poll_shrink; |
41 | } | 73 | - } else { |
42 | /* we need to buffer, because only the chunk as whole can be | 74 | - ctx->poll.ns = 0; |
43 | * inflated. */ | 75 | - } |
44 | - ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 76 | - |
45 | - s->compressed_chunk, 0); | 77 | - trace_poll_shrink(ctx, old, ctx->poll.ns); |
46 | + ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 78 | - } else if (ctx->poll.ns < ctx->poll_max_ns && |
47 | + s->compressed_chunk, 0); | 79 | - block_ns < ctx->poll_max_ns) { |
48 | if (ret < 0) { | 80 | - /* There is room to grow, poll longer */ |
49 | return -1; | 81 | - int64_t old = ctx->poll.ns; |
50 | } | 82 | - int64_t grow = ctx->poll_grow; |
51 | @@ -XXX,XX +XXX,XX @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | 83 | - |
52 | } | 84 | - if (grow == 0) { |
53 | /* we need to buffer, because only the chunk as whole can be | 85 | - grow = 2; |
54 | * inflated. */ | 86 | - } |
55 | - ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 87 | - |
56 | - s->compressed_chunk, 0); | 88 | - if (ctx->poll.ns) { |
57 | + ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 89 | - ctx->poll.ns *= grow; |
58 | + s->compressed_chunk, 0); | 90 | - } else { |
59 | if (ret < 0) { | 91 | - ctx->poll.ns = 4000; /* start polling at 4 microseconds */ |
60 | return -1; | 92 | - } |
61 | } | 93 | - |
62 | @@ -XXX,XX +XXX,XX @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | 94 | - if (ctx->poll.ns > ctx->poll_max_ns) { |
63 | } | 95 | - ctx->poll.ns = ctx->poll_max_ns; |
64 | break; | 96 | - } |
65 | case UDRW: /* copy */ | 97 | - |
66 | - ret = bdrv_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 98 | - trace_poll_grow(ctx, old, ctx->poll.ns); |
67 | - s->uncompressed_chunk, 0); | 99 | - } |
68 | + ret = bdrv_co_pread(bs->file, s->offsets[chunk], s->lengths[chunk], | 100 | + adjust_polling_time(ctx, &ctx->poll, block_ns); |
69 | + s->uncompressed_chunk, 0); | 101 | } |
70 | if (ret < 0) { | 102 | |
71 | return -1; | 103 | progress |= aio_bh_poll(ctx); |
72 | } | ||
73 | @@ -XXX,XX +XXX,XX @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | ||
74 | return 0; | ||
75 | } | ||
76 | |||
77 | -static int coroutine_fn | ||
78 | +static int coroutine_fn GRAPH_RDLOCK | ||
79 | dmg_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
80 | QEMUIOVector *qiov, BdrvRequestFlags flags) | ||
81 | { | ||
82 | -- | 104 | -- |
83 | 2.41.0 | 105 | 2.48.1 | diff view generated by jsdifflib |
1 | bdrv_root_attach_child() requires callers to hold the AioContext lock | 1 | Adaptive polling has a big problem: It doesn't consider that an event |
---|---|---|---|
2 | for child_bs. Take it in block_job_add_bdrv() before calling the | 2 | loop can wait for many different events that may have very different |
3 | function. | 3 | typical latencies. |
4 | |||
5 | For example, think of a guest that tends to send a new I/O request soon | ||
6 | after the previous I/O request completes, but the storage on the host is | ||
7 | rather slow. In this case, getting the new request from guest quickly | ||
8 | means that polling is enabled, but the next thing is performing the I/O | ||
9 | request on the backend, which is slow and disables polling again for the | ||
10 | next guest request. This means that in such a scenario, polling could | ||
11 | help for every other event, but is only ever enabled when it can't | ||
12 | succeed. | ||
13 | |||
14 | In order to fix this, keep a separate AioPolledEvent for each | ||
15 | AioHandler. We will then know that the backend file descriptor always | ||
16 | has a high latency and isn't worth polling for, but we also know that | ||
17 | the guest is always fast and we should poll for it. This solves at least | ||
18 | half of the problem, we can now keep polling for those cases where it | ||
19 | makes sense and get the improved performance from it. | ||
20 | |||
21 | Since the event loop doesn't know which event will be next, we still do | ||
22 | some unnecessary polling while we're waiting for the slow disk. I made | ||
23 | some attempts to be more clever than just randomly growing and shrinking | ||
24 | the polling time, and even to let callers be explicit about when they | ||
25 | expect a new event, but so far this hasn't resulted in improved | ||
26 | performance or even caused performance regressions. For now, let's just | ||
27 | fix the part that is easy enough to fix, we can revisit the rest later. | ||
4 | 28 | ||
5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 29 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
6 | Message-ID: <20230605085711.21261-10-kwolf@redhat.com> | 30 | Message-ID: <20250307221634.71951-6-kwolf@redhat.com> |
7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 31 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 32 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
9 | --- | 33 | --- |
10 | blockjob.c | 17 ++++++++++++----- | 34 | include/block/aio.h | 1 - |
11 | 1 file changed, 12 insertions(+), 5 deletions(-) | 35 | util/aio-posix.h | 1 + |
36 | util/aio-posix.c | 26 ++++++++++++++++++++++---- | ||
37 | util/async.c | 2 -- | ||
38 | 4 files changed, 23 insertions(+), 7 deletions(-) | ||
12 | 39 | ||
13 | diff --git a/blockjob.c b/blockjob.c | 40 | diff --git a/include/block/aio.h b/include/block/aio.h |
14 | index XXXXXXX..XXXXXXX 100644 | 41 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/blockjob.c | 42 | --- a/include/block/aio.h |
16 | +++ b/blockjob.c | 43 | +++ b/include/block/aio.h |
17 | @@ -XXX,XX +XXX,XX @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, | 44 | @@ -XXX,XX +XXX,XX @@ struct AioContext { |
18 | uint64_t perm, uint64_t shared_perm, Error **errp) | 45 | int poll_disable_cnt; |
46 | |||
47 | /* Polling mode parameters */ | ||
48 | - AioPolledEvent poll; | ||
49 | int64_t poll_max_ns; /* maximum polling time in nanoseconds */ | ||
50 | int64_t poll_grow; /* polling time growth factor */ | ||
51 | int64_t poll_shrink; /* polling time shrink factor */ | ||
52 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
53 | index XXXXXXX..XXXXXXX 100644 | ||
54 | --- a/util/aio-posix.h | ||
55 | +++ b/util/aio-posix.h | ||
56 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | ||
57 | #endif | ||
58 | int64_t poll_idle_timeout; /* when to stop userspace polling */ | ||
59 | bool poll_ready; /* has polling detected an event? */ | ||
60 | + AioPolledEvent poll; | ||
61 | }; | ||
62 | |||
63 | /* Add a handler to a ready list */ | ||
64 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
65 | index XXXXXXX..XXXXXXX 100644 | ||
66 | --- a/util/aio-posix.c | ||
67 | +++ b/util/aio-posix.c | ||
68 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list, | ||
69 | static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, | ||
70 | int64_t *timeout) | ||
19 | { | 71 | { |
20 | BdrvChild *c; | 72 | + AioHandler *node; |
21 | + AioContext *ctx = bdrv_get_aio_context(bs); | 73 | int64_t max_ns; |
22 | bool need_context_ops; | 74 | |
23 | GLOBAL_STATE_CODE(); | 75 | if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { |
24 | 76 | return false; | |
25 | bdrv_ref(bs); | ||
26 | |||
27 | - need_context_ops = bdrv_get_aio_context(bs) != job->job.aio_context; | ||
28 | + need_context_ops = ctx != job->job.aio_context; | ||
29 | |||
30 | - if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) { | ||
31 | - aio_context_release(job->job.aio_context); | ||
32 | + if (need_context_ops) { | ||
33 | + if (job->job.aio_context != qemu_get_aio_context()) { | ||
34 | + aio_context_release(job->job.aio_context); | ||
35 | + } | ||
36 | + aio_context_acquire(ctx); | ||
37 | } | 77 | } |
38 | c = bdrv_root_attach_child(bs, name, &child_job, 0, perm, shared_perm, job, | 78 | |
39 | errp); | 79 | - max_ns = qemu_soonest_timeout(*timeout, ctx->poll.ns); |
40 | - if (need_context_ops && job->job.aio_context != qemu_get_aio_context()) { | 80 | + max_ns = 0; |
41 | - aio_context_acquire(job->job.aio_context); | 81 | + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { |
42 | + if (need_context_ops) { | 82 | + max_ns = MAX(max_ns, node->poll.ns); |
43 | + aio_context_release(ctx); | 83 | + } |
44 | + if (job->job.aio_context != qemu_get_aio_context()) { | 84 | + max_ns = qemu_soonest_timeout(*timeout, max_ns); |
45 | + aio_context_acquire(job->job.aio_context); | 85 | + |
86 | if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
87 | /* | ||
88 | * Enable poll mode. It pairs with the poll_set_started() in | ||
89 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
90 | |||
91 | /* Adjust polling time */ | ||
92 | if (ctx->poll_max_ns) { | ||
93 | + AioHandler *node; | ||
94 | int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; | ||
95 | - adjust_polling_time(ctx, &ctx->poll, block_ns); | ||
96 | + | ||
97 | + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { | ||
98 | + if (QLIST_IS_INSERTED(node, node_ready)) { | ||
99 | + adjust_polling_time(ctx, &node->poll, block_ns); | ||
100 | + } | ||
46 | + } | 101 | + } |
47 | } | 102 | } |
48 | if (c == NULL) { | 103 | |
49 | return -EPERM; | 104 | progress |= aio_bh_poll(ctx); |
105 | @@ -XXX,XX +XXX,XX @@ void aio_context_use_g_source(AioContext *ctx) | ||
106 | void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, | ||
107 | int64_t grow, int64_t shrink, Error **errp) | ||
108 | { | ||
109 | + AioHandler *node; | ||
110 | + | ||
111 | + qemu_lockcnt_inc(&ctx->list_lock); | ||
112 | + QLIST_FOREACH(node, &ctx->aio_handlers, node) { | ||
113 | + node->poll.ns = 0; | ||
114 | + } | ||
115 | + qemu_lockcnt_dec(&ctx->list_lock); | ||
116 | + | ||
117 | /* No thread synchronization here, it doesn't matter if an incorrect value | ||
118 | * is used once. | ||
119 | */ | ||
120 | - ctx->poll.ns = 0; | ||
121 | - | ||
122 | ctx->poll_max_ns = max_ns; | ||
123 | ctx->poll_grow = grow; | ||
124 | ctx->poll_shrink = shrink; | ||
125 | diff --git a/util/async.c b/util/async.c | ||
126 | index XXXXXXX..XXXXXXX 100644 | ||
127 | --- a/util/async.c | ||
128 | +++ b/util/async.c | ||
129 | @@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp) | ||
130 | qemu_rec_mutex_init(&ctx->lock); | ||
131 | timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); | ||
132 | |||
133 | - ctx->poll.ns = 0; | ||
134 | - | ||
135 | ctx->poll_max_ns = 0; | ||
136 | ctx->poll_grow = 0; | ||
137 | ctx->poll_shrink = 0; | ||
50 | -- | 138 | -- |
51 | 2.41.0 | 139 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | aio_dispatch_handler() adds handlers to ctx->poll_aio_handlers if |
---|---|---|---|
2 | polling should be enabled. If we call adjust_polling_time() for all | ||
3 | polling handlers before this, new polling handlers are still left at | ||
4 | poll->ns = 0 and polling is only actually enabled after the next event. | ||
5 | Move the adjust_polling_time() call after aio_dispatch_handler(). | ||
2 | 6 | ||
3 | bdrv_co_debug_event was recently introduced, with bdrv_debug_event | 7 | This fixes test-nested-aio-poll, which expects that polling becomes |
4 | becoming a wrapper for use in unknown context. Because most of the | 8 | effective the first time around. |
5 | time bdrv_debug_event is used on a BdrvChild via the wrapper macro | ||
6 | BLKDBG_EVENT, introduce a similar macro BLKDBG_CO_EVENT that calls | ||
7 | bdrv_co_debug_event, and switch whenever possible. | ||
8 | 9 | ||
9 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | Message-ID: <20230601115145.196465-13-pbonzini@redhat.com> | 11 | Message-ID: <20250311141912.135657-1-kwolf@redhat.com> |
11 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
13 | --- | 13 | --- |
14 | include/block/block-io.h | 7 +++++++ | 14 | util/aio-posix.c | 28 +++++++++++++++++----------- |
15 | block/io.c | 4 ++-- | 15 | 1 file changed, 17 insertions(+), 11 deletions(-) |
16 | block/qcow.c | 24 ++++++++++++------------ | ||
17 | block/qcow2-cluster.c | 12 ++++++------ | ||
18 | block/qcow2-refcount.c | 4 ++-- | ||
19 | block/qcow2.c | 18 +++++++++--------- | ||
20 | block/qed-table.c | 6 +++--- | ||
21 | block/qed.c | 8 ++++---- | ||
22 | block/raw-format.c | 4 ++-- | ||
23 | block/vmdk.c | 24 ++++++++++++------------ | ||
24 | 10 files changed, 59 insertions(+), 52 deletions(-) | ||
25 | 16 | ||
26 | diff --git a/include/block/block-io.h b/include/block/block-io.h | 17 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
27 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
28 | --- a/include/block/block-io.h | 19 | --- a/util/aio-posix.c |
29 | +++ b/include/block/block-io.h | 20 | +++ b/util/aio-posix.c |
30 | @@ -XXX,XX +XXX,XX @@ bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event); | 21 | @@ -XXX,XX +XXX,XX @@ |
31 | void co_wrapper_mixed_bdrv_rdlock | 22 | /* Stop userspace polling on a handler if it isn't active for some time */ |
32 | bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event); | 23 | #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) |
33 | 24 | ||
34 | +#define BLKDBG_CO_EVENT(child, evt) \ | 25 | +static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll, |
35 | + do { \ | 26 | + int64_t block_ns); |
36 | + if (child) { \ | ||
37 | + bdrv_co_debug_event(child->bs, evt); \ | ||
38 | + } \ | ||
39 | + } while (0) | ||
40 | + | 27 | + |
41 | #define BLKDBG_EVENT(child, evt) \ | 28 | bool aio_poll_disabled(AioContext *ctx) |
42 | do { \ | 29 | { |
43 | if (child) { \ | 30 | return qatomic_read(&ctx->poll_disable_cnt); |
44 | diff --git a/block/io.c b/block/io.c | 31 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) |
45 | index XXXXXXX..XXXXXXX 100644 | 32 | * scanning all handlers with aio_dispatch_handlers(). |
46 | --- a/block/io.c | 33 | */ |
47 | +++ b/block/io.c | 34 | static bool aio_dispatch_ready_handlers(AioContext *ctx, |
48 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) | 35 | - AioHandlerList *ready_list) |
36 | + AioHandlerList *ready_list, | ||
37 | + int64_t block_ns) | ||
38 | { | ||
39 | bool progress = false; | ||
40 | AioHandler *node; | ||
41 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx, | ||
42 | while ((node = QLIST_FIRST(ready_list))) { | ||
43 | QLIST_REMOVE(node, node_ready); | ||
44 | progress = aio_dispatch_handler(ctx, node) || progress; | ||
45 | + | ||
46 | + /* | ||
47 | + * Adjust polling time only after aio_dispatch_handler(), which can | ||
48 | + * add the handler to ctx->poll_aio_handlers. | ||
49 | + */ | ||
50 | + if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) { | ||
51 | + adjust_polling_time(ctx, &node->poll, block_ns); | ||
52 | + } | ||
49 | } | 53 | } |
50 | 54 | ||
51 | /* Write back cached data to the OS even with cache=unsafe */ | 55 | return progress; |
52 | - BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); | 56 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) |
53 | + BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); | 57 | bool use_notify_me; |
54 | if (bs->drv->bdrv_co_flush_to_os) { | 58 | int64_t timeout; |
55 | ret = bs->drv->bdrv_co_flush_to_os(bs); | 59 | int64_t start = 0; |
56 | if (ret < 0) { | 60 | + int64_t block_ns = 0; |
57 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) | 61 | |
58 | goto flush_children; | 62 | /* |
63 | * There cannot be two concurrent aio_poll calls for the same AioContext (or | ||
64 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
65 | |||
66 | aio_notify_accept(ctx); | ||
67 | |||
68 | - /* Adjust polling time */ | ||
69 | + /* Calculate blocked time for adaptive polling */ | ||
70 | if (ctx->poll_max_ns) { | ||
71 | - AioHandler *node; | ||
72 | - int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; | ||
73 | - | ||
74 | - QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { | ||
75 | - if (QLIST_IS_INSERTED(node, node_ready)) { | ||
76 | - adjust_polling_time(ctx, &node->poll, block_ns); | ||
77 | - } | ||
78 | - } | ||
79 | + block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; | ||
59 | } | 80 | } |
60 | 81 | ||
61 | - BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); | 82 | progress |= aio_bh_poll(ctx); |
62 | + BLKDBG_CO_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); | 83 | - progress |= aio_dispatch_ready_handlers(ctx, &ready_list); |
63 | if (!bs->drv) { | 84 | + progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns); |
64 | /* bs->drv->bdrv_co_flush() might have ejected the BDS | 85 | |
65 | * (even in case of apparent success) */ | 86 | aio_free_deleted_handlers(ctx); |
66 | diff --git a/block/qcow.c b/block/qcow.c | ||
67 | index XXXXXXX..XXXXXXX 100644 | ||
68 | --- a/block/qcow.c | ||
69 | +++ b/block/qcow.c | ||
70 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
71 | /* update the L1 entry */ | ||
72 | s->l1_table[l1_index] = l2_offset; | ||
73 | tmp = cpu_to_be64(l2_offset); | ||
74 | - BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); | ||
75 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_UPDATE); | ||
76 | ret = bdrv_co_pwrite_sync(bs->file, | ||
77 | s->l1_table_offset + l1_index * sizeof(tmp), | ||
78 | sizeof(tmp), &tmp, 0); | ||
79 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
80 | } | ||
81 | } | ||
82 | l2_table = s->l2_cache + (min_index << s->l2_bits); | ||
83 | - BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); | ||
84 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_LOAD); | ||
85 | if (new_l2_table) { | ||
86 | memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); | ||
87 | ret = bdrv_co_pwrite_sync(bs->file, l2_offset, | ||
88 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
89 | ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { | ||
90 | if (!allocate) | ||
91 | return 0; | ||
92 | - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); | ||
93 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); | ||
94 | assert(QEMU_IS_ALIGNED(n_start | n_end, BDRV_SECTOR_SIZE)); | ||
95 | /* allocate a new cluster */ | ||
96 | if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && | ||
97 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
98 | } | ||
99 | cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size); | ||
100 | /* write the cluster content */ | ||
101 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
102 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
103 | ret = bdrv_co_pwrite(bs->file, cluster_offset, s->cluster_size, | ||
104 | s->cluster_cache, 0); | ||
105 | if (ret < 0) { | ||
106 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
107 | NULL) < 0) { | ||
108 | return -EIO; | ||
109 | } | ||
110 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
111 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
112 | ret = bdrv_co_pwrite(bs->file, cluster_offset + i, | ||
113 | BDRV_SECTOR_SIZE, | ||
114 | s->cluster_data, 0); | ||
115 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, | ||
116 | tmp = cpu_to_be64(cluster_offset); | ||
117 | l2_table[l2_index] = tmp; | ||
118 | if (allocate == 2) { | ||
119 | - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); | ||
120 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); | ||
121 | } else { | ||
122 | - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); | ||
123 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE); | ||
124 | } | ||
125 | ret = bdrv_co_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), | ||
126 | sizeof(tmp), &tmp, 0); | ||
127 | @@ -XXX,XX +XXX,XX @@ decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) | ||
128 | if (s->cluster_cache_offset != coffset) { | ||
129 | csize = cluster_offset >> (63 - s->cluster_bits); | ||
130 | csize &= (s->cluster_size - 1); | ||
131 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); | ||
132 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED); | ||
133 | ret = bdrv_co_pread(bs->file, coffset, csize, s->cluster_data, 0); | ||
134 | if (ret < 0) | ||
135 | return -1; | ||
136 | @@ -XXX,XX +XXX,XX @@ qcow_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
137 | /* read from the base image */ | ||
138 | qemu_co_mutex_unlock(&s->lock); | ||
139 | /* qcow2 emits this on bs->file instead of bs->backing */ | ||
140 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); | ||
141 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); | ||
142 | ret = bdrv_co_pread(bs->backing, offset, n, buf, 0); | ||
143 | qemu_co_mutex_lock(&s->lock); | ||
144 | if (ret < 0) { | ||
145 | @@ -XXX,XX +XXX,XX @@ qcow_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
146 | break; | ||
147 | } | ||
148 | qemu_co_mutex_unlock(&s->lock); | ||
149 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); | ||
150 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO); | ||
151 | ret = bdrv_co_pread(bs->file, cluster_offset + offset_in_cluster, | ||
152 | n, buf, 0); | ||
153 | qemu_co_mutex_lock(&s->lock); | ||
154 | @@ -XXX,XX +XXX,XX @@ qcow_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
155 | } | ||
156 | |||
157 | qemu_co_mutex_unlock(&s->lock); | ||
158 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
159 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
160 | ret = bdrv_co_pwrite(bs->file, cluster_offset + offset_in_cluster, | ||
161 | n, buf, 0); | ||
162 | qemu_co_mutex_lock(&s->lock); | ||
163 | @@ -XXX,XX +XXX,XX @@ qcow_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
164 | } | ||
165 | cluster_offset &= s->cluster_offset_mask; | ||
166 | |||
167 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); | ||
168 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); | ||
169 | ret = bdrv_co_pwrite(bs->file, cluster_offset, out_len, out_buf, 0); | ||
170 | if (ret < 0) { | ||
171 | goto fail; | ||
172 | diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c | ||
173 | index XXXXXXX..XXXXXXX 100644 | ||
174 | --- a/block/qcow2-cluster.c | ||
175 | +++ b/block/qcow2-cluster.c | ||
176 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs, | ||
177 | fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size); | ||
178 | #endif | ||
179 | |||
180 | - BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); | ||
181 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); | ||
182 | ret = bdrv_co_pwrite_zeroes(bs->file, | ||
183 | s->l1_table_offset + new_l1_size * L1E_SIZE, | ||
184 | (s->l1_size - new_l1_size) * L1E_SIZE, 0); | ||
185 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs, | ||
186 | goto fail; | ||
187 | } | ||
188 | |||
189 | - BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); | ||
190 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); | ||
191 | for (i = s->l1_size - 1; i > new_l1_size - 1; i--) { | ||
192 | if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) { | ||
193 | continue; | ||
194 | @@ -XXX,XX +XXX,XX @@ do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset, | ||
195 | return 0; | ||
196 | } | ||
197 | |||
198 | - BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); | ||
199 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_READ); | ||
200 | |||
201 | if (!bs->drv) { | ||
202 | return -ENOMEDIUM; | ||
203 | @@ -XXX,XX +XXX,XX @@ do_perform_cow_write(BlockDriverState *bs, uint64_t cluster_offset, | ||
204 | return ret; | ||
205 | } | ||
206 | |||
207 | - BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); | ||
208 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_WRITE); | ||
209 | ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster, | ||
210 | qiov->size, qiov, 0); | ||
211 | if (ret < 0) { | ||
212 | @@ -XXX,XX +XXX,XX @@ qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, | ||
213 | |||
214 | /* compressed clusters never have the copied flag */ | ||
215 | |||
216 | - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); | ||
217 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); | ||
218 | qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); | ||
219 | set_l2_entry(s, l2_slice, l2_index, cluster_offset); | ||
220 | if (has_subclusters(s)) { | ||
221 | @@ -XXX,XX +XXX,XX @@ perform_cow(BlockDriverState *bs, QCowL2Meta *m) | ||
222 | /* NOTE: we have a write_aio blkdebug event here followed by | ||
223 | * a cow_write one in do_perform_cow_write(), but there's only | ||
224 | * one single I/O operation */ | ||
225 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
226 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
227 | ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); | ||
228 | } else { | ||
229 | /* If there's no guest data then write both COW regions separately */ | ||
230 | diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c | ||
231 | index XXXXXXX..XXXXXXX 100644 | ||
232 | --- a/block/qcow2-refcount.c | ||
233 | +++ b/block/qcow2-refcount.c | ||
234 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn qcow2_refcount_init(BlockDriverState *bs) | ||
235 | ret = -ENOMEM; | ||
236 | goto fail; | ||
237 | } | ||
238 | - BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); | ||
239 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); | ||
240 | ret = bdrv_co_pread(bs->file, s->refcount_table_offset, | ||
241 | refcount_table_size2, s->refcount_table, 0); | ||
242 | if (ret < 0) { | ||
243 | @@ -XXX,XX +XXX,XX @@ int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int si | ||
244 | size_t free_in_cluster; | ||
245 | int ret; | ||
246 | |||
247 | - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); | ||
248 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); | ||
249 | assert(size > 0 && size <= s->cluster_size); | ||
250 | assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); | ||
251 | |||
252 | diff --git a/block/qcow2.c b/block/qcow2.c | ||
253 | index XXXXXXX..XXXXXXX 100644 | ||
254 | --- a/block/qcow2.c | ||
255 | +++ b/block/qcow2.c | ||
256 | @@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_encrypted(BlockDriverState *bs, | ||
257 | return -ENOMEM; | ||
258 | } | ||
259 | |||
260 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); | ||
261 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO); | ||
262 | ret = bdrv_co_pread(s->data_file, host_offset, bytes, buf, 0); | ||
263 | if (ret < 0) { | ||
264 | goto fail; | ||
265 | @@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type, | ||
266 | case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: | ||
267 | assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */ | ||
268 | |||
269 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); | ||
270 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); | ||
271 | return bdrv_co_preadv_part(bs->backing, offset, bytes, | ||
272 | qiov, qiov_offset, 0); | ||
273 | |||
274 | @@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type, | ||
275 | offset, bytes, qiov, qiov_offset); | ||
276 | } | ||
277 | |||
278 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); | ||
279 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO); | ||
280 | return bdrv_co_preadv_part(s->data_file, host_offset, | ||
281 | bytes, qiov, qiov_offset, 0); | ||
282 | |||
283 | @@ -XXX,XX +XXX,XX @@ handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) | ||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE); | ||
288 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE); | ||
289 | ret = bdrv_co_pwrite_zeroes(s->data_file, start_offset, nb_bytes, | ||
290 | BDRV_REQ_NO_FALLBACK); | ||
291 | if (ret < 0) { | ||
292 | @@ -XXX,XX +XXX,XX @@ int qcow2_co_pwritev_task(BlockDriverState *bs, uint64_t host_offset, | ||
293 | * guest data now. | ||
294 | */ | ||
295 | if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) { | ||
296 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
297 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
298 | trace_qcow2_writev_data(qemu_coroutine_self(), host_offset); | ||
299 | ret = bdrv_co_pwritev_part(s->data_file, host_offset, | ||
300 | bytes, qiov, qiov_offset, 0); | ||
301 | @@ -XXX,XX +XXX,XX @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs, | ||
302 | goto fail; | ||
303 | } | ||
304 | |||
305 | - BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED); | ||
306 | + BLKDBG_CO_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED); | ||
307 | ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0); | ||
308 | if (ret < 0) { | ||
309 | goto fail; | ||
310 | @@ -XXX,XX +XXX,XX @@ qcow2_co_preadv_compressed(BlockDriverState *bs, | ||
311 | |||
312 | out_buf = qemu_blockalign(bs, s->cluster_size); | ||
313 | |||
314 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); | ||
315 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED); | ||
316 | ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0); | ||
317 | if (ret < 0) { | ||
318 | goto fail; | ||
319 | @@ -XXX,XX +XXX,XX @@ qcow2_co_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) | ||
320 | return offset; | ||
321 | } | ||
322 | |||
323 | - BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); | ||
324 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); | ||
325 | return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0); | ||
326 | } | ||
327 | |||
328 | @@ -XXX,XX +XXX,XX @@ qcow2_co_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) | ||
329 | return offset; | ||
330 | } | ||
331 | |||
332 | - BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); | ||
333 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); | ||
334 | return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0); | ||
335 | } | ||
336 | |||
337 | diff --git a/block/qed-table.c b/block/qed-table.c | ||
338 | index XXXXXXX..XXXXXXX 100644 | ||
339 | --- a/block/qed-table.c | ||
340 | +++ b/block/qed-table.c | ||
341 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn qed_read_l1_table_sync(BDRVQEDState *s) | ||
342 | int coroutine_fn qed_write_l1_table(BDRVQEDState *s, unsigned int index, | ||
343 | unsigned int n) | ||
344 | { | ||
345 | - BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); | ||
346 | + BLKDBG_CO_EVENT(s->bs->file, BLKDBG_L1_UPDATE); | ||
347 | return qed_write_table(s, s->header.l1_table_offset, | ||
348 | s->l1_table, index, n, false); | ||
349 | } | ||
350 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, | ||
351 | request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); | ||
352 | request->l2_table->table = qed_alloc_table(s); | ||
353 | |||
354 | - BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); | ||
355 | + BLKDBG_CO_EVENT(s->bs->file, BLKDBG_L2_LOAD); | ||
356 | ret = qed_read_table(s, offset, request->l2_table->table); | ||
357 | |||
358 | if (ret) { | ||
359 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, | ||
360 | unsigned int index, unsigned int n, | ||
361 | bool flush) | ||
362 | { | ||
363 | - BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); | ||
364 | + BLKDBG_CO_EVENT(s->bs->file, BLKDBG_L2_UPDATE); | ||
365 | return qed_write_table(s, request->l2_table->offset, | ||
366 | request->l2_table->table, index, n, flush); | ||
367 | } | ||
368 | diff --git a/block/qed.c b/block/qed.c | ||
369 | index XXXXXXX..XXXXXXX 100644 | ||
370 | --- a/block/qed.c | ||
371 | +++ b/block/qed.c | ||
372 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn GRAPH_RDLOCK | ||
373 | qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov) | ||
374 | { | ||
375 | if (s->bs->backing) { | ||
376 | - BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); | ||
377 | + BLKDBG_CO_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); | ||
378 | return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0); | ||
379 | } | ||
380 | qemu_iovec_memset(qiov, 0, 0, qiov->size); | ||
381 | @@ -XXX,XX +XXX,XX @@ qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, uint64_t len, | ||
382 | goto out; | ||
383 | } | ||
384 | |||
385 | - BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); | ||
386 | + BLKDBG_CO_EVENT(s->bs->file, BLKDBG_COW_WRITE); | ||
387 | ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0); | ||
388 | if (ret < 0) { | ||
389 | goto out; | ||
390 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn GRAPH_RDLOCK qed_aio_write_main(QEDAIOCB *acb) | ||
391 | |||
392 | trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size); | ||
393 | |||
394 | - BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); | ||
395 | + BLKDBG_CO_EVENT(s->bs->file, BLKDBG_WRITE_AIO); | ||
396 | return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size, | ||
397 | &acb->cur_qiov, 0); | ||
398 | } | ||
399 | @@ -XXX,XX +XXX,XX @@ qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len) | ||
400 | } else if (ret != QED_CLUSTER_FOUND) { | ||
401 | r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov); | ||
402 | } else { | ||
403 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); | ||
404 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO); | ||
405 | r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size, | ||
406 | &acb->cur_qiov, 0); | ||
407 | } | ||
408 | diff --git a/block/raw-format.c b/block/raw-format.c | ||
409 | index XXXXXXX..XXXXXXX 100644 | ||
410 | --- a/block/raw-format.c | ||
411 | +++ b/block/raw-format.c | ||
412 | @@ -XXX,XX +XXX,XX @@ raw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
413 | return ret; | ||
414 | } | ||
415 | |||
416 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); | ||
417 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO); | ||
418 | return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); | ||
419 | } | ||
420 | |||
421 | @@ -XXX,XX +XXX,XX @@ raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
422 | goto fail; | ||
423 | } | ||
424 | |||
425 | - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
426 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO); | ||
427 | ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); | ||
428 | |||
429 | fail: | ||
430 | diff --git a/block/vmdk.c b/block/vmdk.c | ||
431 | index XXXXXXX..XXXXXXX 100644 | ||
432 | --- a/block/vmdk.c | ||
433 | +++ b/block/vmdk.c | ||
434 | @@ -XXX,XX +XXX,XX @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, | ||
435 | if (skip_start_bytes > 0) { | ||
436 | if (copy_from_backing) { | ||
437 | /* qcow2 emits this on bs->file instead of bs->backing */ | ||
438 | - BLKDBG_EVENT(extent->file, BLKDBG_COW_READ); | ||
439 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_READ); | ||
440 | ret = bdrv_co_pread(bs->backing, offset, skip_start_bytes, | ||
441 | whole_grain, 0); | ||
442 | if (ret < 0) { | ||
443 | @@ -XXX,XX +XXX,XX @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, | ||
444 | goto exit; | ||
445 | } | ||
446 | } | ||
447 | - BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE); | ||
448 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_WRITE); | ||
449 | ret = bdrv_co_pwrite(extent->file, cluster_offset, skip_start_bytes, | ||
450 | whole_grain, 0); | ||
451 | if (ret < 0) { | ||
452 | @@ -XXX,XX +XXX,XX @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, | ||
453 | if (skip_end_bytes < cluster_bytes) { | ||
454 | if (copy_from_backing) { | ||
455 | /* qcow2 emits this on bs->file instead of bs->backing */ | ||
456 | - BLKDBG_EVENT(extent->file, BLKDBG_COW_READ); | ||
457 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_READ); | ||
458 | ret = bdrv_co_pread(bs->backing, offset + skip_end_bytes, | ||
459 | cluster_bytes - skip_end_bytes, | ||
460 | whole_grain + skip_end_bytes, 0); | ||
461 | @@ -XXX,XX +XXX,XX @@ get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, | ||
462 | goto exit; | ||
463 | } | ||
464 | } | ||
465 | - BLKDBG_EVENT(extent->file, BLKDBG_COW_WRITE); | ||
466 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_COW_WRITE); | ||
467 | ret = bdrv_co_pwrite(extent->file, cluster_offset + skip_end_bytes, | ||
468 | cluster_bytes - skip_end_bytes, | ||
469 | whole_grain + skip_end_bytes, 0); | ||
470 | @@ -XXX,XX +XXX,XX @@ vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, uint32_t offset) | ||
471 | { | ||
472 | offset = cpu_to_le32(offset); | ||
473 | /* update L2 table */ | ||
474 | - BLKDBG_EVENT(extent->file, BLKDBG_L2_UPDATE); | ||
475 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_L2_UPDATE); | ||
476 | if (bdrv_co_pwrite(extent->file, | ||
477 | ((int64_t)m_data->l2_offset * 512) | ||
478 | + (m_data->l2_index * sizeof(offset)), | ||
479 | @@ -XXX,XX +XXX,XX @@ get_cluster_offset(BlockDriverState *bs, VmdkExtent *extent, | ||
480 | } | ||
481 | } | ||
482 | l2_table = (char *)extent->l2_cache + (min_index * l2_size_bytes); | ||
483 | - BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD); | ||
484 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_L2_LOAD); | ||
485 | if (bdrv_co_pread(extent->file, | ||
486 | (int64_t)l2_offset * 512, | ||
487 | l2_size_bytes, | ||
488 | @@ -XXX,XX +XXX,XX @@ vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, | ||
489 | n_bytes = buf_len + sizeof(VmdkGrainMarker); | ||
490 | qemu_iovec_init_buf(&local_qiov, data, n_bytes); | ||
491 | |||
492 | - BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED); | ||
493 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED); | ||
494 | } else { | ||
495 | qemu_iovec_init(&local_qiov, qiov->niov); | ||
496 | qemu_iovec_concat(&local_qiov, qiov, qiov_offset, n_bytes); | ||
497 | |||
498 | - BLKDBG_EVENT(extent->file, BLKDBG_WRITE_AIO); | ||
499 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_WRITE_AIO); | ||
500 | } | ||
501 | |||
502 | write_offset = cluster_offset + offset_in_cluster; | ||
503 | @@ -XXX,XX +XXX,XX @@ vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, | ||
504 | |||
505 | |||
506 | if (!extent->compressed) { | ||
507 | - BLKDBG_EVENT(extent->file, BLKDBG_READ_AIO); | ||
508 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_READ_AIO); | ||
509 | ret = bdrv_co_preadv(extent->file, | ||
510 | cluster_offset + offset_in_cluster, bytes, | ||
511 | qiov, 0); | ||
512 | @@ -XXX,XX +XXX,XX @@ vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, | ||
513 | buf_bytes = cluster_bytes * 2; | ||
514 | cluster_buf = g_malloc(buf_bytes); | ||
515 | uncomp_buf = g_malloc(cluster_bytes); | ||
516 | - BLKDBG_EVENT(extent->file, BLKDBG_READ_COMPRESSED); | ||
517 | + BLKDBG_CO_EVENT(extent->file, BLKDBG_READ_COMPRESSED); | ||
518 | ret = bdrv_co_pread(extent->file, cluster_offset, buf_bytes, cluster_buf, | ||
519 | 0); | ||
520 | if (ret < 0) { | ||
521 | @@ -XXX,XX +XXX,XX @@ vmdk_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
522 | qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes); | ||
523 | |||
524 | /* qcow2 emits this on bs->file instead of bs->backing */ | ||
525 | - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); | ||
526 | + BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); | ||
527 | ret = bdrv_co_preadv(bs->backing, offset, n_bytes, | ||
528 | &local_qiov, 0); | ||
529 | if (ret < 0) { | ||
530 | @@ -XXX,XX +XXX,XX @@ vmdk_co_check(BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix) | ||
531 | BDRVVmdkState *s = bs->opaque; | ||
532 | VmdkExtent *extent = NULL; | ||
533 | int64_t sector_num = 0; | ||
534 | - int64_t total_sectors = bdrv_nb_sectors(bs); | ||
535 | + int64_t total_sectors = bdrv_co_nb_sectors(bs); | ||
536 | int ret; | ||
537 | uint64_t cluster_offset; | ||
538 | 87 | ||
539 | -- | 88 | -- |
540 | 2.41.0 | 89 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Thomas Huth <thuth@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | qsd-migrate is currently only working for raw, qcow2 and qed. |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 4 | Other formats are failing, e.g. because they don't support migration. |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | 5 | Thus let's limit this test to the three usable formats now. |
6 | 6 | ||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 7 | Suggested-by: Kevin Wolf <kwolf@redhat.com> |
8 | Message-ID: <20230601115145.196465-11-pbonzini@redhat.com> | 8 | Signed-off-by: Thomas Huth <thuth@redhat.com> |
9 | Message-ID: <20250224214058.205889-1-thuth@redhat.com> | ||
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 10 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 12 | --- |
12 | block/qcow2.h | 33 +++++------ | 13 | tests/qemu-iotests/tests/qsd-migrate | 2 +- |
13 | block/qcow2-bitmap.c | 26 +++++---- | 14 | 1 file changed, 1 insertion(+), 1 deletion(-) |
14 | block/qcow2-cluster.c | 12 ++-- | ||
15 | block/qcow2-refcount.c | 130 +++++++++++++++++++++-------------------- | ||
16 | block/qcow2.c | 2 +- | ||
17 | 5 files changed, 105 insertions(+), 98 deletions(-) | ||
18 | 15 | ||
19 | diff --git a/block/qcow2.h b/block/qcow2.h | 16 | diff --git a/tests/qemu-iotests/tests/qsd-migrate b/tests/qemu-iotests/tests/qsd-migrate |
20 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100755 |
21 | --- a/block/qcow2.h | 18 | --- a/tests/qemu-iotests/tests/qsd-migrate |
22 | +++ b/block/qcow2.h | 19 | +++ b/tests/qemu-iotests/tests/qsd-migrate |
23 | @@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size, | 20 | @@ -XXX,XX +XXX,XX @@ import iotests |
24 | 21 | ||
25 | int qcow2_mark_dirty(BlockDriverState *bs); | 22 | from iotests import filter_qemu_io, filter_qtest |
26 | int qcow2_mark_corrupt(BlockDriverState *bs); | 23 | |
27 | -int qcow2_mark_consistent(BlockDriverState *bs); | 24 | -iotests.script_initialize(supported_fmts=['generic'], |
28 | int qcow2_update_header(BlockDriverState *bs); | 25 | +iotests.script_initialize(supported_fmts=['qcow2', 'qed', 'raw'], |
29 | 26 | supported_protocols=['file'], | |
30 | void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, | 27 | supported_platforms=['linux']) |
31 | @@ -XXX,XX +XXX,XX @@ int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t offset, | ||
32 | int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); | ||
33 | int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, | ||
34 | int64_t nb_clusters); | ||
35 | -int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size); | ||
36 | +int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size); | ||
37 | void qcow2_free_clusters(BlockDriverState *bs, | ||
38 | int64_t offset, int64_t size, | ||
39 | enum qcow2_discard_type type); | ||
40 | @@ -XXX,XX +XXX,XX @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, | ||
41 | |||
42 | int qcow2_flush_caches(BlockDriverState *bs); | ||
43 | int qcow2_write_caches(BlockDriverState *bs); | ||
44 | -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
45 | - BdrvCheckMode fix); | ||
46 | +int coroutine_fn qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
47 | + BdrvCheckMode fix); | ||
48 | |||
49 | void qcow2_process_discards(BlockDriverState *bs, int ret); | ||
50 | |||
51 | @@ -XXX,XX +XXX,XX @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, | ||
52 | int64_t size); | ||
53 | int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, | ||
54 | int64_t size, bool data_file); | ||
55 | -int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, | ||
56 | - void **refcount_table, | ||
57 | - int64_t *refcount_table_size, | ||
58 | - int64_t offset, int64_t size); | ||
59 | +int coroutine_fn qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, | ||
60 | + void **refcount_table, | ||
61 | + int64_t *refcount_table_size, | ||
62 | + int64_t offset, int64_t size); | ||
63 | |||
64 | int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, | ||
65 | BlockDriverAmendStatusCB *status_cb, | ||
66 | @@ -XXX,XX +XXX,XX @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset, | ||
67 | int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset, | ||
68 | unsigned int *bytes, | ||
69 | uint64_t *host_offset, QCowL2Meta **m); | ||
70 | -int coroutine_fn qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, | ||
71 | - uint64_t offset, | ||
72 | - int compressed_size, | ||
73 | - uint64_t *host_offset); | ||
74 | +int coroutine_fn GRAPH_RDLOCK | ||
75 | +qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, | ||
76 | + int compressed_size, uint64_t *host_offset); | ||
77 | void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, | ||
78 | uint64_t *coffset, int *csize); | ||
79 | |||
80 | @@ -XXX,XX +XXX,XX @@ void *qcow2_cache_is_table_offset(Qcow2Cache *c, uint64_t offset); | ||
81 | void qcow2_cache_discard(Qcow2Cache *c, void *table); | ||
82 | |||
83 | /* qcow2-bitmap.c functions */ | ||
84 | -int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
85 | - void **refcount_table, | ||
86 | - int64_t *refcount_table_size); | ||
87 | -bool coroutine_fn qcow2_load_dirty_bitmaps(BlockDriverState *bs, | ||
88 | - bool *header_updated, Error **errp); | ||
89 | +int coroutine_fn | ||
90 | +qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
91 | + void **refcount_table, | ||
92 | + int64_t *refcount_table_size); | ||
93 | +bool coroutine_fn GRAPH_RDLOCK | ||
94 | +qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool *header_updated, Error **errp); | ||
95 | bool qcow2_get_bitmap_info_list(BlockDriverState *bs, | ||
96 | Qcow2BitmapInfoList **info_list, Error **errp); | ||
97 | int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp); | ||
98 | diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c | ||
99 | index XXXXXXX..XXXXXXX 100644 | ||
100 | --- a/block/qcow2-bitmap.c | ||
101 | +++ b/block/qcow2-bitmap.c | ||
102 | @@ -XXX,XX +XXX,XX @@ static int free_bitmap_clusters(BlockDriverState *bs, Qcow2BitmapTable *tb) | ||
103 | /* load_bitmap_data | ||
104 | * @bitmap_table entries must satisfy specification constraints. | ||
105 | * @bitmap must be cleared */ | ||
106 | -static int load_bitmap_data(BlockDriverState *bs, | ||
107 | - const uint64_t *bitmap_table, | ||
108 | - uint32_t bitmap_table_size, | ||
109 | - BdrvDirtyBitmap *bitmap) | ||
110 | +static int coroutine_fn GRAPH_RDLOCK | ||
111 | +load_bitmap_data(BlockDriverState *bs, const uint64_t *bitmap_table, | ||
112 | + uint32_t bitmap_table_size, BdrvDirtyBitmap *bitmap) | ||
113 | { | ||
114 | int ret = 0; | ||
115 | BDRVQcow2State *s = bs->opaque; | ||
116 | @@ -XXX,XX +XXX,XX @@ static int load_bitmap_data(BlockDriverState *bs, | ||
117 | * already cleared */ | ||
118 | } | ||
119 | } else { | ||
120 | - ret = bdrv_pread(bs->file, data_offset, s->cluster_size, buf, 0); | ||
121 | + ret = bdrv_co_pread(bs->file, data_offset, s->cluster_size, buf, 0); | ||
122 | if (ret < 0) { | ||
123 | goto finish; | ||
124 | } | ||
125 | @@ -XXX,XX +XXX,XX @@ finish: | ||
126 | return ret; | ||
127 | } | ||
128 | |||
129 | -static BdrvDirtyBitmap *load_bitmap(BlockDriverState *bs, | ||
130 | - Qcow2Bitmap *bm, Error **errp) | ||
131 | +static coroutine_fn GRAPH_RDLOCK | ||
132 | +BdrvDirtyBitmap *load_bitmap(BlockDriverState *bs, | ||
133 | + Qcow2Bitmap *bm, Error **errp) | ||
134 | { | ||
135 | int ret; | ||
136 | uint64_t *bitmap_table = NULL; | ||
137 | @@ -XXX,XX +XXX,XX @@ fail: | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | -int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
142 | - void **refcount_table, | ||
143 | - int64_t *refcount_table_size) | ||
144 | +int coroutine_fn | ||
145 | +qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
146 | + void **refcount_table, | ||
147 | + int64_t *refcount_table_size) | ||
148 | { | ||
149 | int ret; | ||
150 | BDRVQcow2State *s = bs->opaque; | ||
151 | @@ -XXX,XX +XXX,XX @@ static void set_readonly_helper(gpointer bitmap, gpointer value) | ||
152 | * If header_updated is not NULL then it is set appropriately regardless of | ||
153 | * the return value. | ||
154 | */ | ||
155 | -bool coroutine_fn qcow2_load_dirty_bitmaps(BlockDriverState *bs, | ||
156 | - bool *header_updated, Error **errp) | ||
157 | +bool coroutine_fn GRAPH_RDLOCK | ||
158 | +qcow2_load_dirty_bitmaps(BlockDriverState *bs, | ||
159 | + bool *header_updated, Error **errp) | ||
160 | { | ||
161 | BDRVQcow2State *s = bs->opaque; | ||
162 | Qcow2BitmapList *bm_list; | ||
163 | diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c | ||
164 | index XXXXXXX..XXXXXXX 100644 | ||
165 | --- a/block/qcow2-cluster.c | ||
166 | +++ b/block/qcow2-cluster.c | ||
167 | @@ -XXX,XX +XXX,XX @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, | ||
168 | * | ||
169 | * Return 0 on success and -errno in error cases | ||
170 | */ | ||
171 | -int coroutine_fn qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, | ||
172 | - uint64_t offset, | ||
173 | - int compressed_size, | ||
174 | - uint64_t *host_offset) | ||
175 | +int coroutine_fn GRAPH_RDLOCK | ||
176 | +qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, | ||
177 | + int compressed_size, uint64_t *host_offset) | ||
178 | { | ||
179 | BDRVQcow2State *s = bs->opaque; | ||
180 | int l2_index, ret; | ||
181 | @@ -XXX,XX +XXX,XX @@ fail: | ||
182 | * all clusters in the same L2 slice) and returns the number of zeroed | ||
183 | * clusters. | ||
184 | */ | ||
185 | -static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, | ||
186 | - uint64_t nb_clusters, int flags) | ||
187 | +static int coroutine_fn | ||
188 | +zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, | ||
189 | + uint64_t nb_clusters, int flags) | ||
190 | { | ||
191 | BDRVQcow2State *s = bs->opaque; | ||
192 | uint64_t *l2_slice; | ||
193 | diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c | ||
194 | index XXXXXXX..XXXXXXX 100644 | ||
195 | --- a/block/qcow2-refcount.c | ||
196 | +++ b/block/qcow2-refcount.c | ||
197 | @@ -XXX,XX +XXX,XX @@ int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offs | ||
198 | |||
199 | /* only used to allocate compressed sectors. We try to allocate | ||
200 | contiguous sectors. size must be <= cluster_size */ | ||
201 | -int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size) | ||
202 | +int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size) | ||
203 | { | ||
204 | BDRVQcow2State *s = bs->opaque; | ||
205 | int64_t offset; | ||
206 | @@ -XXX,XX +XXX,XX @@ static int realloc_refcount_array(BDRVQcow2State *s, void **array, | ||
207 | * | ||
208 | * Modifies the number of errors in res. | ||
209 | */ | ||
210 | -int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, | ||
211 | - void **refcount_table, | ||
212 | - int64_t *refcount_table_size, | ||
213 | - int64_t offset, int64_t size) | ||
214 | +int coroutine_fn GRAPH_RDLOCK | ||
215 | +qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, | ||
216 | + void **refcount_table, | ||
217 | + int64_t *refcount_table_size, | ||
218 | + int64_t offset, int64_t size) | ||
219 | { | ||
220 | BDRVQcow2State *s = bs->opaque; | ||
221 | uint64_t start, last, cluster_offset, k, refcount; | ||
222 | @@ -XXX,XX +XXX,XX @@ int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, | ||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | - file_len = bdrv_getlength(bs->file->bs); | ||
227 | + file_len = bdrv_co_getlength(bs->file->bs); | ||
228 | if (file_len < 0) { | ||
229 | return file_len; | ||
230 | } | ||
231 | @@ -XXX,XX +XXX,XX @@ enum { | ||
232 | * | ||
233 | * On failure in-memory @l2_table may be modified. | ||
234 | */ | ||
235 | -static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, | ||
236 | - uint64_t l2_offset, | ||
237 | - uint64_t *l2_table, int l2_index, bool active, | ||
238 | - bool *metadata_overlap) | ||
239 | +static int coroutine_fn GRAPH_RDLOCK | ||
240 | +fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, | ||
241 | + uint64_t l2_offset, uint64_t *l2_table, | ||
242 | + int l2_index, bool active, | ||
243 | + bool *metadata_overlap) | ||
244 | { | ||
245 | BDRVQcow2State *s = bs->opaque; | ||
246 | int ret; | ||
247 | @@ -XXX,XX +XXX,XX @@ static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, | ||
248 | goto fail; | ||
249 | } | ||
250 | |||
251 | - ret = bdrv_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s), | ||
252 | - &l2_table[idx], 0); | ||
253 | + ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s), | ||
254 | + &l2_table[idx], 0); | ||
255 | if (ret < 0) { | ||
256 | fprintf(stderr, "ERROR: Failed to overwrite L2 " | ||
257 | "table entry: %s\n", strerror(-ret)); | ||
258 | @@ -XXX,XX +XXX,XX @@ fail: | ||
259 | * Returns the number of errors found by the checks or -errno if an internal | ||
260 | * error occurred. | ||
261 | */ | ||
262 | -static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, | ||
263 | - void **refcount_table, | ||
264 | - int64_t *refcount_table_size, int64_t l2_offset, | ||
265 | - int flags, BdrvCheckMode fix, bool active) | ||
266 | +static int coroutine_fn GRAPH_RDLOCK | ||
267 | +check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, | ||
268 | + void **refcount_table, | ||
269 | + int64_t *refcount_table_size, int64_t l2_offset, | ||
270 | + int flags, BdrvCheckMode fix, bool active) | ||
271 | { | ||
272 | BDRVQcow2State *s = bs->opaque; | ||
273 | uint64_t l2_entry, l2_bitmap; | ||
274 | @@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, | ||
275 | bool metadata_overlap; | ||
276 | |||
277 | /* Read L2 table from disk */ | ||
278 | - ret = bdrv_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0); | ||
279 | + ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0); | ||
280 | if (ret < 0) { | ||
281 | fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); | ||
282 | res->check_errors++; | ||
283 | @@ -XXX,XX +XXX,XX @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, | ||
284 | * Returns the number of errors found by the checks or -errno if an internal | ||
285 | * error occurred. | ||
286 | */ | ||
287 | -static int check_refcounts_l1(BlockDriverState *bs, | ||
288 | - BdrvCheckResult *res, | ||
289 | - void **refcount_table, | ||
290 | - int64_t *refcount_table_size, | ||
291 | - int64_t l1_table_offset, int l1_size, | ||
292 | - int flags, BdrvCheckMode fix, bool active) | ||
293 | +static int coroutine_fn GRAPH_RDLOCK | ||
294 | +check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res, | ||
295 | + void **refcount_table, int64_t *refcount_table_size, | ||
296 | + int64_t l1_table_offset, int l1_size, | ||
297 | + int flags, BdrvCheckMode fix, bool active) | ||
298 | { | ||
299 | BDRVQcow2State *s = bs->opaque; | ||
300 | size_t l1_size_bytes = l1_size * L1E_SIZE; | ||
301 | @@ -XXX,XX +XXX,XX @@ static int check_refcounts_l1(BlockDriverState *bs, | ||
302 | } | ||
303 | |||
304 | /* Read L1 table entries from disk */ | ||
305 | - ret = bdrv_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0); | ||
306 | + ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0); | ||
307 | if (ret < 0) { | ||
308 | fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); | ||
309 | res->check_errors++; | ||
310 | @@ -XXX,XX +XXX,XX @@ static int check_refcounts_l1(BlockDriverState *bs, | ||
311 | * have been already detected and sufficiently signaled by the calling function | ||
312 | * (qcow2_check_refcounts) by the time this function is called). | ||
313 | */ | ||
314 | -static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, | ||
315 | - BdrvCheckMode fix) | ||
316 | +static int coroutine_fn GRAPH_RDLOCK | ||
317 | +check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) | ||
318 | { | ||
319 | BDRVQcow2State *s = bs->opaque; | ||
320 | uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); | ||
321 | @@ -XXX,XX +XXX,XX @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, | ||
322 | } | ||
323 | } | ||
324 | |||
325 | - ret = bdrv_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s), | ||
326 | - l2_table, 0); | ||
327 | + ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s), | ||
328 | + l2_table, 0); | ||
329 | if (ret < 0) { | ||
330 | fprintf(stderr, "ERROR: Could not read L2 table: %s\n", | ||
331 | strerror(-ret)); | ||
332 | @@ -XXX,XX +XXX,XX @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, | ||
333 | goto fail; | ||
334 | } | ||
335 | |||
336 | - ret = bdrv_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, | ||
337 | - 0); | ||
338 | + ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0); | ||
339 | if (ret < 0) { | ||
340 | fprintf(stderr, "ERROR: Could not write L2 table: %s\n", | ||
341 | strerror(-ret)); | ||
342 | @@ -XXX,XX +XXX,XX @@ fail: | ||
343 | * Checks consistency of refblocks and accounts for each refblock in | ||
344 | * *refcount_table. | ||
345 | */ | ||
346 | -static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, | ||
347 | - BdrvCheckMode fix, bool *rebuild, | ||
348 | - void **refcount_table, int64_t *nb_clusters) | ||
349 | +static int coroutine_fn GRAPH_RDLOCK | ||
350 | +check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, | ||
351 | + BdrvCheckMode fix, bool *rebuild, | ||
352 | + void **refcount_table, int64_t *nb_clusters) | ||
353 | { | ||
354 | BDRVQcow2State *s = bs->opaque; | ||
355 | int64_t i, size; | ||
356 | @@ -XXX,XX +XXX,XX @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, | ||
357 | goto resize_fail; | ||
358 | } | ||
359 | |||
360 | - ret = bdrv_truncate(bs->file, offset + s->cluster_size, false, | ||
361 | - PREALLOC_MODE_OFF, 0, &local_err); | ||
362 | + ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false, | ||
363 | + PREALLOC_MODE_OFF, 0, &local_err); | ||
364 | if (ret < 0) { | ||
365 | error_report_err(local_err); | ||
366 | goto resize_fail; | ||
367 | } | ||
368 | - size = bdrv_getlength(bs->file->bs); | ||
369 | + size = bdrv_co_getlength(bs->file->bs); | ||
370 | if (size < 0) { | ||
371 | ret = size; | ||
372 | goto resize_fail; | ||
373 | @@ -XXX,XX +XXX,XX @@ resize_fail: | ||
374 | /* | ||
375 | * Calculates an in-memory refcount table. | ||
376 | */ | ||
377 | -static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
378 | - BdrvCheckMode fix, bool *rebuild, | ||
379 | - void **refcount_table, int64_t *nb_clusters) | ||
380 | +static int coroutine_fn GRAPH_RDLOCK | ||
381 | +calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
382 | + BdrvCheckMode fix, bool *rebuild, | ||
383 | + void **refcount_table, int64_t *nb_clusters) | ||
384 | { | ||
385 | BDRVQcow2State *s = bs->opaque; | ||
386 | int64_t i; | ||
387 | @@ -XXX,XX +XXX,XX @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
388 | * Compares the actual reference count for each cluster in the image against the | ||
389 | * refcount as reported by the refcount structures on-disk. | ||
390 | */ | ||
391 | -static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
392 | - BdrvCheckMode fix, bool *rebuild, | ||
393 | - int64_t *highest_cluster, | ||
394 | - void *refcount_table, int64_t nb_clusters) | ||
395 | +static void coroutine_fn | ||
396 | +compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
397 | + BdrvCheckMode fix, bool *rebuild, | ||
398 | + int64_t *highest_cluster, | ||
399 | + void *refcount_table, int64_t nb_clusters) | ||
400 | { | ||
401 | BDRVQcow2State *s = bs->opaque; | ||
402 | int64_t i; | ||
403 | @@ -XXX,XX +XXX,XX @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, | ||
404 | * Return whether the on-disk reftable array was resized (true/false), | ||
405 | * or -errno on error. | ||
406 | */ | ||
407 | -static int rebuild_refcounts_write_refblocks( | ||
408 | +static int coroutine_fn GRAPH_RDLOCK | ||
409 | +rebuild_refcounts_write_refblocks( | ||
410 | BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters, | ||
411 | int64_t first_cluster, int64_t end_cluster, | ||
412 | uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr, | ||
413 | @@ -XXX,XX +XXX,XX @@ static int rebuild_refcounts_write_refblocks( | ||
414 | on_disk_refblock = (void *)((char *) *refcount_table + | ||
415 | refblock_index * s->cluster_size); | ||
416 | |||
417 | - ret = bdrv_pwrite(bs->file, refblock_offset, s->cluster_size, | ||
418 | - on_disk_refblock, 0); | ||
419 | + ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size, | ||
420 | + on_disk_refblock, 0); | ||
421 | if (ret < 0) { | ||
422 | error_setg_errno(errp, -ret, "ERROR writing refblock"); | ||
423 | return ret; | ||
424 | @@ -XXX,XX +XXX,XX @@ static int rebuild_refcounts_write_refblocks( | ||
425 | * On success, the old refcount structure is leaked (it will be covered by the | ||
426 | * new refcount structure). | ||
427 | */ | ||
428 | -static int rebuild_refcount_structure(BlockDriverState *bs, | ||
429 | - BdrvCheckResult *res, | ||
430 | - void **refcount_table, | ||
431 | - int64_t *nb_clusters, | ||
432 | - Error **errp) | ||
433 | +static int coroutine_fn GRAPH_RDLOCK | ||
434 | +rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res, | ||
435 | + void **refcount_table, int64_t *nb_clusters, | ||
436 | + Error **errp) | ||
437 | { | ||
438 | BDRVQcow2State *s = bs->opaque; | ||
439 | int64_t reftable_offset = -1; | ||
440 | @@ -XXX,XX +XXX,XX @@ static int rebuild_refcount_structure(BlockDriverState *bs, | ||
441 | } | ||
442 | |||
443 | assert(reftable_length < INT_MAX); | ||
444 | - ret = bdrv_pwrite(bs->file, reftable_offset, reftable_length, | ||
445 | - on_disk_reftable, 0); | ||
446 | + ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length, | ||
447 | + on_disk_reftable, 0); | ||
448 | if (ret < 0) { | ||
449 | error_setg_errno(errp, -ret, "ERROR writing reftable"); | ||
450 | goto fail; | ||
451 | @@ -XXX,XX +XXX,XX @@ static int rebuild_refcount_structure(BlockDriverState *bs, | ||
452 | reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); | ||
453 | reftable_offset_and_clusters.reftable_clusters = | ||
454 | cpu_to_be32(reftable_clusters); | ||
455 | - ret = bdrv_pwrite_sync(bs->file, | ||
456 | - offsetof(QCowHeader, refcount_table_offset), | ||
457 | - sizeof(reftable_offset_and_clusters), | ||
458 | - &reftable_offset_and_clusters, 0); | ||
459 | + ret = bdrv_co_pwrite_sync(bs->file, | ||
460 | + offsetof(QCowHeader, refcount_table_offset), | ||
461 | + sizeof(reftable_offset_and_clusters), | ||
462 | + &reftable_offset_and_clusters, 0); | ||
463 | if (ret < 0) { | ||
464 | error_setg_errno(errp, -ret, "ERROR setting reftable"); | ||
465 | goto fail; | ||
466 | @@ -XXX,XX +XXX,XX @@ fail: | ||
467 | * Returns 0 if no errors are found, the number of errors in case the image is | ||
468 | * detected as corrupted, and -errno when an internal error occurred. | ||
469 | */ | ||
470 | -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
471 | - BdrvCheckMode fix) | ||
472 | +int coroutine_fn GRAPH_RDLOCK | ||
473 | +qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) | ||
474 | { | ||
475 | BDRVQcow2State *s = bs->opaque; | ||
476 | BdrvCheckResult pre_compare_res; | ||
477 | @@ -XXX,XX +XXX,XX @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, | ||
478 | bool rebuild = false; | ||
479 | int ret; | ||
480 | |||
481 | - size = bdrv_getlength(bs->file->bs); | ||
482 | + size = bdrv_co_getlength(bs->file->bs); | ||
483 | if (size < 0) { | ||
484 | res->check_errors++; | ||
485 | return size; | ||
486 | @@ -XXX,XX +XXX,XX @@ done: | ||
487 | return ret; | ||
488 | } | ||
489 | |||
490 | -static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset) | ||
491 | +static int64_t coroutine_fn get_refblock_offset(BlockDriverState *bs, | ||
492 | + uint64_t offset) | ||
493 | { | ||
494 | BDRVQcow2State *s = bs->opaque; | ||
495 | uint32_t index = offset_to_reftable_index(s, offset); | ||
496 | @@ -XXX,XX +XXX,XX @@ int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) | ||
497 | return -EIO; | ||
498 | } | ||
499 | |||
500 | -int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs) | ||
501 | +int coroutine_fn GRAPH_RDLOCK | ||
502 | +qcow2_detect_metadata_preallocation(BlockDriverState *bs) | ||
503 | { | ||
504 | BDRVQcow2State *s = bs->opaque; | ||
505 | int64_t i, end_cluster, cluster_count = 0, threshold; | ||
506 | diff --git a/block/qcow2.c b/block/qcow2.c | ||
507 | index XXXXXXX..XXXXXXX 100644 | ||
508 | --- a/block/qcow2.c | ||
509 | +++ b/block/qcow2.c | ||
510 | @@ -XXX,XX +XXX,XX @@ int qcow2_mark_corrupt(BlockDriverState *bs) | ||
511 | * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes | ||
512 | * before if necessary. | ||
513 | */ | ||
514 | -int qcow2_mark_consistent(BlockDriverState *bs) | ||
515 | +static int coroutine_fn qcow2_mark_consistent(BlockDriverState *bs) | ||
516 | { | ||
517 | BDRVQcow2State *s = bs->opaque; | ||
518 | 28 | ||
519 | -- | 29 | -- |
520 | 2.41.0 | 30 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | Commit 71544d30a6f8 ("scsi: push request restart to SCSIDevice") removed |
4 | and they can suspend. Because this function operates on a BlockBackend, mark it | 4 | the only user of SCSIDiskState->bh. |
5 | GRAPH_UNLOCKED. | ||
6 | 5 | ||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 6 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | Message-ID: <20230601115145.196465-6-pbonzini@redhat.com> | 7 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> |
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 8 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
9 | Message-ID: <20250311132616.1049687-2-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 11 | --- |
12 | block.c | 11 ++++++----- | 12 | hw/scsi/scsi-disk.c | 1 - |
13 | 1 file changed, 6 insertions(+), 5 deletions(-) | 13 | 1 file changed, 1 deletion(-) |
14 | 14 | ||
15 | diff --git a/block.c b/block.c | 15 | diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c |
16 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block.c | 17 | --- a/hw/scsi/scsi-disk.c |
18 | +++ b/block.c | 18 | +++ b/hw/scsi/scsi-disk.c |
19 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename, | 19 | @@ -XXX,XX +XXX,XX @@ struct SCSIDiskState { |
20 | * On success, return @blk's actual length. | 20 | uint64_t max_unmap_size; |
21 | * Otherwise, return -errno. | 21 | uint64_t max_io_size; |
22 | */ | 22 | uint32_t quirks; |
23 | -static int64_t create_file_fallback_truncate(BlockBackend *blk, | 23 | - QEMUBH *bh; |
24 | - int64_t minimum_size, Error **errp) | 24 | char *version; |
25 | +static int64_t coroutine_fn GRAPH_UNLOCKED | 25 | char *serial; |
26 | +create_file_fallback_truncate(BlockBackend *blk, int64_t minimum_size, | 26 | char *vendor; |
27 | + Error **errp) | ||
28 | { | ||
29 | Error *local_err = NULL; | ||
30 | int64_t size; | ||
31 | @@ -XXX,XX +XXX,XX @@ static int64_t create_file_fallback_truncate(BlockBackend *blk, | ||
32 | |||
33 | GLOBAL_STATE_CODE(); | ||
34 | |||
35 | - ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0, | ||
36 | - &local_err); | ||
37 | + ret = blk_co_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0, | ||
38 | + &local_err); | ||
39 | if (ret < 0 && ret != -ENOTSUP) { | ||
40 | error_propagate(errp, local_err); | ||
41 | return ret; | ||
42 | } | ||
43 | |||
44 | - size = blk_getlength(blk); | ||
45 | + size = blk_co_getlength(blk); | ||
46 | if (size < 0) { | ||
47 | error_free(local_err); | ||
48 | error_setg_errno(errp, -size, | ||
49 | -- | 27 | -- |
50 | 2.41.0 | 28 | 2.48.1 |
29 | |||
30 | diff view generated by jsdifflib |
1 | bdrv_set_backing() requires the caller to hold the AioContext lock for | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | @backing_hd. Take it in bdrv_open_backing_file() before calling the | ||
3 | function. | ||
4 | 2 | ||
5 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | In the past a single AioContext was used for block I/O and it was |
6 | Message-ID: <20230605085711.21261-9-kwolf@redhat.com> | 4 | fetched using blk_get_aio_context(). Nowadays the block layer supports |
7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | running I/O from any AioContext and multiple AioContexts at the same |
6 | time. Remove the dma_blk_io() AioContext argument and use the current | ||
7 | AioContext instead. | ||
8 | |||
9 | This makes calling the function easier and enables multiple IOThreads to | ||
10 | use dma_blk_io() concurrently for the same block device. | ||
11 | |||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
14 | Message-ID: <20250311132616.1049687-3-stefanha@redhat.com> | ||
8 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 15 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
9 | --- | 16 | --- |
10 | block.c | 5 +++++ | 17 | include/system/dma.h | 3 +-- |
11 | 1 file changed, 5 insertions(+) | 18 | hw/ide/core.c | 3 +-- |
19 | hw/ide/macio.c | 3 +-- | ||
20 | hw/scsi/scsi-disk.c | 6 ++---- | ||
21 | system/dma-helpers.c | 8 ++++---- | ||
22 | 5 files changed, 9 insertions(+), 14 deletions(-) | ||
12 | 23 | ||
13 | diff --git a/block.c b/block.c | 24 | diff --git a/include/system/dma.h b/include/system/dma.h |
14 | index XXXXXXX..XXXXXXX 100644 | 25 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/block.c | 26 | --- a/include/system/dma.h |
16 | +++ b/block.c | 27 | +++ b/include/system/dma.h |
17 | @@ -XXX,XX +XXX,XX @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, | 28 | @@ -XXX,XX +XXX,XX @@ typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov, |
18 | int ret = 0; | 29 | BlockCompletionFunc *cb, void *cb_opaque, |
19 | bool implicit_backing = false; | 30 | void *opaque); |
20 | BlockDriverState *backing_hd; | 31 | |
21 | + AioContext *backing_hd_ctx; | 32 | -BlockAIOCB *dma_blk_io(AioContext *ctx, |
22 | QDict *options; | 33 | - QEMUSGList *sg, uint64_t offset, uint32_t align, |
23 | QDict *tmp_parent_options = NULL; | 34 | +BlockAIOCB *dma_blk_io(QEMUSGList *sg, uint64_t offset, uint32_t align, |
24 | Error *local_err = NULL; | 35 | DMAIOFunc *io_func, void *io_func_opaque, |
25 | @@ -XXX,XX +XXX,XX @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, | 36 | BlockCompletionFunc *cb, void *opaque, DMADirection dir); |
26 | 37 | BlockAIOCB *dma_blk_read(BlockBackend *blk, | |
27 | /* Hook up the backing file link; drop our reference, bs owns the | 38 | diff --git a/hw/ide/core.c b/hw/ide/core.c |
28 | * backing_hd reference now */ | 39 | index XXXXXXX..XXXXXXX 100644 |
29 | + backing_hd_ctx = bdrv_get_aio_context(backing_hd); | 40 | --- a/hw/ide/core.c |
30 | + aio_context_acquire(backing_hd_ctx); | 41 | +++ b/hw/ide/core.c |
31 | ret = bdrv_set_backing_hd(bs, backing_hd, errp); | 42 | @@ -XXX,XX +XXX,XX @@ static void ide_dma_cb(void *opaque, int ret) |
32 | bdrv_unref(backing_hd); | 43 | BDRV_SECTOR_SIZE, ide_dma_cb, s); |
33 | + aio_context_release(backing_hd_ctx); | 44 | break; |
34 | + | 45 | case IDE_DMA_TRIM: |
35 | if (ret < 0) { | 46 | - s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), |
36 | goto free_exit; | 47 | - &s->sg, offset, BDRV_SECTOR_SIZE, |
37 | } | 48 | + s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, BDRV_SECTOR_SIZE, |
49 | ide_issue_trim, s, ide_dma_cb, s, | ||
50 | DMA_DIRECTION_TO_DEVICE); | ||
51 | break; | ||
52 | diff --git a/hw/ide/macio.c b/hw/ide/macio.c | ||
53 | index XXXXXXX..XXXXXXX 100644 | ||
54 | --- a/hw/ide/macio.c | ||
55 | +++ b/hw/ide/macio.c | ||
56 | @@ -XXX,XX +XXX,XX @@ static void pmac_ide_transfer_cb(void *opaque, int ret) | ||
57 | pmac_ide_transfer_cb, io); | ||
58 | break; | ||
59 | case IDE_DMA_TRIM: | ||
60 | - s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), &s->sg, | ||
61 | - offset, 0x1, ide_issue_trim, s, | ||
62 | + s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, 0x1, ide_issue_trim, s, | ||
63 | pmac_ide_transfer_cb, io, | ||
64 | DMA_DIRECTION_TO_DEVICE); | ||
65 | break; | ||
66 | diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c | ||
67 | index XXXXXXX..XXXXXXX 100644 | ||
68 | --- a/hw/scsi/scsi-disk.c | ||
69 | +++ b/hw/scsi/scsi-disk.c | ||
70 | @@ -XXX,XX +XXX,XX @@ static void scsi_do_read(SCSIDiskReq *r, int ret) | ||
71 | if (r->req.sg) { | ||
72 | dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ); | ||
73 | r->req.residual -= r->req.sg->size; | ||
74 | - r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk), | ||
75 | - r->req.sg, r->sector << BDRV_SECTOR_BITS, | ||
76 | + r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS, | ||
77 | BDRV_SECTOR_SIZE, | ||
78 | sdc->dma_readv, r, scsi_dma_complete, r, | ||
79 | DMA_DIRECTION_FROM_DEVICE); | ||
80 | @@ -XXX,XX +XXX,XX @@ static void scsi_write_data(SCSIRequest *req) | ||
81 | if (r->req.sg) { | ||
82 | dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE); | ||
83 | r->req.residual -= r->req.sg->size; | ||
84 | - r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk), | ||
85 | - r->req.sg, r->sector << BDRV_SECTOR_BITS, | ||
86 | + r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS, | ||
87 | BDRV_SECTOR_SIZE, | ||
88 | sdc->dma_writev, r, scsi_dma_complete, r, | ||
89 | DMA_DIRECTION_TO_DEVICE); | ||
90 | diff --git a/system/dma-helpers.c b/system/dma-helpers.c | ||
91 | index XXXXXXX..XXXXXXX 100644 | ||
92 | --- a/system/dma-helpers.c | ||
93 | +++ b/system/dma-helpers.c | ||
94 | @@ -XXX,XX +XXX,XX @@ static const AIOCBInfo dma_aiocb_info = { | ||
95 | .cancel_async = dma_aio_cancel, | ||
96 | }; | ||
97 | |||
98 | -BlockAIOCB *dma_blk_io(AioContext *ctx, | ||
99 | +BlockAIOCB *dma_blk_io( | ||
100 | QEMUSGList *sg, uint64_t offset, uint32_t align, | ||
101 | DMAIOFunc *io_func, void *io_func_opaque, | ||
102 | BlockCompletionFunc *cb, | ||
103 | @@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_io(AioContext *ctx, | ||
104 | |||
105 | dbs->acb = NULL; | ||
106 | dbs->sg = sg; | ||
107 | - dbs->ctx = ctx; | ||
108 | + dbs->ctx = qemu_get_current_aio_context(); | ||
109 | dbs->offset = offset; | ||
110 | dbs->align = align; | ||
111 | dbs->sg_cur_index = 0; | ||
112 | @@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_read(BlockBackend *blk, | ||
113 | QEMUSGList *sg, uint64_t offset, uint32_t align, | ||
114 | void (*cb)(void *opaque, int ret), void *opaque) | ||
115 | { | ||
116 | - return dma_blk_io(blk_get_aio_context(blk), sg, offset, align, | ||
117 | + return dma_blk_io(sg, offset, align, | ||
118 | dma_blk_read_io_func, blk, cb, opaque, | ||
119 | DMA_DIRECTION_FROM_DEVICE); | ||
120 | } | ||
121 | @@ -XXX,XX +XXX,XX @@ BlockAIOCB *dma_blk_write(BlockBackend *blk, | ||
122 | QEMUSGList *sg, uint64_t offset, uint32_t align, | ||
123 | void (*cb)(void *opaque, int ret), void *opaque) | ||
124 | { | ||
125 | - return dma_blk_io(blk_get_aio_context(blk), sg, offset, align, | ||
126 | + return dma_blk_io(sg, offset, align, | ||
127 | dma_blk_write_io_func, blk, cb, opaque, | ||
128 | DMA_DIRECTION_TO_DEVICE); | ||
129 | } | ||
38 | -- | 130 | -- |
39 | 2.41.0 | 131 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | Until now, a SCSIDevice's I/O requests have run in a single AioContext. |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 4 | In order to support multiple IOThreads it will be necessary to move to |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | 5 | the concept of a per-SCSIRequest AioContext. |
6 | 6 | ||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | Message-ID: <20230601115145.196465-10-pbonzini@redhat.com> | ||
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 8 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
9 | Message-ID: <20250311132616.1049687-4-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 11 | --- |
12 | block/vhdx.h | 5 ++-- | 12 | include/hw/scsi/scsi.h | 1 + |
13 | block/vhdx-log.c | 36 +++++++++++++----------- | 13 | hw/scsi/scsi-bus.c | 1 + |
14 | block/vhdx.c | 73 +++++++++++++++++++++++------------------------- | 14 | hw/scsi/scsi-disk.c | 17 ++++++----------- |
15 | 3 files changed, 57 insertions(+), 57 deletions(-) | 15 | 3 files changed, 8 insertions(+), 11 deletions(-) |
16 | 16 | ||
17 | diff --git a/block/vhdx.h b/block/vhdx.h | 17 | diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h |
18 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
19 | --- a/block/vhdx.h | 19 | --- a/include/hw/scsi/scsi.h |
20 | +++ b/block/vhdx.h | 20 | +++ b/include/hw/scsi/scsi.h |
21 | @@ -XXX,XX +XXX,XX @@ bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); | 21 | @@ -XXX,XX +XXX,XX @@ struct SCSIRequest { |
22 | int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, | 22 | SCSIBus *bus; |
23 | Error **errp); | 23 | SCSIDevice *dev; |
24 | 24 | const SCSIReqOps *ops; | |
25 | -int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, | 25 | + AioContext *ctx; |
26 | - void *data, uint32_t length, uint64_t offset); | 26 | uint32_t refcount; |
27 | +int coroutine_fn GRAPH_RDLOCK | 27 | uint32_t tag; |
28 | +vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, | 28 | uint32_t lun; |
29 | + void *data, uint32_t length, uint64_t offset); | 29 | diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c |
30 | |||
31 | static inline void leguid_to_cpus(MSGUID *guid) | ||
32 | { | ||
33 | diff --git a/block/vhdx-log.c b/block/vhdx-log.c | ||
34 | index XXXXXXX..XXXXXXX 100644 | 30 | index XXXXXXX..XXXXXXX 100644 |
35 | --- a/block/vhdx-log.c | 31 | --- a/hw/scsi/scsi-bus.c |
36 | +++ b/block/vhdx-log.c | 32 | +++ b/hw/scsi/scsi-bus.c |
37 | @@ -XXX,XX +XXX,XX @@ exit: | 33 | @@ -XXX,XX +XXX,XX @@ invalid_opcode: |
38 | * It is assumed that 'buffer' is at least 4096*num_sectors large. | ||
39 | * | ||
40 | * 0 is returned on success, -errno otherwise */ | ||
41 | -static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, | ||
42 | - uint32_t *sectors_written, void *buffer, | ||
43 | - uint32_t num_sectors) | ||
44 | +static int coroutine_fn GRAPH_RDLOCK | ||
45 | +vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, | ||
46 | + uint32_t *sectors_written, void *buffer, | ||
47 | + uint32_t num_sectors) | ||
48 | { | ||
49 | int ret = 0; | ||
50 | uint64_t offset; | ||
51 | @@ -XXX,XX +XXX,XX @@ static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, | ||
52 | /* full */ | ||
53 | break; | ||
54 | } | ||
55 | - ret = bdrv_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp, | ||
56 | - 0); | ||
57 | + ret = bdrv_co_pwrite(bs->file, offset, VHDX_LOG_SECTOR_SIZE, buffer_tmp, 0); | ||
58 | if (ret < 0) { | ||
59 | goto exit; | ||
60 | } | ||
61 | @@ -XXX,XX +XXX,XX @@ static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc, | ||
62 | } | ||
63 | |||
64 | |||
65 | -static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, | ||
66 | - void *data, uint32_t length, uint64_t offset) | ||
67 | +static int coroutine_fn GRAPH_RDLOCK | ||
68 | +vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, | ||
69 | + void *data, uint32_t length, uint64_t offset) | ||
70 | { | ||
71 | int ret = 0; | ||
72 | void *buffer = NULL; | ||
73 | @@ -XXX,XX +XXX,XX @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, | ||
74 | |||
75 | sectors += partial_sectors; | ||
76 | |||
77 | - file_length = bdrv_getlength(bs->file->bs); | ||
78 | + file_length = bdrv_co_getlength(bs->file->bs); | ||
79 | if (file_length < 0) { | ||
80 | ret = file_length; | ||
81 | goto exit; | ||
82 | @@ -XXX,XX +XXX,XX @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, | ||
83 | |||
84 | if (i == 0 && leading_length) { | ||
85 | /* partial sector at the front of the buffer */ | ||
86 | - ret = bdrv_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE, | ||
87 | - merged_sector, 0); | ||
88 | + ret = bdrv_co_pread(bs->file, file_offset, VHDX_LOG_SECTOR_SIZE, | ||
89 | + merged_sector, 0); | ||
90 | if (ret < 0) { | ||
91 | goto exit; | ||
92 | } | ||
93 | @@ -XXX,XX +XXX,XX @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, | ||
94 | sector_write = merged_sector; | ||
95 | } else if (i == sectors - 1 && trailing_length) { | ||
96 | /* partial sector at the end of the buffer */ | ||
97 | - ret = bdrv_pread(bs->file, file_offset + trailing_length, | ||
98 | - VHDX_LOG_SECTOR_SIZE - trailing_length, | ||
99 | - merged_sector + trailing_length, 0); | ||
100 | + ret = bdrv_co_pread(bs->file, file_offset + trailing_length, | ||
101 | + VHDX_LOG_SECTOR_SIZE - trailing_length, | ||
102 | + merged_sector + trailing_length, 0); | ||
103 | if (ret < 0) { | ||
104 | goto exit; | ||
105 | } | ||
106 | @@ -XXX,XX +XXX,XX @@ exit: | ||
107 | } | ||
108 | |||
109 | /* Perform a log write, and then immediately flush the entire log */ | ||
110 | -int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, | ||
111 | - void *data, uint32_t length, uint64_t offset) | ||
112 | +int coroutine_fn | ||
113 | +vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, | ||
114 | + void *data, uint32_t length, uint64_t offset) | ||
115 | { | ||
116 | int ret = 0; | ||
117 | VHDXLogSequence logs = { .valid = true, | ||
118 | @@ -XXX,XX +XXX,XX @@ int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, | ||
119 | |||
120 | /* Make sure data written (new and/or changed blocks) is stable | ||
121 | * on disk, before creating log entry */ | ||
122 | - ret = bdrv_flush(bs); | ||
123 | + ret = bdrv_co_flush(bs); | ||
124 | if (ret < 0) { | ||
125 | goto exit; | ||
126 | } | ||
127 | @@ -XXX,XX +XXX,XX @@ int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, | ||
128 | logs.log = s->log; | ||
129 | |||
130 | /* Make sure log is stable on disk */ | ||
131 | - ret = bdrv_flush(bs); | ||
132 | + ret = bdrv_co_flush(bs); | ||
133 | if (ret < 0) { | ||
134 | goto exit; | ||
135 | } | ||
136 | diff --git a/block/vhdx.c b/block/vhdx.c | ||
137 | index XXXXXXX..XXXXXXX 100644 | ||
138 | --- a/block/vhdx.c | ||
139 | +++ b/block/vhdx.c | ||
140 | @@ -XXX,XX +XXX,XX @@ exit: | ||
141 | * | ||
142 | * Returns the file offset start of the new payload block | ||
143 | */ | ||
144 | -static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, | ||
145 | - uint64_t *new_offset, bool *need_zero) | ||
146 | +static int coroutine_fn GRAPH_RDLOCK | ||
147 | +vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, | ||
148 | + uint64_t *new_offset, bool *need_zero) | ||
149 | { | ||
150 | int64_t current_len; | ||
151 | |||
152 | - current_len = bdrv_getlength(bs->file->bs); | ||
153 | + current_len = bdrv_co_getlength(bs->file->bs); | ||
154 | if (current_len < 0) { | ||
155 | return current_len; | ||
156 | } | ||
157 | @@ -XXX,XX +XXX,XX @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, | ||
158 | if (*need_zero) { | ||
159 | int ret; | ||
160 | |||
161 | - ret = bdrv_truncate(bs->file, *new_offset + s->block_size, false, | ||
162 | - PREALLOC_MODE_OFF, BDRV_REQ_ZERO_WRITE, NULL); | ||
163 | + ret = bdrv_co_truncate(bs->file, *new_offset + s->block_size, false, | ||
164 | + PREALLOC_MODE_OFF, BDRV_REQ_ZERO_WRITE, NULL); | ||
165 | if (ret != -ENOTSUP) { | ||
166 | *need_zero = false; | ||
167 | return ret; | ||
168 | } | 34 | } |
169 | } | 35 | } |
170 | 36 | ||
171 | - return bdrv_truncate(bs->file, *new_offset + s->block_size, false, | 37 | + req->ctx = qemu_get_current_aio_context(); |
172 | - PREALLOC_MODE_OFF, 0, NULL); | 38 | req->cmd = cmd; |
173 | + return bdrv_co_truncate(bs->file, *new_offset + s->block_size, false, | 39 | req->residual = req->cmd.xfer; |
174 | + PREALLOC_MODE_OFF, 0, NULL); | 40 | |
175 | } | 41 | diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c |
176 | 42 | index XXXXXXX..XXXXXXX 100644 | |
177 | /* | 43 | --- a/hw/scsi/scsi-disk.c |
178 | @@ -XXX,XX +XXX,XX @@ exit: | 44 | +++ b/hw/scsi/scsi-disk.c |
179 | * The first 64KB of the Metadata section is reserved for the metadata | 45 | @@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret) |
180 | * header and entries; beyond that, the metadata items themselves reside. | 46 | SCSIDiskReq *r = (SCSIDiskReq *)opaque; |
181 | */ | 47 | SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); |
182 | -static int vhdx_create_new_metadata(BlockBackend *blk, | 48 | |
183 | - uint64_t image_size, | 49 | - /* The request must only run in the BlockBackend's AioContext */ |
184 | - uint32_t block_size, | 50 | - assert(blk_get_aio_context(s->qdev.conf.blk) == |
185 | - uint32_t sector_size, | 51 | - qemu_get_current_aio_context()); |
186 | - uint64_t metadata_offset, | 52 | + /* The request must run in its AioContext */ |
187 | - VHDXImageType type) | 53 | + assert(r->req.ctx == qemu_get_current_aio_context()); |
188 | +static int coroutine_fn | 54 | |
189 | +vhdx_create_new_metadata(BlockBackend *blk, uint64_t image_size, | 55 | assert(r->req.aiocb != NULL); |
190 | + uint32_t block_size, uint32_t sector_size, | 56 | r->req.aiocb = NULL; |
191 | + uint64_t metadata_offset, VHDXImageType type) | 57 | @@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret) |
58 | |||
59 | static void scsi_read_complete_noio(SCSIDiskReq *r, int ret) | ||
192 | { | 60 | { |
193 | int ret = 0; | 61 | - SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); |
194 | uint32_t offset = 0; | 62 | uint32_t n; |
195 | @@ -XXX,XX +XXX,XX @@ static int vhdx_create_new_metadata(BlockBackend *blk, | 63 | |
196 | VHDX_META_FLAGS_IS_VIRTUAL_DISK; | 64 | - /* The request must only run in the BlockBackend's AioContext */ |
197 | vhdx_metadata_entry_le_export(&md_table_entry[4]); | 65 | - assert(blk_get_aio_context(s->qdev.conf.blk) == |
198 | 66 | - qemu_get_current_aio_context()); | |
199 | - ret = blk_pwrite(blk, metadata_offset, VHDX_HEADER_BLOCK_SIZE, buffer, 0); | 67 | + /* The request must run in its AioContext */ |
200 | + ret = blk_co_pwrite(blk, metadata_offset, VHDX_HEADER_BLOCK_SIZE, buffer, 0); | 68 | + assert(r->req.ctx == qemu_get_current_aio_context()); |
201 | if (ret < 0) { | 69 | |
202 | goto exit; | 70 | assert(r->req.aiocb == NULL); |
203 | } | 71 | if (scsi_disk_req_check_error(r, ret, ret > 0)) { |
204 | 72 | @@ -XXX,XX +XXX,XX @@ static void scsi_read_data(SCSIRequest *req) | |
205 | - ret = blk_pwrite(blk, metadata_offset + (64 * KiB), | 73 | |
206 | - VHDX_METADATA_ENTRY_BUFFER_SIZE, entry_buffer, 0); | 74 | static void scsi_write_complete_noio(SCSIDiskReq *r, int ret) |
207 | + ret = blk_co_pwrite(blk, metadata_offset + (64 * KiB), | ||
208 | + VHDX_METADATA_ENTRY_BUFFER_SIZE, entry_buffer, 0); | ||
209 | if (ret < 0) { | ||
210 | goto exit; | ||
211 | } | ||
212 | @@ -XXX,XX +XXX,XX @@ exit: | ||
213 | * Fixed images: default state of the BAT is fully populated, with | ||
214 | * file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. | ||
215 | */ | ||
216 | -static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, | ||
217 | - uint64_t image_size, VHDXImageType type, | ||
218 | - bool use_zero_blocks, uint64_t file_offset, | ||
219 | - uint32_t length, Error **errp) | ||
220 | +static int coroutine_fn | ||
221 | +vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, | ||
222 | + uint64_t image_size, VHDXImageType type, | ||
223 | + bool use_zero_blocks, uint64_t file_offset, | ||
224 | + uint32_t length, Error **errp) | ||
225 | { | 75 | { |
226 | int ret = 0; | 76 | - SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); |
227 | uint64_t data_file_offset; | 77 | uint32_t n; |
228 | @@ -XXX,XX +XXX,XX @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, | 78 | |
229 | if (type == VHDX_TYPE_DYNAMIC) { | 79 | - /* The request must only run in the BlockBackend's AioContext */ |
230 | /* All zeroes, so we can just extend the file - the end of the BAT | 80 | - assert(blk_get_aio_context(s->qdev.conf.blk) == |
231 | * is the furthest thing we have written yet */ | 81 | - qemu_get_current_aio_context()); |
232 | - ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF, | 82 | + /* The request must run in its AioContext */ |
233 | - 0, errp); | 83 | + assert(r->req.ctx == qemu_get_current_aio_context()); |
234 | + ret = blk_co_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF, | 84 | |
235 | + 0, errp); | 85 | assert (r->req.aiocb == NULL); |
236 | if (ret < 0) { | 86 | if (scsi_disk_req_check_error(r, ret, ret > 0)) { |
237 | goto exit; | ||
238 | } | ||
239 | } else if (type == VHDX_TYPE_FIXED) { | ||
240 | - ret = blk_truncate(blk, data_file_offset + image_size, false, | ||
241 | - PREALLOC_MODE_OFF, 0, errp); | ||
242 | + ret = blk_co_truncate(blk, data_file_offset + image_size, false, | ||
243 | + PREALLOC_MODE_OFF, 0, errp); | ||
244 | if (ret < 0) { | ||
245 | goto exit; | ||
246 | } | ||
247 | @@ -XXX,XX +XXX,XX @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s, | ||
248 | s->bat[sinfo.bat_idx] = cpu_to_le64(s->bat[sinfo.bat_idx]); | ||
249 | sector_num += s->sectors_per_block; | ||
250 | } | ||
251 | - ret = blk_pwrite(blk, file_offset, length, s->bat, 0); | ||
252 | + ret = blk_co_pwrite(blk, file_offset, length, s->bat, 0); | ||
253 | if (ret < 0) { | ||
254 | error_setg_errno(errp, -ret, "Failed to write the BAT"); | ||
255 | goto exit; | ||
256 | @@ -XXX,XX +XXX,XX @@ exit: | ||
257 | * to create the BAT itself, we will also cause the BAT to be | ||
258 | * created. | ||
259 | */ | ||
260 | -static int vhdx_create_new_region_table(BlockBackend *blk, | ||
261 | - uint64_t image_size, | ||
262 | - uint32_t block_size, | ||
263 | - uint32_t sector_size, | ||
264 | - uint32_t log_size, | ||
265 | - bool use_zero_blocks, | ||
266 | - VHDXImageType type, | ||
267 | - uint64_t *metadata_offset, | ||
268 | - Error **errp) | ||
269 | +static int coroutine_fn | ||
270 | +vhdx_create_new_region_table(BlockBackend *blk, uint64_t image_size, | ||
271 | + uint32_t block_size, uint32_t sector_size, | ||
272 | + uint32_t log_size, bool use_zero_blocks, | ||
273 | + VHDXImageType type, uint64_t *metadata_offset, | ||
274 | + Error **errp) | ||
275 | { | ||
276 | int ret = 0; | ||
277 | uint32_t offset = 0; | ||
278 | @@ -XXX,XX +XXX,XX @@ static int vhdx_create_new_region_table(BlockBackend *blk, | ||
279 | } | ||
280 | |||
281 | /* Now write out the region headers to disk */ | ||
282 | - ret = blk_pwrite(blk, VHDX_REGION_TABLE_OFFSET, VHDX_HEADER_BLOCK_SIZE, | ||
283 | - buffer, 0); | ||
284 | + ret = blk_co_pwrite(blk, VHDX_REGION_TABLE_OFFSET, VHDX_HEADER_BLOCK_SIZE, | ||
285 | + buffer, 0); | ||
286 | if (ret < 0) { | ||
287 | error_setg_errno(errp, -ret, "Failed to write first region table"); | ||
288 | goto exit; | ||
289 | } | ||
290 | |||
291 | - ret = blk_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, VHDX_HEADER_BLOCK_SIZE, | ||
292 | - buffer, 0); | ||
293 | + ret = blk_co_pwrite(blk, VHDX_REGION_TABLE2_OFFSET, VHDX_HEADER_BLOCK_SIZE, | ||
294 | + buffer, 0); | ||
295 | if (ret < 0) { | ||
296 | error_setg_errno(errp, -ret, "Failed to write second region table"); | ||
297 | goto exit; | ||
298 | -- | 87 | -- |
299 | 2.41.0 | 88 | 2.48.1 | diff view generated by jsdifflib |
1 | If the caller keeps the AioContext lock for a block node in an iothread, | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | polling in bdrv_graph_wrlock() deadlocks if the condition isn't | ||
3 | fulfilled immediately. | ||
4 | 2 | ||
5 | Now that all callers make sure to actually have the AioContext locked | 3 | SCSIDevice keeps track of in-flight requests for device reset and Task |
6 | when they call bdrv_replace_child_noperm() like they should, we can | 4 | Management Functions (TMFs). The request list requires protection so |
7 | change bdrv_graph_wrlock() to take a BlockDriverState whose AioContext | 5 | that multi-threaded SCSI emulation can be implemented in commits that |
8 | lock the caller holds (NULL if it doesn't) and unlock it temporarily | 6 | follow. |
9 | while polling. | ||
10 | 7 | ||
11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
12 | Message-ID: <20230605085711.21261-11-kwolf@redhat.com> | 9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
13 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 10 | Message-ID: <20250311132616.1049687-5-stefanha@redhat.com> |
14 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 11 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
15 | --- | 12 | --- |
16 | include/block/graph-lock.h | 6 ++++-- | 13 | include/hw/scsi/scsi.h | 7 ++- |
17 | block.c | 4 ++-- | 14 | hw/scsi/scsi-bus.c | 120 +++++++++++++++++++++++++++++------------ |
18 | block/graph-lock.c | 23 ++++++++++++++++++++++- | 15 | 2 files changed, 88 insertions(+), 39 deletions(-) |
19 | 3 files changed, 28 insertions(+), 5 deletions(-) | ||
20 | 16 | ||
21 | diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h | 17 | diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h |
22 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
23 | --- a/include/block/graph-lock.h | 19 | --- a/include/hw/scsi/scsi.h |
24 | +++ b/include/block/graph-lock.h | 20 | +++ b/include/hw/scsi/scsi.h |
25 | @@ -XXX,XX +XXX,XX @@ void unregister_aiocontext(AioContext *ctx); | 21 | @@ -XXX,XX +XXX,XX @@ struct SCSIRequest { |
26 | * The wrlock can only be taken from the main loop, with BQL held, as only the | 22 | bool dma_started; |
27 | * main loop is allowed to modify the graph. | 23 | BlockAIOCB *aiocb; |
28 | * | 24 | QEMUSGList *sg; |
29 | + * If @bs is non-NULL, its AioContext is temporarily released. | 25 | + |
26 | + /* Protected by SCSIDevice->requests_lock */ | ||
27 | QTAILQ_ENTRY(SCSIRequest) next; | ||
28 | }; | ||
29 | |||
30 | @@ -XXX,XX +XXX,XX @@ struct SCSIDevice | ||
31 | uint8_t sense[SCSI_SENSE_BUF_SIZE]; | ||
32 | uint32_t sense_len; | ||
33 | |||
34 | - /* | ||
35 | - * The requests list is only accessed from the AioContext that executes | ||
36 | - * requests or from the main loop when IOThread processing is stopped. | ||
37 | - */ | ||
38 | + QemuMutex requests_lock; /* protects the requests list */ | ||
39 | QTAILQ_HEAD(, SCSIRequest) requests; | ||
40 | |||
41 | uint32_t channel; | ||
42 | diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c | ||
43 | index XXXXXXX..XXXXXXX 100644 | ||
44 | --- a/hw/scsi/scsi-bus.c | ||
45 | +++ b/hw/scsi/scsi-bus.c | ||
46 | @@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_sync(SCSIDevice *s, | ||
47 | assert(!runstate_is_running()); | ||
48 | assert(qemu_in_main_thread()); | ||
49 | |||
50 | - QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) { | ||
51 | - fn(req, opaque); | ||
52 | + /* | ||
53 | + * Locking is not necessary because the guest is stopped and no other | ||
54 | + * threads can be accessing the requests list, but take the lock for | ||
55 | + * consistency. | ||
56 | + */ | ||
57 | + WITH_QEMU_LOCK_GUARD(&s->requests_lock) { | ||
58 | + QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) { | ||
59 | + fn(req, opaque); | ||
60 | + } | ||
61 | } | ||
62 | } | ||
63 | |||
64 | @@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async_bh(void *opaque) | ||
65 | { | ||
66 | g_autofree SCSIDeviceForEachReqAsyncData *data = opaque; | ||
67 | SCSIDevice *s = data->s; | ||
68 | - AioContext *ctx; | ||
69 | - SCSIRequest *req; | ||
70 | - SCSIRequest *next; | ||
71 | + g_autoptr(GList) reqs = NULL; | ||
72 | |||
73 | /* | ||
74 | - * The BB cannot have changed contexts between this BH being scheduled and | ||
75 | - * now: BBs' AioContexts, when they have a node attached, can only be | ||
76 | - * changed via bdrv_try_change_aio_context(), in a drained section. While | ||
77 | - * we have the in-flight counter incremented, that drain must block. | ||
78 | + * Build a list of requests in this AioContext so fn() can be invoked later | ||
79 | + * outside requests_lock. | ||
80 | */ | ||
81 | - ctx = blk_get_aio_context(s->conf.blk); | ||
82 | - assert(ctx == qemu_get_current_aio_context()); | ||
83 | + WITH_QEMU_LOCK_GUARD(&s->requests_lock) { | ||
84 | + AioContext *ctx = qemu_get_current_aio_context(); | ||
85 | + SCSIRequest *req; | ||
86 | + SCSIRequest *next; | ||
87 | + | ||
88 | + QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { | ||
89 | + if (req->ctx == ctx) { | ||
90 | + scsi_req_ref(req); /* dropped after calling fn() */ | ||
91 | + reqs = g_list_prepend(reqs, req); | ||
92 | + } | ||
93 | + } | ||
94 | + } | ||
95 | |||
96 | - QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { | ||
97 | - data->fn(req, data->fn_opaque); | ||
98 | + /* Call fn() on each request */ | ||
99 | + for (GList *elem = g_list_first(reqs); elem; elem = g_list_next(elem)) { | ||
100 | + data->fn(elem->data, data->fn_opaque); | ||
101 | + scsi_req_unref(elem->data); | ||
102 | } | ||
103 | |||
104 | /* Drop the reference taken by scsi_device_for_each_req_async() */ | ||
105 | @@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async_bh(void *opaque) | ||
106 | blk_dec_in_flight(s->conf.blk); | ||
107 | } | ||
108 | |||
109 | +static void scsi_device_for_each_req_async_do_ctx(gpointer key, gpointer value, | ||
110 | + gpointer user_data) | ||
111 | +{ | ||
112 | + AioContext *ctx = key; | ||
113 | + SCSIDeviceForEachReqAsyncData *params = user_data; | ||
114 | + SCSIDeviceForEachReqAsyncData *data; | ||
115 | + | ||
116 | + data = g_new(SCSIDeviceForEachReqAsyncData, 1); | ||
117 | + data->s = params->s; | ||
118 | + data->fn = params->fn; | ||
119 | + data->fn_opaque = params->fn_opaque; | ||
120 | + | ||
121 | + /* | ||
122 | + * Hold a reference to the SCSIDevice until | ||
123 | + * scsi_device_for_each_req_async_bh() finishes. | ||
124 | + */ | ||
125 | + object_ref(OBJECT(data->s)); | ||
126 | + | ||
127 | + /* Paired with scsi_device_for_each_req_async_bh() */ | ||
128 | + blk_inc_in_flight(data->s->conf.blk); | ||
129 | + | ||
130 | + aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, data); | ||
131 | +} | ||
132 | + | ||
133 | /* | ||
134 | * Schedule @fn() to be invoked for each enqueued request in device @s. @fn() | ||
135 | - * runs in the AioContext that is executing the request. | ||
136 | + * must be thread-safe because it runs concurrently in each AioContext that is | ||
137 | + * executing a request. | ||
30 | + * | 138 | + * |
31 | * This function polls. Callers must not hold the lock of any AioContext other | 139 | * Keeps the BlockBackend's in-flight counter incremented until everything is |
32 | - * than the current one. | 140 | * done, so draining it will settle all scheduled @fn() calls. |
33 | + * than the current one and the one of @bs. | ||
34 | */ | 141 | */ |
35 | -void bdrv_graph_wrlock(void) TSA_ACQUIRE(graph_lock) TSA_NO_TSA; | 142 | @@ -XXX,XX +XXX,XX @@ static void scsi_device_for_each_req_async(SCSIDevice *s, |
36 | +void bdrv_graph_wrlock(BlockDriverState *bs) TSA_ACQUIRE(graph_lock) TSA_NO_TSA; | ||
37 | |||
38 | /* | ||
39 | * bdrv_graph_wrunlock: | ||
40 | diff --git a/block.c b/block.c | ||
41 | index XXXXXXX..XXXXXXX 100644 | ||
42 | --- a/block.c | ||
43 | +++ b/block.c | ||
44 | @@ -XXX,XX +XXX,XX @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm) | ||
45 | * Replaces the node that a BdrvChild points to without updating permissions. | ||
46 | * | ||
47 | * If @new_bs is non-NULL, the parent of @child must already be drained through | ||
48 | - * @child. | ||
49 | + * @child and the caller must hold the AioContext lock for @new_bs. | ||
50 | */ | ||
51 | static void bdrv_replace_child_noperm(BdrvChild *child, | ||
52 | BlockDriverState *new_bs) | ||
53 | @@ -XXX,XX +XXX,XX @@ static void bdrv_replace_child_noperm(BdrvChild *child, | ||
54 | } | ||
55 | |||
56 | /* TODO Pull this up into the callers to avoid polling here */ | ||
57 | - bdrv_graph_wrlock(); | ||
58 | + bdrv_graph_wrlock(new_bs); | ||
59 | if (old_bs) { | ||
60 | if (child->klass->detach) { | ||
61 | child->klass->detach(child); | ||
62 | diff --git a/block/graph-lock.c b/block/graph-lock.c | ||
63 | index XXXXXXX..XXXXXXX 100644 | ||
64 | --- a/block/graph-lock.c | ||
65 | +++ b/block/graph-lock.c | ||
66 | @@ -XXX,XX +XXX,XX @@ static uint32_t reader_count(void) | ||
67 | } | ||
68 | #endif | ||
69 | |||
70 | -void bdrv_graph_wrlock(void) | ||
71 | +void bdrv_graph_wrlock(BlockDriverState *bs) | ||
72 | { | 143 | { |
73 | + AioContext *ctx = NULL; | 144 | assert(qemu_in_main_thread()); |
74 | + | 145 | |
75 | GLOBAL_STATE_CODE(); | 146 | - SCSIDeviceForEachReqAsyncData *data = |
76 | /* | 147 | - g_new(SCSIDeviceForEachReqAsyncData, 1); |
77 | * TODO Some callers hold an AioContext lock when this is called, which | 148 | - |
78 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_wrlock(void) | 149 | - data->s = s; |
79 | */ | 150 | - data->fn = fn; |
80 | #if 0 | 151 | - data->fn_opaque = opaque; |
81 | assert(!qatomic_read(&has_writer)); | 152 | - |
82 | +#endif | 153 | - /* |
83 | + | 154 | - * Hold a reference to the SCSIDevice until |
84 | + /* | 155 | - * scsi_device_for_each_req_async_bh() finishes. |
85 | + * Release only non-mainloop AioContext. The mainloop often relies on the | 156 | - */ |
86 | + * BQL and doesn't lock the main AioContext before doing things. | 157 | - object_ref(OBJECT(s)); |
87 | + */ | 158 | + /* The set of AioContexts where the requests are being processed */ |
88 | + if (bs) { | 159 | + g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL); |
89 | + ctx = bdrv_get_aio_context(bs); | 160 | + WITH_QEMU_LOCK_GUARD(&s->requests_lock) { |
90 | + if (ctx != qemu_get_aio_context()) { | 161 | + SCSIRequest *req; |
91 | + aio_context_release(ctx); | 162 | + QTAILQ_FOREACH(req, &s->requests, next) { |
92 | + } else { | 163 | + g_hash_table_add(aio_contexts, req->ctx); |
93 | + ctx = NULL; | ||
94 | + } | 164 | + } |
95 | + } | 165 | + } |
96 | 166 | ||
97 | +#if 0 | 167 | - /* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */ |
98 | /* Make sure that constantly arriving new I/O doesn't cause starvation */ | 168 | - blk_inc_in_flight(s->conf.blk); |
99 | bdrv_drain_all_begin_nopoll(); | 169 | - aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk), |
100 | 170 | - scsi_device_for_each_req_async_bh, | |
101 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_wrlock(void) | 171 | - data); |
102 | 172 | + /* Schedule a BH for each AioContext */ | |
103 | bdrv_drain_all_end(); | 173 | + SCSIDeviceForEachReqAsyncData params = { |
104 | #endif | 174 | + .s = s, |
105 | + | 175 | + .fn = fn, |
106 | + if (ctx) { | 176 | + .fn_opaque = opaque, |
107 | + aio_context_acquire(bdrv_get_aio_context(bs)); | 177 | + }; |
178 | + g_hash_table_foreach( | ||
179 | + aio_contexts, | ||
180 | + scsi_device_for_each_req_async_do_ctx, | ||
181 | + ¶ms | ||
182 | + ); | ||
183 | } | ||
184 | |||
185 | static void scsi_device_realize(SCSIDevice *s, Error **errp) | ||
186 | @@ -XXX,XX +XXX,XX @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp) | ||
187 | dev->lun = lun; | ||
188 | } | ||
189 | |||
190 | + qemu_mutex_init(&dev->requests_lock); | ||
191 | QTAILQ_INIT(&dev->requests); | ||
192 | scsi_device_realize(dev, &local_err); | ||
193 | if (local_err) { | ||
194 | @@ -XXX,XX +XXX,XX @@ static void scsi_qdev_unrealize(DeviceState *qdev) | ||
195 | |||
196 | scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE)); | ||
197 | |||
198 | + qemu_mutex_destroy(&dev->requests_lock); | ||
199 | + | ||
200 | scsi_device_unrealize(dev); | ||
201 | |||
202 | blockdev_mark_auto_del(dev->conf.blk); | ||
203 | @@ -XXX,XX +XXX,XX @@ static void scsi_req_enqueue_internal(SCSIRequest *req) | ||
204 | req->sg = NULL; | ||
205 | } | ||
206 | req->enqueued = true; | ||
207 | - QTAILQ_INSERT_TAIL(&req->dev->requests, req, next); | ||
208 | + | ||
209 | + WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) { | ||
210 | + QTAILQ_INSERT_TAIL(&req->dev->requests, req, next); | ||
108 | + } | 211 | + } |
109 | } | 212 | } |
110 | 213 | ||
111 | void bdrv_graph_wrunlock(void) | 214 | int32_t scsi_req_enqueue(SCSIRequest *req) |
215 | @@ -XXX,XX +XXX,XX @@ static void scsi_req_dequeue(SCSIRequest *req) | ||
216 | trace_scsi_req_dequeue(req->dev->id, req->lun, req->tag); | ||
217 | req->retry = false; | ||
218 | if (req->enqueued) { | ||
219 | - QTAILQ_REMOVE(&req->dev->requests, req, next); | ||
220 | + WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) { | ||
221 | + QTAILQ_REMOVE(&req->dev->requests, req, next); | ||
222 | + } | ||
223 | req->enqueued = false; | ||
224 | scsi_req_unref(req); | ||
225 | } | ||
226 | @@ -XXX,XX +XXX,XX @@ static void scsi_device_class_init(ObjectClass *klass, void *data) | ||
227 | |||
228 | static void scsi_dev_instance_init(Object *obj) | ||
229 | { | ||
230 | - DeviceState *dev = DEVICE(obj); | ||
231 | - SCSIDevice *s = SCSI_DEVICE(dev); | ||
232 | + SCSIDevice *s = SCSI_DEVICE(obj); | ||
233 | |||
234 | device_add_bootindex_property(obj, &s->conf.bootindex, | ||
235 | "bootindex", NULL, | ||
112 | -- | 236 | -- |
113 | 2.41.0 | 237 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | Virtqueues are not thread-safe. Until now this was not a major issue |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 4 | since all virtqueue processing happened in the same thread. The ctrl |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | 5 | queue's Task Management Function (TMF) requests sometimes need the main |
6 | 6 | loop, so a BH was used to schedule the virtqueue completion back in the | |
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 7 | thread that has virtqueue access. |
8 | Message-ID: <20230601115145.196465-4-pbonzini@redhat.com> | 8 | |
9 | When IOThread Virtqueue Mapping is introduced in later commits, event | ||
10 | and ctrl virtqueue accesses from other threads will become necessary. | ||
11 | Introduce an optional per-virtqueue lock so the event and ctrl | ||
12 | virtqueues can be protected in the commits that follow. | ||
13 | |||
14 | The addition of the ctrl virtqueue lock makes | ||
15 | virtio_scsi_complete_req_from_main_loop() and its BH unnecessary. | ||
16 | Instead, take the ctrl virtqueue lock from the main loop thread. | ||
17 | |||
18 | The cmd virtqueue does not have a lock because the entirety of SCSI | ||
19 | command processing happens in one thread. Only one thread accesses the | ||
20 | cmd virtqueue and a lock is unnecessary. | ||
21 | |||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 23 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
24 | Message-ID: <20250311132616.1049687-6-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 25 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 26 | --- |
12 | block/vpc.c | 52 ++++++++++++++++++++++++++-------------------------- | 27 | include/hw/virtio/virtio-scsi.h | 3 ++ |
13 | 1 file changed, 26 insertions(+), 26 deletions(-) | 28 | hw/scsi/virtio-scsi.c | 84 ++++++++++++++++++--------------- |
14 | 29 | 2 files changed, 49 insertions(+), 38 deletions(-) | |
15 | diff --git a/block/vpc.c b/block/vpc.c | 30 | |
31 | diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h | ||
16 | index XXXXXXX..XXXXXXX 100644 | 32 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/vpc.c | 33 | --- a/include/hw/virtio/virtio-scsi.h |
18 | +++ b/block/vpc.c | 34 | +++ b/include/hw/virtio/virtio-scsi.h |
19 | @@ -XXX,XX +XXX,XX @@ static int vpc_reopen_prepare(BDRVReopenState *state, | 35 | @@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI { |
20 | * operation (the block bitmaps is updated then), 0 otherwise. | 36 | int resetting; /* written from main loop thread, read from any thread */ |
21 | * If write is true then err must not be NULL. | 37 | bool events_dropped; |
22 | */ | 38 | |
23 | -static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset, | 39 | + QemuMutex ctrl_lock; /* protects ctrl_vq */ |
24 | - bool write, int *err) | 40 | + QemuMutex event_lock; /* protects event_vq */ |
25 | +static int64_t coroutine_fn GRAPH_RDLOCK | 41 | + |
26 | +get_image_offset(BlockDriverState *bs, uint64_t offset, bool write, int *err) | 42 | /* |
27 | { | 43 | * TMFs deferred to main loop BH. These fields are protected by |
28 | BDRVVPCState *s = bs->opaque; | 44 | * tmf_bh_lock. |
29 | uint64_t bitmap_offset, block_offset; | 45 | diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c |
30 | @@ -XXX,XX +XXX,XX @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset, | 46 | index XXXXXXX..XXXXXXX 100644 |
31 | 47 | --- a/hw/scsi/virtio-scsi.c | |
32 | s->last_bitmap_offset = bitmap_offset; | 48 | +++ b/hw/scsi/virtio-scsi.c |
33 | memset(bitmap, 0xff, s->bitmap_size); | 49 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req) |
34 | - r = bdrv_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap, | 50 | g_free(req); |
35 | - 0); | 51 | } |
36 | + r = bdrv_co_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap, 0); | 52 | |
37 | if (r < 0) { | 53 | -static void virtio_scsi_complete_req(VirtIOSCSIReq *req) |
38 | *err = r; | 54 | +static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock) |
39 | return -2; | 55 | { |
40 | @@ -XXX,XX +XXX,XX @@ static inline int64_t get_image_offset(BlockDriverState *bs, uint64_t offset, | 56 | VirtIOSCSI *s = req->dev; |
41 | * | 57 | VirtQueue *vq = req->vq; |
42 | * Returns 0 on success and < 0 on error | 58 | VirtIODevice *vdev = VIRTIO_DEVICE(s); |
43 | */ | 59 | |
44 | -static int rewrite_footer(BlockDriverState *bs) | 60 | qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size); |
45 | +static int coroutine_fn GRAPH_RDLOCK rewrite_footer(BlockDriverState *bs) | 61 | + |
46 | { | 62 | + if (vq_lock) { |
47 | int ret; | 63 | + qemu_mutex_lock(vq_lock); |
48 | BDRVVPCState *s = bs->opaque; | 64 | + } |
49 | int64_t offset = s->free_data_block_offset; | 65 | + |
50 | 66 | virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size); | |
51 | - ret = bdrv_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0); | 67 | if (s->dataplane_started && !s->dataplane_fenced) { |
52 | + ret = bdrv_co_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0); | 68 | virtio_notify_irqfd(vdev, vq); |
53 | if (ret < 0) | 69 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req) |
54 | return ret; | 70 | virtio_notify(vdev, vq); |
55 | 71 | } | |
56 | @@ -XXX,XX +XXX,XX @@ static int rewrite_footer(BlockDriverState *bs) | 72 | |
57 | * | 73 | + if (vq_lock) { |
58 | * Returns the sectors' offset in the image file on success and < 0 on error | 74 | + qemu_mutex_unlock(vq_lock); |
59 | */ | 75 | + } |
60 | -static int64_t alloc_block(BlockDriverState *bs, int64_t offset) | 76 | + |
61 | +static int64_t coroutine_fn GRAPH_RDLOCK | 77 | if (req->sreq) { |
62 | +alloc_block(BlockDriverState *bs, int64_t offset) | 78 | req->sreq->hba_private = NULL; |
63 | { | 79 | scsi_req_unref(req->sreq); |
64 | BDRVVPCState *s = bs->opaque; | 80 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req) |
65 | int64_t bat_offset; | 81 | virtio_scsi_free_req(req); |
66 | @@ -XXX,XX +XXX,XX @@ static int64_t alloc_block(BlockDriverState *bs, int64_t offset) | 82 | } |
67 | 83 | ||
68 | /* Initialize the block's bitmap */ | 84 | -static void virtio_scsi_complete_req_bh(void *opaque) |
69 | memset(bitmap, 0xff, s->bitmap_size); | 85 | +static void virtio_scsi_bad_req(VirtIOSCSIReq *req, QemuMutex *vq_lock) |
70 | - ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, | 86 | { |
71 | - s->bitmap_size, bitmap, 0); | 87 | - VirtIOSCSIReq *req = opaque; |
72 | + ret = bdrv_co_pwrite_sync(bs->file, s->free_data_block_offset, | 88 | + virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers"); |
73 | + s->bitmap_size, bitmap, 0); | 89 | |
74 | if (ret < 0) { | 90 | - virtio_scsi_complete_req(req); |
75 | return ret; | 91 | -} |
76 | } | 92 | + if (vq_lock) { |
77 | @@ -XXX,XX +XXX,XX @@ static int64_t alloc_block(BlockDriverState *bs, int64_t offset) | 93 | + qemu_mutex_lock(vq_lock); |
78 | /* Write BAT entry to disk */ | 94 | + } |
79 | bat_offset = s->bat_offset + (4 * index); | 95 | |
80 | bat_value = cpu_to_be32(s->pagetable[index]); | 96 | -/* |
81 | - ret = bdrv_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0); | 97 | - * Called from virtio_scsi_do_one_tmf_bh() in main loop thread. The main loop |
82 | + ret = bdrv_co_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0); | 98 | - * thread cannot touch the virtqueue since that could race with an IOThread. |
83 | if (ret < 0) | 99 | - */ |
84 | goto fail; | 100 | -static void virtio_scsi_complete_req_from_main_loop(VirtIOSCSIReq *req) |
85 | 101 | -{ | |
86 | @@ -XXX,XX +XXX,XX @@ fail: | 102 | - VirtIOSCSI *s = req->dev; |
87 | return ret; | 103 | + virtqueue_detach_element(req->vq, &req->elem, 0); |
88 | } | 104 | |
89 | 105 | - if (!s->ctx || s->ctx == qemu_get_aio_context()) { | |
90 | -static int coroutine_fn vpc_co_block_status(BlockDriverState *bs, | 106 | - /* No need to schedule a BH when there is no IOThread */ |
91 | - bool want_zero, | 107 | - virtio_scsi_complete_req(req); |
92 | - int64_t offset, int64_t bytes, | 108 | - } else { |
93 | - int64_t *pnum, int64_t *map, | 109 | - /* Run request completion in the IOThread */ |
94 | - BlockDriverState **file) | 110 | - aio_wait_bh_oneshot(s->ctx, virtio_scsi_complete_req_bh, req); |
95 | +static int coroutine_fn GRAPH_RDLOCK | 111 | + if (vq_lock) { |
96 | +vpc_co_block_status(BlockDriverState *bs, bool want_zero, | 112 | + qemu_mutex_unlock(vq_lock); |
97 | + int64_t offset, int64_t bytes, | 113 | } |
98 | + int64_t *pnum, int64_t *map, | 114 | -} |
99 | + BlockDriverState **file) | 115 | |
100 | { | 116 | -static void virtio_scsi_bad_req(VirtIOSCSIReq *req) |
101 | BDRVVPCState *s = bs->opaque; | 117 | -{ |
102 | int64_t image_offset; | 118 | - virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers"); |
103 | @@ -XXX,XX +XXX,XX @@ static int calculate_geometry(int64_t total_sectors, uint16_t *cyls, | 119 | - virtqueue_detach_element(req->vq, &req->elem, 0); |
120 | virtio_scsi_free_req(req); | ||
121 | } | ||
122 | |||
123 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_parse_req(VirtIOSCSIReq *req, | ||
104 | return 0; | 124 | return 0; |
105 | } | 125 | } |
106 | 126 | ||
107 | -static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer, | 127 | -static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq) |
108 | - int64_t total_sectors) | 128 | +static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq, QemuMutex *vq_lock) |
109 | +static int coroutine_fn create_dynamic_disk(BlockBackend *blk, VHDFooter *footer, | 129 | { |
110 | + int64_t total_sectors) | 130 | VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s; |
111 | { | 131 | VirtIOSCSIReq *req; |
112 | VHDDynDiskHeader dyndisk_header; | 132 | |
113 | uint8_t bat_sector[512]; | 133 | + if (vq_lock) { |
114 | @@ -XXX,XX +XXX,XX @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer, | 134 | + qemu_mutex_lock(vq_lock); |
115 | block_size = 0x200000; | 135 | + } |
116 | num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512); | 136 | + |
117 | 137 | req = virtqueue_pop(vq, sizeof(VirtIOSCSIReq) + vs->cdb_size); | |
118 | - ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0); | 138 | + |
119 | + ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0); | 139 | + if (vq_lock) { |
120 | if (ret < 0) { | 140 | + qemu_mutex_unlock(vq_lock); |
121 | goto fail; | 141 | + } |
122 | } | 142 | + |
123 | 143 | if (!req) { | |
124 | offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); | 144 | return NULL; |
125 | - ret = blk_pwrite(blk, offset, sizeof(*footer), footer, 0); | 145 | } |
126 | + ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0); | 146 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data) |
127 | if (ret < 0) { | 147 | |
128 | goto fail; | 148 | trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun), |
129 | } | 149 | req->req.tmf.tag, req->resp.tmf.response); |
130 | @@ -XXX,XX +XXX,XX @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer, | 150 | - virtio_scsi_complete_req(req); |
131 | 151 | + virtio_scsi_complete_req(req, &req->dev->ctrl_lock); | |
132 | memset(bat_sector, 0xFF, 512); | 152 | } |
133 | for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) { | 153 | g_free(n); |
134 | - ret = blk_pwrite(blk, offset, 512, bat_sector, 0); | 154 | } |
135 | + ret = blk_co_pwrite(blk, offset, 512, bat_sector, 0); | 155 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req) |
136 | if (ret < 0) { | 156 | |
137 | goto fail; | 157 | out: |
158 | object_unref(OBJECT(d)); | ||
159 | - virtio_scsi_complete_req_from_main_loop(req); | ||
160 | + virtio_scsi_complete_req(req, &s->ctrl_lock); | ||
161 | } | ||
162 | |||
163 | /* Some TMFs must be processed from the main loop thread */ | ||
164 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s) | ||
165 | |||
166 | /* SAM-6 6.3.2 Hard reset */ | ||
167 | req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE; | ||
168 | - virtio_scsi_complete_req(req); | ||
169 | + virtio_scsi_complete_req(req, &req->dev->ctrl_lock); | ||
170 | } | ||
171 | } | ||
172 | |||
173 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
174 | |||
175 | if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0, | ||
176 | &type, sizeof(type)) < sizeof(type)) { | ||
177 | - virtio_scsi_bad_req(req); | ||
178 | + virtio_scsi_bad_req(req, &s->ctrl_lock); | ||
179 | return; | ||
180 | } | ||
181 | |||
182 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
183 | if (type == VIRTIO_SCSI_T_TMF) { | ||
184 | if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq), | ||
185 | sizeof(VirtIOSCSICtrlTMFResp)) < 0) { | ||
186 | - virtio_scsi_bad_req(req); | ||
187 | + virtio_scsi_bad_req(req, &s->ctrl_lock); | ||
188 | return; | ||
189 | } else { | ||
190 | r = virtio_scsi_do_tmf(s, req); | ||
191 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
192 | type == VIRTIO_SCSI_T_AN_SUBSCRIBE) { | ||
193 | if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq), | ||
194 | sizeof(VirtIOSCSICtrlANResp)) < 0) { | ||
195 | - virtio_scsi_bad_req(req); | ||
196 | + virtio_scsi_bad_req(req, &s->ctrl_lock); | ||
197 | return; | ||
198 | } else { | ||
199 | req->req.an.event_requested = | ||
200 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
201 | type == VIRTIO_SCSI_T_AN_SUBSCRIBE) | ||
202 | trace_virtio_scsi_an_resp(virtio_scsi_get_lun(req->req.an.lun), | ||
203 | req->resp.an.response); | ||
204 | - virtio_scsi_complete_req(req); | ||
205 | + virtio_scsi_complete_req(req, &s->ctrl_lock); | ||
206 | } else { | ||
207 | assert(r == -EINPROGRESS); | ||
208 | } | ||
209 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq) | ||
210 | { | ||
211 | VirtIOSCSIReq *req; | ||
212 | |||
213 | - while ((req = virtio_scsi_pop_req(s, vq))) { | ||
214 | + while ((req = virtio_scsi_pop_req(s, vq, &s->ctrl_lock))) { | ||
215 | virtio_scsi_handle_ctrl_req(s, req); | ||
216 | } | ||
217 | } | ||
218 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_complete_cmd_req(VirtIOSCSIReq *req) | ||
219 | * in virtio_scsi_command_complete. | ||
220 | */ | ||
221 | req->resp_size = sizeof(VirtIOSCSICmdResp); | ||
222 | - virtio_scsi_complete_req(req); | ||
223 | + virtio_scsi_complete_req(req, NULL); | ||
224 | } | ||
225 | |||
226 | static void virtio_scsi_command_failed(SCSIRequest *r) | ||
227 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
228 | virtio_scsi_fail_cmd_req(req); | ||
229 | return -ENOTSUP; | ||
230 | } else { | ||
231 | - virtio_scsi_bad_req(req); | ||
232 | + virtio_scsi_bad_req(req, NULL); | ||
233 | return -EINVAL; | ||
138 | } | 234 | } |
139 | @@ -XXX,XX +XXX,XX @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer, | 235 | } |
140 | /* Write the header */ | 236 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) |
141 | offset = 512; | 237 | virtio_queue_set_notification(vq, 0); |
142 | 238 | } | |
143 | - ret = blk_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0); | 239 | |
144 | + ret = blk_co_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0); | 240 | - while ((req = virtio_scsi_pop_req(s, vq))) { |
145 | if (ret < 0) { | 241 | + while ((req = virtio_scsi_pop_req(s, vq, NULL))) { |
146 | goto fail; | 242 | ret = virtio_scsi_handle_cmd_req_prepare(s, req); |
147 | } | 243 | if (!ret) { |
148 | @@ -XXX,XX +XXX,XX @@ static int create_dynamic_disk(BlockBackend *blk, VHDFooter *footer, | 244 | QTAILQ_INSERT_TAIL(&reqs, req, next); |
149 | return ret; | 245 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s, |
150 | } | 246 | return; |
151 | 247 | } | |
152 | -static int create_fixed_disk(BlockBackend *blk, VHDFooter *footer, | 248 | |
153 | - int64_t total_size, Error **errp) | 249 | - req = virtio_scsi_pop_req(s, vs->event_vq); |
154 | +static int coroutine_fn create_fixed_disk(BlockBackend *blk, VHDFooter *footer, | 250 | + req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock); |
155 | + int64_t total_size, Error **errp) | 251 | if (!req) { |
156 | { | 252 | s->events_dropped = true; |
157 | int ret; | 253 | return; |
158 | 254 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s, | |
159 | /* Add footer to total size */ | 255 | } |
160 | total_size += sizeof(*footer); | 256 | |
161 | 257 | if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) { | |
162 | - ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp); | 258 | - virtio_scsi_bad_req(req); |
163 | + ret = blk_co_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp); | 259 | + virtio_scsi_bad_req(req, &s->event_lock); |
164 | if (ret < 0) { | 260 | return; |
165 | return ret; | 261 | } |
166 | } | 262 | |
167 | 263 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s, | |
168 | - ret = blk_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer), | 264 | } |
169 | - footer, 0); | 265 | trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason); |
170 | + ret = blk_co_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer), | 266 | |
171 | + footer, 0); | 267 | - virtio_scsi_complete_req(req); |
172 | if (ret < 0) { | 268 | + virtio_scsi_complete_req(req, &s->event_lock); |
173 | error_setg_errno(errp, -ret, "Unable to write VHD header"); | 269 | } |
174 | return ret; | 270 | |
271 | static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) | ||
272 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp) | ||
273 | Error *err = NULL; | ||
274 | |||
275 | QTAILQ_INIT(&s->tmf_bh_list); | ||
276 | + qemu_mutex_init(&s->ctrl_lock); | ||
277 | + qemu_mutex_init(&s->event_lock); | ||
278 | qemu_mutex_init(&s->tmf_bh_lock); | ||
279 | |||
280 | virtio_scsi_common_realize(dev, | ||
281 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_unrealize(DeviceState *dev) | ||
282 | qbus_set_hotplug_handler(BUS(&s->bus), NULL); | ||
283 | virtio_scsi_common_unrealize(dev); | ||
284 | qemu_mutex_destroy(&s->tmf_bh_lock); | ||
285 | + qemu_mutex_destroy(&s->event_lock); | ||
286 | + qemu_mutex_destroy(&s->ctrl_lock); | ||
287 | } | ||
288 | |||
289 | static const Property virtio_scsi_properties[] = { | ||
175 | -- | 290 | -- |
176 | 2.41.0 | 291 | 2.48.1 | diff view generated by jsdifflib |
1 | bdrv_set_file_or_backing_noperm() requires the caller to hold the | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | AioContext lock for the child node, but we hold the one for the parent | ||
3 | node in bdrv_reopen_parse_file_or_backing(). Take the other one | ||
4 | temporarily. | ||
5 | 2 | ||
6 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | The block layer can invoke the resize callback from any AioContext that |
7 | Message-ID: <20230605085711.21261-7-kwolf@redhat.com> | 4 | is processing requests. The virtqueue is already protected but the |
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | events_dropped field also needs to be protected against races. Cover it |
6 | using the event virtqueue lock because it is closely associated with | ||
7 | accesses to the virtqueue. | ||
8 | |||
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
11 | Message-ID: <20250311132616.1049687-7-stefanha@redhat.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 13 | --- |
11 | block.c | 35 +++++++++++++++++++++++++++++++++-- | 14 | include/hw/virtio/virtio-scsi.h | 3 ++- |
12 | 1 file changed, 33 insertions(+), 2 deletions(-) | 15 | hw/scsi/virtio-scsi.c | 29 ++++++++++++++++++++--------- |
16 | 2 files changed, 22 insertions(+), 10 deletions(-) | ||
13 | 17 | ||
14 | diff --git a/block.c b/block.c | 18 | diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h |
15 | index XXXXXXX..XXXXXXX 100644 | 19 | index XXXXXXX..XXXXXXX 100644 |
16 | --- a/block.c | 20 | --- a/include/hw/virtio/virtio-scsi.h |
17 | +++ b/block.c | 21 | +++ b/include/hw/virtio/virtio-scsi.h |
18 | @@ -XXX,XX +XXX,XX @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs) | 22 | @@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI { |
19 | * callers which don't need their own reference any more must call bdrv_unref(). | 23 | |
20 | * | 24 | SCSIBus bus; |
21 | * Function doesn't update permissions, caller is responsible for this. | 25 | int resetting; /* written from main loop thread, read from any thread */ |
22 | + * | 26 | + |
23 | + * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and | 27 | + QemuMutex event_lock; /* protects event_vq and events_dropped */ |
24 | + * @child_bs can move to a different AioContext in this function. Callers must | 28 | bool events_dropped; |
25 | + * make sure that their AioContext locking is still correct after this. | 29 | |
26 | */ | 30 | QemuMutex ctrl_lock; /* protects ctrl_vq */ |
27 | static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs, | 31 | - QemuMutex event_lock; /* protects event_vq */ |
28 | BlockDriverState *child_bs, | 32 | |
29 | @@ -XXX,XX +XXX,XX @@ out: | 33 | /* |
30 | return 0; | 34 | * TMFs deferred to main loop BH. These fields are protected by |
35 | diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c | ||
36 | index XXXXXXX..XXXXXXX 100644 | ||
37 | --- a/hw/scsi/virtio-scsi.c | ||
38 | +++ b/hw/scsi/virtio-scsi.c | ||
39 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev) | ||
40 | |||
41 | vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; | ||
42 | vs->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE; | ||
43 | - s->events_dropped = false; | ||
44 | + | ||
45 | + WITH_QEMU_LOCK_GUARD(&s->event_lock) { | ||
46 | + s->events_dropped = false; | ||
47 | + } | ||
31 | } | 48 | } |
32 | 49 | ||
33 | +/* | 50 | typedef struct { |
34 | + * The caller must hold the AioContext lock for @backing_hd. Both @bs and | 51 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s, |
35 | + * @backing_hd can move to a different AioContext in this function. Callers must | ||
36 | + * make sure that their AioContext locking is still correct after this. | ||
37 | + */ | ||
38 | static int bdrv_set_backing_noperm(BlockDriverState *bs, | ||
39 | BlockDriverState *backing_hd, | ||
40 | Transaction *tran, Error **errp) | ||
41 | @@ -XXX,XX +XXX,XX @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, | ||
42 | * backing BlockDriverState (or NULL). | ||
43 | * | ||
44 | * Return 0 on success, otherwise return < 0 and set @errp. | ||
45 | + * | ||
46 | + * The caller must hold the AioContext lock of @reopen_state->bs. | ||
47 | + * @reopen_state->bs can move to a different AioContext in this function. | ||
48 | + * Callers must make sure that their AioContext locking is still correct after | ||
49 | + * this. | ||
50 | */ | ||
51 | static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, | ||
52 | bool is_backing, Transaction *tran, | ||
53 | @@ -XXX,XX +XXX,XX @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, | ||
54 | const char *child_name = is_backing ? "backing" : "file"; | ||
55 | QObject *value; | ||
56 | const char *str; | ||
57 | + AioContext *ctx, *old_ctx; | ||
58 | + int ret; | ||
59 | |||
60 | GLOBAL_STATE_CODE(); | ||
61 | |||
62 | @@ -XXX,XX +XXX,XX @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, | ||
63 | reopen_state->old_file_bs = old_child_bs; | ||
64 | } | 52 | } |
65 | 53 | ||
66 | - return bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing, | 54 | req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock); |
67 | - tran, errp); | 55 | - if (!req) { |
68 | + old_ctx = bdrv_get_aio_context(bs); | 56 | - s->events_dropped = true; |
69 | + ctx = bdrv_get_aio_context(new_child_bs); | 57 | - return; |
70 | + if (old_ctx != ctx) { | 58 | - } |
71 | + aio_context_release(old_ctx); | 59 | + WITH_QEMU_LOCK_GUARD(&s->event_lock) { |
72 | + aio_context_acquire(ctx); | 60 | + if (!req) { |
61 | + s->events_dropped = true; | ||
62 | + return; | ||
63 | + } | ||
64 | |||
65 | - if (s->events_dropped) { | ||
66 | - event |= VIRTIO_SCSI_T_EVENTS_MISSED; | ||
67 | - s->events_dropped = false; | ||
68 | + if (s->events_dropped) { | ||
69 | + event |= VIRTIO_SCSI_T_EVENTS_MISSED; | ||
70 | + s->events_dropped = false; | ||
71 | + } | ||
72 | } | ||
73 | |||
74 | if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) { | ||
75 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_push_event(VirtIOSCSI *s, | ||
76 | |||
77 | static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq) | ||
78 | { | ||
79 | - if (s->events_dropped) { | ||
80 | + bool events_dropped; | ||
81 | + | ||
82 | + WITH_QEMU_LOCK_GUARD(&s->event_lock) { | ||
83 | + events_dropped = s->events_dropped; | ||
73 | + } | 84 | + } |
74 | + | 85 | + |
75 | + ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing, | 86 | + if (events_dropped) { |
76 | + tran, errp); | 87 | VirtIOSCSIEventInfo info = { |
77 | + | 88 | .event = VIRTIO_SCSI_T_NO_EVENT, |
78 | + if (old_ctx != ctx) { | 89 | }; |
79 | + aio_context_release(ctx); | ||
80 | + aio_context_acquire(old_ctx); | ||
81 | + } | ||
82 | + | ||
83 | + return ret; | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | @@ -XXX,XX +XXX,XX @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, | ||
88 | * It is the responsibility of the caller to then call the abort() or | ||
89 | * commit() for any other BDS that have been left in a prepare() state | ||
90 | * | ||
91 | + * The caller must hold the AioContext lock of @reopen_state->bs. | ||
92 | */ | ||
93 | static int bdrv_reopen_prepare(BDRVReopenState *reopen_state, | ||
94 | BlockReopenQueue *queue, | ||
95 | -- | 90 | -- |
96 | 2.41.0 | 91 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | With IOThread Virtqueue Mapping there will be multiple AioContexts |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 4 | processing SCSI requests. scsi_req_cancel() and other SCSI request |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | 5 | operations must be performed from the AioContext where the request is |
6 | 6 | running. | |
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 7 | |
8 | Message-ID: <20230601115145.196465-7-pbonzini@redhat.com> | 8 | Introduce a virtio_scsi_defer_tmf_to_aio_context() function and the |
9 | necessary VirtIOSCSIReq->remaining refcount infrastructure to move the | ||
10 | TMF code into the AioContext where the request is running. | ||
11 | |||
12 | For the time being there is still just one AioContext: the main loop or | ||
13 | the IOThread. When the iothread-vq-mapping parameter is added in a later | ||
14 | patch this will be changed to per-virtqueue AioContexts. | ||
15 | |||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 17 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
18 | Message-ID: <20250311132616.1049687-8-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 19 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 20 | --- |
12 | block/cloop.c | 9 +++++---- | 21 | hw/scsi/virtio-scsi.c | 270 ++++++++++++++++++++++++++++++++---------- |
13 | 1 file changed, 5 insertions(+), 4 deletions(-) | 22 | 1 file changed, 206 insertions(+), 64 deletions(-) |
14 | 23 | ||
15 | diff --git a/block/cloop.c b/block/cloop.c | 24 | diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c |
16 | index XXXXXXX..XXXXXXX 100644 | 25 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/cloop.c | 26 | --- a/hw/scsi/virtio-scsi.c |
18 | +++ b/block/cloop.c | 27 | +++ b/hw/scsi/virtio-scsi.c |
19 | @@ -XXX,XX +XXX,XX @@ static void cloop_refresh_limits(BlockDriverState *bs, Error **errp) | 28 | @@ -XXX,XX +XXX,XX @@ typedef struct VirtIOSCSIReq { |
20 | bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */ | 29 | /* Used for two-stage request submission and TMFs deferred to BH */ |
30 | QTAILQ_ENTRY(VirtIOSCSIReq) next; | ||
31 | |||
32 | - /* Used for cancellation of request during TMFs */ | ||
33 | + /* Used for cancellation of request during TMFs. Atomic. */ | ||
34 | int remaining; | ||
35 | |||
36 | SCSIRequest *sreq; | ||
37 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
38 | VirtIOSCSIReq *tmf_req; | ||
39 | } VirtIOSCSICancelNotifier; | ||
40 | |||
41 | +static void virtio_scsi_tmf_dec_remaining(VirtIOSCSIReq *tmf) | ||
42 | +{ | ||
43 | + if (qatomic_fetch_dec(&tmf->remaining) == 1) { | ||
44 | + trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(tmf->req.tmf.lun), | ||
45 | + tmf->req.tmf.tag, tmf->resp.tmf.response); | ||
46 | + | ||
47 | + virtio_scsi_complete_req(tmf, &tmf->dev->ctrl_lock); | ||
48 | + } | ||
49 | +} | ||
50 | + | ||
51 | static void virtio_scsi_cancel_notify(Notifier *notifier, void *data) | ||
52 | { | ||
53 | VirtIOSCSICancelNotifier *n = container_of(notifier, | ||
54 | VirtIOSCSICancelNotifier, | ||
55 | notifier); | ||
56 | |||
57 | - if (--n->tmf_req->remaining == 0) { | ||
58 | - VirtIOSCSIReq *req = n->tmf_req; | ||
59 | - | ||
60 | - trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun), | ||
61 | - req->req.tmf.tag, req->resp.tmf.response); | ||
62 | - virtio_scsi_complete_req(req, &req->dev->ctrl_lock); | ||
63 | - } | ||
64 | + virtio_scsi_tmf_dec_remaining(n->tmf_req); | ||
65 | g_free(n); | ||
21 | } | 66 | } |
22 | 67 | ||
23 | -static inline int cloop_read_block(BlockDriverState *bs, int block_num) | 68 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s) |
24 | +static int coroutine_fn GRAPH_RDLOCK | 69 | } |
25 | +cloop_read_block(BlockDriverState *bs, int block_num) | 70 | } |
71 | |||
72 | -static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req) | ||
73 | +static void virtio_scsi_defer_tmf_to_main_loop(VirtIOSCSIReq *req) | ||
26 | { | 74 | { |
27 | BDRVCloopState *s = bs->opaque; | 75 | VirtIOSCSI *s = req->dev; |
28 | 76 | ||
29 | @@ -XXX,XX +XXX,XX @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) | 77 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req) |
30 | int ret; | 78 | } |
31 | uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; | ||
32 | |||
33 | - ret = bdrv_pread(bs->file, s->offsets[block_num], bytes, | ||
34 | - s->compressed_block, 0); | ||
35 | + ret = bdrv_co_pread(bs->file, s->offsets[block_num], bytes, | ||
36 | + s->compressed_block, 0); | ||
37 | if (ret < 0) { | ||
38 | return -1; | ||
39 | } | ||
40 | @@ -XXX,XX +XXX,XX @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) | ||
41 | return 0; | ||
42 | } | 79 | } |
43 | 80 | ||
44 | -static int coroutine_fn | 81 | +static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r) |
45 | +static int coroutine_fn GRAPH_RDLOCK | 82 | +{ |
46 | cloop_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, | 83 | + VirtIOSCSICancelNotifier *notifier; |
47 | QEMUIOVector *qiov, BdrvRequestFlags flags) | 84 | + |
85 | + assert(r->ctx == qemu_get_current_aio_context()); | ||
86 | + | ||
87 | + /* Decremented in virtio_scsi_cancel_notify() */ | ||
88 | + qatomic_inc(&tmf->remaining); | ||
89 | + | ||
90 | + notifier = g_new(VirtIOSCSICancelNotifier, 1); | ||
91 | + notifier->notifier.notify = virtio_scsi_cancel_notify; | ||
92 | + notifier->tmf_req = tmf; | ||
93 | + scsi_req_cancel_async(r, ¬ifier->notifier); | ||
94 | +} | ||
95 | + | ||
96 | +/* Execute a TMF on the requests in the current AioContext */ | ||
97 | +static void virtio_scsi_do_tmf_aio_context(void *opaque) | ||
98 | +{ | ||
99 | + AioContext *ctx = qemu_get_current_aio_context(); | ||
100 | + VirtIOSCSIReq *tmf = opaque; | ||
101 | + VirtIOSCSI *s = tmf->dev; | ||
102 | + SCSIDevice *d = virtio_scsi_device_get(s, tmf->req.tmf.lun); | ||
103 | + SCSIRequest *r; | ||
104 | + bool match_tag; | ||
105 | + | ||
106 | + if (!d) { | ||
107 | + tmf->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET; | ||
108 | + virtio_scsi_tmf_dec_remaining(tmf); | ||
109 | + return; | ||
110 | + } | ||
111 | + | ||
112 | + /* | ||
113 | + * This function could handle other subtypes that need to be processed in | ||
114 | + * the request's AioContext in the future, but for now only request | ||
115 | + * cancelation subtypes are performed here. | ||
116 | + */ | ||
117 | + switch (tmf->req.tmf.subtype) { | ||
118 | + case VIRTIO_SCSI_T_TMF_ABORT_TASK: | ||
119 | + match_tag = true; | ||
120 | + break; | ||
121 | + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: | ||
122 | + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: | ||
123 | + match_tag = false; | ||
124 | + break; | ||
125 | + default: | ||
126 | + g_assert_not_reached(); | ||
127 | + } | ||
128 | + | ||
129 | + WITH_QEMU_LOCK_GUARD(&d->requests_lock) { | ||
130 | + QTAILQ_FOREACH(r, &d->requests, next) { | ||
131 | + VirtIOSCSIReq *cmd_req = r->hba_private; | ||
132 | + assert(cmd_req); /* request has hba_private while enqueued */ | ||
133 | + | ||
134 | + if (r->ctx != ctx) { | ||
135 | + continue; | ||
136 | + } | ||
137 | + if (match_tag && cmd_req->req.cmd.tag != tmf->req.tmf.tag) { | ||
138 | + continue; | ||
139 | + } | ||
140 | + virtio_scsi_tmf_cancel_req(tmf, r); | ||
141 | + } | ||
142 | + } | ||
143 | + | ||
144 | + /* Incremented by virtio_scsi_do_tmf() */ | ||
145 | + virtio_scsi_tmf_dec_remaining(tmf); | ||
146 | + | ||
147 | + object_unref(d); | ||
148 | +} | ||
149 | + | ||
150 | +static void dummy_bh(void *opaque) | ||
151 | +{ | ||
152 | + /* Do nothing */ | ||
153 | +} | ||
154 | + | ||
155 | +/* | ||
156 | + * Wait for pending virtio_scsi_defer_tmf_to_aio_context() BHs. | ||
157 | + */ | ||
158 | +static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s) | ||
159 | +{ | ||
160 | + GLOBAL_STATE_CODE(); | ||
161 | + | ||
162 | + assert(!s->dataplane_started); | ||
163 | + | ||
164 | + if (s->ctx) { | ||
165 | + /* Our BH only runs after previously scheduled BHs */ | ||
166 | + aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL); | ||
167 | + } | ||
168 | +} | ||
169 | + | ||
170 | +/* | ||
171 | + * Run the TMF in a specific AioContext, handling only requests in that | ||
172 | + * AioContext. This is necessary because requests can run in different | ||
173 | + * AioContext and it is only possible to cancel them from the AioContext where | ||
174 | + * they are running. | ||
175 | + */ | ||
176 | +static void virtio_scsi_defer_tmf_to_aio_context(VirtIOSCSIReq *tmf, | ||
177 | + AioContext *ctx) | ||
178 | +{ | ||
179 | + /* Decremented in virtio_scsi_do_tmf_aio_context() */ | ||
180 | + qatomic_inc(&tmf->remaining); | ||
181 | + | ||
182 | + /* See virtio_scsi_flush_defer_tmf_to_aio_context() cleanup during reset */ | ||
183 | + aio_bh_schedule_oneshot(ctx, virtio_scsi_do_tmf_aio_context, tmf); | ||
184 | +} | ||
185 | + | ||
186 | +/* | ||
187 | + * Returns the AioContext for a given TMF's tag field or NULL. Note that the | ||
188 | + * request identified by the tag may have completed by the time you can execute | ||
189 | + * a BH in the AioContext, so don't assume the request still exists in your BH. | ||
190 | + */ | ||
191 | +static AioContext *find_aio_context_for_tmf_tag(SCSIDevice *d, | ||
192 | + VirtIOSCSIReq *tmf) | ||
193 | +{ | ||
194 | + WITH_QEMU_LOCK_GUARD(&d->requests_lock) { | ||
195 | + SCSIRequest *r; | ||
196 | + SCSIRequest *next; | ||
197 | + | ||
198 | + QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { | ||
199 | + VirtIOSCSIReq *cmd_req = r->hba_private; | ||
200 | + | ||
201 | + /* hba_private is non-NULL while the request is enqueued */ | ||
202 | + assert(cmd_req); | ||
203 | + | ||
204 | + if (cmd_req->req.cmd.tag == tmf->req.tmf.tag) { | ||
205 | + return r->ctx; | ||
206 | + } | ||
207 | + } | ||
208 | + } | ||
209 | + return NULL; | ||
210 | +} | ||
211 | + | ||
212 | /* Return 0 if the request is ready to be completed and return to guest; | ||
213 | * -EINPROGRESS if the request is submitted and will be completed later, in the | ||
214 | * case of async cancellation. */ | ||
215 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
48 | { | 216 | { |
217 | SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun); | ||
218 | SCSIRequest *r, *next; | ||
219 | + AioContext *ctx; | ||
220 | int ret = 0; | ||
221 | |||
222 | virtio_scsi_ctx_check(s, d); | ||
223 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
224 | req->req.tmf.tag, req->req.tmf.subtype); | ||
225 | |||
226 | switch (req->req.tmf.subtype) { | ||
227 | - case VIRTIO_SCSI_T_TMF_ABORT_TASK: | ||
228 | - case VIRTIO_SCSI_T_TMF_QUERY_TASK: | ||
229 | + case VIRTIO_SCSI_T_TMF_ABORT_TASK: { | ||
230 | if (!d) { | ||
231 | goto fail; | ||
232 | } | ||
233 | if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) { | ||
234 | goto incorrect_lun; | ||
235 | } | ||
236 | - QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { | ||
237 | - VirtIOSCSIReq *cmd_req = r->hba_private; | ||
238 | - if (cmd_req && cmd_req->req.cmd.tag == req->req.tmf.tag) { | ||
239 | - break; | ||
240 | - } | ||
241 | + | ||
242 | + ctx = find_aio_context_for_tmf_tag(d, req); | ||
243 | + if (ctx) { | ||
244 | + virtio_scsi_defer_tmf_to_aio_context(req, ctx); | ||
245 | + ret = -EINPROGRESS; | ||
246 | } | ||
247 | - if (r) { | ||
248 | - /* | ||
249 | - * Assert that the request has not been completed yet, we | ||
250 | - * check for it in the loop above. | ||
251 | - */ | ||
252 | - assert(r->hba_private); | ||
253 | - if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) { | ||
254 | - /* "If the specified command is present in the task set, then | ||
255 | - * return a service response set to FUNCTION SUCCEEDED". | ||
256 | - */ | ||
257 | - req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; | ||
258 | - } else { | ||
259 | - VirtIOSCSICancelNotifier *notifier; | ||
260 | - | ||
261 | - req->remaining = 1; | ||
262 | - notifier = g_new(VirtIOSCSICancelNotifier, 1); | ||
263 | - notifier->tmf_req = req; | ||
264 | - notifier->notifier.notify = virtio_scsi_cancel_notify; | ||
265 | - scsi_req_cancel_async(r, ¬ifier->notifier); | ||
266 | - ret = -EINPROGRESS; | ||
267 | + break; | ||
268 | + } | ||
269 | + | ||
270 | + case VIRTIO_SCSI_T_TMF_QUERY_TASK: | ||
271 | + if (!d) { | ||
272 | + goto fail; | ||
273 | + } | ||
274 | + if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) { | ||
275 | + goto incorrect_lun; | ||
276 | + } | ||
277 | + | ||
278 | + WITH_QEMU_LOCK_GUARD(&d->requests_lock) { | ||
279 | + QTAILQ_FOREACH(r, &d->requests, next) { | ||
280 | + VirtIOSCSIReq *cmd_req = r->hba_private; | ||
281 | + assert(cmd_req); /* request has hba_private while enqueued */ | ||
282 | + | ||
283 | + if (cmd_req->req.cmd.tag == req->req.tmf.tag) { | ||
284 | + /* | ||
285 | + * "If the specified command is present in the task set, | ||
286 | + * then return a service response set to FUNCTION | ||
287 | + * SUCCEEDED". | ||
288 | + */ | ||
289 | + req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; | ||
290 | + } | ||
291 | } | ||
292 | } | ||
293 | break; | ||
294 | |||
295 | case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: | ||
296 | case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: | ||
297 | - virtio_scsi_defer_tmf_to_bh(req); | ||
298 | + virtio_scsi_defer_tmf_to_main_loop(req); | ||
299 | ret = -EINPROGRESS; | ||
300 | break; | ||
301 | |||
302 | case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: | ||
303 | - case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: | ||
304 | + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: { | ||
305 | + if (!d) { | ||
306 | + goto fail; | ||
307 | + } | ||
308 | + if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) { | ||
309 | + goto incorrect_lun; | ||
310 | + } | ||
311 | + | ||
312 | + qatomic_inc(&req->remaining); | ||
313 | + | ||
314 | + ctx = s->ctx ?: qemu_get_aio_context(); | ||
315 | + virtio_scsi_defer_tmf_to_aio_context(req, ctx); | ||
316 | + | ||
317 | + virtio_scsi_tmf_dec_remaining(req); | ||
318 | + ret = -EINPROGRESS; | ||
319 | + break; | ||
320 | + } | ||
321 | + | ||
322 | case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: | ||
323 | if (!d) { | ||
324 | goto fail; | ||
325 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
326 | goto incorrect_lun; | ||
327 | } | ||
328 | |||
329 | - /* Add 1 to "remaining" until virtio_scsi_do_tmf returns. | ||
330 | - * This way, if the bus starts calling back to the notifiers | ||
331 | - * even before we finish the loop, virtio_scsi_cancel_notify | ||
332 | - * will not complete the TMF too early. | ||
333 | - */ | ||
334 | - req->remaining = 1; | ||
335 | - QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { | ||
336 | - if (r->hba_private) { | ||
337 | - if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) { | ||
338 | - /* "If there is any command present in the task set, then | ||
339 | - * return a service response set to FUNCTION SUCCEEDED". | ||
340 | - */ | ||
341 | - req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; | ||
342 | - break; | ||
343 | - } else { | ||
344 | - VirtIOSCSICancelNotifier *notifier; | ||
345 | - | ||
346 | - req->remaining++; | ||
347 | - notifier = g_new(VirtIOSCSICancelNotifier, 1); | ||
348 | - notifier->notifier.notify = virtio_scsi_cancel_notify; | ||
349 | - notifier->tmf_req = req; | ||
350 | - scsi_req_cancel_async(r, ¬ifier->notifier); | ||
351 | - } | ||
352 | + WITH_QEMU_LOCK_GUARD(&d->requests_lock) { | ||
353 | + QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) { | ||
354 | + /* Request has hba_private while enqueued */ | ||
355 | + assert(r->hba_private); | ||
356 | + | ||
357 | + /* | ||
358 | + * "If there is any command present in the task set, then | ||
359 | + * return a service response set to FUNCTION SUCCEEDED". | ||
360 | + */ | ||
361 | + req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED; | ||
362 | + break; | ||
363 | } | ||
364 | } | ||
365 | - if (--req->remaining > 0) { | ||
366 | - ret = -EINPROGRESS; | ||
367 | - } | ||
368 | break; | ||
369 | |||
370 | case VIRTIO_SCSI_T_TMF_CLEAR_ACA: | ||
371 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev) | ||
372 | assert(!s->dataplane_started); | ||
373 | |||
374 | virtio_scsi_reset_tmf_bh(s); | ||
375 | + virtio_scsi_flush_defer_tmf_to_aio_context(s); | ||
376 | |||
377 | qatomic_inc(&s->resetting); | ||
378 | bus_cold_reset(BUS(&s->bus)); | ||
49 | -- | 379 | -- |
50 | 2.41.0 | 380 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | This is the cleanup function that must be called after |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 4 | apply_iothread_vq_mapping() succeeds. virtio-scsi will need this |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | 5 | function too, so extract it. |
6 | 6 | ||
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | Message-ID: <20230601115145.196465-3-pbonzini@redhat.com> | ||
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 8 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
9 | Message-ID: <20250311132616.1049687-9-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 11 | --- |
12 | block/qed-check.c | 5 +++-- | 12 | hw/block/virtio-blk.c | 27 +++++++++++++++++++++------ |
13 | block/qed.c | 7 ++++--- | 13 | 1 file changed, 21 insertions(+), 6 deletions(-) |
14 | 2 files changed, 7 insertions(+), 5 deletions(-) | ||
15 | 14 | ||
16 | diff --git a/block/qed-check.c b/block/qed-check.c | 15 | diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c |
17 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
18 | --- a/block/qed-check.c | 17 | --- a/hw/block/virtio-blk.c |
19 | +++ b/block/qed-check.c | 18 | +++ b/hw/block/virtio-blk.c |
20 | @@ -XXX,XX +XXX,XX @@ static void qed_check_for_leaks(QEDCheck *check) | 19 | @@ -XXX,XX +XXX,XX @@ validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, |
21 | /** | 20 | * Fill in the AioContext for each virtqueue in the @vq_aio_context array given |
22 | * Mark an image clean once it passes check or has been repaired | 21 | * the iothread-vq-mapping parameter in @iothread_vq_mapping_list. |
23 | */ | 22 | * |
24 | -static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) | 23 | + * cleanup_iothread_vq_mapping() must be called to free IOThread object |
25 | +static void coroutine_fn GRAPH_RDLOCK | 24 | + * references after this function returns success. |
26 | +qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) | 25 | + * |
26 | * Returns: %true on success, %false on failure. | ||
27 | **/ | ||
28 | static bool apply_iothread_vq_mapping( | ||
29 | @@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping( | ||
30 | return true; | ||
31 | } | ||
32 | |||
33 | +/** | ||
34 | + * cleanup_iothread_vq_mapping: | ||
35 | + * @list: The mapping of virtqueues to IOThreads. | ||
36 | + * | ||
37 | + * Release IOThread object references that were acquired by | ||
38 | + * apply_iothread_vq_mapping(). | ||
39 | + */ | ||
40 | +static void cleanup_iothread_vq_mapping(IOThreadVirtQueueMappingList *list) | ||
41 | +{ | ||
42 | + IOThreadVirtQueueMappingList *node; | ||
43 | + | ||
44 | + for (node = list; node; node = node->next) { | ||
45 | + IOThread *iothread = iothread_by_id(node->value->iothread); | ||
46 | + object_unref(OBJECT(iothread)); | ||
47 | + } | ||
48 | +} | ||
49 | + | ||
50 | /* Context: BQL held */ | ||
51 | static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) | ||
27 | { | 52 | { |
28 | /* Skip if there were unfixable corruptions or I/O errors */ | 53 | @@ -XXX,XX +XXX,XX @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s) |
29 | if (result->corruptions > 0 || result->check_errors > 0) { | 54 | assert(!s->ioeventfd_started); |
30 | @@ -XXX,XX +XXX,XX @@ static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) | 55 | |
56 | if (conf->iothread_vq_mapping_list) { | ||
57 | - IOThreadVirtQueueMappingList *node; | ||
58 | - | ||
59 | - for (node = conf->iothread_vq_mapping_list; node; node = node->next) { | ||
60 | - IOThread *iothread = iothread_by_id(node->value->iothread); | ||
61 | - object_unref(OBJECT(iothread)); | ||
62 | - } | ||
63 | + cleanup_iothread_vq_mapping(conf->iothread_vq_mapping_list); | ||
31 | } | 64 | } |
32 | 65 | ||
33 | /* Ensure fixes reach storage before clearing check bit */ | 66 | if (conf->iothread) { |
34 | - bdrv_flush(s->bs); | ||
35 | + bdrv_co_flush(s->bs); | ||
36 | |||
37 | s->header.features &= ~QED_F_NEED_CHECK; | ||
38 | qed_write_header_sync(s); | ||
39 | diff --git a/block/qed.c b/block/qed.c | ||
40 | index XXXXXXX..XXXXXXX 100644 | ||
41 | --- a/block/qed.c | ||
42 | +++ b/block/qed.c | ||
43 | @@ -XXX,XX +XXX,XX @@ static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, | ||
44 | * | ||
45 | * The string is NUL-terminated. | ||
46 | */ | ||
47 | -static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n, | ||
48 | - char *buf, size_t buflen) | ||
49 | +static int coroutine_fn GRAPH_RDLOCK | ||
50 | +qed_read_string(BdrvChild *file, uint64_t offset, | ||
51 | + size_t n, char *buf, size_t buflen) | ||
52 | { | ||
53 | int ret; | ||
54 | if (n >= buflen) { | ||
55 | return -EINVAL; | ||
56 | } | ||
57 | - ret = bdrv_pread(file, offset, n, buf, 0); | ||
58 | + ret = bdrv_co_pread(file, offset, n, buf, 0); | ||
59 | if (ret < 0) { | ||
60 | return ret; | ||
61 | } | ||
62 | -- | 67 | -- |
63 | 2.41.0 | 68 | 2.48.1 | diff view generated by jsdifflib |
1 | bdrv_open_inherit() calls several functions for which it needs to hold | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | the AioContext lock, but currently doesn't. This includes calls in | ||
3 | bdrv_append_temp_snapshot(), for which bdrv_open_inherit() is the only | ||
4 | caller. Fix the locking in these places. | ||
5 | 2 | ||
6 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | Use noun_verb() function naming instead of verb_noun() because the |
7 | Message-ID: <20230605085711.21261-8-kwolf@redhat.com> | 4 | former is the most common naming style for APIs. The next commit will |
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | move these functions into a header file so that virtio-scsi can call |
6 | them. | ||
7 | |||
8 | Shorten iothread_vq_mapping_apply()'s iothread_vq_mapping_list argument | ||
9 | to just "list" like in the other functions. | ||
10 | |||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | ||
13 | Message-ID: <20250311132616.1049687-10-stefanha@redhat.com> | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 14 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
10 | --- | 15 | --- |
11 | block.c | 25 ++++++++++++++++++++++++- | 16 | hw/block/virtio-blk.c | 33 ++++++++++++++++----------------- |
12 | 1 file changed, 24 insertions(+), 1 deletion(-) | 17 | 1 file changed, 16 insertions(+), 17 deletions(-) |
13 | 18 | ||
14 | diff --git a/block.c b/block.c | 19 | diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c |
15 | index XXXXXXX..XXXXXXX 100644 | 20 | index XXXXXXX..XXXXXXX 100644 |
16 | --- a/block.c | 21 | --- a/hw/block/virtio-blk.c |
17 | +++ b/block.c | 22 | +++ b/hw/block/virtio-blk.c |
18 | @@ -XXX,XX +XXX,XX @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs, | 23 | @@ -XXX,XX +XXX,XX @@ static const BlockDevOps virtio_block_ops = { |
19 | int64_t total_size; | 24 | }; |
20 | QemuOpts *opts = NULL; | 25 | |
21 | BlockDriverState *bs_snapshot = NULL; | 26 | static bool |
22 | + AioContext *ctx = bdrv_get_aio_context(bs); | 27 | -validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, |
23 | int ret; | 28 | - uint16_t num_queues, Error **errp) |
24 | 29 | +iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t | |
25 | GLOBAL_STATE_CODE(); | 30 | + num_queues, Error **errp) |
26 | @@ -XXX,XX +XXX,XX @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs, | 31 | { |
27 | instead of opening 'filename' directly */ | 32 | g_autofree unsigned long *vqs = bitmap_new(num_queues); |
28 | 33 | g_autoptr(GHashTable) iothreads = | |
29 | /* Get the required size from the image */ | 34 | @@ -XXX,XX +XXX,XX @@ validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, |
30 | + aio_context_acquire(ctx); | 35 | } |
31 | total_size = bdrv_getlength(bs); | 36 | |
32 | + aio_context_release(ctx); | 37 | /** |
33 | + | 38 | - * apply_iothread_vq_mapping: |
34 | if (total_size < 0) { | 39 | - * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads. |
35 | error_setg_errno(errp, -total_size, "Could not get image size"); | 40 | + * iothread_vq_mapping_apply: |
36 | goto out; | 41 | + * @list: The mapping of virtqueues to IOThreads. |
37 | @@ -XXX,XX +XXX,XX @@ static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs, | 42 | * @vq_aio_context: The array of AioContext pointers to fill in. |
38 | goto out; | 43 | * @num_queues: The length of @vq_aio_context. |
44 | * @errp: If an error occurs, a pointer to the area to store the error. | ||
45 | * | ||
46 | * Fill in the AioContext for each virtqueue in the @vq_aio_context array given | ||
47 | - * the iothread-vq-mapping parameter in @iothread_vq_mapping_list. | ||
48 | + * the iothread-vq-mapping parameter in @list. | ||
49 | * | ||
50 | - * cleanup_iothread_vq_mapping() must be called to free IOThread object | ||
51 | + * iothread_vq_mapping_cleanup() must be called to free IOThread object | ||
52 | * references after this function returns success. | ||
53 | * | ||
54 | * Returns: %true on success, %false on failure. | ||
55 | **/ | ||
56 | -static bool apply_iothread_vq_mapping( | ||
57 | - IOThreadVirtQueueMappingList *iothread_vq_mapping_list, | ||
58 | +static bool iothread_vq_mapping_apply( | ||
59 | + IOThreadVirtQueueMappingList *list, | ||
60 | AioContext **vq_aio_context, | ||
61 | uint16_t num_queues, | ||
62 | Error **errp) | ||
63 | @@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping( | ||
64 | size_t num_iothreads = 0; | ||
65 | size_t cur_iothread = 0; | ||
66 | |||
67 | - if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list, | ||
68 | - num_queues, errp)) { | ||
69 | + if (!iothread_vq_mapping_validate(list, num_queues, errp)) { | ||
70 | return false; | ||
39 | } | 71 | } |
40 | 72 | ||
41 | + aio_context_acquire(ctx); | 73 | - for (node = iothread_vq_mapping_list; node; node = node->next) { |
42 | ret = bdrv_append(bs_snapshot, bs, errp); | 74 | + for (node = list; node; node = node->next) { |
43 | + aio_context_release(ctx); | 75 | num_iothreads++; |
44 | + | ||
45 | if (ret < 0) { | ||
46 | bs_snapshot = NULL; | ||
47 | goto out; | ||
48 | @@ -XXX,XX +XXX,XX @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, | ||
49 | Error *local_err = NULL; | ||
50 | QDict *snapshot_options = NULL; | ||
51 | int snapshot_flags = 0; | ||
52 | + AioContext *ctx = qemu_get_aio_context(); | ||
53 | |||
54 | assert(!child_class || !flags); | ||
55 | assert(!child_class == !parent); | ||
56 | @@ -XXX,XX +XXX,XX @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, | ||
57 | /* Not requesting BLK_PERM_CONSISTENT_READ because we're only | ||
58 | * looking at the header to guess the image format. This works even | ||
59 | * in cases where a guest would not see a consistent state. */ | ||
60 | - file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL); | ||
61 | + ctx = bdrv_get_aio_context(file_bs); | ||
62 | + aio_context_acquire(ctx); | ||
63 | + file = blk_new(ctx, 0, BLK_PERM_ALL); | ||
64 | blk_insert_bs(file, file_bs, &local_err); | ||
65 | bdrv_unref(file_bs); | ||
66 | + aio_context_release(ctx); | ||
67 | + | ||
68 | if (local_err) { | ||
69 | goto fail; | ||
70 | } | ||
71 | @@ -XXX,XX +XXX,XX @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, | ||
72 | goto fail; | ||
73 | } | 76 | } |
74 | 77 | ||
75 | + /* The AioContext could have changed during bdrv_open_common() */ | 78 | - for (node = iothread_vq_mapping_list; node; node = node->next) { |
76 | + ctx = bdrv_get_aio_context(bs); | 79 | + for (node = list; node; node = node->next) { |
77 | + | 80 | IOThread *iothread = iothread_by_id(node->value->iothread); |
78 | if (file) { | 81 | AioContext *ctx = iothread_get_aio_context(iothread); |
79 | + aio_context_acquire(ctx); | 82 | |
80 | blk_unref(file); | 83 | @@ -XXX,XX +XXX,XX @@ static bool apply_iothread_vq_mapping( |
81 | + aio_context_release(ctx); | 84 | } |
82 | file = NULL; | 85 | |
86 | /** | ||
87 | - * cleanup_iothread_vq_mapping: | ||
88 | + * iothread_vq_mapping_cleanup: | ||
89 | * @list: The mapping of virtqueues to IOThreads. | ||
90 | * | ||
91 | * Release IOThread object references that were acquired by | ||
92 | - * apply_iothread_vq_mapping(). | ||
93 | + * iothread_vq_mapping_apply(). | ||
94 | */ | ||
95 | -static void cleanup_iothread_vq_mapping(IOThreadVirtQueueMappingList *list) | ||
96 | +static void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list) | ||
97 | { | ||
98 | IOThreadVirtQueueMappingList *node; | ||
99 | |||
100 | @@ -XXX,XX +XXX,XX @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) | ||
101 | s->vq_aio_context = g_new(AioContext *, conf->num_queues); | ||
102 | |||
103 | if (conf->iothread_vq_mapping_list) { | ||
104 | - if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list, | ||
105 | + if (!iothread_vq_mapping_apply(conf->iothread_vq_mapping_list, | ||
106 | s->vq_aio_context, | ||
107 | conf->num_queues, | ||
108 | errp)) { | ||
109 | @@ -XXX,XX +XXX,XX @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s) | ||
110 | assert(!s->ioeventfd_started); | ||
111 | |||
112 | if (conf->iothread_vq_mapping_list) { | ||
113 | - cleanup_iothread_vq_mapping(conf->iothread_vq_mapping_list); | ||
114 | + iothread_vq_mapping_cleanup(conf->iothread_vq_mapping_list); | ||
83 | } | 115 | } |
84 | 116 | ||
85 | @@ -XXX,XX +XXX,XX @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, | 117 | if (conf->iothread) { |
86 | * (snapshot_bs); thus, we have to drop the strong reference to bs | ||
87 | * (which we obtained by calling bdrv_new()). bs will not be deleted, | ||
88 | * though, because the overlay still has a reference to it. */ | ||
89 | + aio_context_acquire(ctx); | ||
90 | bdrv_unref(bs); | ||
91 | + aio_context_release(ctx); | ||
92 | bs = snapshot_bs; | ||
93 | } | ||
94 | |||
95 | return bs; | ||
96 | |||
97 | fail: | ||
98 | + aio_context_acquire(ctx); | ||
99 | blk_unref(file); | ||
100 | qobject_unref(snapshot_options); | ||
101 | qobject_unref(bs->explicit_options); | ||
102 | @@ -XXX,XX +XXX,XX @@ fail: | ||
103 | bs->options = NULL; | ||
104 | bs->explicit_options = NULL; | ||
105 | bdrv_unref(bs); | ||
106 | + aio_context_release(ctx); | ||
107 | error_propagate(errp, local_err); | ||
108 | return NULL; | ||
109 | |||
110 | close_and_fail: | ||
111 | + aio_context_acquire(ctx); | ||
112 | bdrv_unref(bs); | ||
113 | + aio_context_release(ctx); | ||
114 | qobject_unref(snapshot_options); | ||
115 | qobject_unref(options); | ||
116 | error_propagate(errp, local_err); | ||
117 | -- | 118 | -- |
118 | 2.41.0 | 119 | 2.48.1 | diff view generated by jsdifflib |
1 | This is a better regression test for the bugs hidden by commit 80fc5d26 | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | ('graph-lock: Disable locking for now'). With that commit reverted, it | ||
3 | hangs instantaneously and reliably for me. | ||
4 | 2 | ||
5 | It is important to have a reliable test like this, because the following | 3 | The code that builds an array of AioContext pointers indexed by the |
6 | commits will set out to fix the actual root cause of the deadlocks and | 4 | virtqueue is not specific to virtio-blk. virtio-scsi will need to do the |
7 | then finally revert commit 80fc5d26, which was only a stopgap solution. | 5 | same thing, so extract the functions. |
8 | 6 | ||
9 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
10 | Message-ID: <20230605085711.21261-2-kwolf@redhat.com> | 8 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> |
11 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 9 | Message-ID: <20250311132616.1049687-11-stefanha@redhat.com> |
12 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
13 | --- | 11 | --- |
14 | .../tests/iothreads-commit-active | 85 +++++++++++++++++++ | 12 | include/hw/virtio/iothread-vq-mapping.h | 45 ++++++++ |
15 | .../tests/iothreads-commit-active.out | 23 +++++ | 13 | hw/block/virtio-blk.c | 142 +----------------------- |
16 | 2 files changed, 108 insertions(+) | 14 | hw/virtio/iothread-vq-mapping.c | 131 ++++++++++++++++++++++ |
17 | create mode 100755 tests/qemu-iotests/tests/iothreads-commit-active | 15 | hw/virtio/meson.build | 1 + |
18 | create mode 100644 tests/qemu-iotests/tests/iothreads-commit-active.out | 16 | 4 files changed, 178 insertions(+), 141 deletions(-) |
17 | create mode 100644 include/hw/virtio/iothread-vq-mapping.h | ||
18 | create mode 100644 hw/virtio/iothread-vq-mapping.c | ||
19 | 19 | ||
20 | diff --git a/tests/qemu-iotests/tests/iothreads-commit-active b/tests/qemu-iotests/tests/iothreads-commit-active | 20 | diff --git a/include/hw/virtio/iothread-vq-mapping.h b/include/hw/virtio/iothread-vq-mapping.h |
21 | new file mode 100755 | ||
22 | index XXXXXXX..XXXXXXX | ||
23 | --- /dev/null | ||
24 | +++ b/tests/qemu-iotests/tests/iothreads-commit-active | ||
25 | @@ -XXX,XX +XXX,XX @@ | ||
26 | +#!/usr/bin/env python3 | ||
27 | +# group: rw quick auto | ||
28 | +# | ||
29 | +# Copyright (C) 2023 Red Hat, Inc. | ||
30 | +# | ||
31 | +# This program is free software; you can redistribute it and/or modify | ||
32 | +# it under the terms of the GNU General Public License as published by | ||
33 | +# the Free Software Foundation; either version 2 of the License, or | ||
34 | +# (at your option) any later version. | ||
35 | +# | ||
36 | +# This program is distributed in the hope that it will be useful, | ||
37 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
38 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
39 | +# GNU General Public License for more details. | ||
40 | +# | ||
41 | +# You should have received a copy of the GNU General Public License | ||
42 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
43 | +# | ||
44 | +# Creator/Owner: Kevin Wolf <kwolf@redhat.com> | ||
45 | + | ||
46 | +import asyncio | ||
47 | +import iotests | ||
48 | + | ||
49 | +iotests.script_initialize(supported_fmts=['qcow2'], | ||
50 | + supported_platforms=['linux']) | ||
51 | +iotests.verify_virtio_scsi_pci_or_ccw() | ||
52 | + | ||
53 | +with iotests.FilePath('disk0.img') as img_path, \ | ||
54 | + iotests.FilePath('disk0-snap.img') as snap_path, \ | ||
55 | + iotests.FilePath('mirror-src.img') as src_path, \ | ||
56 | + iotests.FilePath('mirror-dst.img') as dst_path, \ | ||
57 | + iotests.VM() as vm: | ||
58 | + | ||
59 | + img_size = '10M' | ||
60 | + iotests.qemu_img_create('-f', iotests.imgfmt, img_path, img_size) | ||
61 | + iotests.qemu_img_create('-f', iotests.imgfmt, '-b', img_path, | ||
62 | + '-F', iotests.imgfmt, snap_path) | ||
63 | + iotests.qemu_img_create('-f', iotests.imgfmt, src_path, img_size) | ||
64 | + iotests.qemu_img_create('-f', iotests.imgfmt, dst_path, img_size) | ||
65 | + | ||
66 | + iotests.qemu_io_log('-c', 'write 0 64k', img_path) | ||
67 | + iotests.qemu_io_log('-c', 'write 1M 64k', snap_path) | ||
68 | + iotests.qemu_io_log('-c', 'write 3M 64k', snap_path) | ||
69 | + | ||
70 | + iotests.qemu_io_log('-c', f'write 0 {img_size}', src_path) | ||
71 | + | ||
72 | + iotests.log('Launching VM...') | ||
73 | + vm.add_object('iothread,id=iothread0') | ||
74 | + vm.add_object('throttle-group,x-bps-write=1048576,id=tg0') | ||
75 | + vm.add_blockdev(f'file,node-name=disk0-file,filename={img_path}') | ||
76 | + vm.add_blockdev('qcow2,node-name=disk0-fmt,file=disk0-file') | ||
77 | + vm.add_drive(snap_path, 'backing=disk0-fmt,node-name=disk0', | ||
78 | + interface='none') | ||
79 | + vm.add_device('virtio-scsi,iothread=iothread0') | ||
80 | + vm.add_device('scsi-hd,drive=drive0') | ||
81 | + | ||
82 | + vm.add_blockdev(f'file,filename={src_path},node-name=mirror-src-file') | ||
83 | + vm.add_blockdev('qcow2,file=mirror-src-file,node-name=mirror-src') | ||
84 | + vm.add_blockdev(f'file,filename={dst_path},node-name=mirror-dst-file') | ||
85 | + vm.add_blockdev('qcow2,file=mirror-dst-file,node-name=mirror-dst-fmt') | ||
86 | + vm.add_blockdev('throttle,throttle-group=tg0,file=mirror-dst-fmt,' | ||
87 | + 'node-name=mirror-dst') | ||
88 | + vm.add_device('scsi-hd,drive=mirror-src') | ||
89 | + | ||
90 | + vm.launch() | ||
91 | + | ||
92 | + # The background I/O is created on unrelated nodes (so that they won't be | ||
93 | + # drained together with the other ones), but on the same iothread | ||
94 | + iotests.log('Creating some background I/O...') | ||
95 | + iotests.log(vm.qmp('blockdev-mirror', job_id='job0', sync='full', | ||
96 | + device='mirror-src', target='mirror-dst', | ||
97 | + auto_dismiss=False)) | ||
98 | + | ||
99 | + iotests.log('Starting active commit...') | ||
100 | + iotests.log(vm.qmp('block-commit', device='disk0', job_id='job1', | ||
101 | + auto_dismiss=False)) | ||
102 | + | ||
103 | + # Should succeed and not time out | ||
104 | + try: | ||
105 | + vm.run_job('job1', wait=5.0) | ||
106 | + vm.shutdown() | ||
107 | + except asyncio.TimeoutError: | ||
108 | + # VM may be stuck, kill it | ||
109 | + vm.kill() | ||
110 | + raise | ||
111 | diff --git a/tests/qemu-iotests/tests/iothreads-commit-active.out b/tests/qemu-iotests/tests/iothreads-commit-active.out | ||
112 | new file mode 100644 | 21 | new file mode 100644 |
113 | index XXXXXXX..XXXXXXX | 22 | index XXXXXXX..XXXXXXX |
114 | --- /dev/null | 23 | --- /dev/null |
115 | +++ b/tests/qemu-iotests/tests/iothreads-commit-active.out | 24 | +++ b/include/hw/virtio/iothread-vq-mapping.h |
116 | @@ -XXX,XX +XXX,XX @@ | 25 | @@ -XXX,XX +XXX,XX @@ |
117 | +wrote 65536/65536 bytes at offset 0 | 26 | +/* |
118 | +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 27 | + * IOThread Virtqueue Mapping |
119 | + | 28 | + * |
120 | +wrote 65536/65536 bytes at offset 1048576 | 29 | + * Copyright Red Hat, Inc |
121 | +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 30 | + * |
122 | + | 31 | + * SPDX-License-Identifier: GPL-2.0-only |
123 | +wrote 65536/65536 bytes at offset 3145728 | 32 | + */ |
124 | +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 33 | + |
125 | + | 34 | +#ifndef HW_VIRTIO_IOTHREAD_VQ_MAPPING_H |
126 | +wrote 10485760/10485760 bytes at offset 0 | 35 | +#define HW_VIRTIO_IOTHREAD_VQ_MAPPING_H |
127 | +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 36 | + |
128 | + | 37 | +#include "qapi/error.h" |
129 | +Launching VM... | 38 | +#include "qapi/qapi-types-virtio.h" |
130 | +Creating some background I/O... | 39 | + |
131 | +{"return": {}} | 40 | +/** |
132 | +Starting active commit... | 41 | + * iothread_vq_mapping_apply: |
133 | +{"return": {}} | 42 | + * @list: The mapping of virtqueues to IOThreads. |
134 | +{"execute": "job-complete", "arguments": {"id": "job1"}} | 43 | + * @vq_aio_context: The array of AioContext pointers to fill in. |
135 | +{"return": {}} | 44 | + * @num_queues: The length of @vq_aio_context. |
136 | +{"data": {"device": "job1", "len": 131072, "offset": 131072, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} | 45 | + * @errp: If an error occurs, a pointer to the area to store the error. |
137 | +{"data": {"device": "job1", "len": 131072, "offset": 131072, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} | 46 | + * |
138 | +{"execute": "job-dismiss", "arguments": {"id": "job1"}} | 47 | + * Fill in the AioContext for each virtqueue in the @vq_aio_context array given |
139 | +{"return": {}} | 48 | + * the iothread-vq-mapping parameter in @list. |
49 | + * | ||
50 | + * iothread_vq_mapping_cleanup() must be called to free IOThread object | ||
51 | + * references after this function returns success. | ||
52 | + * | ||
53 | + * Returns: %true on success, %false on failure. | ||
54 | + **/ | ||
55 | +bool iothread_vq_mapping_apply( | ||
56 | + IOThreadVirtQueueMappingList *list, | ||
57 | + AioContext **vq_aio_context, | ||
58 | + uint16_t num_queues, | ||
59 | + Error **errp); | ||
60 | + | ||
61 | +/** | ||
62 | + * iothread_vq_mapping_cleanup: | ||
63 | + * @list: The mapping of virtqueues to IOThreads. | ||
64 | + * | ||
65 | + * Release IOThread object references that were acquired by | ||
66 | + * iothread_vq_mapping_apply(). | ||
67 | + */ | ||
68 | +void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list); | ||
69 | + | ||
70 | +#endif /* HW_VIRTIO_IOTHREAD_VQ_MAPPING_H */ | ||
71 | diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c | ||
72 | index XXXXXXX..XXXXXXX 100644 | ||
73 | --- a/hw/block/virtio-blk.c | ||
74 | +++ b/hw/block/virtio-blk.c | ||
75 | @@ -XXX,XX +XXX,XX @@ | ||
76 | #endif | ||
77 | #include "hw/virtio/virtio-bus.h" | ||
78 | #include "migration/qemu-file-types.h" | ||
79 | +#include "hw/virtio/iothread-vq-mapping.h" | ||
80 | #include "hw/virtio/virtio-access.h" | ||
81 | #include "hw/virtio/virtio-blk-common.h" | ||
82 | #include "qemu/coroutine.h" | ||
83 | @@ -XXX,XX +XXX,XX @@ static const BlockDevOps virtio_block_ops = { | ||
84 | .drained_end = virtio_blk_drained_end, | ||
85 | }; | ||
86 | |||
87 | -static bool | ||
88 | -iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t | ||
89 | - num_queues, Error **errp) | ||
90 | -{ | ||
91 | - g_autofree unsigned long *vqs = bitmap_new(num_queues); | ||
92 | - g_autoptr(GHashTable) iothreads = | ||
93 | - g_hash_table_new(g_str_hash, g_str_equal); | ||
94 | - | ||
95 | - for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) { | ||
96 | - const char *name = node->value->iothread; | ||
97 | - uint16List *vq; | ||
98 | - | ||
99 | - if (!iothread_by_id(name)) { | ||
100 | - error_setg(errp, "IOThread \"%s\" object does not exist", name); | ||
101 | - return false; | ||
102 | - } | ||
103 | - | ||
104 | - if (!g_hash_table_add(iothreads, (gpointer)name)) { | ||
105 | - error_setg(errp, | ||
106 | - "duplicate IOThread name \"%s\" in iothread-vq-mapping", | ||
107 | - name); | ||
108 | - return false; | ||
109 | - } | ||
110 | - | ||
111 | - if (node != list) { | ||
112 | - if (!!node->value->vqs != !!list->value->vqs) { | ||
113 | - error_setg(errp, "either all items in iothread-vq-mapping " | ||
114 | - "must have vqs or none of them must have it"); | ||
115 | - return false; | ||
116 | - } | ||
117 | - } | ||
118 | - | ||
119 | - for (vq = node->value->vqs; vq; vq = vq->next) { | ||
120 | - if (vq->value >= num_queues) { | ||
121 | - error_setg(errp, "vq index %u for IOThread \"%s\" must be " | ||
122 | - "less than num_queues %u in iothread-vq-mapping", | ||
123 | - vq->value, name, num_queues); | ||
124 | - return false; | ||
125 | - } | ||
126 | - | ||
127 | - if (test_and_set_bit(vq->value, vqs)) { | ||
128 | - error_setg(errp, "cannot assign vq %u to IOThread \"%s\" " | ||
129 | - "because it is already assigned", vq->value, name); | ||
130 | - return false; | ||
131 | - } | ||
132 | - } | ||
133 | - } | ||
134 | - | ||
135 | - if (list->value->vqs) { | ||
136 | - for (uint16_t i = 0; i < num_queues; i++) { | ||
137 | - if (!test_bit(i, vqs)) { | ||
138 | - error_setg(errp, | ||
139 | - "missing vq %u IOThread assignment in iothread-vq-mapping", | ||
140 | - i); | ||
141 | - return false; | ||
142 | - } | ||
143 | - } | ||
144 | - } | ||
145 | - | ||
146 | - return true; | ||
147 | -} | ||
148 | - | ||
149 | -/** | ||
150 | - * iothread_vq_mapping_apply: | ||
151 | - * @list: The mapping of virtqueues to IOThreads. | ||
152 | - * @vq_aio_context: The array of AioContext pointers to fill in. | ||
153 | - * @num_queues: The length of @vq_aio_context. | ||
154 | - * @errp: If an error occurs, a pointer to the area to store the error. | ||
155 | - * | ||
156 | - * Fill in the AioContext for each virtqueue in the @vq_aio_context array given | ||
157 | - * the iothread-vq-mapping parameter in @list. | ||
158 | - * | ||
159 | - * iothread_vq_mapping_cleanup() must be called to free IOThread object | ||
160 | - * references after this function returns success. | ||
161 | - * | ||
162 | - * Returns: %true on success, %false on failure. | ||
163 | - **/ | ||
164 | -static bool iothread_vq_mapping_apply( | ||
165 | - IOThreadVirtQueueMappingList *list, | ||
166 | - AioContext **vq_aio_context, | ||
167 | - uint16_t num_queues, | ||
168 | - Error **errp) | ||
169 | -{ | ||
170 | - IOThreadVirtQueueMappingList *node; | ||
171 | - size_t num_iothreads = 0; | ||
172 | - size_t cur_iothread = 0; | ||
173 | - | ||
174 | - if (!iothread_vq_mapping_validate(list, num_queues, errp)) { | ||
175 | - return false; | ||
176 | - } | ||
177 | - | ||
178 | - for (node = list; node; node = node->next) { | ||
179 | - num_iothreads++; | ||
180 | - } | ||
181 | - | ||
182 | - for (node = list; node; node = node->next) { | ||
183 | - IOThread *iothread = iothread_by_id(node->value->iothread); | ||
184 | - AioContext *ctx = iothread_get_aio_context(iothread); | ||
185 | - | ||
186 | - /* Released in virtio_blk_vq_aio_context_cleanup() */ | ||
187 | - object_ref(OBJECT(iothread)); | ||
188 | - | ||
189 | - if (node->value->vqs) { | ||
190 | - uint16List *vq; | ||
191 | - | ||
192 | - /* Explicit vq:IOThread assignment */ | ||
193 | - for (vq = node->value->vqs; vq; vq = vq->next) { | ||
194 | - assert(vq->value < num_queues); | ||
195 | - vq_aio_context[vq->value] = ctx; | ||
196 | - } | ||
197 | - } else { | ||
198 | - /* Round-robin vq:IOThread assignment */ | ||
199 | - for (unsigned i = cur_iothread; i < num_queues; | ||
200 | - i += num_iothreads) { | ||
201 | - vq_aio_context[i] = ctx; | ||
202 | - } | ||
203 | - } | ||
204 | - | ||
205 | - cur_iothread++; | ||
206 | - } | ||
207 | - | ||
208 | - return true; | ||
209 | -} | ||
210 | - | ||
211 | -/** | ||
212 | - * iothread_vq_mapping_cleanup: | ||
213 | - * @list: The mapping of virtqueues to IOThreads. | ||
214 | - * | ||
215 | - * Release IOThread object references that were acquired by | ||
216 | - * iothread_vq_mapping_apply(). | ||
217 | - */ | ||
218 | -static void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list) | ||
219 | -{ | ||
220 | - IOThreadVirtQueueMappingList *node; | ||
221 | - | ||
222 | - for (node = list; node; node = node->next) { | ||
223 | - IOThread *iothread = iothread_by_id(node->value->iothread); | ||
224 | - object_unref(OBJECT(iothread)); | ||
225 | - } | ||
226 | -} | ||
227 | - | ||
228 | /* Context: BQL held */ | ||
229 | static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) | ||
230 | { | ||
231 | diff --git a/hw/virtio/iothread-vq-mapping.c b/hw/virtio/iothread-vq-mapping.c | ||
232 | new file mode 100644 | ||
233 | index XXXXXXX..XXXXXXX | ||
234 | --- /dev/null | ||
235 | +++ b/hw/virtio/iothread-vq-mapping.c | ||
236 | @@ -XXX,XX +XXX,XX @@ | ||
237 | +/* | ||
238 | + * IOThread Virtqueue Mapping | ||
239 | + * | ||
240 | + * Copyright Red Hat, Inc | ||
241 | + * | ||
242 | + * SPDX-License-Identifier: GPL-2.0-only | ||
243 | + */ | ||
244 | + | ||
245 | +#include "qemu/osdep.h" | ||
246 | +#include "system/iothread.h" | ||
247 | +#include "hw/virtio/iothread-vq-mapping.h" | ||
248 | + | ||
249 | +static bool | ||
250 | +iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t | ||
251 | + num_queues, Error **errp) | ||
252 | +{ | ||
253 | + g_autofree unsigned long *vqs = bitmap_new(num_queues); | ||
254 | + g_autoptr(GHashTable) iothreads = | ||
255 | + g_hash_table_new(g_str_hash, g_str_equal); | ||
256 | + | ||
257 | + for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) { | ||
258 | + const char *name = node->value->iothread; | ||
259 | + uint16List *vq; | ||
260 | + | ||
261 | + if (!iothread_by_id(name)) { | ||
262 | + error_setg(errp, "IOThread \"%s\" object does not exist", name); | ||
263 | + return false; | ||
264 | + } | ||
265 | + | ||
266 | + if (!g_hash_table_add(iothreads, (gpointer)name)) { | ||
267 | + error_setg(errp, | ||
268 | + "duplicate IOThread name \"%s\" in iothread-vq-mapping", | ||
269 | + name); | ||
270 | + return false; | ||
271 | + } | ||
272 | + | ||
273 | + if (node != list) { | ||
274 | + if (!!node->value->vqs != !!list->value->vqs) { | ||
275 | + error_setg(errp, "either all items in iothread-vq-mapping " | ||
276 | + "must have vqs or none of them must have it"); | ||
277 | + return false; | ||
278 | + } | ||
279 | + } | ||
280 | + | ||
281 | + for (vq = node->value->vqs; vq; vq = vq->next) { | ||
282 | + if (vq->value >= num_queues) { | ||
283 | + error_setg(errp, "vq index %u for IOThread \"%s\" must be " | ||
284 | + "less than num_queues %u in iothread-vq-mapping", | ||
285 | + vq->value, name, num_queues); | ||
286 | + return false; | ||
287 | + } | ||
288 | + | ||
289 | + if (test_and_set_bit(vq->value, vqs)) { | ||
290 | + error_setg(errp, "cannot assign vq %u to IOThread \"%s\" " | ||
291 | + "because it is already assigned", vq->value, name); | ||
292 | + return false; | ||
293 | + } | ||
294 | + } | ||
295 | + } | ||
296 | + | ||
297 | + if (list->value->vqs) { | ||
298 | + for (uint16_t i = 0; i < num_queues; i++) { | ||
299 | + if (!test_bit(i, vqs)) { | ||
300 | + error_setg(errp, | ||
301 | + "missing vq %u IOThread assignment in iothread-vq-mapping", | ||
302 | + i); | ||
303 | + return false; | ||
304 | + } | ||
305 | + } | ||
306 | + } | ||
307 | + | ||
308 | + return true; | ||
309 | +} | ||
310 | + | ||
311 | +bool iothread_vq_mapping_apply( | ||
312 | + IOThreadVirtQueueMappingList *list, | ||
313 | + AioContext **vq_aio_context, | ||
314 | + uint16_t num_queues, | ||
315 | + Error **errp) | ||
316 | +{ | ||
317 | + IOThreadVirtQueueMappingList *node; | ||
318 | + size_t num_iothreads = 0; | ||
319 | + size_t cur_iothread = 0; | ||
320 | + | ||
321 | + if (!iothread_vq_mapping_validate(list, num_queues, errp)) { | ||
322 | + return false; | ||
323 | + } | ||
324 | + | ||
325 | + for (node = list; node; node = node->next) { | ||
326 | + num_iothreads++; | ||
327 | + } | ||
328 | + | ||
329 | + for (node = list; node; node = node->next) { | ||
330 | + IOThread *iothread = iothread_by_id(node->value->iothread); | ||
331 | + AioContext *ctx = iothread_get_aio_context(iothread); | ||
332 | + | ||
333 | + /* Released in virtio_blk_vq_aio_context_cleanup() */ | ||
334 | + object_ref(OBJECT(iothread)); | ||
335 | + | ||
336 | + if (node->value->vqs) { | ||
337 | + uint16List *vq; | ||
338 | + | ||
339 | + /* Explicit vq:IOThread assignment */ | ||
340 | + for (vq = node->value->vqs; vq; vq = vq->next) { | ||
341 | + assert(vq->value < num_queues); | ||
342 | + vq_aio_context[vq->value] = ctx; | ||
343 | + } | ||
344 | + } else { | ||
345 | + /* Round-robin vq:IOThread assignment */ | ||
346 | + for (unsigned i = cur_iothread; i < num_queues; | ||
347 | + i += num_iothreads) { | ||
348 | + vq_aio_context[i] = ctx; | ||
349 | + } | ||
350 | + } | ||
351 | + | ||
352 | + cur_iothread++; | ||
353 | + } | ||
354 | + | ||
355 | + return true; | ||
356 | +} | ||
357 | + | ||
358 | +void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list) | ||
359 | +{ | ||
360 | + IOThreadVirtQueueMappingList *node; | ||
361 | + | ||
362 | + for (node = list; node; node = node->next) { | ||
363 | + IOThread *iothread = iothread_by_id(node->value->iothread); | ||
364 | + object_unref(OBJECT(iothread)); | ||
365 | + } | ||
366 | +} | ||
367 | + | ||
368 | diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build | ||
369 | index XXXXXXX..XXXXXXX 100644 | ||
370 | --- a/hw/virtio/meson.build | ||
371 | +++ b/hw/virtio/meson.build | ||
372 | @@ -XXX,XX +XXX,XX @@ | ||
373 | system_virtio_ss = ss.source_set() | ||
374 | system_virtio_ss.add(files('virtio-bus.c')) | ||
375 | +system_virtio_ss.add(files('iothread-vq-mapping.c')) | ||
376 | system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c')) | ||
377 | system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c')) | ||
378 | system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c')) | ||
140 | -- | 379 | -- |
141 | 2.41.0 | 380 | 2.48.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | blk_insert_bs() requires that callers hold the AioContext lock for the | ||
2 | node that should be inserted. Take it. | ||
3 | 1 | ||
4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
5 | Message-ID: <20230605085711.21261-3-kwolf@redhat.com> | ||
6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
7 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | ||
8 | --- | ||
9 | hw/core/qdev-properties-system.c | 8 ++++++-- | ||
10 | 1 file changed, 6 insertions(+), 2 deletions(-) | ||
11 | |||
12 | diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c | ||
13 | index XXXXXXX..XXXXXXX 100644 | ||
14 | --- a/hw/core/qdev-properties-system.c | ||
15 | +++ b/hw/core/qdev-properties-system.c | ||
16 | @@ -XXX,XX +XXX,XX @@ static void set_drive_helper(Object *obj, Visitor *v, const char *name, | ||
17 | * aware of iothreads require their BlockBackends to be in the main | ||
18 | * AioContext. | ||
19 | */ | ||
20 | - ctx = iothread ? bdrv_get_aio_context(bs) : qemu_get_aio_context(); | ||
21 | - blk = blk_new(ctx, 0, BLK_PERM_ALL); | ||
22 | + ctx = bdrv_get_aio_context(bs); | ||
23 | + blk = blk_new(iothread ? ctx : qemu_get_aio_context(), | ||
24 | + 0, BLK_PERM_ALL); | ||
25 | blk_created = true; | ||
26 | |||
27 | + aio_context_acquire(ctx); | ||
28 | ret = blk_insert_bs(blk, bs, errp); | ||
29 | + aio_context_release(ctx); | ||
30 | + | ||
31 | if (ret < 0) { | ||
32 | goto fail; | ||
33 | } | ||
34 | -- | ||
35 | 2.41.0 | diff view generated by jsdifflib |
1 | blk_insert_bs() requires that callers hold the AioContext lock for the | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | node that should be inserted. Take it. | ||
3 | 2 | ||
4 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 3 | Allow virtio-scsi virtqueues to be assigned to different IOThreads. This |
5 | Message-ID: <20230605085711.21261-4-kwolf@redhat.com> | 4 | makes it possible to take advantage of host multi-queue block layer |
6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | scalability by assigning virtqueues that have affinity with vCPUs to |
6 | different IOThreads that have affinity with host CPUs. The same feature | ||
7 | was introduced for virtio-blk in the past: | ||
8 | https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping | ||
9 | |||
10 | Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an | ||
11 | Intel P4800X SSD: | ||
12 | iothreads IOPS | ||
13 | ------------------------------ | ||
14 | 1 189576 | ||
15 | 2 312698 | ||
16 | 4 346744 | ||
17 | |||
18 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
19 | Message-ID: <20250311132616.1049687-12-stefanha@redhat.com> | ||
7 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 20 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
8 | --- | 21 | --- |
9 | tests/unit/test-block-iothread.c | 7 ++++++- | 22 | include/hw/virtio/virtio-scsi.h | 5 +- |
10 | 1 file changed, 6 insertions(+), 1 deletion(-) | 23 | hw/scsi/virtio-scsi-dataplane.c | 90 ++++++++++++++++++++++++--------- |
24 | hw/scsi/virtio-scsi.c | 63 ++++++++++++++--------- | ||
25 | 3 files changed, 107 insertions(+), 51 deletions(-) | ||
11 | 26 | ||
12 | diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c | 27 | diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h |
13 | index XXXXXXX..XXXXXXX 100644 | 28 | index XXXXXXX..XXXXXXX 100644 |
14 | --- a/tests/unit/test-block-iothread.c | 29 | --- a/include/hw/virtio/virtio-scsi.h |
15 | +++ b/tests/unit/test-block-iothread.c | 30 | +++ b/include/hw/virtio/virtio-scsi.h |
16 | @@ -XXX,XX +XXX,XX @@ static void test_attach_second_node(void) | 31 | @@ -XXX,XX +XXX,XX @@ |
17 | BlockDriverState *bs, *filter; | 32 | #include "hw/virtio/virtio.h" |
18 | QDict *options; | 33 | #include "hw/scsi/scsi.h" |
19 | 34 | #include "chardev/char-fe.h" | |
20 | + aio_context_acquire(main_ctx); | 35 | +#include "qapi/qapi-types-virtio.h" |
21 | blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL); | 36 | #include "system/iothread.h" |
22 | bs = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort); | 37 | |
23 | blk_insert_bs(blk, bs, &error_abort); | 38 | #define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common" |
24 | @@ -XXX,XX +XXX,XX @@ static void test_attach_second_node(void) | 39 | @@ -XXX,XX +XXX,XX @@ struct VirtIOSCSIConf { |
25 | qdict_put_str(options, "driver", "raw"); | 40 | CharBackend chardev; |
26 | qdict_put_str(options, "file", "base"); | 41 | uint32_t boot_tpgt; |
27 | 42 | IOThread *iothread; | |
28 | - aio_context_acquire(main_ctx); | 43 | + IOThreadVirtQueueMappingList *iothread_vq_mapping_list; |
29 | filter = bdrv_open(NULL, NULL, options, BDRV_O_RDWR, &error_abort); | 44 | }; |
30 | aio_context_release(main_ctx); | 45 | |
31 | 46 | struct VirtIOSCSI; | |
32 | @@ -XXX,XX +XXX,XX @@ static void test_attach_preserve_blk_ctx(void) | 47 | @@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI { |
33 | { | 48 | QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list; |
34 | IOThread *iothread = iothread_new(); | 49 | |
35 | AioContext *ctx = iothread_get_aio_context(iothread); | 50 | /* Fields for dataplane below */ |
36 | + AioContext *main_ctx = qemu_get_aio_context(); | 51 | - AioContext *ctx; /* one iothread per virtio-scsi-pci for now */ |
37 | BlockBackend *blk; | 52 | + AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */ |
38 | BlockDriverState *bs; | 53 | |
39 | 54 | bool dataplane_started; | |
40 | + aio_context_acquire(main_ctx); | 55 | bool dataplane_starting; |
41 | blk = blk_new(ctx, BLK_PERM_ALL, BLK_PERM_ALL); | 56 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_common_realize(DeviceState *dev, |
42 | bs = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort); | 57 | void virtio_scsi_common_unrealize(DeviceState *dev); |
43 | bs->total_sectors = 65536 / BDRV_SECTOR_SIZE; | 58 | |
44 | @@ -XXX,XX +XXX,XX @@ static void test_attach_preserve_blk_ctx(void) | 59 | void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp); |
45 | blk_insert_bs(blk, bs, &error_abort); | 60 | +void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s); |
46 | g_assert(blk_get_aio_context(blk) == ctx); | 61 | int virtio_scsi_dataplane_start(VirtIODevice *s); |
47 | g_assert(bdrv_get_aio_context(bs) == ctx); | 62 | void virtio_scsi_dataplane_stop(VirtIODevice *s); |
48 | + aio_context_release(main_ctx); | 63 | |
49 | 64 | diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c | |
50 | /* Remove the node again */ | 65 | index XXXXXXX..XXXXXXX 100644 |
51 | aio_context_acquire(ctx); | 66 | --- a/hw/scsi/virtio-scsi-dataplane.c |
52 | @@ -XXX,XX +XXX,XX @@ static void test_attach_preserve_blk_ctx(void) | 67 | +++ b/hw/scsi/virtio-scsi-dataplane.c |
53 | g_assert(bdrv_get_aio_context(bs) == qemu_get_aio_context()); | 68 | @@ -XXX,XX +XXX,XX @@ |
54 | 69 | #include "system/block-backend.h" | |
55 | /* Re-attach the node */ | 70 | #include "hw/scsi/scsi.h" |
56 | + aio_context_acquire(main_ctx); | 71 | #include "scsi/constants.h" |
57 | blk_insert_bs(blk, bs, &error_abort); | 72 | +#include "hw/virtio/iothread-vq-mapping.h" |
58 | + aio_context_release(main_ctx); | 73 | #include "hw/virtio/virtio-bus.h" |
59 | g_assert(blk_get_aio_context(blk) == ctx); | 74 | |
60 | g_assert(bdrv_get_aio_context(bs) == ctx); | 75 | /* Context: BQL held */ |
61 | 76 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) | |
77 | VirtIODevice *vdev = VIRTIO_DEVICE(s); | ||
78 | BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); | ||
79 | VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); | ||
80 | + uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; | ||
81 | |||
82 | - if (vs->conf.iothread) { | ||
83 | + if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) { | ||
84 | + error_setg(errp, | ||
85 | + "iothread and iothread-vq-mapping properties cannot be set " | ||
86 | + "at the same time"); | ||
87 | + return; | ||
88 | + } | ||
89 | + | ||
90 | + if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) { | ||
91 | if (!k->set_guest_notifiers || !k->ioeventfd_assign) { | ||
92 | error_setg(errp, | ||
93 | "device is incompatible with iothread " | ||
94 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) | ||
95 | error_setg(errp, "ioeventfd is required for iothread"); | ||
96 | return; | ||
97 | } | ||
98 | - s->ctx = iothread_get_aio_context(vs->conf.iothread); | ||
99 | - } else { | ||
100 | - if (!virtio_device_ioeventfd_enabled(vdev)) { | ||
101 | + } | ||
102 | + | ||
103 | + s->vq_aio_context = g_new(AioContext *, num_vqs); | ||
104 | + | ||
105 | + if (vs->conf.iothread_vq_mapping_list) { | ||
106 | + if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list, | ||
107 | + s->vq_aio_context, num_vqs, errp)) { | ||
108 | + g_free(s->vq_aio_context); | ||
109 | + s->vq_aio_context = NULL; | ||
110 | return; | ||
111 | } | ||
112 | - s->ctx = qemu_get_aio_context(); | ||
113 | + } else if (vs->conf.iothread) { | ||
114 | + AioContext *ctx = iothread_get_aio_context(vs->conf.iothread); | ||
115 | + for (uint16_t i = 0; i < num_vqs; i++) { | ||
116 | + s->vq_aio_context[i] = ctx; | ||
117 | + } | ||
118 | + | ||
119 | + /* Released in virtio_scsi_dataplane_cleanup() */ | ||
120 | + object_ref(OBJECT(vs->conf.iothread)); | ||
121 | + } else { | ||
122 | + AioContext *ctx = qemu_get_aio_context(); | ||
123 | + for (unsigned i = 0; i < num_vqs; i++) { | ||
124 | + s->vq_aio_context[i] = ctx; | ||
125 | + } | ||
126 | + } | ||
127 | +} | ||
128 | + | ||
129 | +/* Context: BQL held */ | ||
130 | +void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s) | ||
131 | +{ | ||
132 | + VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); | ||
133 | + | ||
134 | + if (vs->conf.iothread_vq_mapping_list) { | ||
135 | + iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list); | ||
136 | } | ||
137 | + | ||
138 | + if (vs->conf.iothread) { | ||
139 | + object_unref(OBJECT(vs->conf.iothread)); | ||
140 | + } | ||
141 | + | ||
142 | + g_free(s->vq_aio_context); | ||
143 | + s->vq_aio_context = NULL; | ||
144 | } | ||
145 | |||
146 | static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n) | ||
147 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n) | ||
148 | } | ||
149 | |||
150 | /* Context: BH in IOThread */ | ||
151 | -static void virtio_scsi_dataplane_stop_bh(void *opaque) | ||
152 | +static void virtio_scsi_dataplane_stop_vq_bh(void *opaque) | ||
153 | { | ||
154 | - VirtIOSCSI *s = opaque; | ||
155 | - VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); | ||
156 | + AioContext *ctx = qemu_get_current_aio_context(); | ||
157 | + VirtQueue *vq = opaque; | ||
158 | EventNotifier *host_notifier; | ||
159 | - int i; | ||
160 | |||
161 | - virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx); | ||
162 | - host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq); | ||
163 | + virtio_queue_aio_detach_host_notifier(vq, ctx); | ||
164 | + host_notifier = virtio_queue_get_host_notifier(vq); | ||
165 | |||
166 | /* | ||
167 | * Test and clear notifier after disabling event, in case poll callback | ||
168 | * didn't have time to run. | ||
169 | */ | ||
170 | virtio_queue_host_notifier_read(host_notifier); | ||
171 | - | ||
172 | - virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx); | ||
173 | - host_notifier = virtio_queue_get_host_notifier(vs->event_vq); | ||
174 | - virtio_queue_host_notifier_read(host_notifier); | ||
175 | - | ||
176 | - for (i = 0; i < vs->conf.num_queues; i++) { | ||
177 | - virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx); | ||
178 | - host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]); | ||
179 | - virtio_queue_host_notifier_read(host_notifier); | ||
180 | - } | ||
181 | } | ||
182 | |||
183 | /* Context: BQL held */ | ||
184 | @@ -XXX,XX +XXX,XX @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) | ||
185 | smp_wmb(); /* paired with aio_notify_accept() */ | ||
186 | |||
187 | if (s->bus.drain_count == 0) { | ||
188 | - virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx); | ||
189 | - virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx); | ||
190 | + virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, | ||
191 | + s->vq_aio_context[0]); | ||
192 | + virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, | ||
193 | + s->vq_aio_context[1]); | ||
194 | |||
195 | for (i = 0; i < vs->conf.num_queues; i++) { | ||
196 | - virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx); | ||
197 | + AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i]; | ||
198 | + virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx); | ||
199 | } | ||
200 | } | ||
201 | return 0; | ||
202 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev) | ||
203 | s->dataplane_stopping = true; | ||
204 | |||
205 | if (s->bus.drain_count == 0) { | ||
206 | - aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s); | ||
207 | + for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) { | ||
208 | + VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i); | ||
209 | + AioContext *ctx = s->vq_aio_context[i]; | ||
210 | + aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq); | ||
211 | + } | ||
212 | } | ||
213 | |||
214 | blk_drain_all(); /* ensure there are no in-flight requests */ | ||
215 | diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c | ||
216 | index XXXXXXX..XXXXXXX 100644 | ||
217 | --- a/hw/scsi/virtio-scsi.c | ||
218 | +++ b/hw/scsi/virtio-scsi.c | ||
219 | @@ -XXX,XX +XXX,XX @@ | ||
220 | #include "hw/qdev-properties.h" | ||
221 | #include "hw/scsi/scsi.h" | ||
222 | #include "scsi/constants.h" | ||
223 | +#include "hw/virtio/iothread-vq-mapping.h" | ||
224 | #include "hw/virtio/virtio-bus.h" | ||
225 | #include "hw/virtio/virtio-access.h" | ||
226 | #include "trace.h" | ||
227 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data) | ||
228 | g_free(n); | ||
229 | } | ||
230 | |||
231 | -static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d) | ||
232 | -{ | ||
233 | - if (s->dataplane_started && d && blk_is_available(d->conf.blk)) { | ||
234 | - assert(blk_get_aio_context(d->conf.blk) == s->ctx); | ||
235 | - } | ||
236 | -} | ||
237 | - | ||
238 | static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req) | ||
239 | { | ||
240 | VirtIOSCSI *s = req->dev; | ||
241 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s) | ||
242 | |||
243 | assert(!s->dataplane_started); | ||
244 | |||
245 | - if (s->ctx) { | ||
246 | + for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) { | ||
247 | + AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i]; | ||
248 | + | ||
249 | /* Our BH only runs after previously scheduled BHs */ | ||
250 | - aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL); | ||
251 | + aio_wait_bh_oneshot(ctx, dummy_bh, NULL); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
256 | AioContext *ctx; | ||
257 | int ret = 0; | ||
258 | |||
259 | - virtio_scsi_ctx_check(s, d); | ||
260 | /* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */ | ||
261 | req->resp.tmf.response = VIRTIO_SCSI_S_OK; | ||
262 | |||
263 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
264 | |||
265 | case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: | ||
266 | case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: { | ||
267 | + g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL); | ||
268 | + | ||
269 | if (!d) { | ||
270 | goto fail; | ||
271 | } | ||
272 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
273 | |||
274 | qatomic_inc(&req->remaining); | ||
275 | |||
276 | - ctx = s->ctx ?: qemu_get_aio_context(); | ||
277 | - virtio_scsi_defer_tmf_to_aio_context(req, ctx); | ||
278 | + for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) { | ||
279 | + ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i]; | ||
280 | + | ||
281 | + if (!g_hash_table_add(aio_contexts, ctx)) { | ||
282 | + continue; /* skip previously added AioContext */ | ||
283 | + } | ||
284 | + | ||
285 | + virtio_scsi_defer_tmf_to_aio_context(req, ctx); | ||
286 | + } | ||
287 | |||
288 | virtio_scsi_tmf_dec_remaining(req); | ||
289 | ret = -EINPROGRESS; | ||
290 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq) | ||
291 | */ | ||
292 | static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s) | ||
293 | { | ||
294 | - if (!s->ctx || s->dataplane_started) { | ||
295 | + if (s->dataplane_started) { | ||
296 | return false; | ||
297 | } | ||
298 | + if (s->vq_aio_context[0] == qemu_get_aio_context()) { | ||
299 | + return false; /* not using IOThreads */ | ||
300 | + } | ||
301 | |||
302 | virtio_device_start_ioeventfd(&s->parent_obj.parent_obj); | ||
303 | return !s->dataplane_fenced; | ||
304 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) | ||
305 | virtio_scsi_complete_cmd_req(req); | ||
306 | return -ENOENT; | ||
307 | } | ||
308 | - virtio_scsi_ctx_check(s, d); | ||
309 | req->sreq = scsi_req_new(d, req->req.cmd.tag, | ||
310 | virtio_scsi_get_lun(req->req.cmd.lun), | ||
311 | req->req.cmd.cdb, vs->cdb_size, req); | ||
312 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev, | ||
313 | { | ||
314 | VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev); | ||
315 | VirtIOSCSI *s = VIRTIO_SCSI(vdev); | ||
316 | + AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED]; | ||
317 | SCSIDevice *sd = SCSI_DEVICE(dev); | ||
318 | - int ret; | ||
319 | |||
320 | - if (s->ctx && !s->dataplane_fenced) { | ||
321 | - ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp); | ||
322 | - if (ret < 0) { | ||
323 | - return; | ||
324 | - } | ||
325 | + if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) { | ||
326 | + /* | ||
327 | + * Try to make the BlockBackend's AioContext match ours. Ignore failure | ||
328 | + * because I/O will still work although block jobs and other users | ||
329 | + * might be slower when multiple AioContexts use a BlockBackend. | ||
330 | + */ | ||
331 | + blk_set_aio_context(sd->conf.blk, ctx, errp); | ||
332 | } | ||
333 | |||
334 | if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { | ||
335 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev, | ||
336 | |||
337 | qdev_simple_device_unplug_cb(hotplug_dev, dev, errp); | ||
338 | |||
339 | - if (s->ctx) { | ||
340 | + if (s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED] != qemu_get_aio_context()) { | ||
341 | /* If other users keep the BlockBackend in the iothread, that's ok */ | ||
342 | blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL); | ||
343 | } | ||
344 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_drained_begin(SCSIBus *bus) | ||
345 | |||
346 | for (uint32_t i = 0; i < total_queues; i++) { | ||
347 | VirtQueue *vq = virtio_get_queue(vdev, i); | ||
348 | - virtio_queue_aio_detach_host_notifier(vq, s->ctx); | ||
349 | + virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]); | ||
350 | } | ||
351 | } | ||
352 | |||
353 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_drained_end(SCSIBus *bus) | ||
354 | |||
355 | for (uint32_t i = 0; i < total_queues; i++) { | ||
356 | VirtQueue *vq = virtio_get_queue(vdev, i); | ||
357 | + AioContext *ctx = s->vq_aio_context[i]; | ||
358 | + | ||
359 | if (vq == vs->event_vq) { | ||
360 | - virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx); | ||
361 | + virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx); | ||
362 | } else { | ||
363 | - virtio_queue_aio_attach_host_notifier(vq, s->ctx); | ||
364 | + virtio_queue_aio_attach_host_notifier(vq, ctx); | ||
365 | } | ||
366 | } | ||
367 | } | ||
368 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_common_unrealize(DeviceState *dev) | ||
369 | virtio_cleanup(vdev); | ||
370 | } | ||
371 | |||
372 | +/* main loop */ | ||
373 | static void virtio_scsi_device_unrealize(DeviceState *dev) | ||
374 | { | ||
375 | VirtIOSCSI *s = VIRTIO_SCSI(dev); | ||
376 | |||
377 | virtio_scsi_reset_tmf_bh(s); | ||
378 | - | ||
379 | + virtio_scsi_dataplane_cleanup(s); | ||
380 | qbus_set_hotplug_handler(BUS(&s->bus), NULL); | ||
381 | virtio_scsi_common_unrealize(dev); | ||
382 | qemu_mutex_destroy(&s->tmf_bh_lock); | ||
383 | @@ -XXX,XX +XXX,XX @@ static const Property virtio_scsi_properties[] = { | ||
384 | VIRTIO_SCSI_F_CHANGE, true), | ||
385 | DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread, | ||
386 | TYPE_IOTHREAD, IOThread *), | ||
387 | + DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI, | ||
388 | + parent_obj.conf.iothread_vq_mapping_list), | ||
389 | }; | ||
390 | |||
391 | static const VMStateDescription vmstate_virtio_scsi = { | ||
62 | -- | 392 | -- |
63 | 2.41.0 | 393 | 2.48.1 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Mark functions as coroutine_fn when they are only called by other coroutine_fns | 3 | Previously the ctrl virtqueue was handled in the AioContext where SCSI |
4 | and they can suspend. Change calls to co_wrappers to use the non-wrapped | 4 | requests are processed. When IOThread Virtqueue Mapping was added things |
5 | functions, which in turn requires adding GRAPH_RDLOCK annotations. | 5 | become more complicated because SCSI requests could run in other |
6 | 6 | AioContexts. | |
7 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 7 | |
8 | Message-ID: <20230601115145.196465-5-pbonzini@redhat.com> | 8 | Simplify by handling the ctrl virtqueue in the main loop where reset |
9 | Reviewed-by: Kevin Wolf <kwolf@redhat.com> | 9 | operations can be performed. Note that BHs are still used canceling SCSI |
10 | requests in their AioContexts but at least the mean loop activity | ||
11 | doesn't need BHs anymore. | ||
12 | |||
13 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
14 | Message-ID: <20250311132616.1049687-13-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 15 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 16 | --- |
12 | block/bochs.c | 7 ++++--- | 17 | include/hw/virtio/virtio-scsi.h | 8 -- |
13 | 1 file changed, 4 insertions(+), 3 deletions(-) | 18 | hw/scsi/virtio-scsi-dataplane.c | 6 ++ |
14 | 19 | hw/scsi/virtio-scsi.c | 144 ++++++-------------------------- | |
15 | diff --git a/block/bochs.c b/block/bochs.c | 20 | 3 files changed, 33 insertions(+), 125 deletions(-) |
21 | |||
22 | diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h | ||
16 | index XXXXXXX..XXXXXXX 100644 | 23 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/bochs.c | 24 | --- a/include/hw/virtio/virtio-scsi.h |
18 | +++ b/block/bochs.c | 25 | +++ b/include/hw/virtio/virtio-scsi.h |
19 | @@ -XXX,XX +XXX,XX @@ static void bochs_refresh_limits(BlockDriverState *bs, Error **errp) | 26 | @@ -XXX,XX +XXX,XX @@ struct VirtIOSCSI { |
20 | bs->bl.request_alignment = BDRV_SECTOR_SIZE; /* No sub-sector I/O */ | 27 | |
28 | QemuMutex ctrl_lock; /* protects ctrl_vq */ | ||
29 | |||
30 | - /* | ||
31 | - * TMFs deferred to main loop BH. These fields are protected by | ||
32 | - * tmf_bh_lock. | ||
33 | - */ | ||
34 | - QemuMutex tmf_bh_lock; | ||
35 | - QEMUBH *tmf_bh; | ||
36 | - QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list; | ||
37 | - | ||
38 | /* Fields for dataplane below */ | ||
39 | AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */ | ||
40 | |||
41 | diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c | ||
42 | index XXXXXXX..XXXXXXX 100644 | ||
43 | --- a/hw/scsi/virtio-scsi-dataplane.c | ||
44 | +++ b/hw/scsi/virtio-scsi-dataplane.c | ||
45 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) | ||
46 | s->vq_aio_context[i] = ctx; | ||
47 | } | ||
48 | } | ||
49 | + | ||
50 | + /* | ||
51 | + * Always handle the ctrl virtqueue in the main loop thread where device | ||
52 | + * resets can be performed. | ||
53 | + */ | ||
54 | + s->vq_aio_context[0] = qemu_get_aio_context(); | ||
21 | } | 55 | } |
22 | 56 | ||
23 | -static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) | 57 | /* Context: BQL held */ |
24 | +static int64_t coroutine_fn GRAPH_RDLOCK | 58 | diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c |
25 | +seek_to_sector(BlockDriverState *bs, int64_t sector_num) | 59 | index XXXXXXX..XXXXXXX 100644 |
60 | --- a/hw/scsi/virtio-scsi.c | ||
61 | +++ b/hw/scsi/virtio-scsi.c | ||
62 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data) | ||
63 | g_free(n); | ||
64 | } | ||
65 | |||
66 | -static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req) | ||
67 | -{ | ||
68 | - VirtIOSCSI *s = req->dev; | ||
69 | - SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun); | ||
70 | - BusChild *kid; | ||
71 | - int target; | ||
72 | - | ||
73 | - switch (req->req.tmf.subtype) { | ||
74 | - case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: | ||
75 | - if (!d) { | ||
76 | - req->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET; | ||
77 | - goto out; | ||
78 | - } | ||
79 | - if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) { | ||
80 | - req->resp.tmf.response = VIRTIO_SCSI_S_INCORRECT_LUN; | ||
81 | - goto out; | ||
82 | - } | ||
83 | - qatomic_inc(&s->resetting); | ||
84 | - device_cold_reset(&d->qdev); | ||
85 | - qatomic_dec(&s->resetting); | ||
86 | - break; | ||
87 | - | ||
88 | - case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: | ||
89 | - target = req->req.tmf.lun[1]; | ||
90 | - qatomic_inc(&s->resetting); | ||
91 | - | ||
92 | - rcu_read_lock(); | ||
93 | - QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) { | ||
94 | - SCSIDevice *d1 = SCSI_DEVICE(kid->child); | ||
95 | - if (d1->channel == 0 && d1->id == target) { | ||
96 | - device_cold_reset(&d1->qdev); | ||
97 | - } | ||
98 | - } | ||
99 | - rcu_read_unlock(); | ||
100 | - | ||
101 | - qatomic_dec(&s->resetting); | ||
102 | - break; | ||
103 | - | ||
104 | - default: | ||
105 | - g_assert_not_reached(); | ||
106 | - } | ||
107 | - | ||
108 | -out: | ||
109 | - object_unref(OBJECT(d)); | ||
110 | - virtio_scsi_complete_req(req, &s->ctrl_lock); | ||
111 | -} | ||
112 | - | ||
113 | -/* Some TMFs must be processed from the main loop thread */ | ||
114 | -static void virtio_scsi_do_tmf_bh(void *opaque) | ||
115 | -{ | ||
116 | - VirtIOSCSI *s = opaque; | ||
117 | - QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs); | ||
118 | - VirtIOSCSIReq *req; | ||
119 | - VirtIOSCSIReq *tmp; | ||
120 | - | ||
121 | - GLOBAL_STATE_CODE(); | ||
122 | - | ||
123 | - WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) { | ||
124 | - QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) { | ||
125 | - QTAILQ_REMOVE(&s->tmf_bh_list, req, next); | ||
126 | - QTAILQ_INSERT_TAIL(&reqs, req, next); | ||
127 | - } | ||
128 | - | ||
129 | - qemu_bh_delete(s->tmf_bh); | ||
130 | - s->tmf_bh = NULL; | ||
131 | - } | ||
132 | - | ||
133 | - QTAILQ_FOREACH_SAFE(req, &reqs, next, tmp) { | ||
134 | - QTAILQ_REMOVE(&reqs, req, next); | ||
135 | - virtio_scsi_do_one_tmf_bh(req); | ||
136 | - } | ||
137 | -} | ||
138 | - | ||
139 | -static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s) | ||
140 | -{ | ||
141 | - VirtIOSCSIReq *req; | ||
142 | - VirtIOSCSIReq *tmp; | ||
143 | - | ||
144 | - GLOBAL_STATE_CODE(); | ||
145 | - | ||
146 | - /* Called after ioeventfd has been stopped, so tmf_bh_lock is not needed */ | ||
147 | - if (s->tmf_bh) { | ||
148 | - qemu_bh_delete(s->tmf_bh); | ||
149 | - s->tmf_bh = NULL; | ||
150 | - } | ||
151 | - | ||
152 | - QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) { | ||
153 | - QTAILQ_REMOVE(&s->tmf_bh_list, req, next); | ||
154 | - | ||
155 | - /* SAM-6 6.3.2 Hard reset */ | ||
156 | - req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE; | ||
157 | - virtio_scsi_complete_req(req, &req->dev->ctrl_lock); | ||
158 | - } | ||
159 | -} | ||
160 | - | ||
161 | -static void virtio_scsi_defer_tmf_to_main_loop(VirtIOSCSIReq *req) | ||
162 | -{ | ||
163 | - VirtIOSCSI *s = req->dev; | ||
164 | - | ||
165 | - WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) { | ||
166 | - QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next); | ||
167 | - | ||
168 | - if (!s->tmf_bh) { | ||
169 | - s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s); | ||
170 | - qemu_bh_schedule(s->tmf_bh); | ||
171 | - } | ||
172 | - } | ||
173 | -} | ||
174 | - | ||
175 | static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r) | ||
26 | { | 176 | { |
27 | BDRVBochsState *s = bs->opaque; | 177 | VirtIOSCSICancelNotifier *notifier; |
28 | uint64_t offset = sector_num * 512; | 178 | @@ -XXX,XX +XXX,XX @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req) |
29 | @@ -XXX,XX +XXX,XX @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) | 179 | break; |
30 | (s->extent_blocks + s->bitmap_blocks)); | 180 | |
31 | 181 | case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: | |
32 | /* read in bitmap for current extent */ | 182 | - case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: |
33 | - ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), 1, | 183 | - virtio_scsi_defer_tmf_to_main_loop(req); |
34 | - &bitmap_entry, 0); | 184 | - ret = -EINPROGRESS; |
35 | + ret = bdrv_co_pread(bs->file, bitmap_offset + (extent_offset / 8), 1, | 185 | + if (!d) { |
36 | + &bitmap_entry, 0); | 186 | + goto fail; |
37 | if (ret < 0) { | 187 | + } |
38 | return ret; | 188 | + if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) { |
39 | } | 189 | + goto incorrect_lun; |
190 | + } | ||
191 | + qatomic_inc(&s->resetting); | ||
192 | + device_cold_reset(&d->qdev); | ||
193 | + qatomic_dec(&s->resetting); | ||
194 | break; | ||
195 | |||
196 | + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: { | ||
197 | + BusChild *kid; | ||
198 | + int target = req->req.tmf.lun[1]; | ||
199 | + qatomic_inc(&s->resetting); | ||
200 | + | ||
201 | + rcu_read_lock(); | ||
202 | + QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) { | ||
203 | + SCSIDevice *d1 = SCSI_DEVICE(kid->child); | ||
204 | + if (d1->channel == 0 && d1->id == target) { | ||
205 | + device_cold_reset(&d1->qdev); | ||
206 | + } | ||
207 | + } | ||
208 | + rcu_read_unlock(); | ||
209 | + | ||
210 | + qatomic_dec(&s->resetting); | ||
211 | + break; | ||
212 | + } | ||
213 | + | ||
214 | case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: | ||
215 | case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: { | ||
216 | g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL); | ||
217 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_reset(VirtIODevice *vdev) | ||
218 | |||
219 | assert(!s->dataplane_started); | ||
220 | |||
221 | - virtio_scsi_reset_tmf_bh(s); | ||
222 | virtio_scsi_flush_defer_tmf_to_aio_context(s); | ||
223 | |||
224 | qatomic_inc(&s->resetting); | ||
225 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp) | ||
226 | VirtIOSCSI *s = VIRTIO_SCSI(dev); | ||
227 | Error *err = NULL; | ||
228 | |||
229 | - QTAILQ_INIT(&s->tmf_bh_list); | ||
230 | qemu_mutex_init(&s->ctrl_lock); | ||
231 | qemu_mutex_init(&s->event_lock); | ||
232 | - qemu_mutex_init(&s->tmf_bh_lock); | ||
233 | |||
234 | virtio_scsi_common_realize(dev, | ||
235 | virtio_scsi_handle_ctrl, | ||
236 | @@ -XXX,XX +XXX,XX @@ static void virtio_scsi_device_unrealize(DeviceState *dev) | ||
237 | { | ||
238 | VirtIOSCSI *s = VIRTIO_SCSI(dev); | ||
239 | |||
240 | - virtio_scsi_reset_tmf_bh(s); | ||
241 | virtio_scsi_dataplane_cleanup(s); | ||
242 | qbus_set_hotplug_handler(BUS(&s->bus), NULL); | ||
243 | virtio_scsi_common_unrealize(dev); | ||
244 | - qemu_mutex_destroy(&s->tmf_bh_lock); | ||
245 | qemu_mutex_destroy(&s->event_lock); | ||
246 | qemu_mutex_destroy(&s->ctrl_lock); | ||
247 | } | ||
40 | -- | 248 | -- |
41 | 2.41.0 | 249 | 2.48.1 | diff view generated by jsdifflib |
1 | Now that bdrv_graph_wrlock() temporarily drops the AioContext lock that | 1 | From: Stefan Hajnoczi <stefanha@redhat.com> |
---|---|---|---|
2 | its caller holds, it can poll without causing deadlocks. We can now | ||
3 | re-enable graph locking. | ||
4 | 2 | ||
5 | This reverts commit ad128dff0bf4b6f971d05eb4335a627883a19c1d. | 3 | Peter Krempa and Kevin Wolf observed that iothread-vq-mapping is |
4 | confusing to use because the control and event virtqueues have a fixed | ||
5 | location before the command virtqueues but need to be treated | ||
6 | differently. | ||
6 | 7 | ||
7 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 8 | Only expose the command virtqueues via iothread-vq-mapping so that the |
8 | Message-ID: <20230605085711.21261-12-kwolf@redhat.com> | 9 | command-line parameter is intuitive: it controls where SCSI requests are |
9 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | 10 | processed. |
11 | |||
12 | The control virtqueue needs to be hardcoded to the main loop thread for | ||
13 | technical reasons anyway. Kevin also pointed out that it's better to | ||
14 | place the event virtqueue in the main loop thread since its no poll | ||
15 | behavior would prevent polling if assigned to an IOThread. | ||
16 | |||
17 | This change is its own commit to avoid squashing the previous commit. | ||
18 | |||
19 | Suggested-by: Kevin Wolf <kwolf@redhat.com> | ||
20 | Suggested-by: Peter Krempa <pkrempa@redhat.com> | ||
21 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
22 | Message-ID: <20250311132616.1049687-14-stefanha@redhat.com> | ||
10 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> | 23 | Signed-off-by: Kevin Wolf <kwolf@redhat.com> |
11 | --- | 24 | --- |
12 | block/graph-lock.c | 26 -------------------------- | 25 | hw/scsi/virtio-scsi-dataplane.c | 33 ++++++++++++++++++++------------- |
13 | 1 file changed, 26 deletions(-) | 26 | 1 file changed, 20 insertions(+), 13 deletions(-) |
14 | 27 | ||
15 | diff --git a/block/graph-lock.c b/block/graph-lock.c | 28 | diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c |
16 | index XXXXXXX..XXXXXXX 100644 | 29 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/graph-lock.c | 30 | --- a/hw/scsi/virtio-scsi-dataplane.c |
18 | +++ b/block/graph-lock.c | 31 | +++ b/hw/scsi/virtio-scsi-dataplane.c |
19 | @@ -XXX,XX +XXX,XX @@ BdrvGraphLock graph_lock; | 32 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) |
20 | /* Protects the list of aiocontext and orphaned_reader_count */ | 33 | VirtIODevice *vdev = VIRTIO_DEVICE(s); |
21 | static QemuMutex aio_context_list_lock; | 34 | BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); |
22 | 35 | VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); | |
23 | -#if 0 | 36 | - uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; |
24 | /* Written and read with atomic operations. */ | 37 | |
25 | static int has_writer; | 38 | if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) { |
26 | -#endif | 39 | error_setg(errp, |
27 | 40 | @@ -XXX,XX +XXX,XX @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp) | |
28 | /* | ||
29 | * A reader coroutine could move from an AioContext to another. | ||
30 | @@ -XXX,XX +XXX,XX @@ void unregister_aiocontext(AioContext *ctx) | ||
31 | g_free(ctx->bdrv_graph); | ||
32 | } | ||
33 | |||
34 | -#if 0 | ||
35 | static uint32_t reader_count(void) | ||
36 | { | ||
37 | BdrvGraphRWlock *brdv_graph; | ||
38 | @@ -XXX,XX +XXX,XX @@ static uint32_t reader_count(void) | ||
39 | assert((int32_t)rd >= 0); | ||
40 | return rd; | ||
41 | } | ||
42 | -#endif | ||
43 | |||
44 | void bdrv_graph_wrlock(BlockDriverState *bs) | ||
45 | { | ||
46 | AioContext *ctx = NULL; | ||
47 | |||
48 | GLOBAL_STATE_CODE(); | ||
49 | - /* | ||
50 | - * TODO Some callers hold an AioContext lock when this is called, which | ||
51 | - * causes deadlocks. Reenable once the AioContext locking is cleaned up (or | ||
52 | - * AioContext locks are gone). | ||
53 | - */ | ||
54 | -#if 0 | ||
55 | assert(!qatomic_read(&has_writer)); | ||
56 | -#endif | ||
57 | |||
58 | /* | ||
59 | * Release only non-mainloop AioContext. The mainloop often relies on the | ||
60 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_wrlock(BlockDriverState *bs) | ||
61 | } | 41 | } |
62 | } | 42 | } |
63 | 43 | ||
64 | -#if 0 | 44 | - s->vq_aio_context = g_new(AioContext *, num_vqs); |
65 | /* Make sure that constantly arriving new I/O doesn't cause starvation */ | 45 | + s->vq_aio_context = g_new(AioContext *, vs->conf.num_queues + |
66 | bdrv_drain_all_begin_nopoll(); | 46 | + VIRTIO_SCSI_VQ_NUM_FIXED); |
67 | 47 | + | |
68 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_wrlock(BlockDriverState *bs) | 48 | + /* |
69 | } while (reader_count() >= 1); | 49 | + * Handle the ctrl virtqueue in the main loop thread where device resets |
70 | 50 | + * can be performed. | |
71 | bdrv_drain_all_end(); | 51 | + */ |
72 | -#endif | 52 | + s->vq_aio_context[0] = qemu_get_aio_context(); |
73 | 53 | + | |
74 | if (ctx) { | 54 | + /* |
75 | aio_context_acquire(bdrv_get_aio_context(bs)); | 55 | + * Handle the event virtqueue in the main loop thread where its no_poll |
76 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_wrlock(BlockDriverState *bs) | 56 | + * behavior won't stop IOThread polling. |
77 | void bdrv_graph_wrunlock(void) | 57 | + */ |
78 | { | 58 | + s->vq_aio_context[1] = qemu_get_aio_context(); |
79 | GLOBAL_STATE_CODE(); | 59 | |
80 | -#if 0 | 60 | if (vs->conf.iothread_vq_mapping_list) { |
81 | QEMU_LOCK_GUARD(&aio_context_list_lock); | 61 | if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list, |
82 | assert(qatomic_read(&has_writer)); | 62 | - s->vq_aio_context, num_vqs, errp)) { |
83 | 63 | + &s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED], | |
84 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_wrunlock(void) | 64 | + vs->conf.num_queues, errp)) { |
85 | 65 | g_free(s->vq_aio_context); | |
86 | /* Wake up all coroutine that are waiting to read the graph */ | 66 | s->vq_aio_context = NULL; |
87 | qemu_co_enter_all(&reader_queue, &aio_context_list_lock); | 67 | return; |
88 | -#endif | 68 | } |
89 | } | 69 | } else if (vs->conf.iothread) { |
90 | 70 | AioContext *ctx = iothread_get_aio_context(vs->conf.iothread); | |
91 | void coroutine_fn bdrv_graph_co_rdlock(void) | 71 | - for (uint16_t i = 0; i < num_vqs; i++) { |
92 | { | 72 | - s->vq_aio_context[i] = ctx; |
93 | - /* TODO Reenable when wrlock is reenabled */ | 73 | + for (uint16_t i = 0; i < vs->conf.num_queues; i++) { |
94 | -#if 0 | 74 | + s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx; |
95 | BdrvGraphRWlock *bdrv_graph; | 75 | } |
96 | bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; | 76 | |
97 | 77 | /* Released in virtio_scsi_dataplane_cleanup() */ | |
98 | @@ -XXX,XX +XXX,XX @@ void coroutine_fn bdrv_graph_co_rdlock(void) | 78 | object_ref(OBJECT(vs->conf.iothread)); |
99 | qemu_co_queue_wait(&reader_queue, &aio_context_list_lock); | 79 | } else { |
80 | AioContext *ctx = qemu_get_aio_context(); | ||
81 | - for (unsigned i = 0; i < num_vqs; i++) { | ||
82 | - s->vq_aio_context[i] = ctx; | ||
83 | + for (unsigned i = 0; i < vs->conf.num_queues; i++) { | ||
84 | + s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx; | ||
100 | } | 85 | } |
101 | } | 86 | } |
102 | -#endif | 87 | - |
88 | - /* | ||
89 | - * Always handle the ctrl virtqueue in the main loop thread where device | ||
90 | - * resets can be performed. | ||
91 | - */ | ||
92 | - s->vq_aio_context[0] = qemu_get_aio_context(); | ||
103 | } | 93 | } |
104 | 94 | ||
105 | void coroutine_fn bdrv_graph_co_rdunlock(void) | 95 | /* Context: BQL held */ |
106 | { | ||
107 | -#if 0 | ||
108 | BdrvGraphRWlock *bdrv_graph; | ||
109 | bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; | ||
110 | |||
111 | @@ -XXX,XX +XXX,XX @@ void coroutine_fn bdrv_graph_co_rdunlock(void) | ||
112 | if (qatomic_read(&has_writer)) { | ||
113 | aio_wait_kick(); | ||
114 | } | ||
115 | -#endif | ||
116 | } | ||
117 | |||
118 | void bdrv_graph_rdlock_main_loop(void) | ||
119 | @@ -XXX,XX +XXX,XX @@ void bdrv_graph_rdunlock_main_loop(void) | ||
120 | void assert_bdrv_graph_readable(void) | ||
121 | { | ||
122 | /* reader_count() is slow due to aio_context_list_lock lock contention */ | ||
123 | - /* TODO Reenable when wrlock is reenabled */ | ||
124 | -#if 0 | ||
125 | #ifdef CONFIG_DEBUG_GRAPH_LOCK | ||
126 | assert(qemu_in_main_thread() || reader_count()); | ||
127 | #endif | ||
128 | -#endif | ||
129 | } | ||
130 | |||
131 | void assert_bdrv_graph_writable(void) | ||
132 | { | ||
133 | assert(qemu_in_main_thread()); | ||
134 | - /* TODO Reenable when wrlock is reenabled */ | ||
135 | -#if 0 | ||
136 | assert(qatomic_read(&has_writer)); | ||
137 | -#endif | ||
138 | } | ||
139 | -- | 96 | -- |
140 | 2.41.0 | 97 | 2.48.1 | diff view generated by jsdifflib |