1 | The following changes since commit 55a19ad8b2d0797e3a8fe90ab99a9bb713824059: | 1 | The following changes since commit 813bac3d8d70d85cb7835f7945eb9eed84c2d8d0: |
---|---|---|---|
2 | 2 | ||
3 | Update version for v2.9.0-rc1 release (2017-03-21 17:13:29 +0000) | 3 | Merge tag '2023q3-bsd-user-pull-request' of https://gitlab.com/bsdimp/qemu into staging (2023-08-29 08:58:00 -0400) |
4 | 4 | ||
5 | are available in the git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 600ac6a0ef5c06418446ef2f37407bddcc51b21c: | 9 | for you to fetch changes up to 87ec6f55af38e29be5b2b65a8acf84da73e06d06: |
10 | 10 | ||
11 | blockjob: add devops to blockjob backends (2017-03-22 13:26:27 -0400) | 11 | aio-posix: zero out io_uring sqe user_data (2023-08-30 07:39:59 -0400) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Block patches for 2.9 | 14 | Pull request |
15 | |||
16 | v3: | ||
17 | - Drop UFS emulation due to CI failures | ||
18 | - Add "aio-posix: zero out io_uring sqe user_data" | ||
19 | |||
15 | ---------------------------------------------------------------- | 20 | ---------------------------------------------------------------- |
16 | 21 | ||
17 | John Snow (3): | 22 | Andrey Drobyshev (3): |
18 | blockjob: add block_job_start_shim | 23 | block: add subcluster_size field to BlockDriverInfo |
19 | block-backend: add drained_begin / drained_end ops | 24 | block/io: align requests to subcluster_size |
20 | blockjob: add devops to blockjob backends | 25 | tests/qemu-iotests/197: add testcase for CoR with subclusters |
21 | 26 | ||
22 | Paolo Bonzini (1): | 27 | Fabiano Rosas (1): |
23 | blockjob: avoid recursive AioContext locking | 28 | block-migration: Ensure we don't crash during migration cleanup |
24 | 29 | ||
25 | block/block-backend.c | 24 ++++++++++++++-- | 30 | Stefan Hajnoczi (1): |
26 | blockjob.c | 63 ++++++++++++++++++++++++++++++++---------- | 31 | aio-posix: zero out io_uring sqe user_data |
27 | include/sysemu/block-backend.h | 8 ++++++ | 32 | |
28 | 3 files changed, 79 insertions(+), 16 deletions(-) | 33 | include/block/block-common.h | 5 ++++ |
34 | include/block/block-io.h | 8 +++--- | ||
35 | block.c | 7 +++++ | ||
36 | block/io.c | 50 ++++++++++++++++++------------------ | ||
37 | block/mirror.c | 8 +++--- | ||
38 | block/qcow2.c | 1 + | ||
39 | migration/block.c | 11 ++++++-- | ||
40 | util/fdmon-io_uring.c | 2 ++ | ||
41 | tests/qemu-iotests/197 | 29 +++++++++++++++++++++ | ||
42 | tests/qemu-iotests/197.out | 24 +++++++++++++++++ | ||
43 | 10 files changed, 110 insertions(+), 35 deletions(-) | ||
29 | 44 | ||
30 | -- | 45 | -- |
31 | 2.9.3 | 46 | 2.41.0 |
32 | |||
33 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Fabiano Rosas <farosas@suse.de> | ||
1 | 2 | ||
3 | We can fail the blk_insert_bs() at init_blk_migration(), leaving the | ||
4 | BlkMigDevState without a dirty_bitmap and BlockDriverState. Account | ||
5 | for the possibly missing elements when doing cleanup. | ||
6 | |||
7 | Fix the following crashes: | ||
8 | |||
9 | Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. | ||
10 | 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359 | ||
11 | 359 BlockDriverState *bs = bitmap->bs; | ||
12 | #0 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359 | ||
13 | #1 0x0000555555bba331 in unset_dirty_tracking () at ../migration/block.c:371 | ||
14 | #2 0x0000555555bbad98 in block_migration_cleanup_bmds () at ../migration/block.c:681 | ||
15 | |||
16 | Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. | ||
17 | 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073 | ||
18 | 7073 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) { | ||
19 | #0 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073 | ||
20 | #1 0x0000555555e9734a in bdrv_op_unblock_all (bs=0x0, reason=0x0) at ../block.c:7095 | ||
21 | #2 0x0000555555bbae13 in block_migration_cleanup_bmds () at ../migration/block.c:690 | ||
22 | |||
23 | Signed-off-by: Fabiano Rosas <farosas@suse.de> | ||
24 | Message-id: 20230731203338.27581-1-farosas@suse.de | ||
25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
26 | --- | ||
27 | migration/block.c | 11 +++++++++-- | ||
28 | 1 file changed, 9 insertions(+), 2 deletions(-) | ||
29 | |||
30 | diff --git a/migration/block.c b/migration/block.c | ||
31 | index XXXXXXX..XXXXXXX 100644 | ||
32 | --- a/migration/block.c | ||
33 | +++ b/migration/block.c | ||
34 | @@ -XXX,XX +XXX,XX @@ static void unset_dirty_tracking(void) | ||
35 | BlkMigDevState *bmds; | ||
36 | |||
37 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { | ||
38 | - bdrv_release_dirty_bitmap(bmds->dirty_bitmap); | ||
39 | + if (bmds->dirty_bitmap) { | ||
40 | + bdrv_release_dirty_bitmap(bmds->dirty_bitmap); | ||
41 | + } | ||
42 | } | ||
43 | } | ||
44 | |||
45 | @@ -XXX,XX +XXX,XX @@ static int64_t get_remaining_dirty(void) | ||
46 | static void block_migration_cleanup_bmds(void) | ||
47 | { | ||
48 | BlkMigDevState *bmds; | ||
49 | + BlockDriverState *bs; | ||
50 | AioContext *ctx; | ||
51 | |||
52 | unset_dirty_tracking(); | ||
53 | |||
54 | while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { | ||
55 | QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); | ||
56 | - bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); | ||
57 | + | ||
58 | + bs = blk_bs(bmds->blk); | ||
59 | + if (bs) { | ||
60 | + bdrv_op_unblock_all(bs, bmds->blocker); | ||
61 | + } | ||
62 | error_free(bmds->blocker); | ||
63 | |||
64 | /* Save ctx, because bmds->blk can disappear during blk_unref. */ | ||
65 | -- | ||
66 | 2.41.0 | diff view generated by jsdifflib |
1 | From: John Snow <jsnow@redhat.com> | 1 | From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
---|---|---|---|
2 | 2 | ||
3 | This lets us hook into drained_begin and drained_end requests from the | 3 | This is going to be used in the subsequent commit as requests alignment |
4 | backend level, which is particularly useful for making sure that all | 4 | (in particular, during copy-on-read). This value only makes sense for |
5 | jobs associated with a particular node (whether the source or the target) | 5 | the formats which support subclusters (currently QCOW2 only). If this |
6 | receive a drain request. | 6 | field isn't set by driver's own bdrv_get_info() implementation, we |
7 | simply set it equal to the cluster size thus treating each cluster as | ||
8 | having a single subcluster. | ||
7 | 9 | ||
8 | Suggested-by: Kevin Wolf <kwolf@redhat.com> | 10 | Reviewed-by: Eric Blake <eblake@redhat.com> |
9 | Signed-off-by: John Snow <jsnow@redhat.com> | 11 | Reviewed-by: Denis V. Lunev <den@openvz.org> |
10 | Reviewed-by: Jeff Cody <jcody@redhat.com> | 12 | Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
11 | Message-id: 20170316212351.13797-4-jsnow@redhat.com | 13 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> |
12 | Signed-off-by: Jeff Cody <jcody@redhat.com> | 14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
15 | Message-ID: <20230711172553.234055-2-andrey.drobyshev@virtuozzo.com> | ||
13 | --- | 16 | --- |
14 | blockjob.c | 29 ++++++++++++++++++++++++----- | 17 | include/block/block-common.h | 5 +++++ |
15 | 1 file changed, 24 insertions(+), 5 deletions(-) | 18 | block.c | 7 +++++++ |
19 | block/qcow2.c | 1 + | ||
20 | 3 files changed, 13 insertions(+) | ||
16 | 21 | ||
17 | diff --git a/blockjob.c b/blockjob.c | 22 | diff --git a/include/block/block-common.h b/include/block/block-common.h |
18 | index XXXXXXX..XXXXXXX 100644 | 23 | index XXXXXXX..XXXXXXX 100644 |
19 | --- a/blockjob.c | 24 | --- a/include/block/block-common.h |
20 | +++ b/blockjob.c | 25 | +++ b/include/block/block-common.h |
21 | @@ -XXX,XX +XXX,XX @@ static const BdrvChildRole child_job = { | 26 | @@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneWps { |
22 | .stay_at_node = true, | 27 | typedef struct BlockDriverInfo { |
23 | }; | 28 | /* in bytes, 0 if irrelevant */ |
24 | 29 | int cluster_size; | |
25 | +static void block_job_drained_begin(void *opaque) | 30 | + /* |
26 | +{ | 31 | + * A fraction of cluster_size, if supported (currently QCOW2 only); if |
27 | + BlockJob *job = opaque; | 32 | + * disabled or unsupported, set equal to cluster_size. |
28 | + block_job_pause(job); | 33 | + */ |
29 | +} | 34 | + int subcluster_size; |
30 | + | 35 | /* offset at which the VM state can be saved (0 if not possible) */ |
31 | +static void block_job_drained_end(void *opaque) | 36 | int64_t vm_state_offset; |
32 | +{ | 37 | bool is_dirty; |
33 | + BlockJob *job = opaque; | 38 | diff --git a/block.c b/block.c |
34 | + block_job_resume(job); | 39 | index XXXXXXX..XXXXXXX 100644 |
35 | +} | 40 | --- a/block.c |
36 | + | 41 | +++ b/block.c |
37 | +static const BlockDevOps block_job_dev_ops = { | 42 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) |
38 | + .drained_begin = block_job_drained_begin, | 43 | } |
39 | + .drained_end = block_job_drained_end, | 44 | memset(bdi, 0, sizeof(*bdi)); |
40 | +}; | 45 | ret = drv->bdrv_co_get_info(bs, bdi); |
41 | + | 46 | + if (bdi->subcluster_size == 0) { |
42 | BlockJob *block_job_next(BlockJob *job) | 47 | + /* |
48 | + * If the driver left this unset, subclusters are not supported. | ||
49 | + * Then it is safe to treat each cluster as having only one subcluster. | ||
50 | + */ | ||
51 | + bdi->subcluster_size = bdi->cluster_size; | ||
52 | + } | ||
53 | if (ret < 0) { | ||
54 | return ret; | ||
55 | } | ||
56 | diff --git a/block/qcow2.c b/block/qcow2.c | ||
57 | index XXXXXXX..XXXXXXX 100644 | ||
58 | --- a/block/qcow2.c | ||
59 | +++ b/block/qcow2.c | ||
60 | @@ -XXX,XX +XXX,XX @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) | ||
43 | { | 61 | { |
44 | if (!job) { | 62 | BDRVQcow2State *s = bs->opaque; |
45 | @@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, | 63 | bdi->cluster_size = s->cluster_size; |
46 | } | 64 | + bdi->subcluster_size = s->subcluster_size; |
47 | 65 | bdi->vm_state_offset = qcow2_vm_state_offset(s); | |
48 | job = g_malloc0(driver->instance_size); | 66 | bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY; |
49 | - error_setg(&job->blocker, "block device is in use by block job: %s", | 67 | return 0; |
50 | - BlockJobType_lookup[driver->job_type]); | ||
51 | - block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort); | ||
52 | - bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker); | ||
53 | - | ||
54 | job->driver = driver; | ||
55 | job->id = g_strdup(job_id); | ||
56 | job->blk = blk; | ||
57 | @@ -XXX,XX +XXX,XX @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, | ||
58 | job->paused = true; | ||
59 | job->pause_count = 1; | ||
60 | job->refcnt = 1; | ||
61 | + | ||
62 | + error_setg(&job->blocker, "block device is in use by block job: %s", | ||
63 | + BlockJobType_lookup[driver->job_type]); | ||
64 | + block_job_add_bdrv(job, "main node", bs, 0, BLK_PERM_ALL, &error_abort); | ||
65 | bs->job = job; | ||
66 | |||
67 | + blk_set_dev_ops(blk, &block_job_dev_ops, job); | ||
68 | + bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker); | ||
69 | + | ||
70 | QLIST_INSERT_HEAD(&block_jobs, job, job_list); | ||
71 | |||
72 | blk_add_aio_context_notifier(blk, block_job_attached_aio_context, | ||
73 | -- | 68 | -- |
74 | 2.9.3 | 69 | 2.41.0 |
75 | |||
76 | diff view generated by jsdifflib |
1 | From: John Snow <jsnow@redhat.com> | 1 | From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
---|---|---|---|
2 | 2 | ||
3 | Allow block backends to forward drain requests to their devices/users. | 3 | When target image is using subclusters, and we align the request during |
4 | The initial intended purpose for this patch is to allow BBs to forward | 4 | copy-on-read, it makes sense to align to subcluster_size rather than |
5 | requests along to BlockJobs, which will want to pause if their associated | 5 | cluster_size. Otherwise we end up with unnecessary allocations. |
6 | BB has entered a drained region. | 6 | |
7 | 7 | This commit renames bdrv_round_to_clusters() to bdrv_round_to_subclusters() | |
8 | Signed-off-by: John Snow <jsnow@redhat.com> | 8 | and utilizes subcluster_size field of BlockDriverInfo to make necessary |
9 | Reviewed-by: Jeff Cody <jcody@redhat.com> | 9 | alignments. It affects copy-on-read as well as mirror job (which is |
10 | Message-id: 20170316212351.13797-3-jsnow@redhat.com | 10 | using bdrv_round_to_clusters()). |
11 | Signed-off-by: Jeff Cody <jcody@redhat.com> | 11 | |
12 | This change also fixes the following bug with failing assert (covered by | ||
13 | the test in the subsequent commit): | ||
14 | |||
15 | qemu-img create -f qcow2 base.qcow2 64K | ||
16 | qemu-img create -f qcow2 -o extended_l2=on,backing_file=base.qcow2,backing_fmt=qcow2 img.qcow2 64K | ||
17 | qemu-io -c "write -P 0xaa 0 2K" img.qcow2 | ||
18 | qemu-io -C -c "read -P 0x00 2K 62K" img.qcow2 | ||
19 | |||
20 | qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. | ||
21 | |||
22 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
23 | Reviewed-by: Denis V. Lunev <den@openvz.org> | ||
24 | Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> | ||
25 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> | ||
26 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
27 | Message-ID: <20230711172553.234055-3-andrey.drobyshev@virtuozzo.com> | ||
12 | --- | 28 | --- |
13 | block/block-backend.c | 24 ++++++++++++++++++++++-- | 29 | include/block/block-io.h | 8 +++---- |
14 | include/sysemu/block-backend.h | 8 ++++++++ | 30 | block/io.c | 50 ++++++++++++++++++++-------------------- |
15 | 2 files changed, 30 insertions(+), 2 deletions(-) | 31 | block/mirror.c | 8 +++---- |
16 | 32 | 3 files changed, 33 insertions(+), 33 deletions(-) | |
17 | diff --git a/block/block-backend.c b/block/block-backend.c | 33 | |
34 | diff --git a/include/block/block-io.h b/include/block/block-io.h | ||
18 | index XXXXXXX..XXXXXXX 100644 | 35 | index XXXXXXX..XXXXXXX 100644 |
19 | --- a/block/block-backend.c | 36 | --- a/include/block/block-io.h |
20 | +++ b/block/block-backend.c | 37 | +++ b/include/block/block-io.h |
21 | @@ -XXX,XX +XXX,XX @@ struct BlockBackend { | 38 | @@ -XXX,XX +XXX,XX @@ bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); |
22 | bool allow_write_beyond_eof; | 39 | ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs, |
23 | 40 | Error **errp); | |
24 | NotifierList remove_bs_notifiers, insert_bs_notifiers; | 41 | BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs); |
25 | + | 42 | -void bdrv_round_to_clusters(BlockDriverState *bs, |
26 | + int quiesce_counter; | 43 | - int64_t offset, int64_t bytes, |
27 | }; | 44 | - int64_t *cluster_offset, |
28 | 45 | - int64_t *cluster_bytes); | |
29 | typedef struct BlockBackendAIOCB { | 46 | +void bdrv_round_to_subclusters(BlockDriverState *bs, |
30 | @@ -XXX,XX +XXX,XX @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, | 47 | + int64_t offset, int64_t bytes, |
31 | void *opaque) | 48 | + int64_t *cluster_offset, |
49 | + int64_t *cluster_bytes); | ||
50 | |||
51 | void bdrv_get_backing_filename(BlockDriverState *bs, | ||
52 | char *filename, int filename_size); | ||
53 | diff --git a/block/io.c b/block/io.c | ||
54 | index XXXXXXX..XXXXXXX 100644 | ||
55 | --- a/block/io.c | ||
56 | +++ b/block/io.c | ||
57 | @@ -XXX,XX +XXX,XX @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) | ||
58 | } | ||
59 | |||
60 | /** | ||
61 | - * Round a region to cluster boundaries | ||
62 | + * Round a region to subcluster (if supported) or cluster boundaries | ||
63 | */ | ||
64 | void coroutine_fn GRAPH_RDLOCK | ||
65 | -bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
66 | - int64_t *cluster_offset, int64_t *cluster_bytes) | ||
67 | +bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes, | ||
68 | + int64_t *align_offset, int64_t *align_bytes) | ||
32 | { | 69 | { |
33 | /* All drivers that use blk_set_dev_ops() are qdevified and we want to keep | 70 | BlockDriverInfo bdi; |
34 | - * it that way, so we can assume blk->dev is a DeviceState if blk->dev_ops | 71 | IO_CODE(); |
35 | - * is set. */ | 72 | - if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { |
36 | + * it that way, so we can assume blk->dev, if present, is a DeviceState if | 73 | - *cluster_offset = offset; |
37 | + * blk->dev_ops is set. Non-device users may use dev_ops without device. */ | 74 | - *cluster_bytes = bytes; |
38 | assert(!blk->legacy_dev); | 75 | + if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) { |
39 | 76 | + *align_offset = offset; | |
40 | blk->dev_ops = ops; | 77 | + *align_bytes = bytes; |
41 | blk->dev_opaque = opaque; | 78 | } else { |
42 | + | 79 | - int64_t c = bdi.cluster_size; |
43 | + /* Are we currently quiesced? Should we enforce this right now? */ | 80 | - *cluster_offset = QEMU_ALIGN_DOWN(offset, c); |
44 | + if (blk->quiesce_counter && ops->drained_begin) { | 81 | - *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); |
45 | + ops->drained_begin(opaque); | 82 | + int64_t c = bdi.subcluster_size; |
46 | + } | 83 | + *align_offset = QEMU_ALIGN_DOWN(offset, c); |
84 | + *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c); | ||
85 | } | ||
47 | } | 86 | } |
48 | 87 | ||
49 | /* | 88 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, |
50 | @@ -XXX,XX +XXX,XX @@ static void blk_root_drained_begin(BdrvChild *child) | 89 | void *bounce_buffer = NULL; |
51 | { | 90 | |
52 | BlockBackend *blk = child->opaque; | 91 | BlockDriver *drv = bs->drv; |
53 | 92 | - int64_t cluster_offset; | |
54 | + if (++blk->quiesce_counter == 1) { | 93 | - int64_t cluster_bytes; |
55 | + if (blk->dev_ops && blk->dev_ops->drained_begin) { | 94 | + int64_t align_offset; |
56 | + blk->dev_ops->drained_begin(blk->dev_opaque); | 95 | + int64_t align_bytes; |
57 | + } | 96 | int64_t skip_bytes; |
58 | + } | 97 | int ret; |
59 | + | 98 | int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, |
60 | /* Note that blk->root may not be accessible here yet if we are just | 99 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, |
61 | * attaching to a BlockDriverState that is drained. Use child instead. */ | 100 | * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which |
62 | 101 | * is one reason we loop rather than doing it all at once. | |
63 | @@ -XXX,XX +XXX,XX @@ static void blk_root_drained_begin(BdrvChild *child) | 102 | */ |
64 | static void blk_root_drained_end(BdrvChild *child) | 103 | - bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); |
65 | { | 104 | - skip_bytes = offset - cluster_offset; |
66 | BlockBackend *blk = child->opaque; | 105 | + bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes); |
67 | + assert(blk->quiesce_counter); | 106 | + skip_bytes = offset - align_offset; |
68 | 107 | ||
69 | assert(blk->public.io_limits_disabled); | 108 | trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, |
70 | --blk->public.io_limits_disabled; | 109 | - cluster_offset, cluster_bytes); |
71 | + | 110 | + align_offset, align_bytes); |
72 | + if (--blk->quiesce_counter == 0) { | 111 | |
73 | + if (blk->dev_ops && blk->dev_ops->drained_end) { | 112 | - while (cluster_bytes) { |
74 | + blk->dev_ops->drained_end(blk->dev_opaque); | 113 | + while (align_bytes) { |
75 | + } | 114 | int64_t pnum; |
76 | + } | 115 | |
77 | } | 116 | if (skip_write) { |
78 | diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h | 117 | ret = 1; /* "already allocated", so nothing will be copied */ |
118 | - pnum = MIN(cluster_bytes, max_transfer); | ||
119 | + pnum = MIN(align_bytes, max_transfer); | ||
120 | } else { | ||
121 | - ret = bdrv_is_allocated(bs, cluster_offset, | ||
122 | - MIN(cluster_bytes, max_transfer), &pnum); | ||
123 | + ret = bdrv_is_allocated(bs, align_offset, | ||
124 | + MIN(align_bytes, max_transfer), &pnum); | ||
125 | if (ret < 0) { | ||
126 | /* | ||
127 | * Safe to treat errors in querying allocation as if | ||
128 | * unallocated; we'll probably fail again soon on the | ||
129 | * read, but at least that will set a decent errno. | ||
130 | */ | ||
131 | - pnum = MIN(cluster_bytes, max_transfer); | ||
132 | + pnum = MIN(align_bytes, max_transfer); | ||
133 | } | ||
134 | |||
135 | /* Stop at EOF if the image ends in the middle of the cluster */ | ||
136 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
137 | /* Must copy-on-read; use the bounce buffer */ | ||
138 | pnum = MIN(pnum, MAX_BOUNCE_BUFFER); | ||
139 | if (!bounce_buffer) { | ||
140 | - int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); | ||
141 | + int64_t max_we_need = MAX(pnum, align_bytes - pnum); | ||
142 | int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); | ||
143 | int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); | ||
144 | |||
145 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
146 | } | ||
147 | qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); | ||
148 | |||
149 | - ret = bdrv_driver_preadv(bs, cluster_offset, pnum, | ||
150 | + ret = bdrv_driver_preadv(bs, align_offset, pnum, | ||
151 | &local_qiov, 0, 0); | ||
152 | if (ret < 0) { | ||
153 | goto err; | ||
154 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
155 | /* FIXME: Should we (perhaps conditionally) be setting | ||
156 | * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy | ||
157 | * that still correctly reads as zero? */ | ||
158 | - ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, | ||
159 | + ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum, | ||
160 | BDRV_REQ_WRITE_UNCHANGED); | ||
161 | } else { | ||
162 | /* This does not change the data on the disk, it is not | ||
163 | * necessary to flush even in cache=writethrough mode. | ||
164 | */ | ||
165 | - ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, | ||
166 | + ret = bdrv_driver_pwritev(bs, align_offset, pnum, | ||
167 | &local_qiov, 0, | ||
168 | BDRV_REQ_WRITE_UNCHANGED); | ||
169 | } | ||
170 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
171 | } | ||
172 | } | ||
173 | |||
174 | - cluster_offset += pnum; | ||
175 | - cluster_bytes -= pnum; | ||
176 | + align_offset += pnum; | ||
177 | + align_bytes -= pnum; | ||
178 | progress += pnum - skip_bytes; | ||
179 | skip_bytes = 0; | ||
180 | } | ||
181 | diff --git a/block/mirror.c b/block/mirror.c | ||
79 | index XXXXXXX..XXXXXXX 100644 | 182 | index XXXXXXX..XXXXXXX 100644 |
80 | --- a/include/sysemu/block-backend.h | 183 | --- a/block/mirror.c |
81 | +++ b/include/sysemu/block-backend.h | 184 | +++ b/block/mirror.c |
82 | @@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps { | 185 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset, |
83 | * Runs when the size changed (e.g. monitor command block_resize) | 186 | need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity, |
84 | */ | 187 | s->cow_bitmap); |
85 | void (*resize_cb)(void *opaque); | 188 | if (need_cow) { |
86 | + /* | 189 | - bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes, |
87 | + * Runs when the backend receives a drain request. | 190 | - &align_offset, &align_bytes); |
88 | + */ | 191 | + bdrv_round_to_subclusters(blk_bs(s->target), *offset, *bytes, |
89 | + void (*drained_begin)(void *opaque); | 192 | + &align_offset, &align_bytes); |
90 | + /* | 193 | } |
91 | + * Runs when the backend's last drain request ends. | 194 | |
92 | + */ | 195 | if (align_bytes > max_bytes) { |
93 | + void (*drained_end)(void *opaque); | 196 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) |
94 | } BlockDevOps; | 197 | int64_t target_offset; |
95 | 198 | int64_t target_bytes; | |
96 | /* This struct is embedded in (the private) BlockBackend struct and contains | 199 | WITH_GRAPH_RDLOCK_GUARD() { |
200 | - bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes, | ||
201 | - &target_offset, &target_bytes); | ||
202 | + bdrv_round_to_subclusters(blk_bs(s->target), offset, io_bytes, | ||
203 | + &target_offset, &target_bytes); | ||
204 | } | ||
205 | if (target_offset == offset && | ||
206 | target_bytes == io_bytes) { | ||
97 | -- | 207 | -- |
98 | 2.9.3 | 208 | 2.41.0 |
99 | |||
100 | diff view generated by jsdifflib |
1 | From: Paolo Bonzini <pbonzini@redhat.com> | 1 | From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
---|---|---|---|
2 | 2 | ||
3 | Streaming or any other block job hangs when performed on a block device | 3 | Add testcase which checks that allocations during copy-on-read are |
4 | that has a non-default iothread. This happens because the AioContext | 4 | performed on the subcluster basis when subclusters are enabled in target |
5 | is acquired twice by block_job_defer_to_main_loop_bh and then released | 5 | image. |
6 | only once by BDRV_POLL_WHILE. (Insert rants on recursive mutexes, which | ||
7 | unfortunately are a temporary but necessary evil for iothreads at the | ||
8 | moment). | ||
9 | 6 | ||
10 | Luckily, the reason for the double acquisition is simple; the function | 7 | This testcase also triggers the following assert with previous commit |
11 | acquires the AioContext for both the job iothread and the BDS iothread, | 8 | not being applied, so we check that as well: |
12 | in case the BDS iothread was changed while the job was running. It | ||
13 | is therefore enough to skip the second acquisition when the two | ||
14 | AioContexts are one and the same. | ||
15 | 9 | ||
16 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 10 | qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. |
11 | |||
17 | Reviewed-by: Eric Blake <eblake@redhat.com> | 12 | Reviewed-by: Eric Blake <eblake@redhat.com> |
18 | Reviewed-by: Jeff Cody <jcody@redhat.com> | 13 | Reviewed-by: Denis V. Lunev <den@openvz.org> |
19 | Message-id: 1490118490-5597-1-git-send-email-pbonzini@redhat.com | 14 | Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
20 | Signed-off-by: Jeff Cody <jcody@redhat.com> | 15 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> |
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
17 | Message-ID: <20230711172553.234055-4-andrey.drobyshev@virtuozzo.com> | ||
21 | --- | 18 | --- |
22 | blockjob.c | 8 ++++++-- | 19 | tests/qemu-iotests/197 | 29 +++++++++++++++++++++++++++++ |
23 | 1 file changed, 6 insertions(+), 2 deletions(-) | 20 | tests/qemu-iotests/197.out | 24 ++++++++++++++++++++++++ |
21 | 2 files changed, 53 insertions(+) | ||
24 | 22 | ||
25 | diff --git a/blockjob.c b/blockjob.c | 23 | diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 |
24 | index XXXXXXX..XXXXXXX 100755 | ||
25 | --- a/tests/qemu-iotests/197 | ||
26 | +++ b/tests/qemu-iotests/197 | ||
27 | @@ -XXX,XX +XXX,XX @@ $QEMU_IO -f qcow2 -C -c 'read 0 1024' "$TEST_WRAP" | _filter_qemu_io | ||
28 | $QEMU_IO -f qcow2 -c map "$TEST_WRAP" | ||
29 | _check_test_img | ||
30 | |||
31 | +echo | ||
32 | +echo '=== Copy-on-read with subclusters ===' | ||
33 | +echo | ||
34 | + | ||
35 | +# Create base and top images 64K (1 cluster) each. Make subclusters enabled | ||
36 | +# for the top image | ||
37 | +_make_test_img 64K | ||
38 | +IMGPROTO=file IMGFMT=qcow2 TEST_IMG_FILE="$TEST_WRAP" \ | ||
39 | + _make_test_img --no-opts -o extended_l2=true -F "$IMGFMT" -b "$TEST_IMG" \ | ||
40 | + 64K | _filter_img_create | ||
41 | + | ||
42 | +$QEMU_IO -c "write -P 0xaa 0 64k" "$TEST_IMG" | _filter_qemu_io | ||
43 | + | ||
44 | +# Allocate individual subclusters in the top image, and not the whole cluster | ||
45 | +$QEMU_IO -c "write -P 0xbb 28K 2K" -c "write -P 0xcc 34K 2K" "$TEST_WRAP" \ | ||
46 | + | _filter_qemu_io | ||
47 | + | ||
48 | +# Only 2 subclusters should be allocated in the top image at this point | ||
49 | +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map | ||
50 | + | ||
51 | +# Actual copy-on-read operation | ||
52 | +$QEMU_IO -C -c "read -P 0xaa 30K 4K" "$TEST_WRAP" | _filter_qemu_io | ||
53 | + | ||
54 | +# And here we should have 4 subclusters allocated right in the middle of the | ||
55 | +# top image. Make sure the whole cluster remains unallocated | ||
56 | +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map | ||
57 | + | ||
58 | +_check_test_img | ||
59 | + | ||
60 | # success, all done | ||
61 | echo '*** done' | ||
62 | status=0 | ||
63 | diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out | ||
26 | index XXXXXXX..XXXXXXX 100644 | 64 | index XXXXXXX..XXXXXXX 100644 |
27 | --- a/blockjob.c | 65 | --- a/tests/qemu-iotests/197.out |
28 | +++ b/blockjob.c | 66 | +++ b/tests/qemu-iotests/197.out |
29 | @@ -XXX,XX +XXX,XX @@ static void block_job_defer_to_main_loop_bh(void *opaque) | 67 | @@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 0 |
30 | 68 | 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | |
31 | /* Fetch BDS AioContext again, in case it has changed */ | 69 | 1 KiB (0x400) bytes allocated at offset 0 bytes (0x0) |
32 | aio_context = blk_get_aio_context(data->job->blk); | 70 | No errors were found on the image. |
33 | - aio_context_acquire(aio_context); | 71 | + |
34 | + if (aio_context != data->aio_context) { | 72 | +=== Copy-on-read with subclusters === |
35 | + aio_context_acquire(aio_context); | 73 | + |
36 | + } | 74 | +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 |
37 | 75 | +Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT | |
38 | data->job->deferred_to_main_loop = false; | 76 | +wrote 65536/65536 bytes at offset 0 |
39 | data->fn(data->job, data->opaque); | 77 | +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) |
40 | 78 | +wrote 2048/2048 bytes at offset 28672 | |
41 | - aio_context_release(aio_context); | 79 | +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) |
42 | + if (aio_context != data->aio_context) { | 80 | +wrote 2048/2048 bytes at offset 34816 |
43 | + aio_context_release(aio_context); | 81 | +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) |
44 | + } | 82 | +Offset Length File |
45 | 83 | +0 0x7000 TEST_DIR/t.IMGFMT | |
46 | aio_context_release(data->aio_context); | 84 | +0x7000 0x800 TEST_DIR/t.wrap.IMGFMT |
47 | 85 | +0x7800 0x1000 TEST_DIR/t.IMGFMT | |
86 | +0x8800 0x800 TEST_DIR/t.wrap.IMGFMT | ||
87 | +0x9000 0x7000 TEST_DIR/t.IMGFMT | ||
88 | +read 4096/4096 bytes at offset 30720 | ||
89 | +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | ||
90 | +Offset Length File | ||
91 | +0 0x7000 TEST_DIR/t.IMGFMT | ||
92 | +0x7000 0x2000 TEST_DIR/t.wrap.IMGFMT | ||
93 | +0x9000 0x7000 TEST_DIR/t.IMGFMT | ||
94 | +No errors were found on the image. | ||
95 | *** done | ||
48 | -- | 96 | -- |
49 | 2.9.3 | 97 | 2.41.0 |
50 | |||
51 | diff view generated by jsdifflib |
1 | From: John Snow <jsnow@redhat.com> | 1 | liburing does not clear sqe->user_data. We must do it ourselves to avoid |
---|---|---|---|
2 | undefined behavior in process_cqe() when user_data is used. | ||
2 | 3 | ||
3 | The purpose of this shim is to allow us to pause pre-started jobs. | 4 | Note that fdmon-io_uring is currently disabled, so this is a latent bug |
4 | The purpose of *that* is to allow us to buffer a pause request that | 5 | that does not affect users. Let's merge this fix now to make it easier |
5 | will be able to take effect before the job ever does any work, allowing | 6 | to enable fdmon-io_uring in the future (and I'm working on that). |
6 | us to create jobs during a quiescent state (under which they will be | ||
7 | automatically paused), then resuming the jobs after the critical section | ||
8 | in any order, either: | ||
9 | 7 | ||
10 | (1) -block_job_start | 8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
11 | -block_job_resume (via e.g. drained_end) | 9 | Message-ID: <20230426212639.82310-1-stefanha@redhat.com> |
10 | --- | ||
11 | util/fdmon-io_uring.c | 2 ++ | ||
12 | 1 file changed, 2 insertions(+) | ||
12 | 13 | ||
13 | (2) -block_job_resume (via e.g. drained_end) | 14 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c |
14 | -block_job_start | ||
15 | |||
16 | The problem that requires a startup wrapper is the idea that a job must | ||
17 | start in the busy=true state only its first time-- all subsequent entries | ||
18 | require busy to be false, and the toggling of this state is otherwise | ||
19 | handled during existing pause and yield points. | ||
20 | |||
21 | The wrapper simply allows us to mandate that a job can "start," set busy | ||
22 | to true, then immediately pause only if necessary. We could avoid | ||
23 | requiring a wrapper, but all jobs would need to do it, so it's been | ||
24 | factored out here. | ||
25 | |||
26 | Signed-off-by: John Snow <jsnow@redhat.com> | ||
27 | Reviewed-by: Jeff Cody <jcody@redhat.com> | ||
28 | Message-id: 20170316212351.13797-2-jsnow@redhat.com | ||
29 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
30 | --- | ||
31 | blockjob.c | 26 +++++++++++++++++++------- | ||
32 | 1 file changed, 19 insertions(+), 7 deletions(-) | ||
33 | |||
34 | diff --git a/blockjob.c b/blockjob.c | ||
35 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
36 | --- a/blockjob.c | 16 | --- a/util/fdmon-io_uring.c |
37 | +++ b/blockjob.c | 17 | +++ b/util/fdmon-io_uring.c |
38 | @@ -XXX,XX +XXX,XX @@ static bool block_job_started(BlockJob *job) | 18 | @@ -XXX,XX +XXX,XX @@ static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) |
39 | return job->co; | 19 | #else |
20 | io_uring_prep_poll_remove(sqe, node); | ||
21 | #endif | ||
22 | + io_uring_sqe_set_data(sqe, NULL); | ||
40 | } | 23 | } |
41 | 24 | ||
42 | +/** | 25 | /* Add a timeout that self-cancels when another cqe becomes ready */ |
43 | + * All jobs must allow a pause point before entering their job proper. This | 26 | @@ -XXX,XX +XXX,XX @@ static void add_timeout_sqe(AioContext *ctx, int64_t ns) |
44 | + * ensures that jobs can be paused prior to being started, then resumed later. | 27 | |
45 | + */ | 28 | sqe = get_sqe(ctx); |
46 | +static void coroutine_fn block_job_co_entry(void *opaque) | 29 | io_uring_prep_timeout(sqe, &ts, 1, 0); |
47 | +{ | 30 | + io_uring_sqe_set_data(sqe, NULL); |
48 | + BlockJob *job = opaque; | ||
49 | + | ||
50 | + assert(job && job->driver && job->driver->start); | ||
51 | + block_job_pause_point(job); | ||
52 | + job->driver->start(job); | ||
53 | +} | ||
54 | + | ||
55 | void block_job_start(BlockJob *job) | ||
56 | { | ||
57 | assert(job && !block_job_started(job) && job->paused && | ||
58 | - !job->busy && job->driver->start); | ||
59 | - job->co = qemu_coroutine_create(job->driver->start, job); | ||
60 | - if (--job->pause_count == 0) { | ||
61 | - job->paused = false; | ||
62 | - job->busy = true; | ||
63 | - qemu_coroutine_enter(job->co); | ||
64 | - } | ||
65 | + job->driver && job->driver->start); | ||
66 | + job->co = qemu_coroutine_create(block_job_co_entry, job); | ||
67 | + job->pause_count--; | ||
68 | + job->busy = true; | ||
69 | + job->paused = false; | ||
70 | + qemu_coroutine_enter(job->co); | ||
71 | } | 31 | } |
72 | 32 | ||
73 | void block_job_ref(BlockJob *job) | 33 | /* Add sqes from ctx->submit_list for submission */ |
74 | -- | 34 | -- |
75 | 2.9.3 | 35 | 2.41.0 |
76 | |||
77 | diff view generated by jsdifflib |