It's safer to expand in_flight request to start before enter to
coroutine in synchronous wrappers and end after BDRV_POLL_WHILE loop.
Note that qemu_coroutine_enter may only schedule the coroutine in some
circumstances.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
block/io.c | 155 ++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 119 insertions(+), 36 deletions(-)
diff --git a/block/io.c b/block/io.c
index dfbe68f428..9b57c7e422 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1511,7 +1511,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
}
-int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_preadv_part(BdrvChild *child,
int64_t offset, unsigned int bytes,
QEMUIOVector *qiov, size_t qiov_offset,
BdrvRequestFlags flags)
@@ -1540,8 +1541,6 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
return 0;
}
- bdrv_inc_in_flight(bs);
-
/* Don't do copy-on-read if we read data before write operation */
if (atomic_read(&bs->copy_on_read)) {
flags |= BDRV_REQ_COPY_ON_READ;
@@ -1554,13 +1553,26 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
bs->bl.request_alignment,
qiov, qiov_offset, flags);
tracked_request_end(&req);
- bdrv_dec_in_flight(bs);
bdrv_padding_destroy(&pad);
return ret;
}
+int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
+ int64_t offset, unsigned int bytes,
+ QEMUIOVector *qiov, size_t qiov_offset,
+ BdrvRequestFlags flags)
+{
+ int ret;
+
+ bdrv_inc_in_flight(child->bs);
+ ret = bdrv_do_preadv_part(child, offset, bytes, qiov, qiov_offset, flags);
+ bdrv_dec_in_flight(child->bs);
+
+ return ret;
+}
+
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int bytes, BdrvRequestFlags flags)
{
@@ -1922,7 +1934,8 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
}
-int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_pwritev_part(BdrvChild *child,
int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
BdrvRequestFlags flags)
{
@@ -1962,7 +1975,6 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
return 0;
}
- bdrv_inc_in_flight(bs);
/*
* Align write if necessary by performing a read-modify-write cycle.
* Pad qiov with the read parts and be sure to have a tracked request not
@@ -1987,7 +1999,19 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
out:
tracked_request_end(&req);
- bdrv_dec_in_flight(bs);
+
+ return ret;
+}
+
+int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
+ BdrvRequestFlags flags)
+{
+ int ret;
+
+ bdrv_inc_in_flight(child->bs);
+ ret = bdrv_do_pwritev_part(child, offset, bytes, qiov, qiov_offset, flags);
+ bdrv_dec_in_flight(child->bs);
return ret;
}
@@ -2019,12 +2043,12 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
RwCo *rwco = opaque;
if (!rwco->is_write) {
- rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
- rwco->qiov->size, rwco->qiov,
+ rwco->ret = bdrv_do_preadv_part(rwco->child, rwco->offset,
+ rwco->qiov->size, rwco->qiov, 0,
rwco->flags);
} else {
- rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
- rwco->qiov->size, rwco->qiov,
+ rwco->ret = bdrv_do_pwritev_part(rwco->child, rwco->offset,
+ rwco->qiov->size, rwco->qiov, 0,
rwco->flags);
}
aio_wait_kick();
@@ -2047,6 +2071,8 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
.flags = flags,
};
+ bdrv_inc_in_flight(child->bs);
+
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
bdrv_rw_co_entry(&rwco);
@@ -2055,6 +2081,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
bdrv_coroutine_enter(child->bs, co);
BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
}
+
+ bdrv_dec_in_flight(child->bs);
+
return rwco.ret;
}
@@ -2700,15 +2729,14 @@ typedef struct BdrvVmstateCo {
int ret;
} BdrvVmstateCo;
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
static int coroutine_fn
-bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+bdrv_do_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
bool is_read)
{
BlockDriver *drv = bs->drv;
int ret = -ENOTSUP;
- bdrv_inc_in_flight(bs);
-
if (!drv) {
ret = -ENOMEDIUM;
} else if (drv->bdrv_load_vmstate) {
@@ -2718,17 +2746,18 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
ret = drv->bdrv_save_vmstate(bs, qiov, pos);
}
} else if (bs->file) {
- ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+ bdrv_inc_in_flight(bs->file->bs);
+ ret = bdrv_do_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+ bdrv_dec_in_flight(bs->file->bs);
}
- bdrv_dec_in_flight(bs);
return ret;
}
static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
{
BdrvVmstateCo *co = opaque;
- co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
+ co->ret = bdrv_do_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
aio_wait_kick();
}
@@ -2736,8 +2765,12 @@ static inline int
bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
bool is_read)
{
+ int ret;
+
+ bdrv_inc_in_flight(bs);
+
if (qemu_in_coroutine()) {
- return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
+ ret = bdrv_do_rw_vmstate(bs, qiov, pos, is_read);
} else {
BdrvVmstateCo data = {
.bs = bs,
@@ -2750,8 +2783,12 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
bdrv_coroutine_enter(bs, co);
BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
- return data.ret;
+ ret = data.ret;
}
+
+ bdrv_dec_in_flight(bs);
+
+ return ret;
}
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
@@ -2829,16 +2866,14 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
/**************************************************************/
/* Coroutine block device emulation */
-int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_flush(BlockDriverState *bs)
{
int current_gen;
- int ret = 0;
-
- bdrv_inc_in_flight(bs);
+ int ret;
- if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
- bdrv_is_sg(bs)) {
- goto early_exit;
+ if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || bdrv_is_sg(bs)) {
+ return 0;
}
qemu_co_mutex_lock(&bs->reqs_lock);
@@ -2936,8 +2971,17 @@ out:
qemu_co_queue_next(&bs->flush_queue);
qemu_co_mutex_unlock(&bs->reqs_lock);
-early_exit:
+ return ret;
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+ int ret;
+
+ bdrv_inc_in_flight(bs);
+ ret = bdrv_do_flush(bs);
bdrv_dec_in_flight(bs);
+
return ret;
}
@@ -2950,7 +2994,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
{
FlushCo *rwco = opaque;
- rwco->ret = bdrv_co_flush(rwco->bs);
+ rwco->ret = bdrv_do_flush(rwco->bs);
aio_wait_kick();
}
@@ -2962,6 +3006,8 @@ int bdrv_flush(BlockDriverState *bs)
.ret = NOT_DONE,
};
+ bdrv_inc_in_flight(bs);
+
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
bdrv_flush_co_entry(&flush_co);
@@ -2971,11 +3017,14 @@ int bdrv_flush(BlockDriverState *bs)
BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
}
+ bdrv_dec_in_flight(bs);
+
return flush_co.ret;
}
-int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
- int64_t bytes)
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_pdiscahd(BdrvChild *child, int64_t offset,
+ int64_t bytes)
{
BdrvTrackedRequest req;
int max_pdiscard, ret;
@@ -3013,7 +3062,6 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
head = offset % align;
tail = (offset + bytes) % align;
- bdrv_inc_in_flight(bs);
tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
@@ -3084,7 +3132,19 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
out:
bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
tracked_request_end(&req);
- bdrv_dec_in_flight(bs);
+ return ret;
+}
+
+int coroutine_fn bdrv_co_pdiscard(BdrvChild *child,
+ int64_t offset, int64_t bytes)
+{
+ int ret;
+ BlockDriverState *bs = child->bs;
+
+ bdrv_inc_in_flight(child->bs);
+ ret = bdrv_do_pdiscahd(child, offset, bytes);
+ bdrv_dec_in_flight(child->bs);
+
return ret;
}
@@ -3113,6 +3173,8 @@ int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
.ret = NOT_DONE,
};
+ bdrv_inc_in_flight(child->bs);
+
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
bdrv_pdiscard_co_entry(&rwco);
@@ -3122,6 +3184,8 @@ int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
}
+ bdrv_dec_in_flight(child->bs);
+
return rwco.ret;
}
@@ -3412,9 +3476,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
* If 'exact' is true, the file must be resized to exactly the given
* 'offset'. Otherwise, it is sufficient for the node to be at least
* 'offset' bytes in length.
+ *
+ * To be called between exactly one pair of bdrv_inc/dec_in_flight()
*/
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
- PreallocMode prealloc, Error **errp)
+static int coroutine_fn bdrv_do_truncate(BdrvChild *child,
+ int64_t offset, bool exact,
+ PreallocMode prealloc, Error **errp)
{
BlockDriverState *bs = child->bs;
BlockDriver *drv = bs->drv;
@@ -3445,7 +3512,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
new_bytes = 0;
}
- bdrv_inc_in_flight(bs);
tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
BDRV_TRACKED_TRUNCATE);
@@ -3494,6 +3560,19 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
out:
tracked_request_end(&req);
+
+ return ret;
+}
+
+int coroutine_fn bdrv_co_truncate(BdrvChild *child,
+ int64_t offset, bool exact,
+ PreallocMode prealloc, Error **errp)
+{
+ int ret;
+ BlockDriverState *bs = child->bs;
+
+ bdrv_inc_in_flight(bs);
+ ret = bdrv_do_truncate(child, offset, exact, prealloc, errp);
bdrv_dec_in_flight(bs);
return ret;
@@ -3511,7 +3590,7 @@ typedef struct TruncateCo {
static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
{
TruncateCo *tco = opaque;
- tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
+ tco->ret = bdrv_do_truncate(tco->child, tco->offset, tco->exact,
tco->prealloc, tco->errp);
aio_wait_kick();
}
@@ -3529,6 +3608,8 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
.ret = NOT_DONE,
};
+ bdrv_inc_in_flight(child->bs);
+
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
bdrv_truncate_co_entry(&tco);
@@ -3538,5 +3619,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
}
+ bdrv_dec_in_flight(child->bs);
+
return tco.ret;
}
--
2.21.0
On Wed, Apr 08, 2020 at 12:30:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> It's safer to expand in_flight request to start before enter to
Please explain what exeactly "safer" means. If I understand correctly
this is just a refactoring and does not fix bugs that have been hit in
the real world.
Is this just a generate attempt to avoid accidentally performing
operations that need to happen as part of the request after the dec
call?
> @@ -2718,17 +2746,18 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
> ret = drv->bdrv_save_vmstate(bs, qiov, pos);
> }
> } else if (bs->file) {
> - ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
> + bdrv_inc_in_flight(bs->file->bs);
> + ret = bdrv_do_rw_vmstate(bs->file->bs, qiov, pos, is_read);
> + bdrv_dec_in_flight(bs->file->bs);
Here we inc/dec...
> }
>
> - bdrv_dec_in_flight(bs);
> return ret;
> }
>
> static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
> {
> BdrvVmstateCo *co = opaque;
> - co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
> + co->ret = bdrv_do_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
...here we don't. The code is correct, but bdrv_co_rw_vmstate_entry()
should also document that its caller must inc/dec.
> @@ -2950,7 +2994,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
> {
> FlushCo *rwco = opaque;
>
> - rwco->ret = bdrv_co_flush(rwco->bs);
> + rwco->ret = bdrv_do_flush(rwco->bs);
> aio_wait_kick();
> }
This function should also document that the caller must inc/dec.
20.04.2020 19:22, Stefan Hajnoczi wrote:
> On Wed, Apr 08, 2020 at 12:30:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
>> It's safer to expand in_flight request to start before enter to
>
> Please explain what exeactly "safer" means. If I understand correctly
> this is just a refactoring and does not fix bugs that have been hit in
> the real world.
>
> Is this just a generate attempt to avoid accidentally performing
> operations that need to happen as part of the request after the dec
> call?
Consider write.
It's possible, that qemu_coroutine_enter only schedules execution, assume such case.
Then we may possibly have the following:
1. Somehow check that we are not in drained section in outer code
2. call bdrv_pwritev(), assuming that it will increse in_flight, which will protect us from starting drained section
3. it calls bdrv_prwv_co -> bdrv_coroutine_enter (not yet increased in_flight)
4. assume coroutine not yet actually entered, only scheduled, and we go to some code, which starts drained section (as in_flight is zero)
5. scheduled coroutine starts, and blindly increases in_flight, and we are in drained section with in_flight request.
The series does the same thing for block/io.c like Kevin's "block: Fix blk->in_flight during blk_wait_while_drained()" for blk layer.
>
>> @@ -2718,17 +2746,18 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>> ret = drv->bdrv_save_vmstate(bs, qiov, pos);
>> }
>> } else if (bs->file) {
>> - ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
>> + bdrv_inc_in_flight(bs->file->bs);
>> + ret = bdrv_do_rw_vmstate(bs->file->bs, qiov, pos, is_read);
>> + bdrv_dec_in_flight(bs->file->bs);
>
> Here we inc/dec...
>
>> }
>>
>> - bdrv_dec_in_flight(bs);
>> return ret;
>> }
>>
>> static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
>> {
>> BdrvVmstateCo *co = opaque;
>> - co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
>> + co->ret = bdrv_do_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
>
> ...here we don't. The code is correct, but bdrv_co_rw_vmstate_entry()
> should also document that its caller must inc/dec.
>
>> @@ -2950,7 +2994,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
>> {
>> FlushCo *rwco = opaque;
>>
>> - rwco->ret = bdrv_co_flush(rwco->bs);
>> + rwco->ret = bdrv_do_flush(rwco->bs);
>> aio_wait_kick();
>> }
>
> This function should also document that the caller must inc/dec.
>
--
Best regards,
Vladimir
On Wed, Apr 22, 2020 at 04:47:07PM +0300, Vladimir Sementsov-Ogievskiy wrote: > 20.04.2020 19:22, Stefan Hajnoczi wrote: > > On Wed, Apr 08, 2020 at 12:30:47PM +0300, Vladimir Sementsov-Ogievskiy wrote: > > > It's safer to expand in_flight request to start before enter to > > > > Please explain what exeactly "safer" means. If I understand correctly > > this is just a refactoring and does not fix bugs that have been hit in > > the real world. > > > > Is this just a generate attempt to avoid accidentally performing > > operations that need to happen as part of the request after the dec > > call? > > Consider write. > > It's possible, that qemu_coroutine_enter only schedules execution, assume such case. > Then we may possibly have the following: > > 1. Somehow check that we are not in drained section in outer code > > 2. call bdrv_pwritev(), assuming that it will increse in_flight, which will protect us from starting drained section > > 3. it calls bdrv_prwv_co -> bdrv_coroutine_enter (not yet increased in_flight) > > 4. assume coroutine not yet actually entered, only scheduled, and we go to some code, which starts drained section (as in_flight is zero) > > 5. scheduled coroutine starts, and blindly increases in_flight, and we are in drained section with in_flight request. > > The series does the same thing for block/io.c like Kevin's "block: Fix blk->in_flight during blk_wait_while_drained()" for blk layer. Please include this in the commit description. Thanks! Stefan
© 2016 - 2026 Red Hat, Inc.