[v1] block/io: safer inc/dec in_flight sections

[PATCH 5/9] block/io: expand in_flight inc/dec section: simple cases

Posted by Vladimir Sementsov-Ogievskiy 5 years, 10 months ago

It's safer to expand in_flight request to start before enter to
coroutine in synchronous wrappers and end after BDRV_POLL_WHILE loop.
Note that qemu_coroutine_enter may only schedule the coroutine in some
circumstances.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/io.c | 155 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 119 insertions(+), 36 deletions(-)

diff --git a/block/io.c b/block/io.c
index dfbe68f428..9b57c7e422 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1511,7 +1511,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child,
     return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
 }
 
-int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_preadv_part(BdrvChild *child,
     int64_t offset, unsigned int bytes,
     QEMUIOVector *qiov, size_t qiov_offset,
     BdrvRequestFlags flags)
@@ -1540,8 +1541,6 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
         return 0;
     }
 
-    bdrv_inc_in_flight(bs);
-
     /* Don't do copy-on-read if we read data before write operation */
     if (atomic_read(&bs->copy_on_read)) {
         flags |= BDRV_REQ_COPY_ON_READ;
@@ -1554,13 +1553,26 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
                               bs->bl.request_alignment,
                               qiov, qiov_offset, flags);
     tracked_request_end(&req);
-    bdrv_dec_in_flight(bs);
 
     bdrv_padding_destroy(&pad);
 
     return ret;
 }
 
+int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
+    int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, size_t qiov_offset,
+    BdrvRequestFlags flags)
+{
+    int ret;
+
+    bdrv_inc_in_flight(child->bs);
+    ret = bdrv_do_preadv_part(child, offset, bytes, qiov, qiov_offset, flags);
+    bdrv_dec_in_flight(child->bs);
+
+    return ret;
+}
+
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int bytes, BdrvRequestFlags flags)
 {
@@ -1922,7 +1934,8 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
 }
 
-int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_pwritev_part(BdrvChild *child,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
     BdrvRequestFlags flags)
 {
@@ -1962,7 +1975,6 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
         return 0;
     }
 
-    bdrv_inc_in_flight(bs);
     /*
      * Align write if necessary by performing a read-modify-write cycle.
      * Pad qiov with the read parts and be sure to have a tracked request not
@@ -1987,7 +1999,19 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
 
 out:
     tracked_request_end(&req);
-    bdrv_dec_in_flight(bs);
+
+    return ret;
+}
+
+int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
+    BdrvRequestFlags flags)
+{
+    int ret;
+
+    bdrv_inc_in_flight(child->bs);
+    ret = bdrv_do_pwritev_part(child, offset, bytes, qiov, qiov_offset, flags);
+    bdrv_dec_in_flight(child->bs);
 
     return ret;
 }
@@ -2019,12 +2043,12 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
     RwCo *rwco = opaque;
 
     if (!rwco->is_write) {
-        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
-                                   rwco->qiov->size, rwco->qiov,
+        rwco->ret = bdrv_do_preadv_part(rwco->child, rwco->offset,
+                                   rwco->qiov->size, rwco->qiov, 0,
                                    rwco->flags);
     } else {
-        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
-                                    rwco->qiov->size, rwco->qiov,
+        rwco->ret = bdrv_do_pwritev_part(rwco->child, rwco->offset,
+                                    rwco->qiov->size, rwco->qiov, 0,
                                     rwco->flags);
     }
     aio_wait_kick();
@@ -2047,6 +2071,8 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
         .flags = flags,
     };
 
+    bdrv_inc_in_flight(child->bs);
+
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
@@ -2055,6 +2081,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
         bdrv_coroutine_enter(child->bs, co);
         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
     }
+
+    bdrv_dec_in_flight(child->bs);
+
     return rwco.ret;
 }
 
@@ -2700,15 +2729,14 @@ typedef struct BdrvVmstateCo {
     int                 ret;
 } BdrvVmstateCo;
 
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
 static int coroutine_fn
-bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
+bdrv_do_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                    bool is_read)
 {
     BlockDriver *drv = bs->drv;
     int ret = -ENOTSUP;
 
-    bdrv_inc_in_flight(bs);
-
     if (!drv) {
         ret = -ENOMEDIUM;
     } else if (drv->bdrv_load_vmstate) {
@@ -2718,17 +2746,18 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
         }
     } else if (bs->file) {
-        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+        bdrv_inc_in_flight(bs->file->bs);
+        ret = bdrv_do_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+        bdrv_dec_in_flight(bs->file->bs);
     }
 
-    bdrv_dec_in_flight(bs);
     return ret;
 }
 
 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
 {
     BdrvVmstateCo *co = opaque;
-    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
+    co->ret = bdrv_do_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
     aio_wait_kick();
 }
 
@@ -2736,8 +2765,12 @@ static inline int
 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                 bool is_read)
 {
+    int ret;
+
+    bdrv_inc_in_flight(bs);
+
     if (qemu_in_coroutine()) {
-        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
+        ret = bdrv_do_rw_vmstate(bs, qiov, pos, is_read);
     } else {
         BdrvVmstateCo data = {
             .bs         = bs,
@@ -2750,8 +2783,12 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
 
         bdrv_coroutine_enter(bs, co);
         BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
-        return data.ret;
+        ret = data.ret;
     }
+
+    bdrv_dec_in_flight(bs);
+
+    return ret;
 }
 
 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
@@ -2829,16 +2866,14 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
 /**************************************************************/
 /* Coroutine block device emulation */
 
-int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_flush(BlockDriverState *bs)
 {
     int current_gen;
-    int ret = 0;
-
-    bdrv_inc_in_flight(bs);
+    int ret;
 
-    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
-        bdrv_is_sg(bs)) {
-        goto early_exit;
+    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || bdrv_is_sg(bs)) {
+        return 0;
     }
 
     qemu_co_mutex_lock(&bs->reqs_lock);
@@ -2936,8 +2971,17 @@ out:
     qemu_co_queue_next(&bs->flush_queue);
     qemu_co_mutex_unlock(&bs->reqs_lock);
 
-early_exit:
+    return ret;
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+    int ret;
+
+    bdrv_inc_in_flight(bs);
+    ret = bdrv_do_flush(bs);
     bdrv_dec_in_flight(bs);
+
     return ret;
 }
 
@@ -2950,7 +2994,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 {
     FlushCo *rwco = opaque;
 
-    rwco->ret = bdrv_co_flush(rwco->bs);
+    rwco->ret = bdrv_do_flush(rwco->bs);
     aio_wait_kick();
 }
 
@@ -2962,6 +3006,8 @@ int bdrv_flush(BlockDriverState *bs)
         .ret = NOT_DONE,
     };
 
+    bdrv_inc_in_flight(bs);
+
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_flush_co_entry(&flush_co);
@@ -2971,11 +3017,14 @@ int bdrv_flush(BlockDriverState *bs)
         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
     }
 
+    bdrv_dec_in_flight(bs);
+
     return flush_co.ret;
 }
 
-int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
-                                  int64_t bytes)
+/* To be called between exactly one pair of bdrv_inc/dec_in_flight() */
+static int coroutine_fn bdrv_do_pdiscahd(BdrvChild *child, int64_t offset,
+                                         int64_t bytes)
 {
     BdrvTrackedRequest req;
     int max_pdiscard, ret;
@@ -3013,7 +3062,6 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
     head = offset % align;
     tail = (offset + bytes) % align;
 
-    bdrv_inc_in_flight(bs);
     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
 
     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
@@ -3084,7 +3132,19 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
 out:
     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
     tracked_request_end(&req);
-    bdrv_dec_in_flight(bs);
+    return ret;
+}
+
+int coroutine_fn bdrv_co_pdiscard(BdrvChild *child,
+                                  int64_t offset, int64_t bytes)
+{
+    int ret;
+    BlockDriverState *bs = child->bs;
+
+    bdrv_inc_in_flight(child->bs);
+    ret = bdrv_do_pdiscahd(child, offset, bytes);
+    bdrv_dec_in_flight(child->bs);
+
     return ret;
 }
 
@@ -3113,6 +3173,8 @@ int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
         .ret = NOT_DONE,
     };
 
+    bdrv_inc_in_flight(child->bs);
+
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_pdiscard_co_entry(&rwco);
@@ -3122,6 +3184,8 @@ int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
     }
 
+    bdrv_dec_in_flight(child->bs);
+
     return rwco.ret;
 }
 
@@ -3412,9 +3476,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
  * If 'exact' is true, the file must be resized to exactly the given
  * 'offset'.  Otherwise, it is sufficient for the node to be at least
  * 'offset' bytes in length.
+ *
+ * To be called between exactly one pair of bdrv_inc/dec_in_flight()
  */
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
-                                  PreallocMode prealloc, Error **errp)
+static int coroutine_fn bdrv_do_truncate(BdrvChild *child,
+                                         int64_t offset, bool exact,
+                                         PreallocMode prealloc, Error **errp)
 {
     BlockDriverState *bs = child->bs;
     BlockDriver *drv = bs->drv;
@@ -3445,7 +3512,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
         new_bytes = 0;
     }
 
-    bdrv_inc_in_flight(bs);
     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
                           BDRV_TRACKED_TRUNCATE);
 
@@ -3494,6 +3560,19 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
 
 out:
     tracked_request_end(&req);
+
+    return ret;
+}
+
+int coroutine_fn bdrv_co_truncate(BdrvChild *child,
+                                  int64_t offset, bool exact,
+                                  PreallocMode prealloc, Error **errp)
+{
+    int ret;
+    BlockDriverState *bs = child->bs;
+
+    bdrv_inc_in_flight(bs);
+    ret = bdrv_do_truncate(child, offset, exact, prealloc, errp);
     bdrv_dec_in_flight(bs);
 
     return ret;
@@ -3511,7 +3590,7 @@ typedef struct TruncateCo {
 static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
 {
     TruncateCo *tco = opaque;
-    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
+    tco->ret = bdrv_do_truncate(tco->child, tco->offset, tco->exact,
                                 tco->prealloc, tco->errp);
     aio_wait_kick();
 }
@@ -3529,6 +3608,8 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
         .ret        = NOT_DONE,
     };
 
+    bdrv_inc_in_flight(child->bs);
+
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_truncate_co_entry(&tco);
@@ -3538,5 +3619,7 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
         BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
     }
 
+    bdrv_dec_in_flight(child->bs);
+
     return tco.ret;
 }
-- 
2.21.0

Re: [PATCH 5/9] block/io: expand in_flight inc/dec section: simple cases

Posted by Stefan Hajnoczi 5 years, 9 months ago

On Wed, Apr 08, 2020 at 12:30:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> It's safer to expand in_flight request to start before enter to

Please explain what exeactly "safer" means.  If I understand correctly
this is just a refactoring and does not fix bugs that have been hit in
the real world.

Is this just a generate attempt to avoid accidentally performing
operations that need to happen as part of the request after the dec
call?

> @@ -2718,17 +2746,18 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>              ret = drv->bdrv_save_vmstate(bs, qiov, pos);
>          }
>      } else if (bs->file) {
> -        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
> +        bdrv_inc_in_flight(bs->file->bs);
> +        ret = bdrv_do_rw_vmstate(bs->file->bs, qiov, pos, is_read);
> +        bdrv_dec_in_flight(bs->file->bs);

Here we inc/dec...

>      }
>  
> -    bdrv_dec_in_flight(bs);
>      return ret;
>  }
>  
>  static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
>  {
>      BdrvVmstateCo *co = opaque;
> -    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
> +    co->ret = bdrv_do_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);

...here we don't.  The code is correct, but bdrv_co_rw_vmstate_entry()
should also document that its caller must inc/dec.

> @@ -2950,7 +2994,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
>  {
>      FlushCo *rwco = opaque;
>  
> -    rwco->ret = bdrv_co_flush(rwco->bs);
> +    rwco->ret = bdrv_do_flush(rwco->bs);
>      aio_wait_kick();
>  }

This function should also document that the caller must inc/dec.

Re: [PATCH 5/9] block/io: expand in_flight inc/dec section: simple cases

Posted by Vladimir Sementsov-Ogievskiy 5 years, 9 months ago

20.04.2020 19:22, Stefan Hajnoczi wrote:
> On Wed, Apr 08, 2020 at 12:30:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
>> It's safer to expand in_flight request to start before enter to
> 
> Please explain what exeactly "safer" means.  If I understand correctly
> this is just a refactoring and does not fix bugs that have been hit in
> the real world.
> 
> Is this just a generate attempt to avoid accidentally performing
> operations that need to happen as part of the request after the dec
> call?

Consider write.

It's possible, that qemu_coroutine_enter only schedules execution, assume such case.
Then we may possibly have the following:

1. Somehow check that we are not in drained section in outer code

2. call bdrv_pwritev(), assuming that it will increse in_flight, which will protect us from starting drained section

3. it calls bdrv_prwv_co -> bdrv_coroutine_enter (not yet increased in_flight)

4. assume coroutine not yet actually entered, only scheduled, and we go to some code, which starts drained section (as in_flight is zero)

5. scheduled coroutine starts, and blindly increases in_flight, and we are in drained section with in_flight request.

The series does the same thing for block/io.c like Kevin's "block: Fix blk->in_flight during blk_wait_while_drained()" for blk layer.

> 
>> @@ -2718,17 +2746,18 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
>>               ret = drv->bdrv_save_vmstate(bs, qiov, pos);
>>           }
>>       } else if (bs->file) {
>> -        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
>> +        bdrv_inc_in_flight(bs->file->bs);
>> +        ret = bdrv_do_rw_vmstate(bs->file->bs, qiov, pos, is_read);
>> +        bdrv_dec_in_flight(bs->file->bs);
> 
> Here we inc/dec...
> 
>>       }
>>   
>> -    bdrv_dec_in_flight(bs);
>>       return ret;
>>   }
>>   
>>   static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
>>   {
>>       BdrvVmstateCo *co = opaque;
>> -    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
>> +    co->ret = bdrv_do_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
> 
> ...here we don't.  The code is correct, but bdrv_co_rw_vmstate_entry()
> should also document that its caller must inc/dec.
> 
>> @@ -2950,7 +2994,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
>>   {
>>       FlushCo *rwco = opaque;
>>   
>> -    rwco->ret = bdrv_co_flush(rwco->bs);
>> +    rwco->ret = bdrv_do_flush(rwco->bs);
>>       aio_wait_kick();
>>   }
> 
> This function should also document that the caller must inc/dec.
> 

-- 
Best regards,
Vladimir

Re: [PATCH 5/9] block/io: expand in_flight inc/dec section: simple cases

Posted by Stefan Hajnoczi 5 years, 9 months ago

On Wed, Apr 22, 2020 at 04:47:07PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> 20.04.2020 19:22, Stefan Hajnoczi wrote:
> > On Wed, Apr 08, 2020 at 12:30:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > > It's safer to expand in_flight request to start before enter to
> > 
> > Please explain what exeactly "safer" means.  If I understand correctly
> > this is just a refactoring and does not fix bugs that have been hit in
> > the real world.
> > 
> > Is this just a generate attempt to avoid accidentally performing
> > operations that need to happen as part of the request after the dec
> > call?
> 
> Consider write.
> 
> It's possible, that qemu_coroutine_enter only schedules execution, assume such case.
> Then we may possibly have the following:
> 
> 1. Somehow check that we are not in drained section in outer code
> 
> 2. call bdrv_pwritev(), assuming that it will increse in_flight, which will protect us from starting drained section
> 
> 3. it calls bdrv_prwv_co -> bdrv_coroutine_enter (not yet increased in_flight)
> 
> 4. assume coroutine not yet actually entered, only scheduled, and we go to some code, which starts drained section (as in_flight is zero)
> 
> 5. scheduled coroutine starts, and blindly increases in_flight, and we are in drained section with in_flight request.
> 
> The series does the same thing for block/io.c like Kevin's "block: Fix blk->in_flight during blk_wait_while_drained()" for blk layer.

Please include this in the commit description.  Thanks!

Stefan