[v1] block-backend: Queue requests while drained

[Qemu-devel] [PATCH 4/4] block-backend: Queue requests while drained

Posted by Kevin Wolf 6 years, 6 months ago

This fixes device like IDE that can still start new requests from I/O
handlers in the CPU thread while the block backend is drained.

The basic assumption is that in a drain section, no new requests should
be allowed through a BlockBackend (blk_drained_begin/end don't exist,
we get drain sections only on the node level). However, there are two
special cases where requests should not be queued:

1. Block jobs: We already make sure that block jobs are paused in a
   drain section, so they won't start new requests. However, if the
   drain_begin is called on the job's BlockBackend first, it can happen
   that we deadlock because the job stays busy until it reaches a pause
   point - which it can't if it's requests aren't processed any more.

   The proper solution here would be to make all requests through the
   job's filter node instead of using a BlockBackend. For now, just
   disabling request queuin on the job BlockBackend is simpler.

2. In test cases where making requests through bdrv_* would be
   cumbersome because we'd need a BdrvChild. As we already got the
   functionality to disable request queuing from 1., use it in tests,
   too, for convenience.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/sysemu/block-backend.h | 11 +++---
 block/backup.c                 |  1 +
 block/block-backend.c          | 69 +++++++++++++++++++++++++++++-----
 block/commit.c                 |  2 +
 block/mirror.c                 |  6 ++-
 blockjob.c                     |  3 ++
 tests/test-bdrv-drain.c        |  1 +
 7 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 7320b58467..d453a4e9a1 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -104,6 +104,7 @@ void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm);
 
 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow);
 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow);
+void blk_set_disable_request_queuing(BlockBackend *blk, bool disable);
 void blk_iostatus_enable(BlockBackend *blk);
 bool blk_iostatus_is_enabled(const BlockBackend *blk);
 BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk);
@@ -119,10 +120,10 @@ BlockBackend *blk_by_qdev_id(const char *id, Error **errp);
 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque);
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
-                               BdrvRequestFlags flags);
+                               BdrvRequestFlags flags, bool wait_while_drained);
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                               unsigned int bytes, QEMUIOVector *qiov,
-                               BdrvRequestFlags flags);
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags, bool wait_while_drained);
 
 static inline int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset,
                                             unsigned int bytes, void *buf,
@@ -130,7 +131,7 @@ static inline int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset,
 {
     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
 
-    return blk_co_preadv(blk, offset, bytes, &qiov, flags);
+    return blk_co_preadv(blk, offset, bytes, &qiov, flags, true);
 }
 
 static inline int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset,
@@ -139,7 +140,7 @@ static inline int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset,
 {
     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
 
-    return blk_co_pwritev(blk, offset, bytes, &qiov, flags);
+    return blk_co_pwritev(blk, offset, bytes, &qiov, flags, true);
 }
 
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
diff --git a/block/backup.c b/block/backup.c
index 715e1d3be8..f66b2f4ee7 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -635,6 +635,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto error;
     }
+    blk_set_disable_request_queuing(job->target, true);
 
     job->on_source_error = on_source_error;
     job->on_target_error = on_target_error;
diff --git a/block/block-backend.c b/block/block-backend.c
index fdd6b01ecf..603b281743 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -79,6 +79,9 @@ struct BlockBackend {
     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
 
     int quiesce_counter;
+    CoQueue queued_requests;
+    bool disable_request_queuing;
+
     VMChangeStateEntry *vmsh;
     bool force_allow_inactivate;
 
@@ -339,6 +342,7 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 
     block_acct_init(&blk->stats);
 
+    qemu_co_queue_init(&blk->queued_requests);
     notifier_list_init(&blk->remove_bs_notifiers);
     notifier_list_init(&blk->insert_bs_notifiers);
     QLIST_INIT(&blk->aio_notifiers);
@@ -1096,6 +1100,11 @@ void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
     blk->allow_aio_context_change = allow;
 }
 
+void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
+{
+    blk->disable_request_queuing = disable;
+}
+
 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
                                   size_t size)
 {
@@ -1127,13 +1136,26 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
     return 0;
 }
 
+static void blk_wait_while_drained(BlockBackend *blk)
+{
+    if (blk->quiesce_counter && !blk->disable_request_queuing) {
+        qemu_co_queue_wait(&blk->queued_requests, NULL);
+    }
+}
+
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
-                               BdrvRequestFlags flags)
+                               BdrvRequestFlags flags, bool wait_while_drained)
 {
     int ret;
-    BlockDriverState *bs = blk_bs(blk);
+    BlockDriverState *bs;
 
+    if (wait_while_drained) {
+        blk_wait_while_drained(blk);
+    }
+
+    /* Call blk_bs() only after waiting, the graph may have changed */
+    bs = blk_bs(blk);
     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
@@ -1156,11 +1178,17 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
 
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                 unsigned int bytes, QEMUIOVector *qiov,
-                                BdrvRequestFlags flags)
+                                BdrvRequestFlags flags, bool wait_while_drained)
 {
     int ret;
-    BlockDriverState *bs = blk_bs(blk);
+    BlockDriverState *bs;
+
+    if (wait_while_drained) {
+        blk_wait_while_drained(blk);
+    }
 
+    /* Call blk_bs() only after waiting, the graph may have changed */
+    bs = blk_bs(blk);
     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
@@ -1198,7 +1226,7 @@ static void blk_read_entry(void *opaque)
     QEMUIOVector *qiov = rwco->iobuf;
 
     rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
-                              qiov, rwco->flags);
+                              qiov, rwco->flags, true);
     aio_wait_kick();
 }
 
@@ -1208,7 +1236,7 @@ static void blk_write_entry(void *opaque)
     QEMUIOVector *qiov = rwco->iobuf;
 
     rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
-                               qiov, rwco->flags);
+                               qiov, rwco->flags, true);
     aio_wait_kick();
 }
 
@@ -1349,9 +1377,15 @@ static void blk_aio_read_entry(void *opaque)
     BlkRwCo *rwco = &acb->rwco;
     QEMUIOVector *qiov = rwco->iobuf;
 
+    if (rwco->blk->quiesce_counter) {
+        blk_dec_in_flight(rwco->blk);
+        blk_wait_while_drained(rwco->blk);
+        blk_inc_in_flight(rwco->blk);
+    }
+
     assert(qiov->size == acb->bytes);
     rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
-                              qiov, rwco->flags);
+                              qiov, rwco->flags, false);
     blk_aio_complete(acb);
 }
 
@@ -1361,9 +1395,15 @@ static void blk_aio_write_entry(void *opaque)
     BlkRwCo *rwco = &acb->rwco;
     QEMUIOVector *qiov = rwco->iobuf;
 
+    if (rwco->blk->quiesce_counter) {
+        blk_dec_in_flight(rwco->blk);
+        blk_wait_while_drained(rwco->blk);
+        blk_inc_in_flight(rwco->blk);
+    }
+
     assert(!qiov || qiov->size == acb->bytes);
     rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
-                               qiov, rwco->flags);
+                               qiov, rwco->flags, false);
     blk_aio_complete(acb);
 }
 
@@ -1482,6 +1522,8 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
 
 int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
+    blk_wait_while_drained(blk);
+
     if (!blk_is_available(blk)) {
         return -ENOMEDIUM;
     }
@@ -1522,7 +1564,11 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
 
 int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
-    int ret = blk_check_byte_request(blk, offset, bytes);
+    int ret;
+
+    blk_wait_while_drained(blk);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
@@ -2004,7 +2050,7 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                                       int bytes, BdrvRequestFlags flags)
 {
     return blk_co_pwritev(blk, offset, bytes, NULL,
-                          flags | BDRV_REQ_ZERO_WRITE);
+                          flags | BDRV_REQ_ZERO_WRITE, true);
 }
 
 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
@@ -2232,6 +2278,9 @@ static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
         if (blk->dev_ops && blk->dev_ops->drained_end) {
             blk->dev_ops->drained_end(blk->dev_opaque);
         }
+        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
+            /* Resume all queued requests */
+        }
     }
 }
 
diff --git a/block/commit.c b/block/commit.c
index 2c5a6d4ebc..408ae15389 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -350,6 +350,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto fail;
     }
+    blk_set_disable_request_queuing(s->base, true);
     s->base_bs = base;
 
     /* Required permissions are already taken with block_job_add_bdrv() */
@@ -358,6 +359,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto fail;
     }
+    blk_set_disable_request_queuing(s->top, true);
 
     s->backing_file_str = g_strdup(backing_file_str);
     s->on_error = on_error;
diff --git a/block/mirror.c b/block/mirror.c
index 7483051f8d..8d0a3a987d 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -231,7 +231,8 @@ static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
         return;
     }
 
-    ret = blk_co_pwritev(s->target, op->offset, op->qiov.size, &op->qiov, 0);
+    ret = blk_co_pwritev(s->target, op->offset, op->qiov.size, &op->qiov, 0,
+                         false);
     mirror_write_complete(op, ret);
 }
 
@@ -1237,7 +1238,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
         switch (method) {
         case MIRROR_METHOD_COPY:
             ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
-                                 qiov ? &target_qiov : NULL, flags);
+                                 qiov ? &target_qiov : NULL, flags, false);
             break;
 
         case MIRROR_METHOD_ZERO:
@@ -1624,6 +1625,7 @@ static BlockJob *mirror_start_job(
         blk_set_force_allow_inactivate(s->target);
     }
     blk_set_allow_aio_context_change(s->target, true);
+    blk_set_disable_request_queuing(s->target, true);
 
     s->replaces = g_strdup(replaces);
     s->on_source_error = on_source_error;
diff --git a/blockjob.c b/blockjob.c
index 20b7f557da..73d9f1ba2b 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -445,6 +445,9 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
 
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
+    /* Disable request queuing in the BlockBackend to avoid deadlocks on drain:
+     * The job reports that it's busy until it reaches a pause point. */
+    blk_set_disable_request_queuing(blk, true);
     blk_set_allow_aio_context_change(blk, true);
 
     /* Only set speed when necessary to avoid NotSupported error */
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 03fa1142a1..3fcf7c1c95 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -677,6 +677,7 @@ static void test_iothread_common(enum drain_type drain_type, int drain_thread)
                               &error_abort);
     s = bs->opaque;
     blk_insert_bs(blk, bs, &error_abort);
+    blk_set_disable_request_queuing(blk, true);
 
     blk_set_aio_context(blk, ctx_a, &error_abort);
     aio_context_acquire(ctx_a);
-- 
2.20.1

Re: [Qemu-devel] [PATCH 4/4] block-backend: Queue requests while drained

Posted by Eric Blake 6 years, 6 months ago

On 7/25/19 11:27 AM, Kevin Wolf wrote:
> This fixes device like IDE that can still start new requests from I/O
> handlers in the CPU thread while the block backend is drained.
> 
> The basic assumption is that in a drain section, no new requests should
> be allowed through a BlockBackend (blk_drained_begin/end don't exist,
> we get drain sections only on the node level). However, there are two
> special cases where requests should not be queued:
> 
> 1. Block jobs: We already make sure that block jobs are paused in a
>    drain section, so they won't start new requests. However, if the
>    drain_begin is called on the job's BlockBackend first, it can happen
>    that we deadlock because the job stays busy until it reaches a pause
>    point - which it can't if it's requests aren't processed any more.

its (remember, "it's" is only okay if "it is" works as well)

> 
>    The proper solution here would be to make all requests through the
>    job's filter node instead of using a BlockBackend. For now, just
>    disabling request queuin on the job BlockBackend is simpler.

queuing

> 
> 2. In test cases where making requests through bdrv_* would be
>    cumbersome because we'd need a BdrvChild. As we already got the
>    functionality to disable request queuing from 1., use it in tests,
>    too, for convenience.
> 
-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-devel] [PATCH 4/4] block-backend: Queue requests while drained

Posted by Max Reitz 6 years, 6 months ago

On 25.07.19 18:27, Kevin Wolf wrote:
> This fixes device like IDE that can still start new requests from I/O

*devices

> handlers in the CPU thread while the block backend is drained.
> 
> The basic assumption is that in a drain section, no new requests should
> be allowed through a BlockBackend (blk_drained_begin/end don't exist,
> we get drain sections only on the node level). However, there are two
> special cases where requests should not be queued:
> 
> 1. Block jobs: We already make sure that block jobs are paused in a
>    drain section, so they won't start new requests. However, if the
>    drain_begin is called on the job's BlockBackend first, it can happen
>    that we deadlock because the job stays busy until it reaches a pause
>    point - which it can't if it's requests aren't processed any more.
> 
>    The proper solution here would be to make all requests through the
>    job's filter node instead of using a BlockBackend. For now, just
>    disabling request queuin on the job BlockBackend is simpler.

Yep, seems reasonable.

(We’d need a relationship that a BB is owned by some job, and then pause
the job when the BB is drained, I suppose.  But that’s exactly
accomplished by not making the job use a BB, but its BdrvChild
references instead.)

> 2. In test cases where making requests through bdrv_* would be
>    cumbersome because we'd need a BdrvChild. As we already got the
>    functionality to disable request queuing from 1., use it in tests,
>    too, for convenience.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  include/sysemu/block-backend.h | 11 +++---
>  block/backup.c                 |  1 +
>  block/block-backend.c          | 69 +++++++++++++++++++++++++++++-----
>  block/commit.c                 |  2 +
>  block/mirror.c                 |  6 ++-
>  blockjob.c                     |  3 ++
>  tests/test-bdrv-drain.c        |  1 +
>  7 files changed, 76 insertions(+), 17 deletions(-)

[...]

> diff --git a/block/block-backend.c b/block/block-backend.c
> index fdd6b01ecf..603b281743 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c

[...]

> @@ -1127,13 +1136,26 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
>      return 0;
>  }
>  
> +static void blk_wait_while_drained(BlockBackend *blk)

+coroutine_fn?  (Maybe even blk_co_wait...)

> +{
> +    if (blk->quiesce_counter && !blk->disable_request_queuing) {
> +        qemu_co_queue_wait(&blk->queued_requests, NULL);
> +    }
> +}
> +
>  int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
>                                 unsigned int bytes, QEMUIOVector *qiov,
> -                               BdrvRequestFlags flags)
> +                               BdrvRequestFlags flags, bool wait_while_drained)

What’s the purpose of this parameter?  How would it hurt to always
wait_while_drained?

I see the following callers of blk_co_p{read,write}v() that call it with
wait_while_drained=false:

1. blk_aio_{read,write}_entry(): They wait themselves, so they don’t
   need these functions to wait.  But OTOH, because they have waited, we
   know that the BB is not quiesced here, so we won’t wait here anyway.
   (These functions should be coroutine_fn, too, by the way)

2. mirror: It disables request queuing anyway, so wait_while_drained
   doesn’t have any effect.

>  {
>      int ret;
> -    BlockDriverState *bs = blk_bs(blk);
> +    BlockDriverState *bs;
>  
> +    if (wait_while_drained) {
> +        blk_wait_while_drained(blk);
> +    }

[...]

What about blk_co_flush()?  Should that wait, too?

> @@ -2232,6 +2278,9 @@ static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
>          if (blk->dev_ops && blk->dev_ops->drained_end) {
>              blk->dev_ops->drained_end(blk->dev_opaque);
>          }
> +        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
> +            /* Resume all queued requests */
> +        }

Wouldn’t qemu_co_queue_restart_all(&blk->queued_requests) achieve the same?

Max

Re: [Qemu-devel] [PATCH 4/4] block-backend: Queue requests while drained

Posted by Kevin Wolf 6 years, 6 months ago

Am 26.07.2019 um 12:50 hat Max Reitz geschrieben:
> On 25.07.19 18:27, Kevin Wolf wrote:
> > This fixes device like IDE that can still start new requests from I/O
> 
> *devices
> 
> > handlers in the CPU thread while the block backend is drained.
> > 
> > The basic assumption is that in a drain section, no new requests should
> > be allowed through a BlockBackend (blk_drained_begin/end don't exist,
> > we get drain sections only on the node level). However, there are two
> > special cases where requests should not be queued:
> > 
> > 1. Block jobs: We already make sure that block jobs are paused in a
> >    drain section, so they won't start new requests. However, if the
> >    drain_begin is called on the job's BlockBackend first, it can happen
> >    that we deadlock because the job stays busy until it reaches a pause
> >    point - which it can't if it's requests aren't processed any more.
> > 
> >    The proper solution here would be to make all requests through the
> >    job's filter node instead of using a BlockBackend. For now, just
> >    disabling request queuin on the job BlockBackend is simpler.
> 
> Yep, seems reasonable.
> 
> (We’d need a relationship that a BB is owned by some job, and then pause
> the job when the BB is drained, I suppose.  But that’s exactly
> accomplished by not making the job use a BB, but its BdrvChild
> references instead.)

We actually had this before commit ad90feba, when we changed it to use
the job's BdrvChild objects instead. All block jobs have both currently,
they just don't use their BdrvChild objects much.

> > 2. In test cases where making requests through bdrv_* would be
> >    cumbersome because we'd need a BdrvChild. As we already got the
> >    functionality to disable request queuing from 1., use it in tests,
> >    too, for convenience.
> > 
> > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > ---
> >  include/sysemu/block-backend.h | 11 +++---
> >  block/backup.c                 |  1 +
> >  block/block-backend.c          | 69 +++++++++++++++++++++++++++++-----
> >  block/commit.c                 |  2 +
> >  block/mirror.c                 |  6 ++-
> >  blockjob.c                     |  3 ++
> >  tests/test-bdrv-drain.c        |  1 +
> >  7 files changed, 76 insertions(+), 17 deletions(-)
> 
> [...]
> 
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index fdd6b01ecf..603b281743 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> 
> [...]
> 
> > @@ -1127,13 +1136,26 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
> >      return 0;
> >  }
> >  
> > +static void blk_wait_while_drained(BlockBackend *blk)
> 
> +coroutine_fn?  (Maybe even blk_co_wait...)
> 
> > +{
> > +    if (blk->quiesce_counter && !blk->disable_request_queuing) {
> > +        qemu_co_queue_wait(&blk->queued_requests, NULL);
> > +    }
> > +}
> > +
> >  int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
> >                                 unsigned int bytes, QEMUIOVector *qiov,
> > -                               BdrvRequestFlags flags)
> > +                               BdrvRequestFlags flags, bool wait_while_drained)
> 
> What’s the purpose of this parameter?  How would it hurt to always
> wait_while_drained?
> 
> I see the following callers of blk_co_p{read,write}v() that call it with
> wait_while_drained=false:
> 
> 1. blk_aio_{read,write}_entry(): They wait themselves, so they don’t
>    need these functions to wait.  But OTOH, because they have waited, we
>    know that the BB is not quiesced here, so we won’t wait here anyway.
>    (These functions should be coroutine_fn, too, by the way)

I think I was worried that the coroutine might yield between the two
places. Later I noticed that blk_wait_while_drained() must be the very
first thing anyway, so maybe it doesn't matter any more now.

If we did yield here for requests coming from blk_aio_prwv(), in_flight
would be increased and drain would deadlock.

Would you prefer if I just unconditionally wait if we're drained?

> 2. mirror: It disables request queuing anyway, so wait_while_drained
>    doesn’t have any effect.

Yes, I wasn't sure what to use there. false seemed like it would be
less likely to cause misunderstandings because it just repeats what
would happen anyway.

> >  {
> >      int ret;
> > -    BlockDriverState *bs = blk_bs(blk);
> > +    BlockDriverState *bs;
> >  
> > +    if (wait_while_drained) {
> > +        blk_wait_while_drained(blk);
> > +    }
> 
> [...]
> 
> What about blk_co_flush()?  Should that wait, too?

Hm, probably, yes.

> > @@ -2232,6 +2278,9 @@ static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
> >          if (blk->dev_ops && blk->dev_ops->drained_end) {
> >              blk->dev_ops->drained_end(blk->dev_opaque);
> >          }
> > +        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
> > +            /* Resume all queued requests */
> > +        }
> 
> Wouldn’t qemu_co_queue_restart_all(&blk->queued_requests) achieve the same?

It would fail an assertion because we're not in coroutine context.
(Guess what my first attempt was!)

Kevin

Re: [Qemu-devel] [PATCH 4/4] block-backend: Queue requests while drained

Posted by Max Reitz 6 years, 6 months ago

On 26.07.19 13:49, Kevin Wolf wrote:
> Am 26.07.2019 um 12:50 hat Max Reitz geschrieben:
>> On 25.07.19 18:27, Kevin Wolf wrote:
>>> This fixes device like IDE that can still start new requests from I/O
>>
>> *devices
>>
>>> handlers in the CPU thread while the block backend is drained.
>>>
>>> The basic assumption is that in a drain section, no new requests should
>>> be allowed through a BlockBackend (blk_drained_begin/end don't exist,
>>> we get drain sections only on the node level). However, there are two
>>> special cases where requests should not be queued:
>>>
>>> 1. Block jobs: We already make sure that block jobs are paused in a
>>>    drain section, so they won't start new requests. However, if the
>>>    drain_begin is called on the job's BlockBackend first, it can happen
>>>    that we deadlock because the job stays busy until it reaches a pause
>>>    point - which it can't if it's requests aren't processed any more.
>>>
>>>    The proper solution here would be to make all requests through the
>>>    job's filter node instead of using a BlockBackend. For now, just
>>>    disabling request queuin on the job BlockBackend is simpler.
>>
>> Yep, seems reasonable.
>>
>> (We’d need a relationship that a BB is owned by some job, and then pause
>> the job when the BB is drained, I suppose.  But that’s exactly
>> accomplished by not making the job use a BB, but its BdrvChild
>> references instead.)
> 
> We actually had this before commit ad90feba, when we changed it to use
> the job's BdrvChild objects instead. All block jobs have both currently,
> they just don't use their BdrvChild objects much.
> 
>>> 2. In test cases where making requests through bdrv_* would be
>>>    cumbersome because we'd need a BdrvChild. As we already got the
>>>    functionality to disable request queuing from 1., use it in tests,
>>>    too, for convenience.
>>>
>>> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
>>> ---
>>>  include/sysemu/block-backend.h | 11 +++---
>>>  block/backup.c                 |  1 +
>>>  block/block-backend.c          | 69 +++++++++++++++++++++++++++++-----
>>>  block/commit.c                 |  2 +
>>>  block/mirror.c                 |  6 ++-
>>>  blockjob.c                     |  3 ++
>>>  tests/test-bdrv-drain.c        |  1 +
>>>  7 files changed, 76 insertions(+), 17 deletions(-)
>>
>> [...]
>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index fdd6b01ecf..603b281743 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>
>> [...]
>>
>>> @@ -1127,13 +1136,26 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
>>>      return 0;
>>>  }
>>>  
>>> +static void blk_wait_while_drained(BlockBackend *blk)
>>
>> +coroutine_fn?  (Maybe even blk_co_wait...)
>>
>>> +{
>>> +    if (blk->quiesce_counter && !blk->disable_request_queuing) {
>>> +        qemu_co_queue_wait(&blk->queued_requests, NULL);
>>> +    }
>>> +}
>>> +
>>>  int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
>>>                                 unsigned int bytes, QEMUIOVector *qiov,
>>> -                               BdrvRequestFlags flags)
>>> +                               BdrvRequestFlags flags, bool wait_while_drained)
>>
>> What’s the purpose of this parameter?  How would it hurt to always
>> wait_while_drained?
>>
>> I see the following callers of blk_co_p{read,write}v() that call it with
>> wait_while_drained=false:
>>
>> 1. blk_aio_{read,write}_entry(): They wait themselves, so they don’t
>>    need these functions to wait.  But OTOH, because they have waited, we
>>    know that the BB is not quiesced here, so we won’t wait here anyway.
>>    (These functions should be coroutine_fn, too, by the way)
> 
> I think I was worried that the coroutine might yield between the two
> places. Later I noticed that blk_wait_while_drained() must be the very
> first thing anyway, so maybe it doesn't matter any more now.
> 
> If we did yield here for requests coming from blk_aio_prwv(), in_flight
> would be increased and drain would deadlock.
> 
> Would you prefer if I just unconditionally wait if we're drained?

I think I would, yes.

>> 2. mirror: It disables request queuing anyway, so wait_while_drained
>>    doesn’t have any effect.
> 
> Yes, I wasn't sure what to use there. false seemed like it would be
> less likely to cause misunderstandings because it just repeats what
> would happen anyway.
> 
>>>  {
>>>      int ret;
>>> -    BlockDriverState *bs = blk_bs(blk);
>>> +    BlockDriverState *bs;
>>>  
>>> +    if (wait_while_drained) {
>>> +        blk_wait_while_drained(blk);
>>> +    }
>>
>> [...]
>>
>> What about blk_co_flush()?  Should that wait, too?
> 
> Hm, probably, yes.
> 
>>> @@ -2232,6 +2278,9 @@ static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
>>>          if (blk->dev_ops && blk->dev_ops->drained_end) {
>>>              blk->dev_ops->drained_end(blk->dev_opaque);
>>>          }
>>> +        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
>>> +            /* Resume all queued requests */
>>> +        }
>>
>> Wouldn’t qemu_co_queue_restart_all(&blk->queued_requests) achieve the same?
> 
> It would fail an assertion because we're not in coroutine context.
> (Guess what my first attempt was!)

:-)

Max

[Qemu-devel] [PATCH 1/4] block: Remove blk_pread_unthrottled()
[Qemu-devel] [PATCH 2/4] block: Reduce (un)drains when replacing a child
[Qemu-devel] [PATCH 3/4] mirror: Keep target drained until graph changes are done
[Qemu-devel] [PATCH 4/4] block-backend: Queue requests while drained