[Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Denis Plotnikov posted 1 patch 1 week ago
Test checkpatch passed
Test docker-quick@centos7 passed
Test docker-clang@ubuntu passed
Test docker-mingw@fedora passed
Test asan passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20181205122326.26625-1-dplotnikov@virtuozzo.com
block/block-backend.c | 31 +++++++++++++++++++++++++++++++
block/io.c            |  3 ++-
dma-helpers.c         |  4 ++--
hw/block/nvme.c       |  8 ++++----
hw/block/xen_disk.c   |  8 ++++----
hw/ide/core.c         |  6 ++++--
hw/scsi/scsi-disk.c   | 10 ++++++----
include/block/aio.h   | 37 ++++++++++++++++++++++++++++---------
include/block/block.h |  8 +++++++-
util/async.c          |  2 ++
10 files changed, 90 insertions(+), 27 deletions(-)

[Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Denis Plotnikov 1 week ago
At the time, the "drained section" doesn't protect Block Driver State
from the requests appearing in the vCPU threads.
This could lead to the data loss because of request coming to
an unexpected BDS.

For example, when a request comes to ide controller from the guest,
the controller creates a request coroutine and executes the coroutine
in the vCPU thread. If another thread(iothread) has entered the
"drained section" on a BDS with bdrv_drained_begin, which protects
BDS' AioContext from external requests, and released the AioContext
because of finishing some coroutine by the moment of the request
appearing at the ide controller, the controller acquires the AioContext
and executes its request without any respect to the entered
"drained section" producing any kinds of data inconsistency.

The patch prevents this case by putting requests from external threads to
the queue on AioContext while the context is protected for external requests
and executes those requests later on the external requests protection removing.

Also, the patch marks requests generated in a vCPU thread as external ones
to make use of the request postponing.

How to reproduce:
1. start vm with an ide disk and a linux guest
2. in the guest run: dd if=... of=... bs=4K count=100000 oflag=direct
3. (qemu) drive_mirror "disk-name"
4. wait until block job can receive block_job_complete
5. (qemu) block_job_complete "disk-name"
6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)

Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
---
 block/block-backend.c | 31 +++++++++++++++++++++++++++++++
 block/io.c            |  3 ++-
 dma-helpers.c         |  4 ++--
 hw/block/nvme.c       |  8 ++++----
 hw/block/xen_disk.c   |  8 ++++----
 hw/ide/core.c         |  6 ++++--
 hw/scsi/scsi-disk.c   | 10 ++++++----
 include/block/aio.h   | 37 ++++++++++++++++++++++++++++---------
 include/block/block.h |  8 +++++++-
 util/async.c          |  2 ++
 10 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 60d37a0c3d..10f7dd357d 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1148,6 +1148,8 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
     return 0;
 }
 
+static void coroutine_fn blk_postpone_request(BlockBackend *blk);
+
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags)
@@ -1157,6 +1159,10 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
 
     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
+    if ((flags & BDRV_REQ_EXTERNAL)) {
+        blk_postpone_request(blk);
+    }
+
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
@@ -1184,6 +1190,10 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
 
     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
+    if ((flags & BDRV_REQ_EXTERNAL)) {
+        blk_postpone_request(blk);
+    }
+
     ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
@@ -1304,6 +1314,27 @@ static void blk_dec_in_flight(BlockBackend *blk)
     aio_wait_kick();
 }
 
+static void coroutine_fn blk_postpone_request(BlockBackend *blk)
+{
+    AioContext *ctx;
+
+    assert(qemu_in_coroutine());
+    ctx = qemu_coroutine_get_aio_context(qemu_coroutine_self());
+
+    /*
+     * Put the request to the postponed queue if
+     * external requests is not allowed currenly
+     * The request is continued when the context
+     * leaves the bdrv "drained" section allowing
+     * external requests
+     */
+    if (aio_external_disabled(ctx)) {
+        blk_dec_in_flight(blk);
+        qemu_co_queue_wait(&ctx->postponed_reqs, NULL);
+        blk_inc_in_flight(blk);
+    }
+}
+
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
diff --git a/block/io.c b/block/io.c
index bd9d688f8b..019da464a2 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1318,7 +1318,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
      * potential fallback support, if we ever implement any read flags
      * to pass through to drivers.  For now, there aren't any
      * passthrough flags.  */
-    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
+    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ |
+                       BDRV_REQ_EXTERNAL)));
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
diff --git a/dma-helpers.c b/dma-helpers.c
index 2d7e02d35e..53706031c5 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -235,7 +235,7 @@ BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
                                  void *opaque)
 {
     BlockBackend *blk = opaque;
-    return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
+    return blk_aio_preadv(blk, offset, iov, BDRV_REQ_EXTERNAL, cb, cb_opaque);
 }
 
 BlockAIOCB *dma_blk_read(BlockBackend *blk,
@@ -253,7 +253,7 @@ BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
                                   void *opaque)
 {
     BlockBackend *blk = opaque;
-    return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
+    return blk_aio_pwritev(blk, offset, iov, BDRV_REQ_EXTERNAL, cb, cb_opaque);
 }
 
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7c8c63e8f5..e645180e18 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -380,10 +380,10 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     } else {
         req->has_sg = false;
         req->aiocb = is_write ?
-            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
-                            req) :
-            blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
-                           req);
+            blk_aio_pwritev(n->conf.blk, data_offset, &req->iov,
+                            BDRV_REQ_EXTERNAL, nvme_rw_cb, req) :
+            blk_aio_preadv(n->conf.blk, data_offset, &req->iov,
+                           BDRV_REQ_EXTERNAL, nvme_rw_cb, req);
     }
 
     return NVME_NO_COMPLETE;
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 36eff94f84..8888fcd070 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -411,8 +411,8 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
         block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
                          ioreq->v.size, BLOCK_ACCT_READ);
         ioreq->aio_inflight++;
-        blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v, 0,
-                       qemu_aio_complete, ioreq);
+        blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v,
+                       BDRV_REQ_EXTERNAL, qemu_aio_complete, ioreq);
         break;
     case BLKIF_OP_WRITE:
     case BLKIF_OP_FLUSH_DISKCACHE:
@@ -426,8 +426,8 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
                          ioreq->req.operation == BLKIF_OP_WRITE ?
                          BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
         ioreq->aio_inflight++;
-        blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v, 0,
-                        qemu_aio_complete, ioreq);
+        blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v,
+                        BDRV_REQ_EXTERNAL, qemu_aio_complete, ioreq);
         break;
     case BLKIF_OP_DISCARD:
     {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 04e22e751d..ae4b715eb1 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -665,7 +665,8 @@ BlockAIOCB *ide_buffered_readv(IDEState *s, int64_t sector_num,
     qemu_iovec_init_external(&req->qiov, &req->iov, 1);
 
     aioreq = blk_aio_preadv(s->blk, sector_num << BDRV_SECTOR_BITS,
-                            &req->qiov, 0, ide_buffered_readv_cb, req);
+                            &req->qiov, BDRV_REQ_EXTERNAL,
+                            ide_buffered_readv_cb, req);
 
     QLIST_INSERT_HEAD(&s->buffered_requests, req, list);
     return aioreq;
@@ -1052,7 +1053,8 @@ static void ide_sector_write(IDEState *s)
     block_acct_start(blk_get_stats(s->blk), &s->acct,
                      n * BDRV_SECTOR_SIZE, BLOCK_ACCT_WRITE);
     s->pio_aiocb = blk_aio_pwritev(s->blk, sector_num << BDRV_SECTOR_BITS,
-                                   &s->qiov, 0, ide_sector_write_cb, s);
+                                   &s->qiov, BDRV_REQ_EXTERNAL,
+                                   ide_sector_write_cb, s);
 }
 
 static void ide_flush_cb(void *opaque, int ret)
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 0e9027c8f3..ad7b236e67 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -1732,7 +1732,7 @@ static void scsi_write_same_complete(void *opaque, int ret)
         qemu_iovec_init_external(&data->qiov, &data->iov, 1);
         r->req.aiocb = blk_aio_pwritev(s->qdev.conf.blk,
                                        data->sector << BDRV_SECTOR_BITS,
-                                       &data->qiov, 0,
+                                       &data->qiov, BDRV_REQ_EXTERNAL,
                                        scsi_write_same_complete, data);
         aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
         return;
@@ -1804,7 +1804,7 @@ static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
                      data->iov.iov_len, BLOCK_ACCT_WRITE);
     r->req.aiocb = blk_aio_pwritev(s->qdev.conf.blk,
                                    data->sector << BDRV_SECTOR_BITS,
-                                   &data->qiov, 0,
+                                   &data->qiov, BDRV_REQ_EXTERNAL,
                                    scsi_write_same_complete, data);
 }
 
@@ -2862,7 +2862,8 @@ BlockAIOCB *scsi_dma_readv(int64_t offset, QEMUIOVector *iov,
 {
     SCSIDiskReq *r = opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-    return blk_aio_preadv(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+    return blk_aio_preadv(s->qdev.conf.blk, offset, iov, BDRV_REQ_EXTERNAL,
+                          cb, cb_opaque);
 }
 
 static
@@ -2872,7 +2873,8 @@ BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
 {
     SCSIDiskReq *r = opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-    return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+    return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, BDRV_REQ_EXTERNAL,
+                           cb, cb_opaque);
 }
 
 static void scsi_disk_base_class_initfn(ObjectClass *klass, void *data)
diff --git a/include/block/aio.h b/include/block/aio.h
index 0ca25dfec6..8512bda44e 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -19,6 +19,7 @@
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
 #include "qemu/timer.h"
+#include "qemu/coroutine.h"
 
 typedef struct BlockAIOCB BlockAIOCB;
 typedef void BlockCompletionFunc(void *opaque, int ret);
@@ -130,6 +131,11 @@ struct AioContext {
     QEMUTimerListGroup tlg;
 
     int external_disable_cnt;
+    /* Queue to store the requests coming when the context is disabled for
+     * external requests.
+     * Don't use a separate lock for protection relying the context lock
+     */
+    CoQueue postponed_reqs;
 
     /* Number of AioHandlers without .io_poll() */
     int poll_disable_cnt;
@@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx,
  */
 int64_t aio_compute_timeout(AioContext *ctx);
 
+/**
+ * aio_co_enter:
+ * @ctx: the context to run the coroutine
+ * @co: the coroutine to run
+ *
+ * Enter a coroutine in the specified AioContext.
+ */
+void aio_co_enter(AioContext *ctx, struct Coroutine *co);
+
 /**
  * aio_disable_external:
  * @ctx: the aio context
@@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx);
  */
 static inline void aio_disable_external(AioContext *ctx)
 {
+    aio_context_acquire(ctx);
     atomic_inc(&ctx->external_disable_cnt);
+    aio_context_release(ctx);
 }
 
+static void run_postponed_co(void *opaque)
+{
+    AioContext *ctx = (AioContext *) opaque;
+
+    qemu_co_queue_restart_all(&ctx->postponed_reqs);
+}
 /**
  * aio_enable_external:
  * @ctx: the aio context
@@ -504,12 +527,17 @@ static inline void aio_enable_external(AioContext *ctx)
 {
     int old;
 
+    aio_context_acquire(ctx);
     old = atomic_fetch_dec(&ctx->external_disable_cnt);
     assert(old > 0);
     if (old == 1) {
+        Coroutine *co = qemu_coroutine_create(run_postponed_co, ctx);
+        aio_co_enter(ctx, co);
+
         /* Kick event loop so it re-arms file descriptors */
         aio_notify(ctx);
     }
+    aio_context_release(ctx);
 }
 
 /**
@@ -564,15 +592,6 @@ void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
  */
 void aio_co_wake(struct Coroutine *co);
 
-/**
- * aio_co_enter:
- * @ctx: the context to run the coroutine
- * @co: the coroutine to run
- *
- * Enter a coroutine in the specified AioContext.
- */
-void aio_co_enter(AioContext *ctx, struct Coroutine *co);
-
 /**
  * Return the AioContext whose event loop runs in the current thread.
  *
diff --git a/include/block/block.h b/include/block/block.h
index 7f5453b45b..0685a73975 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -83,8 +83,14 @@ typedef enum {
      */
     BDRV_REQ_SERIALISING        = 0x80,
 
+    /*
+     * marks requests comming from external sources,
+     * e.g block requests from dma running in the vCPU thread
+     */
+    BDRV_REQ_EXTERNAL   = 0x100,
+
     /* Mask of valid flags */
-    BDRV_REQ_MASK               = 0xff,
+    BDRV_REQ_MASK               = 0xfff,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
diff --git a/util/async.c b/util/async.c
index c10642a385..ff1bafd344 100644
--- a/util/async.c
+++ b/util/async.c
@@ -441,6 +441,8 @@ AioContext *aio_context_new(Error **errp)
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;
 
+    qemu_co_queue_init(&ctx->postponed_reqs);
+
     return ctx;
 fail:
     g_source_destroy(&ctx->source);
-- 
2.17.0


Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Kevin Wolf 1 week ago
Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben:
> At the time, the "drained section" doesn't protect Block Driver State
> from the requests appearing in the vCPU threads.
> This could lead to the data loss because of request coming to
> an unexpected BDS.
> 
> For example, when a request comes to ide controller from the guest,
> the controller creates a request coroutine and executes the coroutine
> in the vCPU thread. If another thread(iothread) has entered the
> "drained section" on a BDS with bdrv_drained_begin, which protects
> BDS' AioContext from external requests, and released the AioContext
> because of finishing some coroutine by the moment of the request
> appearing at the ide controller, the controller acquires the AioContext
> and executes its request without any respect to the entered
> "drained section" producing any kinds of data inconsistency.
> 
> The patch prevents this case by putting requests from external threads to
> the queue on AioContext while the context is protected for external requests
> and executes those requests later on the external requests protection removing.
> 
> Also, the patch marks requests generated in a vCPU thread as external ones
> to make use of the request postponing.
> 
> How to reproduce:
> 1. start vm with an ide disk and a linux guest
> 2. in the guest run: dd if=... of=... bs=4K count=100000 oflag=direct
> 3. (qemu) drive_mirror "disk-name"
> 4. wait until block job can receive block_job_complete
> 5. (qemu) block_job_complete "disk-name"
> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)
> 
> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>

This is getting closer, but I'd like to see two more major changes:

> diff --git a/include/block/aio.h b/include/block/aio.h
> index 0ca25dfec6..8512bda44e 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -19,6 +19,7 @@
>  #include "qemu/event_notifier.h"
>  #include "qemu/thread.h"
>  #include "qemu/timer.h"
> +#include "qemu/coroutine.h"
>  
>  typedef struct BlockAIOCB BlockAIOCB;
>  typedef void BlockCompletionFunc(void *opaque, int ret);
> @@ -130,6 +131,11 @@ struct AioContext {
>      QEMUTimerListGroup tlg;
>  
>      int external_disable_cnt;
> +    /* Queue to store the requests coming when the context is disabled for
> +     * external requests.
> +     * Don't use a separate lock for protection relying the context lock
> +     */
> +    CoQueue postponed_reqs;

Why involve the AioContext at all? This could all be kept at the
BlockBackend level without extending the layering violation that
aio_disable_external() is.

BlockBackends get notified when their root node is drained, so hooking
things up there should be as easy, if not even easier than in
AioContext.

>      /* Number of AioHandlers without .io_poll() */
>      int poll_disable_cnt;
> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx,
>   */
>  int64_t aio_compute_timeout(AioContext *ctx);
>  
> +/**
> + * aio_co_enter:
> + * @ctx: the context to run the coroutine
> + * @co: the coroutine to run
> + *
> + * Enter a coroutine in the specified AioContext.
> + */
> +void aio_co_enter(AioContext *ctx, struct Coroutine *co);
> +
>  /**
>   * aio_disable_external:
>   * @ctx: the aio context
> @@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx);
>   */
>  static inline void aio_disable_external(AioContext *ctx)
>  {
> +    aio_context_acquire(ctx);
>      atomic_inc(&ctx->external_disable_cnt);
> +    aio_context_release(ctx);
>  }

This acquire/release pair looks rather useless?

> +static void run_postponed_co(void *opaque)
> +{
> +    AioContext *ctx = (AioContext *) opaque;
> +
> +    qemu_co_queue_restart_all(&ctx->postponed_reqs);
> +}
>  /**
>   * aio_enable_external:
>   * @ctx: the aio context
> @@ -504,12 +527,17 @@ static inline void aio_enable_external(AioContext *ctx)
>  {
>      int old;
>  
> +    aio_context_acquire(ctx);
>      old = atomic_fetch_dec(&ctx->external_disable_cnt);
>      assert(old > 0);
>      if (old == 1) {
> +        Coroutine *co = qemu_coroutine_create(run_postponed_co, ctx);
> +        aio_co_enter(ctx, co);
> +
>          /* Kick event loop so it re-arms file descriptors */
>          aio_notify(ctx);
>      }
> +    aio_context_release(ctx);
>  }
>  
>  /**
> @@ -564,15 +592,6 @@ void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
>   */
>  void aio_co_wake(struct Coroutine *co);
>  
> -/**
> - * aio_co_enter:
> - * @ctx: the context to run the coroutine
> - * @co: the coroutine to run
> - *
> - * Enter a coroutine in the specified AioContext.
> - */
> -void aio_co_enter(AioContext *ctx, struct Coroutine *co);
> -
>  /**
>   * Return the AioContext whose event loop runs in the current thread.
>   *
> diff --git a/include/block/block.h b/include/block/block.h
> index 7f5453b45b..0685a73975 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -83,8 +83,14 @@ typedef enum {
>       */
>      BDRV_REQ_SERIALISING        = 0x80,
>  
> +    /*
> +     * marks requests comming from external sources,
> +     * e.g block requests from dma running in the vCPU thread
> +     */
> +    BDRV_REQ_EXTERNAL   = 0x100,

I don't like this one because BlockBackends should be used _only_ from
external sources.

I know that this isn't quite true today and there are still block jobs
calling BlockBackend internally while handling a BDS request, but I hope
with Vladimir's backup patches going it, this can go away.

So I suggest that for the time being, we invert the flag and have a
BDRV_REQ_INTERNAL instead, which is only used for BlockBackend requests,
hopefully doesn't have to be used in many places and which can go away
eventually.

>      /* Mask of valid flags */
> -    BDRV_REQ_MASK               = 0xff,
> +    BDRV_REQ_MASK               = 0xfff,
>  } BdrvRequestFlags;
>  
>  typedef struct BlockSizes {

Kevin

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Denis Plotnikov 5 days ago

On 07.12.2018 15:26, Kevin Wolf wrote:
> Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben:
>> At the time, the "drained section" doesn't protect Block Driver State
>> from the requests appearing in the vCPU threads.
>> This could lead to the data loss because of request coming to
>> an unexpected BDS.
>>
>> For example, when a request comes to ide controller from the guest,
>> the controller creates a request coroutine and executes the coroutine
>> in the vCPU thread. If another thread(iothread) has entered the
>> "drained section" on a BDS with bdrv_drained_begin, which protects
>> BDS' AioContext from external requests, and released the AioContext
>> because of finishing some coroutine by the moment of the request
>> appearing at the ide controller, the controller acquires the AioContext
>> and executes its request without any respect to the entered
>> "drained section" producing any kinds of data inconsistency.
>>
>> The patch prevents this case by putting requests from external threads to
>> the queue on AioContext while the context is protected for external requests
>> and executes those requests later on the external requests protection removing.
>>
>> Also, the patch marks requests generated in a vCPU thread as external ones
>> to make use of the request postponing.
>>
>> How to reproduce:
>> 1. start vm with an ide disk and a linux guest
>> 2. in the guest run: dd if=... of=... bs=4K count=100000 oflag=direct
>> 3. (qemu) drive_mirror "disk-name"
>> 4. wait until block job can receive block_job_complete
>> 5. (qemu) block_job_complete "disk-name"
>> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)
>>
>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
> 
> This is getting closer, but I'd like to see two more major changes:
> 
>> diff --git a/include/block/aio.h b/include/block/aio.h
>> index 0ca25dfec6..8512bda44e 100644
>> --- a/include/block/aio.h
>> +++ b/include/block/aio.h
>> @@ -19,6 +19,7 @@
>>   #include "qemu/event_notifier.h"
>>   #include "qemu/thread.h"
>>   #include "qemu/timer.h"
>> +#include "qemu/coroutine.h"
>>   
>>   typedef struct BlockAIOCB BlockAIOCB;
>>   typedef void BlockCompletionFunc(void *opaque, int ret);
>> @@ -130,6 +131,11 @@ struct AioContext {
>>       QEMUTimerListGroup tlg;
>>   
>>       int external_disable_cnt;
>> +    /* Queue to store the requests coming when the context is disabled for
>> +     * external requests.
>> +     * Don't use a separate lock for protection relying the context lock
>> +     */
>> +    CoQueue postponed_reqs;
> 
> Why involve the AioContext at all? This could all be kept at the
> BlockBackend level without extending the layering violation that
> aio_disable_external() is.
> 
> BlockBackends get notified when their root node is drained, so hooking
> things up there should be as easy, if not even easier than in
> AioContext.

Just want to make sure that I understood correctly what you meant by 
"BlockBackends get notified". Did you mean that bdrv_drain_end calls
child's role callback blk_root_drained_end by calling 
bdrv_parent_drained_end?

In case if it's so, it won't work if resume postponed requests in 
blk_root_drained_end since we can't know if external is disabled for the 
context because the counter showing that is decreased only after roles' 
drained callbacks are finished at bdrv_do_drained_end.
Please correct me if I'm wrong.

Looking at the patch again, I think that it might be useful to keep the 
requests in the structure that limits their execution and also protects 
the access (context acquire/release) although it's indeed the layering 
violation but at least we can store the parts related at the same place
and later on move somewhere else alongside the request restrictor.

Denis



> 
>>       /* Number of AioHandlers without .io_poll() */
>>       int poll_disable_cnt;
>> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx,
>>    */
>>   int64_t aio_compute_timeout(AioContext *ctx);
>>   
>> +/**
>> + * aio_co_enter:
>> + * @ctx: the context to run the coroutine
>> + * @co: the coroutine to run
>> + *
>> + * Enter a coroutine in the specified AioContext.
>> + */
>> +void aio_co_enter(AioContext *ctx, struct Coroutine *co);
>> +
>>   /**
>>    * aio_disable_external:
>>    * @ctx: the aio context
>> @@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx);
>>    */
>>   static inline void aio_disable_external(AioContext *ctx)
>>   {
>> +    aio_context_acquire(ctx);
>>       atomic_inc(&ctx->external_disable_cnt);
>> +    aio_context_release(ctx);
>>   }
> 
> This acquire/release pair looks rather useless?
> 
>> +static void run_postponed_co(void *opaque)
>> +{
>> +    AioContext *ctx = (AioContext *) opaque;
>> +
>> +    qemu_co_queue_restart_all(&ctx->postponed_reqs);
>> +}
>>   /**
>>    * aio_enable_external:
>>    * @ctx: the aio context
>> @@ -504,12 +527,17 @@ static inline void aio_enable_external(AioContext *ctx)
>>   {
>>       int old;
>>   
>> +    aio_context_acquire(ctx);
>>       old = atomic_fetch_dec(&ctx->external_disable_cnt);
>>       assert(old > 0);
>>       if (old == 1) {
>> +        Coroutine *co = qemu_coroutine_create(run_postponed_co, ctx);
>> +        aio_co_enter(ctx, co);
>> +
>>           /* Kick event loop so it re-arms file descriptors */
>>           aio_notify(ctx);
>>       }
>> +    aio_context_release(ctx);
>>   }
>>   
>>   /**
>> @@ -564,15 +592,6 @@ void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
>>    */
>>   void aio_co_wake(struct Coroutine *co);
>>   
>> -/**
>> - * aio_co_enter:
>> - * @ctx: the context to run the coroutine
>> - * @co: the coroutine to run
>> - *
>> - * Enter a coroutine in the specified AioContext.
>> - */
>> -void aio_co_enter(AioContext *ctx, struct Coroutine *co);
>> -
>>   /**
>>    * Return the AioContext whose event loop runs in the current thread.
>>    *
>> diff --git a/include/block/block.h b/include/block/block.h
>> index 7f5453b45b..0685a73975 100644
>> --- a/include/block/block.h
>> +++ b/include/block/block.h
>> @@ -83,8 +83,14 @@ typedef enum {
>>        */
>>       BDRV_REQ_SERIALISING        = 0x80,
>>   
>> +    /*
>> +     * marks requests comming from external sources,
>> +     * e.g block requests from dma running in the vCPU thread
>> +     */
>> +    BDRV_REQ_EXTERNAL   = 0x100,
> 
> I don't like this one because BlockBackends should be used _only_ from
> external sources.
> 
> I know that this isn't quite true today and there are still block jobs
> calling BlockBackend internally while handling a BDS request, but I hope
> with Vladimir's backup patches going it, this can go away.
> 
> So I suggest that for the time being, we invert the flag and have a
> BDRV_REQ_INTERNAL instead, which is only used for BlockBackend requests,
> hopefully doesn't have to be used in many places and which can go away
> eventually.
> 
>>       /* Mask of valid flags */
>> -    BDRV_REQ_MASK               = 0xff,
>> +    BDRV_REQ_MASK               = 0xfff,
>>   } BdrvRequestFlags;
>>   
>>   typedef struct BlockSizes {
> 
> Kevin
> 

-- 
Best,
Denis

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Kevin Wolf 4 days ago
Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
> > Why involve the AioContext at all? This could all be kept at the
> > BlockBackend level without extending the layering violation that
> > aio_disable_external() is.
> > 
> > BlockBackends get notified when their root node is drained, so hooking
> > things up there should be as easy, if not even easier than in
> > AioContext.
> 
> Just want to make sure that I understood correctly what you meant by 
> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
> child's role callback blk_root_drained_end by calling 
> bdrv_parent_drained_end?

Yes, blk_root_drained_begin/end calls are all you need. Specifically,
their adjustments to blk->quiesce_counter that are already there, and in
the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
we can resume the queued requests.

> In case if it's so, it won't work if resume postponed requests in 
> blk_root_drained_end since we can't know if external is disabled for the 
> context because the counter showing that is decreased only after roles' 
> drained callbacks are finished at bdrv_do_drained_end.
> Please correct me if I'm wrong.

You don't need to know about the AioContext state, this is the whole
point. blk->quiesce_counter is what tells you whether to postpone
requests.

> Looking at the patch again, I think that it might be useful to keep the 
> requests in the structure that limits their execution and also protects 
> the access (context acquire/release) although it's indeed the layering 
> violation but at least we can store the parts related at the same place
> and later on move somewhere else alongside the request restrictor.

You can keep everything you need in BlockBackend (and that's also where
your code is that really postpones request).

Kevin

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Denis Plotnikov 3 days ago

On 12.12.2018 15:24, Kevin Wolf wrote:
> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>> Why involve the AioContext at all? This could all be kept at the
>>> BlockBackend level without extending the layering violation that
>>> aio_disable_external() is.
>>>
>>> BlockBackends get notified when their root node is drained, so hooking
>>> things up there should be as easy, if not even easier than in
>>> AioContext.
>>
>> Just want to make sure that I understood correctly what you meant by
>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>> child's role callback blk_root_drained_end by calling
>> bdrv_parent_drained_end?
> 
> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
> their adjustments to blk->quiesce_counter that are already there, and in
> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
> we can resume the queued requests.
Sounds it should be so, but it doesn't work that way and that's why:
when doing mirror we may resume postponed coroutines too early when the 
underlying bs is protected from writing at and thus we encounter the 
assert on a write request execution at bdrv_co_write_req_prepare when 
resuming the postponed coroutines.

The thing is that the bs is protected for writing before execution of 
bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls 
bdrv_replace_child_noperm which, in turn, calls child->role->drained_end 
where one of the callbacks is blk_root_drained_end which check 
if(--blk->quiesce_counter == 0) and runs the postponed requests 
(coroutines) if the coundition is true.

In seems that if the external requests disabled on the context we can't 
rely on anything or should check where the underlying bs and its 
underlying nodes are ready to receive requests which sounds quite 
complicated.
Please correct me if still don't understand something in that routine.

Denis
>> In case if it's so, it won't work if resume postponed requests in
>> blk_root_drained_end since we can't know if external is disabled for the
>> context because the counter showing that is decreased only after roles'
>> drained callbacks are finished at bdrv_do_drained_end.
>> Please correct me if I'm wrong.
> 
> You don't need to know about the AioContext state, this is the whole
> point. blk->quiesce_counter is what tells you whether to postpone
> requests.
> 
>> Looking at the patch again, I think that it might be useful to keep the
>> requests in the structure that limits their execution and also protects
>> the access (context acquire/release) although it's indeed the layering
>> violation but at least we can store the parts related at the same place
>> and later on move somewhere else alongside the request restrictor.
> 
> You can keep everything you need in BlockBackend (and that's also where
> your code is that really postpones request).
> 
> Kevin
> 

-- 
Best,
Denis

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Kevin Wolf 3 days ago
Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
> On 12.12.2018 15:24, Kevin Wolf wrote:
> > Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
> >>> Why involve the AioContext at all? This could all be kept at the
> >>> BlockBackend level without extending the layering violation that
> >>> aio_disable_external() is.
> >>>
> >>> BlockBackends get notified when their root node is drained, so hooking
> >>> things up there should be as easy, if not even easier than in
> >>> AioContext.
> >>
> >> Just want to make sure that I understood correctly what you meant by
> >> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
> >> child's role callback blk_root_drained_end by calling
> >> bdrv_parent_drained_end?
> > 
> > Yes, blk_root_drained_begin/end calls are all you need. Specifically,
> > their adjustments to blk->quiesce_counter that are already there, and in
> > the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
> > we can resume the queued requests.
> Sounds it should be so, but it doesn't work that way and that's why:
> when doing mirror we may resume postponed coroutines too early when the 
> underlying bs is protected from writing at and thus we encounter the 
> assert on a write request execution at bdrv_co_write_req_prepare when 
> resuming the postponed coroutines.
> 
> The thing is that the bs is protected for writing before execution of 
> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls 
> bdrv_replace_child_noperm which, in turn, calls child->role->drained_end 
> where one of the callbacks is blk_root_drained_end which check 
> if(--blk->quiesce_counter == 0) and runs the postponed requests 
> (coroutines) if the coundition is true.

Hm, so something is messed up with the drain sections in the mirror
driver. We have:

    bdrv_drained_begin(target_bs);
    bdrv_replace_node(to_replace, target_bs, &local_err);
    bdrv_drained_end(target_bs);

Obviously, the intention was to keep the BlockBackend drained during
bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
inside bdrv_replace_node() when target_bs is drained?

Looking at bdrv_replace_child_noperm(), it seems that the function has
a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
for the parent reaches 0 for a moment because we call .drained_end for
the old child first and .drained_begin for the new one later.

So it seems the fix would be to reverse the order and first call
.drained_begin for the new child and then .drained_end for the old
child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too.

> In seems that if the external requests disabled on the context we can't 
> rely on anything or should check where the underlying bs and its 
> underlying nodes are ready to receive requests which sounds quite 
> complicated.
> Please correct me if still don't understand something in that routine.

I think the reason why reyling on aio_disable_external() works is simply
because src is also drained, which keeps external events in the
AioContext disabled despite the bug in draining the target node.

The bug would become apparent even with aio_disable_external() if we
didn't drain src, or even if we just supported src and target being in
different AioContexts.

Kevin

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Denis Plotnikov 2 days ago

On 13.12.2018 15:20, Kevin Wolf wrote:
> Am 13.12.2018 um 12:07 hat Denis Plotnikov geschrieben:
>> On 12.12.2018 15:24, Kevin Wolf wrote:
>>> Am 11.12.2018 um 17:55 hat Denis Plotnikov geschrieben:
>>>>> Why involve the AioContext at all? This could all be kept at the
>>>>> BlockBackend level without extending the layering violation that
>>>>> aio_disable_external() is.
>>>>>
>>>>> BlockBackends get notified when their root node is drained, so hooking
>>>>> things up there should be as easy, if not even easier than in
>>>>> AioContext.
>>>>
>>>> Just want to make sure that I understood correctly what you meant by
>>>> "BlockBackends get notified". Did you mean that bdrv_drain_end calls
>>>> child's role callback blk_root_drained_end by calling
>>>> bdrv_parent_drained_end?
>>>
>>> Yes, blk_root_drained_begin/end calls are all you need. Specifically,
>>> their adjustments to blk->quiesce_counter that are already there, and in
>>> the 'if (--blk->quiesce_counter == 0)' block of blk_root_drained_end()
>>> we can resume the queued requests.
>> Sounds it should be so, but it doesn't work that way and that's why:
>> when doing mirror we may resume postponed coroutines too early when the
>> underlying bs is protected from writing at and thus we encounter the
>> assert on a write request execution at bdrv_co_write_req_prepare when
>> resuming the postponed coroutines.
>>
>> The thing is that the bs is protected for writing before execution of
>> bdrv_replace_node at mirror_exit_common and bdrv_replace_node calls
>> bdrv_replace_child_noperm which, in turn, calls child->role->drained_end
>> where one of the callbacks is blk_root_drained_end which check
>> if(--blk->quiesce_counter == 0) and runs the postponed requests
>> (coroutines) if the coundition is true.
> 
> Hm, so something is messed up with the drain sections in the mirror
> driver. We have:
> 
>      bdrv_drained_begin(target_bs);
>      bdrv_replace_node(to_replace, target_bs, &local_err);
>      bdrv_drained_end(target_bs);
> 
> Obviously, the intention was to keep the BlockBackend drained during
> bdrv_replace_node(). So how could blk->quiesce_counter ever get to 0
> inside bdrv_replace_node() when target_bs is drained?
> 
> Looking at bdrv_replace_child_noperm(), it seems that the function has
> a bug: Even if old_bs and new_bs are both drained, the quiesce_counter
> for the parent reaches 0 for a moment because we call .drained_end for
> the old child first and .drained_begin for the new one later.
> 
> So it seems the fix would be to reverse the order and first call
> .drained_begin for the new child and then .drained_end for the old
> child. Sounds like a good new testcase for tests/test-bdrv-drain.c, too.
Yes, it's true, but it's not enough...
In mirror_exit_common() we actively manipulate with block driver states.
When we replaced a node in the snippet you showed we can't allow the 
postponed coroutines to run because the block tree isn't ready to 
receive the requests yet.
To be ready, we need to insert a proper block driver state to the block 
backend which is done here

     blk_remove_bs(bjob->blk);
     blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, &error_abort);
     blk_insert_bs(bjob->blk, mirror_top_bs, &error_abort); << << << <<

     bs_opaque->job = NULL;

     bdrv_drained_end(src);

If the tree isn't ready and we resume the coroutines, we'll end up with 
the request landed in a wrong block driver state.

So, we explicitly should stop all activities on all the driver states
and its parents and allow the activities when everything is ready to go.

Why explicitly, because the block driver states may belong to different 
block backends at the moment of the manipulation beginning.

So, it seems we need to disable all their contexts until the 
manipulation ends.

Please, correct me if I'm wrong.

> 
>> In seems that if the external requests disabled on the context we can't
>> rely on anything or should check where the underlying bs and its
>> underlying nodes are ready to receive requests which sounds quite
>> complicated.
>> Please correct me if still don't understand something in that routine.
> 
> I think the reason why reyling on aio_disable_external() works is simply
> because src is also drained, which keeps external events in the
> AioContext disabled despite the bug in draining the target node.
> 
> The bug would become apparent even with aio_disable_external() if we
> didn't drain src, or even if we just supported src and target being in
> different AioContexts.

Why don't we disable all those contexts involved until the end of the 
block device tree reconstruction?

Thanks!

Denis
> 
> Kevin
> 

-- 
Best,
Denis

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Denis Plotnikov 6 days ago

On 07.12.2018 15:26, Kevin Wolf wrote:
> Am 05.12.2018 um 13:23 hat Denis Plotnikov geschrieben:
>> At the time, the "drained section" doesn't protect Block Driver State
>> from the requests appearing in the vCPU threads.
>> This could lead to the data loss because of request coming to
>> an unexpected BDS.
>>
>> For example, when a request comes to ide controller from the guest,
>> the controller creates a request coroutine and executes the coroutine
>> in the vCPU thread. If another thread(iothread) has entered the
>> "drained section" on a BDS with bdrv_drained_begin, which protects
>> BDS' AioContext from external requests, and released the AioContext
>> because of finishing some coroutine by the moment of the request
>> appearing at the ide controller, the controller acquires the AioContext
>> and executes its request without any respect to the entered
>> "drained section" producing any kinds of data inconsistency.
>>
>> The patch prevents this case by putting requests from external threads to
>> the queue on AioContext while the context is protected for external requests
>> and executes those requests later on the external requests protection removing.
In general, I agree with the comments and going to make changes in the 
patches accordingly.

Also, I'd like to ask a question below
>>
>> Also, the patch marks requests generated in a vCPU thread as external ones
>> to make use of the request postponing.
>>
>> How to reproduce:
>> 1. start vm with an ide disk and a linux guest
>> 2. in the guest run: dd if=... of=... bs=4K count=100000 oflag=direct
>> 3. (qemu) drive_mirror "disk-name"
>> 4. wait until block job can receive block_job_complete
>> 5. (qemu) block_job_complete "disk-name"
>> 6. blk_aio_p[read|write]v may appear in vCPU context (here is the race)
>>
>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
> 
> This is getting closer, but I'd like to see two more major changes:
> 
>> diff --git a/include/block/aio.h b/include/block/aio.h
>> index 0ca25dfec6..8512bda44e 100644
>> --- a/include/block/aio.h
>> +++ b/include/block/aio.h
>> @@ -19,6 +19,7 @@
>>   #include "qemu/event_notifier.h"
>>   #include "qemu/thread.h"
>>   #include "qemu/timer.h"
>> +#include "qemu/coroutine.h"
>>   
>>   typedef struct BlockAIOCB BlockAIOCB;
>>   typedef void BlockCompletionFunc(void *opaque, int ret);
>> @@ -130,6 +131,11 @@ struct AioContext {
>>       QEMUTimerListGroup tlg;
>>   
>>       int external_disable_cnt;
>> +    /* Queue to store the requests coming when the context is disabled for
>> +     * external requests.
>> +     * Don't use a separate lock for protection relying the context lock
>> +     */
>> +    CoQueue postponed_reqs;
> 
> Why involve the AioContext at all? This could all be kept at the
> BlockBackend level without extending the layering violation that
> aio_disable_external() is.
> 
> BlockBackends get notified when their root node is drained, so hooking
> things up there should be as easy, if not even easier than in
> AioContext.
> 
>>       /* Number of AioHandlers without .io_poll() */
>>       int poll_disable_cnt;
>> @@ -483,6 +489,15 @@ static inline void aio_timer_init(AioContext *ctx,
>>    */
>>   int64_t aio_compute_timeout(AioContext *ctx);
>>   
>> +/**
>> + * aio_co_enter:
>> + * @ctx: the context to run the coroutine
>> + * @co: the coroutine to run
>> + *
>> + * Enter a coroutine in the specified AioContext.
>> + */
>> +void aio_co_enter(AioContext *ctx, struct Coroutine *co);
>> +
>>   /**
>>    * aio_disable_external:
>>    * @ctx: the aio context
>> @@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx);
>>    */
>>   static inline void aio_disable_external(AioContext *ctx)
>>   {
>> +    aio_context_acquire(ctx);
>>       atomic_inc(&ctx->external_disable_cnt);
>> +    aio_context_release(ctx);
>>   }
> 
> This acquire/release pair looks rather useless?

I'm not sure that I understand everything correctly...
but can a thread (context) try to disable external in another context?

> 
>> +static void run_postponed_co(void *opaque)
>> +{
>> +    AioContext *ctx = (AioContext *) opaque;
>> +
>> +    qemu_co_queue_restart_all(&ctx->postponed_reqs);
>> +}
>>   /**
>>    * aio_enable_external:
>>    * @ctx: the aio context
>> @@ -504,12 +527,17 @@ static inline void aio_enable_external(AioContext *ctx)
>>   {
>>       int old;
>>   
>> +    aio_context_acquire(ctx);
>>       old = atomic_fetch_dec(&ctx->external_disable_cnt);
>>       assert(old > 0);
>>       if (old == 1) {
>> +        Coroutine *co = qemu_coroutine_create(run_postponed_co, ctx);
>> +        aio_co_enter(ctx, co);
>> +
>>           /* Kick event loop so it re-arms file descriptors */
>>           aio_notify(ctx);
>>       }
>> +    aio_context_release(ctx);
>>   }
>>   
>>   /**
>> @@ -564,15 +592,6 @@ void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
>>    */
>>   void aio_co_wake(struct Coroutine *co);
>>   
>> -/**
>> - * aio_co_enter:
>> - * @ctx: the context to run the coroutine
>> - * @co: the coroutine to run
>> - *
>> - * Enter a coroutine in the specified AioContext.
>> - */
>> -void aio_co_enter(AioContext *ctx, struct Coroutine *co);
>> -
>>   /**
>>    * Return the AioContext whose event loop runs in the current thread.
>>    *
>> diff --git a/include/block/block.h b/include/block/block.h
>> index 7f5453b45b..0685a73975 100644
>> --- a/include/block/block.h
>> +++ b/include/block/block.h
>> @@ -83,8 +83,14 @@ typedef enum {
>>        */
>>       BDRV_REQ_SERIALISING        = 0x80,
>>   
>> +    /*
>> +     * marks requests comming from external sources,
>> +     * e.g block requests from dma running in the vCPU thread
>> +     */
>> +    BDRV_REQ_EXTERNAL   = 0x100,
> 
> I don't like this one because BlockBackends should be used _only_ from
> external sources.
> 
> I know that this isn't quite true today and there are still block jobs
> calling BlockBackend internally while handling a BDS request, but I hope
> with Vladimir's backup patches going it, this can go away.
> 
> So I suggest that for the time being, we invert the flag and have a
> BDRV_REQ_INTERNAL instead, which is only used for BlockBackend requests,
> hopefully doesn't have to be used in many places and which can go away
> eventually.
> 
>>       /* Mask of valid flags */
>> -    BDRV_REQ_MASK               = 0xff,
>> +    BDRV_REQ_MASK               = 0xfff,
>>   } BdrvRequestFlags;
>>   
>>   typedef struct BlockSizes {
> 
> Kevin
> 
Denis
-- 
Best,
Denis

Re: [Qemu-devel] [PATCH] blk: postpone request execution on a context protected with "drained section"

Posted by Kevin Wolf 6 days ago
Am 10.12.2018 um 13:14 hat Denis Plotnikov geschrieben:
> >> @@ -491,9 +506,17 @@ int64_t aio_compute_timeout(AioContext *ctx);
> >>    */
> >>   static inline void aio_disable_external(AioContext *ctx)
> >>   {
> >> +    aio_context_acquire(ctx);
> >>       atomic_inc(&ctx->external_disable_cnt);
> >> +    aio_context_release(ctx);
> >>   }
> > 
> > This acquire/release pair looks rather useless?
> 
> I'm not sure that I understand everything correctly...
> but can a thread (context) try to disable external in another context?

Yes, that can happen. For example, think of bdrv_drain_all().

Kevin