vduse_blk_detach_ctx() waits for in-flight requests using
AIO_WAIT_WHILE(). This is not allowed according to a comment in
bdrv_set_aio_context_commit():
/*
* Take the old AioContex when detaching it from bs.
* At this point, new_context lock is already acquired, and we are now
* also taking old_context. This is safe as long as bdrv_detach_aio_context
* does not call AIO_POLL_WHILE().
*/
Use this opportunity to rewrite the drain code in vduse-blk:
- Use the BlockExport refcount so that vduse_blk_exp_delete() is only
called when there are no more requests in flight.
- Implement .drained_poll() so in-flight request coroutines are stopped
by the time .bdrv_detach_aio_context() is called.
- Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
.bdrv_detach_aio_context() constraint violation. It's no longer
needed due to the previous changes.
- Always handle the VDUSE file descriptor, even in drained sections. The
VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
drained sections. This ensures that the VDUSE kernel code gets a fast
response.
- Suspend virtqueue fd handlers in .drained_begin() and resume them in
.drained_end(). This eliminates the need for the
aio_set_fd_handler(is_external=true) flag, which is being removed from
QEMU.
This is a long list but splitting it into individual commits would
probably lead to git bisect failures - the changes are all related.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
1 file changed, 93 insertions(+), 39 deletions(-)
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
index f7ae44e3ce..35dc8fcf45 100644
--- a/block/export/vduse-blk.c
+++ b/block/export/vduse-blk.c
@@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
VduseDev *dev;
uint16_t num_queues;
char *recon_file;
- unsigned int inflight;
+ unsigned int inflight; /* atomic */
+ bool vqs_started;
} VduseBlkExport;
typedef struct VduseBlkReq {
@@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
{
- vblk_exp->inflight++;
+ if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
+ /* Prevent export from being deleted */
+ aio_context_acquire(vblk_exp->export.ctx);
+ blk_exp_ref(&vblk_exp->export);
+ aio_context_release(vblk_exp->export.ctx);
+ }
}
static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
{
- if (--vblk_exp->inflight == 0) {
+ if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
+ /* Wake AIO_WAIT_WHILE() */
aio_wait_kick();
+
+ /* Now the export can be deleted */
+ aio_context_acquire(vblk_exp->export.ctx);
+ blk_exp_unref(&vblk_exp->export);
+ aio_context_release(vblk_exp->export.ctx);
}
}
@@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
{
VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
+ if (!vblk_exp->vqs_started) {
+ return; /* vduse_blk_drained_end() will start vqs later */
+ }
+
aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
- true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
+ false, on_vduse_vq_kick, NULL, NULL, NULL, vq);
/* Make sure we don't miss any kick afer reconnecting */
eventfd_write(vduse_queue_get_fd(vq), 1);
}
@@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
{
VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
+ int fd = vduse_queue_get_fd(vq);
- aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
- true, NULL, NULL, NULL, NULL, NULL);
+ if (fd < 0) {
+ return;
+ }
+
+ aio_set_fd_handler(vblk_exp->export.ctx, fd, false,
+ NULL, NULL, NULL, NULL, NULL);
}
static const VduseOps vduse_blk_ops = {
@@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque)
static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
{
- int i;
-
aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
- true, on_vduse_dev_kick, NULL, NULL, NULL,
+ false, on_vduse_dev_kick, NULL, NULL, NULL,
vblk_exp->dev);
- for (i = 0; i < vblk_exp->num_queues; i++) {
- VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
- int fd = vduse_queue_get_fd(vq);
-
- if (fd < 0) {
- continue;
- }
- aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
- on_vduse_vq_kick, NULL, NULL, NULL, vq);
- }
+ /* Virtqueues are handled by vduse_blk_drained_end() */
}
static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
{
- int i;
-
- for (i = 0; i < vblk_exp->num_queues; i++) {
- VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
- int fd = vduse_queue_get_fd(vq);
-
- if (fd < 0) {
- continue;
- }
- aio_set_fd_handler(vblk_exp->export.ctx, fd,
- true, NULL, NULL, NULL, NULL, NULL);
- }
aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
- true, NULL, NULL, NULL, NULL, NULL);
+ false, NULL, NULL, NULL, NULL, NULL);
- AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
+ /* Virtqueues are handled by vduse_blk_drained_begin() */
}
@@ -220,8 +218,55 @@ static void vduse_blk_resize(void *opaque)
(char *)&config.capacity);
}
+static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
+{
+ for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
+ VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
+ vduse_blk_disable_queue(vblk_exp->dev, vq);
+ }
+
+ vblk_exp->vqs_started = false;
+}
+
+static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
+{
+ vblk_exp->vqs_started = true;
+
+ for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
+ VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
+ vduse_blk_enable_queue(vblk_exp->dev, vq);
+ }
+}
+
+static void vduse_blk_drained_begin(void *opaque)
+{
+ BlockExport *exp = opaque;
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+ vduse_blk_stop_virtqueues(vblk_exp);
+}
+
+static void vduse_blk_drained_end(void *opaque)
+{
+ BlockExport *exp = opaque;
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+ vduse_blk_start_virtqueues(vblk_exp);
+}
+
+static bool vduse_blk_drained_poll(void *opaque)
+{
+ BlockExport *exp = opaque;
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
+
+ return qatomic_read(&vblk_exp->inflight) > 0;
+}
+
static const BlockDevOps vduse_block_ops = {
- .resize_cb = vduse_blk_resize,
+ .resize_cb = vduse_blk_resize,
+ .drained_begin = vduse_blk_drained_begin,
+ .drained_end = vduse_blk_drained_end,
+ .drained_poll = vduse_blk_drained_poll,
};
static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
@@ -268,6 +313,7 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
vblk_exp->handler.logical_block_size = logical_block_size;
vblk_exp->handler.writable = opts->writable;
+ vblk_exp->vqs_started = true;
config.capacity =
cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
@@ -322,14 +368,20 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
}
- aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
+ aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false,
on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
vblk_exp);
-
blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
+ /*
+ * We handle draining ourselves using an in-flight counter and by disabling
+ * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
+ * complete so the in-flight counter reaches zero.
+ */
+ blk_set_disable_request_queuing(exp->blk, true);
+
return 0;
err:
vduse_dev_destroy(vblk_exp->dev);
@@ -344,6 +396,9 @@ static void vduse_blk_exp_delete(BlockExport *exp)
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
int ret;
+ assert(qatomic_read(&vblk_exp->inflight) == 0);
+
+ vduse_blk_detach_ctx(vblk_exp);
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
vblk_exp);
blk_set_dev_ops(exp->blk, NULL, NULL);
@@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
g_free(vblk_exp->handler.serial);
}
+/* Called with exp->ctx acquired */
static void vduse_blk_exp_request_shutdown(BlockExport *exp)
{
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
- aio_context_acquire(vblk_exp->export.ctx);
- vduse_blk_detach_ctx(vblk_exp);
- aio_context_acquire(vblk_exp->export.ctx);
+ vduse_blk_stop_virtqueues(vblk_exp);
}
const BlockExportDriver blk_exp_vduse_blk = {
--
2.39.2
Hi Stefan,
On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> vduse_blk_detach_ctx() waits for in-flight requests using
> AIO_WAIT_WHILE(). This is not allowed according to a comment in
> bdrv_set_aio_context_commit():
>
> /*
> * Take the old AioContex when detaching it from bs.
> * At this point, new_context lock is already acquired, and we are now
> * also taking old_context. This is safe as long as bdrv_detach_aio_context
> * does not call AIO_POLL_WHILE().
> */
>
> Use this opportunity to rewrite the drain code in vduse-blk:
>
> - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
> called when there are no more requests in flight.
>
> - Implement .drained_poll() so in-flight request coroutines are stopped
> by the time .bdrv_detach_aio_context() is called.
>
> - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
> .bdrv_detach_aio_context() constraint violation. It's no longer
> needed due to the previous changes.
>
> - Always handle the VDUSE file descriptor, even in drained sections. The
> VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
> drained sections. This ensures that the VDUSE kernel code gets a fast
> response.
>
> - Suspend virtqueue fd handlers in .drained_begin() and resume them in
> .drained_end(). This eliminates the need for the
> aio_set_fd_handler(is_external=true) flag, which is being removed from
> QEMU.
>
> This is a long list but splitting it into individual commits would
> probably lead to git bisect failures - the changes are all related.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
> 1 file changed, 93 insertions(+), 39 deletions(-)
>
> diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> index f7ae44e3ce..35dc8fcf45 100644
> --- a/block/export/vduse-blk.c
> +++ b/block/export/vduse-blk.c
> @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
> VduseDev *dev;
> uint16_t num_queues;
> char *recon_file;
> - unsigned int inflight;
> + unsigned int inflight; /* atomic */
> + bool vqs_started;
> } VduseBlkExport;
>
> typedef struct VduseBlkReq {
> @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
>
> static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
> {
> - vblk_exp->inflight++;
> + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
I wonder why we need to use atomic operations here.
> + /* Prevent export from being deleted */
> + aio_context_acquire(vblk_exp->export.ctx);
> + blk_exp_ref(&vblk_exp->export);
> + aio_context_release(vblk_exp->export.ctx);
> + }
> }
>
> static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
> {
> - if (--vblk_exp->inflight == 0) {
> + if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
> + /* Wake AIO_WAIT_WHILE() */
> aio_wait_kick();
> +
> + /* Now the export can be deleted */
> + aio_context_acquire(vblk_exp->export.ctx);
> + blk_exp_unref(&vblk_exp->export);
> + aio_context_release(vblk_exp->export.ctx);
> }
> }
>
> @@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
> {
> VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
>
> + if (!vblk_exp->vqs_started) {
> + return; /* vduse_blk_drained_end() will start vqs later */
> + }
> +
> aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> - true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
> + false, on_vduse_vq_kick, NULL, NULL, NULL, vq);
> /* Make sure we don't miss any kick afer reconnecting */
> eventfd_write(vduse_queue_get_fd(vq), 1);
> }
> @@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
> static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
> {
> VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
> + int fd = vduse_queue_get_fd(vq);
>
> - aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> - true, NULL, NULL, NULL, NULL, NULL);
> + if (fd < 0) {
> + return;
> + }
> +
> + aio_set_fd_handler(vblk_exp->export.ctx, fd, false,
> + NULL, NULL, NULL, NULL, NULL);
> }
>
> static const VduseOps vduse_blk_ops = {
> @@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque)
>
> static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
> {
> - int i;
> -
> aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
> - true, on_vduse_dev_kick, NULL, NULL, NULL,
> + false, on_vduse_dev_kick, NULL, NULL, NULL,
> vblk_exp->dev);
>
> - for (i = 0; i < vblk_exp->num_queues; i++) {
> - VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> - int fd = vduse_queue_get_fd(vq);
> -
> - if (fd < 0) {
> - continue;
> - }
> - aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
> - on_vduse_vq_kick, NULL, NULL, NULL, vq);
> - }
> + /* Virtqueues are handled by vduse_blk_drained_end() */
> }
>
> static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
> {
> - int i;
> -
> - for (i = 0; i < vblk_exp->num_queues; i++) {
> - VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> - int fd = vduse_queue_get_fd(vq);
> -
> - if (fd < 0) {
> - continue;
> - }
> - aio_set_fd_handler(vblk_exp->export.ctx, fd,
> - true, NULL, NULL, NULL, NULL, NULL);
> - }
> aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
> - true, NULL, NULL, NULL, NULL, NULL);
> + false, NULL, NULL, NULL, NULL, NULL);
>
> - AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
> + /* Virtqueues are handled by vduse_blk_drained_begin() */
> }
>
>
> @@ -220,8 +218,55 @@ static void vduse_blk_resize(void *opaque)
> (char *)&config.capacity);
> }
>
> +static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
> +{
> + for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
> + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> + vduse_blk_disable_queue(vblk_exp->dev, vq);
> + }
> +
> + vblk_exp->vqs_started = false;
> +}
> +
> +static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
> +{
> + vblk_exp->vqs_started = true;
> +
> + for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
> + VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> + vduse_blk_enable_queue(vblk_exp->dev, vq);
> + }
> +}
> +
> +static void vduse_blk_drained_begin(void *opaque)
> +{
> + BlockExport *exp = opaque;
> + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> + vduse_blk_stop_virtqueues(vblk_exp);
> +}
> +
> +static void vduse_blk_drained_end(void *opaque)
> +{
> + BlockExport *exp = opaque;
> + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> + vduse_blk_start_virtqueues(vblk_exp);
> +}
> +
> +static bool vduse_blk_drained_poll(void *opaque)
> +{
> + BlockExport *exp = opaque;
> + VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> + return qatomic_read(&vblk_exp->inflight) > 0;
> +}
> +
> static const BlockDevOps vduse_block_ops = {
> - .resize_cb = vduse_blk_resize,
> + .resize_cb = vduse_blk_resize,
> + .drained_begin = vduse_blk_drained_begin,
> + .drained_end = vduse_blk_drained_end,
> + .drained_poll = vduse_blk_drained_poll,
> };
>
> static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
> @@ -268,6 +313,7 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
> vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
> vblk_exp->handler.logical_block_size = logical_block_size;
> vblk_exp->handler.writable = opts->writable;
> + vblk_exp->vqs_started = true;
>
> config.capacity =
> cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
> @@ -322,14 +368,20 @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
> vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
> }
>
> - aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
> + aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false,
> on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
>
> blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
> vblk_exp);
> -
> blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
>
> + /*
> + * We handle draining ourselves using an in-flight counter and by disabling
> + * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
> + * complete so the in-flight counter reaches zero.
> + */
> + blk_set_disable_request_queuing(exp->blk, true);
> +
> return 0;
> err:
> vduse_dev_destroy(vblk_exp->dev);
> @@ -344,6 +396,9 @@ static void vduse_blk_exp_delete(BlockExport *exp)
> VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> int ret;
>
> + assert(qatomic_read(&vblk_exp->inflight) == 0);
> +
> + vduse_blk_detach_ctx(vblk_exp);
> blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
> vblk_exp);
> blk_set_dev_ops(exp->blk, NULL, NULL);
> @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
> g_free(vblk_exp->handler.serial);
> }
>
> +/* Called with exp->ctx acquired */
> static void vduse_blk_exp_request_shutdown(BlockExport *exp)
> {
> VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>
> - aio_context_acquire(vblk_exp->export.ctx);
> - vduse_blk_detach_ctx(vblk_exp);
> - aio_context_acquire(vblk_exp->export.ctx);
> + vduse_blk_stop_virtqueues(vblk_exp);
Can we add a AIO_WAIT_WHILE() here? Then we don't need to
increase/decrease the BlockExport refcount during I/O processing.
Thanks,
Yongji
On Fri, Apr 21, 2023 at 11:36:02AM +0800, Yongji Xie wrote:
> Hi Stefan,
>
> On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > vduse_blk_detach_ctx() waits for in-flight requests using
> > AIO_WAIT_WHILE(). This is not allowed according to a comment in
> > bdrv_set_aio_context_commit():
> >
> > /*
> > * Take the old AioContex when detaching it from bs.
> > * At this point, new_context lock is already acquired, and we are now
> > * also taking old_context. This is safe as long as bdrv_detach_aio_context
> > * does not call AIO_POLL_WHILE().
> > */
> >
> > Use this opportunity to rewrite the drain code in vduse-blk:
> >
> > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
> > called when there are no more requests in flight.
> >
> > - Implement .drained_poll() so in-flight request coroutines are stopped
> > by the time .bdrv_detach_aio_context() is called.
> >
> > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
> > .bdrv_detach_aio_context() constraint violation. It's no longer
> > needed due to the previous changes.
> >
> > - Always handle the VDUSE file descriptor, even in drained sections. The
> > VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
> > drained sections. This ensures that the VDUSE kernel code gets a fast
> > response.
> >
> > - Suspend virtqueue fd handlers in .drained_begin() and resume them in
> > .drained_end(). This eliminates the need for the
> > aio_set_fd_handler(is_external=true) flag, which is being removed from
> > QEMU.
> >
> > This is a long list but splitting it into individual commits would
> > probably lead to git bisect failures - the changes are all related.
> >
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> > block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
> > 1 file changed, 93 insertions(+), 39 deletions(-)
> >
> > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> > index f7ae44e3ce..35dc8fcf45 100644
> > --- a/block/export/vduse-blk.c
> > +++ b/block/export/vduse-blk.c
> > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
> > VduseDev *dev;
> > uint16_t num_queues;
> > char *recon_file;
> > - unsigned int inflight;
> > + unsigned int inflight; /* atomic */
> > + bool vqs_started;
> > } VduseBlkExport;
> >
> > typedef struct VduseBlkReq {
> > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
> >
> > static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
> > {
> > - vblk_exp->inflight++;
> > + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
>
> I wonder why we need to use atomic operations here.
The inflight counter is only modified by the vhost-user export thread,
but it may be read by another thread here:
static bool vduse_blk_drained_poll(void *opaque)
{
BlockExport *exp = opaque;
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
return qatomic_read(&vblk_exp->inflight) > 0;
BlockDevOps->drained_poll() calls are invoked when BlockDriverStates are
drained (e.g. blk_drain_all() and related APIs).
> > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
> > g_free(vblk_exp->handler.serial);
> > }
> >
> > +/* Called with exp->ctx acquired */
> > static void vduse_blk_exp_request_shutdown(BlockExport *exp)
> > {
> > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> >
> > - aio_context_acquire(vblk_exp->export.ctx);
> > - vduse_blk_detach_ctx(vblk_exp);
> > - aio_context_acquire(vblk_exp->export.ctx);
> > + vduse_blk_stop_virtqueues(vblk_exp);
>
> Can we add a AIO_WAIT_WHILE() here? Then we don't need to
> increase/decrease the BlockExport refcount during I/O processing.
I don't think so because vduse_blk_exp_request_shutdown() is not the
only place where we wait for requests to complete. There would still
need to be away to wait for requests to finish (without calling
AIO_WAIT_WHILE()) in vduse_blk_drained_poll().
Stefan
On Wed, Apr 26, 2023 at 12:43 AM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Fri, Apr 21, 2023 at 11:36:02AM +0800, Yongji Xie wrote:
> > Hi Stefan,
> >
> > On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >
> > > vduse_blk_detach_ctx() waits for in-flight requests using
> > > AIO_WAIT_WHILE(). This is not allowed according to a comment in
> > > bdrv_set_aio_context_commit():
> > >
> > > /*
> > > * Take the old AioContex when detaching it from bs.
> > > * At this point, new_context lock is already acquired, and we are now
> > > * also taking old_context. This is safe as long as bdrv_detach_aio_context
> > > * does not call AIO_POLL_WHILE().
> > > */
> > >
> > > Use this opportunity to rewrite the drain code in vduse-blk:
> > >
> > > - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
> > > called when there are no more requests in flight.
> > >
> > > - Implement .drained_poll() so in-flight request coroutines are stopped
> > > by the time .bdrv_detach_aio_context() is called.
> > >
> > > - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
> > > .bdrv_detach_aio_context() constraint violation. It's no longer
> > > needed due to the previous changes.
> > >
> > > - Always handle the VDUSE file descriptor, even in drained sections. The
> > > VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
> > > drained sections. This ensures that the VDUSE kernel code gets a fast
> > > response.
> > >
> > > - Suspend virtqueue fd handlers in .drained_begin() and resume them in
> > > .drained_end(). This eliminates the need for the
> > > aio_set_fd_handler(is_external=true) flag, which is being removed from
> > > QEMU.
> > >
> > > This is a long list but splitting it into individual commits would
> > > probably lead to git bisect failures - the changes are all related.
> > >
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > ---
> > > block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
> > > 1 file changed, 93 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> > > index f7ae44e3ce..35dc8fcf45 100644
> > > --- a/block/export/vduse-blk.c
> > > +++ b/block/export/vduse-blk.c
> > > @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
> > > VduseDev *dev;
> > > uint16_t num_queues;
> > > char *recon_file;
> > > - unsigned int inflight;
> > > + unsigned int inflight; /* atomic */
> > > + bool vqs_started;
> > > } VduseBlkExport;
> > >
> > > typedef struct VduseBlkReq {
> > > @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
> > >
> > > static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
> > > {
> > > - vblk_exp->inflight++;
> > > + if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {
> >
> > I wonder why we need to use atomic operations here.
>
> The inflight counter is only modified by the vhost-user export thread,
> but it may be read by another thread here:
>
I see. I mean is it enough to just use volatile keywords here, since
the writers would not access the variable concurrently.
> static bool vduse_blk_drained_poll(void *opaque)
> {
> BlockExport *exp = opaque;
> VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>
> return qatomic_read(&vblk_exp->inflight) > 0;
>
> BlockDevOps->drained_poll() calls are invoked when BlockDriverStates are
> drained (e.g. blk_drain_all() and related APIs).
>
> > > @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
> > > g_free(vblk_exp->handler.serial);
> > > }
> > >
> > > +/* Called with exp->ctx acquired */
> > > static void vduse_blk_exp_request_shutdown(BlockExport *exp)
> > > {
> > > VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> > >
> > > - aio_context_acquire(vblk_exp->export.ctx);
> > > - vduse_blk_detach_ctx(vblk_exp);
> > > - aio_context_acquire(vblk_exp->export.ctx);
> > > + vduse_blk_stop_virtqueues(vblk_exp);
> >
> > Can we add a AIO_WAIT_WHILE() here? Then we don't need to
> > increase/decrease the BlockExport refcount during I/O processing.
>
> I don't think so because vduse_blk_exp_request_shutdown() is not the
> only place where we wait for requests to complete. There would still
> need to be away to wait for requests to finish (without calling
> AIO_WAIT_WHILE()) in vduse_blk_drained_poll().
>
But the BlockExport would not be freed until we call
vduse_blk_exp_request_shutdown(). If we can ensure that there will be
no inflight I/O after we call vduse_blk_exp_request_shutdown(), the
BlockExport can be freed safely without increasing/decreasing the
BlockExport refcount during I/O processing.
Thanks,
Yongji
© 2016 - 2026 Red Hat, Inc.