[v6] aio: add the aio_add_sqe() io_uring API

[PATCH v6 14/15] block/io_uring: use aio_add_sqe()

Posted by Stefan Hajnoczi 3 months, 1 week ago

AioContext has its own io_uring instance for file descriptor monitoring.
The disk I/O io_uring code was developed separately. Originally I
thought the characteristics of file descriptor monitoring and disk I/O
were too different, requiring separate io_uring instances.

Now it has become clear to me that it's feasible to share a single
io_uring instance for file descriptor monitoring and disk I/O. We're not
using io_uring's IOPOLL feature or anything else that would require a
separate instance.

Unify block/io_uring.c and util/fdmon-io_uring.c using the new
aio_add_sqe() API that allows user-defined io_uring sqe submission. Now
block/io_uring.c just needs to submit readv/writev/fsync and most of the
io_uring-specific logic is handled by fdmon-io_uring.c.

There are two immediate advantages:
1. Fewer system calls. There is no need to monitor the disk I/O io_uring
   ring fd from the file descriptor monitoring io_uring instance. Disk
   I/O completions are now picked up directly. Also, sqes are
   accumulated in the sq ring until the end of the event loop iteration
   and there are fewer io_uring_enter(2) syscalls.
2. Less code duplication.

Note that error_setg() messages are not supposed to end with
punctuation, so I removed a '.' for the non-io_uring build error
message.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 include/block/aio.h     |   7 -
 include/block/raw-aio.h |   5 -
 block/file-posix.c      |  40 ++--
 block/io_uring.c        | 489 ++++++++++------------------------------
 stubs/io_uring.c        |  32 ---
 util/async.c            |  35 ---
 block/trace-events      |  12 +-
 stubs/meson.build       |   3 -
 8 files changed, 130 insertions(+), 493 deletions(-)
 delete mode 100644 stubs/io_uring.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 05d1bf4036..540bbc5d60 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -310,8 +310,6 @@ struct AioContext {
     struct LinuxAioState *linux_aio;
 #endif
 #ifdef CONFIG_LINUX_IO_URING
-    LuringState *linux_io_uring;
-
     /* State for file descriptor monitoring using Linux io_uring */
     struct io_uring fdmon_io_uring;
     AioHandlerSList submit_list;
@@ -615,11 +613,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
 /* Return the LinuxAioState bound to this AioContext */
 struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
 
-/* Setup the LuringState bound to this AioContext */
-LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
-
-/* Return the LuringState bound to this AioContext */
-LuringState *aio_get_linux_io_uring(AioContext *ctx);
 /**
  * aio_timer_new_with_attrs:
  * @ctx: the aio context
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 6570244496..30e5fc9a9f 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -74,15 +74,10 @@ static inline bool laio_has_fua(void)
 #endif
 /* io_uring.c - Linux io_uring implementation */
 #ifdef CONFIG_LINUX_IO_URING
-LuringState *luring_init(Error **errp);
-void luring_cleanup(LuringState *s);
-
 /* luring_co_submit: submit I/O requests in the thread's current AioContext. */
 int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
                                   QEMUIOVector *qiov, int type,
                                   BdrvRequestFlags flags);
-void luring_detach_aio_context(LuringState *s, AioContext *old_context);
-void luring_attach_aio_context(LuringState *s, AioContext *new_context);
 bool luring_has_fua(void);
 #else
 static inline bool luring_has_fua(void)
diff --git a/block/file-posix.c b/block/file-posix.c
index 8c738674ce..8b7c02d19a 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
 #endif /* !defined(CONFIG_LINUX_AIO) */
 
-#ifndef CONFIG_LINUX_IO_URING
     if (s->use_linux_io_uring) {
+#ifdef CONFIG_LINUX_IO_URING
+        if (!aio_has_io_uring()) {
+            error_setg(errp, "aio=io_uring was specified, but is not "
+                             "available (disabled via io_uring_disabled "
+                             "sysctl or blocked by container runtime "
+                             "seccomp policy?)");
+            ret = -EINVAL;
+            goto fail;
+        }
+#else
         error_setg(errp, "aio=io_uring was specified, but is not supported "
-                         "in this build.");
+                         "in this build");
         ret = -EINVAL;
         goto fail;
-    }
 #endif /* !defined(CONFIG_LINUX_IO_URING) */
+    }
 
     s->has_discard = true;
     s->has_write_zeroes = true;
@@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
     return true;
 }
 
-#ifdef CONFIG_LINUX_IO_URING
-static inline bool raw_check_linux_io_uring(BDRVRawState *s)
-{
-    Error *local_err = NULL;
-    AioContext *ctx;
-
-    if (!s->use_linux_io_uring) {
-        return false;
-    }
-
-    ctx = qemu_get_current_aio_context();
-    if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
-        error_reportf_err(local_err, "Unable to use linux io_uring, "
-                                     "falling back to thread pool: ");
-        s->use_linux_io_uring = false;
-        return false;
-    }
-    return true;
-}
-#endif
-
 #ifdef CONFIG_LINUX_AIO
 static inline bool raw_check_linux_aio(BDRVRawState *s)
 {
@@ -2595,7 +2583,7 @@ raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes,
     if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
         type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_IO_URING
-    } else if (raw_check_linux_io_uring(s)) {
+    } else if (s->use_linux_io_uring) {
         assert(qiov->size == bytes);
         ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
         goto out;
@@ -2692,7 +2680,7 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
     };
 
 #ifdef CONFIG_LINUX_IO_URING
-    if (raw_check_linux_io_uring(s)) {
+    if (s->use_linux_io_uring) {
         return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
     }
 #endif
diff --git a/block/io_uring.c b/block/io_uring.c
index dd4f304910..dd930ee57e 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -11,28 +11,20 @@
 #include "qemu/osdep.h"
 #include <liburing.h>
 #include "block/aio.h"
-#include "qemu/queue.h"
 #include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
-#include "qemu/defer-call.h"
-#include "qapi/error.h"
 #include "system/block-backend.h"
 #include "trace.h"
 
-/* Only used for assertions.  */
-#include "qemu/coroutine_int.h"
-
-/* io_uring ring size */
-#define MAX_ENTRIES 128
-
-typedef struct LuringAIOCB {
+typedef struct {
     Coroutine *co;
-    struct io_uring_sqe sqeq;
-    ssize_t ret;
     QEMUIOVector *qiov;
-    bool is_read;
-    QSIMPLEQ_ENTRY(LuringAIOCB) next;
+    uint64_t offset;
+    ssize_t ret;
+    int type;
+    int fd;
+    BdrvRequestFlags flags;
 
     /*
      * Buffered reads may require resubmission, see
@@ -40,36 +32,51 @@ typedef struct LuringAIOCB {
      */
     int total_read;
     QEMUIOVector resubmit_qiov;
-} LuringAIOCB;
 
-typedef struct LuringQueue {
-    unsigned int in_queue;
-    unsigned int in_flight;
-    bool blocked;
-    QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
-} LuringQueue;
+    CqeHandler cqe_handler;
+} LuringRequest;
 
-struct LuringState {
-    AioContext *aio_context;
-
-    struct io_uring ring;
-
-    /* No locking required, only accessed from AioContext home thread */
-    LuringQueue io_q;
-
-    QEMUBH *completion_bh;
-};
-
-/**
- * luring_resubmit:
- *
- * Resubmit a request by appending it to submit_queue.  The caller must ensure
- * that ioq_submit() is called later so that submit_queue requests are started.
- */
-static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
+static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
 {
-    QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
-    s->io_q.in_queue++;
+    LuringRequest *req = opaque;
+    QEMUIOVector *qiov = req->qiov;
+    uint64_t offset = req->offset;
+    int fd = req->fd;
+    BdrvRequestFlags flags = req->flags;
+
+    switch (req->type) {
+    case QEMU_AIO_WRITE:
+#ifdef HAVE_IO_URING_PREP_WRITEV2
+    {
+        int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
+        io_uring_prep_writev2(sqe, fd, qiov->iov,
+                              qiov->niov, offset, luring_flags);
+    }
+#else
+        assert(flags == 0);
+        io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
+#endif
+        break;
+    case QEMU_AIO_ZONE_APPEND:
+        io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
+        break;
+    case QEMU_AIO_READ:
+    {
+        if (req->resubmit_qiov.iov != NULL) {
+            qiov = &req->resubmit_qiov;
+        }
+        io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov,
+                            offset + req->total_read);
+        break;
+    }
+    case QEMU_AIO_FLUSH:
+        io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC);
+        break;
+    default:
+        fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
+                        __func__, req->type);
+        abort();
+    }
 }
 
 /**
@@ -78,385 +85,115 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
  * Short reads are rare but may occur. The remaining read request needs to be
  * resubmitted.
  */
-static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
-                                       int nread)
+static void luring_resubmit_short_read(LuringRequest *req, int nread)
 {
     QEMUIOVector *resubmit_qiov;
     size_t remaining;
 
-    trace_luring_resubmit_short_read(s, luringcb, nread);
+    trace_luring_resubmit_short_read(req, nread);
 
     /* Update read position */
-    luringcb->total_read += nread;
-    remaining = luringcb->qiov->size - luringcb->total_read;
+    req->total_read += nread;
+    remaining = req->qiov->size - req->total_read;
 
     /* Shorten qiov */
-    resubmit_qiov = &luringcb->resubmit_qiov;
+    resubmit_qiov = &req->resubmit_qiov;
     if (resubmit_qiov->iov == NULL) {
-        qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
+        qemu_iovec_init(resubmit_qiov, req->qiov->niov);
     } else {
         qemu_iovec_reset(resubmit_qiov);
     }
-    qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
-                      remaining);
+    qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining);
 
-    /* Update sqe */
-    luringcb->sqeq.off += nread;
-    luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov;
-    luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
-
-    luring_resubmit(s, luringcb);
+    aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
 }
 
-/**
- * luring_process_completions:
- * @s: AIO state
- *
- * Fetches completed I/O requests, consumes cqes and invokes their callbacks
- * The function is somewhat tricky because it supports nested event loops, for
- * example when a request callback invokes aio_poll().
- *
- * Function schedules BH completion so it  can be called again in a nested
- * event loop.  When there are no events left  to complete the BH is being
- * canceled.
- *
- */
-static void luring_process_completions(LuringState *s)
+static void luring_cqe_handler(CqeHandler *cqe_handler)
 {
-    struct io_uring_cqe *cqes;
-    int total_bytes;
+    LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler);
+    int ret = cqe_handler->cqe.res;
 
-    defer_call_begin();
+    trace_luring_cqe_handler(req, ret);
 
-    /*
-     * Request completion callbacks can run the nested event loop.
-     * Schedule ourselves so the nested event loop will "see" remaining
-     * completed requests and process them.  Without this, completion
-     * callbacks that wait for other requests using a nested event loop
-     * would hang forever.
-     *
-     * This workaround is needed because io_uring uses poll_wait, which
-     * is woken up when new events are added to the uring, thus polling on
-     * the same uring fd will block unless more events are received.
-     *
-     * Other leaf block drivers (drivers that access the data themselves)
-     * are networking based, so they poll sockets for data and run the
-     * correct coroutine.
-     */
-    qemu_bh_schedule(s->completion_bh);
-
-    while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
-        LuringAIOCB *luringcb;
-        int ret;
-
-        if (!cqes) {
-            break;
+    if (ret < 0) {
+        /*
+         * Only writev/readv/fsync requests on regular files or host block
+         * devices are submitted. Therefore -EAGAIN is not expected but it's
+         * known to happen sometimes with Linux SCSI. Submit again and hope
+         * the request completes successfully.
+         *
+         * For more information, see:
+         * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
+         *
+         * If the code is changed to submit other types of requests in the
+         * future, then this workaround may need to be extended to deal with
+         * genuine -EAGAIN results that should not be resubmitted
+         * immediately.
+         */
+        if (ret == -EINTR || ret == -EAGAIN) {
+            aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
+            return;
         }
-
-        luringcb = io_uring_cqe_get_data(cqes);
-        ret = cqes->res;
-        io_uring_cqe_seen(&s->ring, cqes);
-        cqes = NULL;
-
-        /* Change counters one-by-one because we can be nested. */
-        s->io_q.in_flight--;
-        trace_luring_process_completion(s, luringcb, ret);
-
+    } else if (req->qiov) {
         /* total_read is non-zero only for resubmitted read requests */
-        total_bytes = ret + luringcb->total_read;
+        int total_bytes = ret + req->total_read;
 
-        if (ret < 0) {
-            /*
-             * Only writev/readv/fsync requests on regular files or host block
-             * devices are submitted. Therefore -EAGAIN is not expected but it's
-             * known to happen sometimes with Linux SCSI. Submit again and hope
-             * the request completes successfully.
-             *
-             * For more information, see:
-             * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
-             *
-             * If the code is changed to submit other types of requests in the
-             * future, then this workaround may need to be extended to deal with
-             * genuine -EAGAIN results that should not be resubmitted
-             * immediately.
-             */
-            if (ret == -EINTR || ret == -EAGAIN) {
-                luring_resubmit(s, luringcb);
-                continue;
-            }
-        } else if (!luringcb->qiov) {
-            goto end;
-        } else if (total_bytes == luringcb->qiov->size) {
+        if (total_bytes == req->qiov->size) {
             ret = 0;
-        /* Only read/write */
         } else {
             /* Short Read/Write */
-            if (luringcb->is_read) {
+            if (req->type == QEMU_AIO_READ) {
                 if (ret > 0) {
-                    luring_resubmit_short_read(s, luringcb, ret);
-                    continue;
-                } else {
-                    /* Pad with zeroes */
-                    qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
-                                      luringcb->qiov->size - total_bytes);
-                    ret = 0;
+                    luring_resubmit_short_read(req, ret);
+                    return;
                 }
+
+                /* Pad with zeroes */
+                qemu_iovec_memset(req->qiov, total_bytes, 0,
+                                  req->qiov->size - total_bytes);
+                ret = 0;
             } else {
                 ret = -ENOSPC;
             }
         }
-end:
-        luringcb->ret = ret;
-        qemu_iovec_destroy(&luringcb->resubmit_qiov);
-
-        /*
-         * If the coroutine is already entered it must be in ioq_submit()
-         * and will notice luringcb->ret has been filled in when it
-         * eventually runs later. Coroutines cannot be entered recursively
-         * so avoid doing that!
-         */
-        assert(luringcb->co->ctx == s->aio_context);
-        if (!qemu_coroutine_entered(luringcb->co)) {
-            aio_co_wake(luringcb->co);
-        }
     }
 
-    qemu_bh_cancel(s->completion_bh);
+    req->ret = ret;
+    qemu_iovec_destroy(&req->resubmit_qiov);
 
-    defer_call_end();
-}
-
-static int ioq_submit(LuringState *s)
-{
-    int ret = 0;
-    LuringAIOCB *luringcb, *luringcb_next;
-
-    while (s->io_q.in_queue > 0) {
-        /*
-         * Try to fetch sqes from the ring for requests waiting in
-         * the overflow queue
-         */
-        QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next,
-                              luringcb_next) {
-            struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring);
-            if (!sqes) {
-                break;
-            }
-            /* Prep sqe for submission */
-            *sqes = luringcb->sqeq;
-            QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next);
-        }
-        ret = io_uring_submit(&s->ring);
-        trace_luring_io_uring_submit(s, ret);
-        /* Prevent infinite loop if submission is refused */
-        if (ret <= 0) {
-            if (ret == -EAGAIN || ret == -EINTR) {
-                continue;
-            }
-            break;
-        }
-        s->io_q.in_flight += ret;
-        s->io_q.in_queue  -= ret;
-    }
-    s->io_q.blocked = (s->io_q.in_queue > 0);
-
-    if (s->io_q.in_flight) {
-        /*
-         * We can try to complete something just right away if there are
-         * still requests in-flight.
-         */
-        luring_process_completions(s);
-    }
-    return ret;
-}
-
-static void luring_process_completions_and_submit(LuringState *s)
-{
-    luring_process_completions(s);
-
-    if (s->io_q.in_queue > 0) {
-        ioq_submit(s);
+    /*
+     * If the coroutine is already entered it must be in luring_co_submit() and
+     * will notice req->ret has been filled in when it eventually runs later.
+     * Coroutines cannot be entered recursively so avoid doing that!
+     */
+    if (!qemu_coroutine_entered(req->co)) {
+        aio_co_wake(req->co);
     }
 }
 
-static void qemu_luring_completion_bh(void *opaque)
+int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd,
+                                  uint64_t offset, QEMUIOVector *qiov,
+                                  int type, BdrvRequestFlags flags)
 {
-    LuringState *s = opaque;
-    luring_process_completions_and_submit(s);
-}
-
-static void qemu_luring_completion_cb(void *opaque)
-{
-    LuringState *s = opaque;
-    luring_process_completions_and_submit(s);
-}
-
-static bool qemu_luring_poll_cb(void *opaque)
-{
-    LuringState *s = opaque;
-
-    return io_uring_cq_ready(&s->ring);
-}
-
-static void qemu_luring_poll_ready(void *opaque)
-{
-    LuringState *s = opaque;
-
-    luring_process_completions_and_submit(s);
-}
-
-static void ioq_init(LuringQueue *io_q)
-{
-    QSIMPLEQ_INIT(&io_q->submit_queue);
-    io_q->in_queue = 0;
-    io_q->in_flight = 0;
-    io_q->blocked = false;
-}
-
-static void luring_deferred_fn(void *opaque)
-{
-    LuringState *s = opaque;
-    trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
-                           s->io_q.in_flight);
-    if (!s->io_q.blocked && s->io_q.in_queue > 0) {
-        ioq_submit(s);
-    }
-}
-
-/**
- * luring_do_submit:
- * @fd: file descriptor for I/O
- * @luringcb: AIO control block
- * @s: AIO state
- * @offset: offset for request
- * @type: type of request
- *
- * Fetches sqes from ring, adds to pending queue and preps them
- *
- */
-static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
-                            uint64_t offset, int type, BdrvRequestFlags flags)
-{
-    int ret;
-    struct io_uring_sqe *sqes = &luringcb->sqeq;
-
-    switch (type) {
-    case QEMU_AIO_WRITE:
-#ifdef HAVE_IO_URING_PREP_WRITEV2
-    {
-        int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
-        io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
-                              luringcb->qiov->niov, offset, luring_flags);
-    }
-#else
-        assert(flags == 0);
-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
-                             luringcb->qiov->niov, offset);
-#endif
-        break;
-    case QEMU_AIO_ZONE_APPEND:
-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
-                             luringcb->qiov->niov, offset);
-        break;
-    case QEMU_AIO_READ:
-        io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
-                            luringcb->qiov->niov, offset);
-        break;
-    case QEMU_AIO_FLUSH:
-        io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
-        break;
-    default:
-        fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
-                        __func__, type);
-        abort();
-    }
-    io_uring_sqe_set_data(sqes, luringcb);
-
-    QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
-    s->io_q.in_queue++;
-    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
-                           s->io_q.in_flight);
-    if (!s->io_q.blocked) {
-        if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
-            ret = ioq_submit(s);
-            trace_luring_do_submit_done(s, ret);
-            return ret;
-        }
-
-        defer_call(luring_deferred_fn, s);
-    }
-    return 0;
-}
-
-int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
-                                  QEMUIOVector *qiov, int type,
-                                  BdrvRequestFlags flags)
-{
-    int ret;
-    AioContext *ctx = qemu_get_current_aio_context();
-    LuringState *s = aio_get_linux_io_uring(ctx);
-    LuringAIOCB luringcb = {
+    LuringRequest req = {
         .co         = qemu_coroutine_self(),
-        .ret        = -EINPROGRESS,
         .qiov       = qiov,
-        .is_read    = (type == QEMU_AIO_READ),
+        .ret        = -EINPROGRESS,
+        .type       = type,
+        .fd         = fd,
+        .offset     = offset,
+        .flags      = flags,
     };
-    trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
-                           type);
-    ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
 
-    if (ret < 0) {
-        return ret;
-    }
+    req.cqe_handler.cb = luring_cqe_handler;
 
-    if (luringcb.ret == -EINPROGRESS) {
+    trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type);
+    aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler);
+
+    if (req.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
-    return luringcb.ret;
-}
-
-void luring_detach_aio_context(LuringState *s, AioContext *old_context)
-{
-    aio_set_fd_handler(old_context, s->ring.ring_fd,
-                       NULL, NULL, NULL, NULL, s);
-    qemu_bh_delete(s->completion_bh);
-    s->aio_context = NULL;
-}
-
-void luring_attach_aio_context(LuringState *s, AioContext *new_context)
-{
-    s->aio_context = new_context;
-    s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
-    aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
-                       qemu_luring_completion_cb, NULL,
-                       qemu_luring_poll_cb, qemu_luring_poll_ready, s);
-}
-
-LuringState *luring_init(Error **errp)
-{
-    int rc;
-    LuringState *s = g_new0(LuringState, 1);
-    struct io_uring *ring = &s->ring;
-
-    trace_luring_init_state(s, sizeof(*s));
-
-    rc = io_uring_queue_init(MAX_ENTRIES, ring, 0);
-    if (rc < 0) {
-        error_setg_errno(errp, -rc, "failed to init linux io_uring ring");
-        g_free(s);
-        return NULL;
-    }
-
-    ioq_init(&s->io_q);
-    return s;
-
-}
-
-void luring_cleanup(LuringState *s)
-{
-    io_uring_queue_exit(&s->ring);
-    trace_luring_cleanup_state(s);
-    g_free(s);
+    return req.ret;
 }
 
 bool luring_has_fua(void)
diff --git a/stubs/io_uring.c b/stubs/io_uring.c
deleted file mode 100644
index 622d1e4648..0000000000
--- a/stubs/io_uring.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Linux io_uring support.
- *
- * Copyright (C) 2009 IBM, Corp.
- * Copyright (C) 2009 Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-#include "qemu/osdep.h"
-#include "block/aio.h"
-#include "block/raw-aio.h"
-
-void luring_detach_aio_context(LuringState *s, AioContext *old_context)
-{
-    abort();
-}
-
-void luring_attach_aio_context(LuringState *s, AioContext *new_context)
-{
-    abort();
-}
-
-LuringState *luring_init(Error **errp)
-{
-    abort();
-}
-
-void luring_cleanup(LuringState *s)
-{
-    abort();
-}
diff --git a/util/async.c b/util/async.c
index 00e46b99f9..a216cf8695 100644
--- a/util/async.c
+++ b/util/async.c
@@ -386,14 +386,6 @@ aio_ctx_finalize(GSource *source)
     }
 #endif
 
-#ifdef CONFIG_LINUX_IO_URING
-    if (ctx->linux_io_uring) {
-        luring_detach_aio_context(ctx->linux_io_uring, ctx);
-        luring_cleanup(ctx->linux_io_uring);
-        ctx->linux_io_uring = NULL;
-    }
-#endif
-
     assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
     qemu_bh_delete(ctx->co_schedule_bh);
 
@@ -468,29 +460,6 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
 }
 #endif
 
-#ifdef CONFIG_LINUX_IO_URING
-LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
-{
-    if (ctx->linux_io_uring) {
-        return ctx->linux_io_uring;
-    }
-
-    ctx->linux_io_uring = luring_init(errp);
-    if (!ctx->linux_io_uring) {
-        return NULL;
-    }
-
-    luring_attach_aio_context(ctx->linux_io_uring, ctx);
-    return ctx->linux_io_uring;
-}
-
-LuringState *aio_get_linux_io_uring(AioContext *ctx)
-{
-    assert(ctx->linux_io_uring);
-    return ctx->linux_io_uring;
-}
-#endif
-
 void aio_notify(AioContext *ctx)
 {
     /*
@@ -630,10 +599,6 @@ AioContext *aio_context_new(Error **errp)
     ctx->linux_aio = NULL;
 #endif
 
-#ifdef CONFIG_LINUX_IO_URING
-    ctx->linux_io_uring = NULL;
-#endif
-
     ctx->thread_pool = NULL;
     qemu_rec_mutex_init(&ctx->lock);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
diff --git a/block/trace-events b/block/trace-events
index 8e789e1f12..c9b4736ff8 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -62,15 +62,9 @@ qmp_block_stream(void *bs) "bs %p"
 file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
 
 # io_uring.c
-luring_init_state(void *s, size_t size) "s %p size %zu"
-luring_cleanup_state(void *s) "%p freed"
-luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
-luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
-luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
-luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
-luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
-luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
-luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
+luring_cqe_handler(void *req, int ret) "req %p ret %d"
+luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d"
+luring_resubmit_short_read(void *req, int nread) "req %p nread %d"
 
 # qcow2.c
 qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu"
diff --git a/stubs/meson.build b/stubs/meson.build
index 27be2dec9f..0b2778c568 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -32,9 +32,6 @@ if have_block or have_ga
   stub_ss.add(files('cpus-virtual-clock.c'))
   stub_ss.add(files('icount.c'))
   stub_ss.add(files('graph-lock.c'))
-  if linux_io_uring.found()
-    stub_ss.add(files('io_uring.c'))
-  endif
   if libaio.found()
     stub_ss.add(files('linux-aio.c'))
   endif
-- 
2.51.1

Re: [PATCH v6 14/15] block/io_uring: use aio_add_sqe()

Posted by Fiona Ebner 3 weeks, 3 days ago

Hi,

Am 04.11.25 um 5:11 AM schrieb Stefan Hajnoczi:
> AioContext has its own io_uring instance for file descriptor monitoring.
> The disk I/O io_uring code was developed separately. Originally I
> thought the characteristics of file descriptor monitoring and disk I/O
> were too different, requiring separate io_uring instances.
> 
> Now it has become clear to me that it's feasible to share a single
> io_uring instance for file descriptor monitoring and disk I/O. We're not
> using io_uring's IOPOLL feature or anything else that would require a
> separate instance.
> 
> Unify block/io_uring.c and util/fdmon-io_uring.c using the new
> aio_add_sqe() API that allows user-defined io_uring sqe submission. Now
> block/io_uring.c just needs to submit readv/writev/fsync and most of the
> io_uring-specific logic is handled by fdmon-io_uring.c.
> 
> There are two immediate advantages:
> 1. Fewer system calls. There is no need to monitor the disk I/O io_uring
>    ring fd from the file descriptor monitoring io_uring instance. Disk
>    I/O completions are now picked up directly. Also, sqes are
>    accumulated in the sq ring until the end of the event loop iteration
>    and there are fewer io_uring_enter(2) syscalls.
> 2. Less code duplication.
> 
> Note that error_setg() messages are not supposed to end with
> punctuation, so I removed a '.' for the non-io_uring build error
> message.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Reviewed-by: Eric Blake <eblake@redhat.com>

after commit 047dabef97 ("block/io_uring: use aio_add_sqe()") using
'aio=io_uring' for an 'ide-hd' drive becomes prohibitively slow: for me,
a Debian 12 guest [0] won't even reach the GRUB menu after multiple
minutes. Using aio=threads makes the issue go away, as does building
from the previous commit, i.e. 1eebdab3c3 ("aio-posix: add aio_add_sqe()
API for user-defined io_uring requests"). The issue still exists on
current master.

There are lots and lots of 512 byte read requests. MAX_BUFFERED_REQS in
ide_buffered_readv() is only 16, maybe that has something to do with it?
Haven't had the time to dig into the issue yet, and wasn't able to
reproduce with 'qemu-img bench' or something other than 'ide-hd'.

Best Regards,
Fiona

[0]:
> ./qemu-system-x86_64 \
>   -accel kvm \
>   -chardev 'socket,id=qmp,path=/var/run/qemu-server/500.qmp,server=on,wait=off' \
>   -mon 'chardev=qmp,mode=control' \
>   -pidfile /var/run/qemu-server/500.pid \
>   -nodefaults \
>   -vnc 'unix:/var/run/qemu-server/500.vnc,password=on' \
>   -m 4096 \
>   -device 'VGA,id=vga,bus=pci.0,addr=0x2' \
>   -drive 'file=/mnt/pve/dir/images/500/vm-500-disk-0.raw,if=none,id=drive-ide0,format=raw,aio=io_uring' \
>   -device 'ide-hd,bus=ide.0,unit=0,drive=drive-ide0,id=ide0'

Re: [PATCH v6 14/15] block/io_uring: use aio_add_sqe()

Posted by Fiona Ebner 2 weeks, 4 days ago

Am 16.01.26 um 5:44 PM schrieb Fiona Ebner:
> Hi,
> 
> Am 04.11.25 um 5:11 AM schrieb Stefan Hajnoczi:
>> AioContext has its own io_uring instance for file descriptor monitoring.
>> The disk I/O io_uring code was developed separately. Originally I
>> thought the characteristics of file descriptor monitoring and disk I/O
>> were too different, requiring separate io_uring instances.
>>
>> Now it has become clear to me that it's feasible to share a single
>> io_uring instance for file descriptor monitoring and disk I/O. We're not
>> using io_uring's IOPOLL feature or anything else that would require a
>> separate instance.
>>
>> Unify block/io_uring.c and util/fdmon-io_uring.c using the new
>> aio_add_sqe() API that allows user-defined io_uring sqe submission. Now
>> block/io_uring.c just needs to submit readv/writev/fsync and most of the
>> io_uring-specific logic is handled by fdmon-io_uring.c.
>>
>> There are two immediate advantages:
>> 1. Fewer system calls. There is no need to monitor the disk I/O io_uring
>>    ring fd from the file descriptor monitoring io_uring instance. Disk
>>    I/O completions are now picked up directly. Also, sqes are
>>    accumulated in the sq ring until the end of the event loop iteration
>>    and there are fewer io_uring_enter(2) syscalls.
>> 2. Less code duplication.
>>
>> Note that error_setg() messages are not supposed to end with
>> punctuation, so I removed a '.' for the non-io_uring build error
>> message.
>>
>> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Reviewed-by: Eric Blake <eblake@redhat.com>
> 
> after commit 047dabef97 ("block/io_uring: use aio_add_sqe()") using
> 'aio=io_uring' for an 'ide-hd' drive becomes prohibitively slow: for me,
> a Debian 12 guest [0] won't even reach the GRUB menu after multiple
> minutes. Using aio=threads makes the issue go away, as does building
> from the previous commit, i.e. 1eebdab3c3 ("aio-posix: add aio_add_sqe()
> API for user-defined io_uring requests"). The issue still exists on
> current master.
> 
> There are lots and lots of 512 byte read requests. MAX_BUFFERED_REQS in
> ide_buffered_readv() is only 16, maybe that has something to do with it?
> Haven't had the time to dig into the issue yet, and wasn't able to
> reproduce with 'qemu-img bench' or something other than 'ide-hd'.

I did some more experimentation and tracing and found the likely issue.
The old implementation had in its ioq_submit():

>     if (s->io_q.in_flight) {
>         /*
>          * We can try to complete something just right away if there are
>          * still requests in-flight.
>          */
>         luring_process_completions(s);
>     }

and thus at the end of luring_co_submit()

>     if (luringcb.ret == -EINPROGRESS) {
>         qemu_coroutine_yield();
>     }
>     return luringcb.ret;

all those small one-sector requests will already be completed right away
and the coroutine won't yield.

But for the new implementation, that doesn't happen and the coroutine
yields for every single request in my testing.

The following patch restores the speed and the coroutine doesn't yield
for all those one-sector requests anymore:

> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index e24b955fd9..acaed8cec3 100644
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -813,5 +813,6 @@ void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
>  {
>      AioContext *ctx = qemu_get_current_aio_context();
>      ctx->fdmon_ops->add_sqe(ctx, prep_sqe, opaque, cqe_handler);
> +    aio_poll(ctx, false);
>  }
>  #endif /* CONFIG_LINUX_IO_URING */

Does the approach make sense? Might it be better to do this on the call
sites instead? Or add a flag to aio_add_sqe() to control whether it is done?

> 
> Best Regards,
> Fiona
> 
> [0]:
>> ./qemu-system-x86_64 \
>>   -accel kvm \
>>   -chardev 'socket,id=qmp,path=/var/run/qemu-server/500.qmp,server=on,wait=off' \
>>   -mon 'chardev=qmp,mode=control' \
>>   -pidfile /var/run/qemu-server/500.pid \
>>   -nodefaults \
>>   -vnc 'unix:/var/run/qemu-server/500.vnc,password=on' \
>>   -m 4096 \
>>   -device 'VGA,id=vga,bus=pci.0,addr=0x2' \
>>   -drive 'file=/mnt/pve/dir/images/500/vm-500-disk-0.raw,if=none,id=drive-ide0,format=raw,aio=io_uring' \
>>   -device 'ide-hd,bus=ide.0,unit=0,drive=drive-ide0,id=ide0'
> 
> 
> 
>