[v1] block: split qemu/aio.h out of block/aio.h

[PATCH] block: split qemu/aio.h out of block/aio.h

Posted by Paolo Bonzini 2 months, 1 week ago

Rust bindings are roughly broken up according to subdirectories of
include/ (that's not exact, but it's roughly an aim).  However,
block/aio.h contains both block layer-specific concepts (BlockAIOCB,
BlockCompletionFunc) and AioContext-related declarations that are
used be qemu/main-loop.h.

Break out the latter into their own header file, and use that to
break the inclusion of block/ from qemu/main-loop.h.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
	Based on top of
	https://lore.kernel.org/qemu-devel/20251127131516.80807-3-pbonzini@redhat.com/

 include/block/aio.h      | 838 +-------------------------------------
 include/qemu/aio.h       | 852 +++++++++++++++++++++++++++++++++++++++
 include/qemu/main-loop.h |   4 +-
 3 files changed, 857 insertions(+), 837 deletions(-)
 create mode 100644 include/qemu/aio.h

diff --git a/include/block/aio.h b/include/block/aio.h
index cc3d5f25a24..dba423f896e 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -11,22 +11,13 @@
  *
  */
 
-#ifndef QEMU_AIO_H
-#define QEMU_AIO_H
+#ifndef QEMU_BLOCK_AIO_H
+#define QEMU_BLOCK_AIO_H
 
-#ifdef CONFIG_LINUX_IO_URING
-#include <liburing.h>
-#endif
-#include "qemu/coroutine-core.h"
-#include "qemu/queue.h"
-#include "qemu/event_notifier.h"
-#include "qemu/lockcnt.h"
-#include "qemu/thread.h"
-#include "qemu/timer.h"
+#include "qemu/aio.h"
 #include "block/graph-lock.h"
 #include "hw/core/qdev.h"
 
-
 typedef struct BlockAIOCB BlockAIOCB;
 typedef void BlockCompletionFunc(void *opaque, int ret);
 
@@ -48,827 +39,4 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
 void qemu_aio_unref(void *p);
 void qemu_aio_ref(void *p);
 
-typedef struct AioHandler AioHandler;
-typedef QLIST_HEAD(, AioHandler) AioHandlerList;
-typedef void QEMUBHFunc(void *opaque);
-typedef bool AioPollFn(void *opaque);
-typedef void IOHandler(void *opaque);
-
-struct ThreadPoolAio;
-struct LinuxAioState;
-typedef struct LuringState LuringState;
-
-/* Is polling disabled? */
-bool aio_poll_disabled(AioContext *ctx);
-
-#ifdef CONFIG_LINUX_IO_URING
-/*
- * Each io_uring request must have a unique CqeHandler that processes the cqe.
- * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
- * ->cb() invocation.
- */
-typedef struct CqeHandler CqeHandler;
-struct CqeHandler {
-    /* Called by the AioContext when the request has completed */
-    void (*cb)(CqeHandler *handler);
-
-    /* Used internally, do not access this */
-    QSIMPLEQ_ENTRY(CqeHandler) next;
-
-    /* This field is filled in before ->cb() is called */
-    struct io_uring_cqe cqe;
-};
-
-typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
-#endif /* CONFIG_LINUX_IO_URING */
-
-/* Callbacks for file descriptor monitoring implementations */
-typedef struct {
-    /*
-     * update:
-     * @ctx: the AioContext
-     * @old_node: the existing handler or NULL if this file descriptor is being
-     *            monitored for the first time
-     * @new_node: the new handler or NULL if this file descriptor is being
-     *            removed
-     *
-     * Add/remove/modify a monitored file descriptor.
-     *
-     * Called with ctx->list_lock acquired.
-     */
-    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
-
-    /*
-     * wait:
-     * @ctx: the AioContext
-     * @ready_list: list for handlers that become ready
-     * @timeout: maximum duration to wait, in nanoseconds
-     *
-     * Wait for file descriptors to become ready and place them on ready_list.
-     *
-     * Called with ctx->list_lock incremented but not locked.
-     *
-     * Returns: number of ready file descriptors.
-     */
-    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
-
-    /*
-     * need_wait:
-     * @ctx: the AioContext
-     *
-     * Tell aio_poll() when to stop userspace polling early because ->wait()
-     * has fds ready.
-     *
-     * File descriptor monitoring implementations that cannot poll fd readiness
-     * from userspace should use aio_poll_disabled() here.  This ensures that
-     * file descriptors are not starved by handlers that frequently make
-     * progress via userspace polling.
-     *
-     * Returns: true if ->wait() should be called, false otherwise.
-     */
-    bool (*need_wait)(AioContext *ctx);
-
-    /*
-     * dispatch:
-     * @ctx: the AioContext
-     *
-     * Dispatch any work that is specific to this file descriptor monitoring
-     * implementation. Usually the event loop's generic file descriptor
-     * monitoring, BH, and timer dispatching code is sufficient, but file
-     * descriptor monitoring implementations offering additional functionality
-     * may need to implement this function for custom behavior. Called at a
-     * point in the event loop when it is safe to invoke user-defined
-     * callbacks.
-     *
-     * This function is optional and may be NULL.
-     *
-     * Returns: true if progress was made (see aio_poll()'s return value),
-     * false otherwise.
-     */
-    bool (*dispatch)(AioContext *ctx);
-
-    /*
-     * gsource_prepare:
-     * @ctx: the AioContext
-     *
-     * Prepare for the glib event loop to wait for events instead of the usual
-     * ->wait() call. See glib's GSourceFuncs->prepare().
-     */
-    void (*gsource_prepare)(AioContext *ctx);
-
-    /*
-     * gsource_check:
-     * @ctx: the AioContext
-     *
-     * Called by the glib event loop from glib's GSourceFuncs->check() after
-     * waiting for events.
-     *
-     * Returns: true when ready to be dispatched.
-     */
-    bool (*gsource_check)(AioContext *ctx);
-
-    /*
-     * gsource_dispatch:
-     * @ctx: the AioContext
-     * @ready_list: list for handlers that become ready
-     *
-     * Place ready AioHandlers on ready_list. Called as part of the glib event
-     * loop from glib's GSourceFuncs->dispatch().
-     *
-     * Called with list_lock incremented.
-     */
-    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
-
-#ifdef CONFIG_LINUX_IO_URING
-    /**
-     * add_sqe: Add an io_uring sqe for submission.
-     * @prep_sqe: invoked with an sqe that should be prepared for submission
-     * @opaque: user-defined argument to @prep_sqe()
-     * @cqe_handler: the unique cqe handler associated with this request
-     *
-     * The caller's @prep_sqe() function is invoked to fill in the details of
-     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
-     *
-     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
-     * until the next event loop iteration.
-     *
-     * This function is called from the current AioContext and is not
-     * thread-safe.
-     */
-    void (*add_sqe)(AioContext *ctx,
-                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
-                    void *opaque, CqeHandler *cqe_handler);
-#endif /* CONFIG_LINUX_IO_URING */
-} FDMonOps;
-
-/*
- * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
- * scheduled BHs are not processed until the next aio_bh_poll() call.  All
- * active aio_bh_poll() calls chain their slices together in a list, so that
- * nested aio_bh_poll() calls process all scheduled bottom halves.
- */
-typedef QSLIST_HEAD(, QEMUBH) BHList;
-typedef struct BHListSlice BHListSlice;
-struct BHListSlice {
-    BHList bh_list;
-    QSIMPLEQ_ENTRY(BHListSlice) next;
-};
-
-typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
-
-typedef struct AioPolledEvent {
-    int64_t ns;        /* current polling time in nanoseconds */
-} AioPolledEvent;
-
-struct AioContext {
-    GSource source;
-
-    /* Used by AioContext users to protect from multi-threaded access.  */
-    QemuRecMutex lock;
-
-    /*
-     * Keep track of readers and writers of the block layer graph.
-     * This is essential to avoid performing additions and removal
-     * of nodes and edges from block graph while some
-     * other thread is traversing it.
-     */
-    BdrvGraphRWlock *bdrv_graph;
-
-    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
-    AioHandlerList aio_handlers;
-
-    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
-    AioHandlerList deleted_aio_handlers;
-
-    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
-     * only written from the AioContext home thread, or under the BQL in
-     * the case of the main AioContext.  However, it is read from any
-     * thread so it is still accessed with atomic primitives.
-     *
-     * If this field is 0, everything (file descriptors, bottom halves,
-     * timers) will be re-evaluated before the next blocking poll() or
-     * io_uring wait; therefore, the event_notifier_set call can be
-     * skipped.  If it is non-zero, you may need to wake up a concurrent
-     * aio_poll or the glib main event loop, making event_notifier_set
-     * necessary.
-     *
-     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
-     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
-     * Bits 1-31 simply count the number of active calls to aio_poll
-     * that are in the prepare or poll phase.
-     *
-     * The GSource and aio_poll must use a different mechanism because
-     * there is no certainty that a call to GSource's prepare callback
-     * (via g_main_context_prepare) is indeed followed by check and
-     * dispatch.  It's not clear whether this would be a bug, but let's
-     * play safe and allow it---it will just cause extra calls to
-     * event_notifier_set until the next call to dispatch.
-     *
-     * Instead, the aio_poll calls include both the prepare and the
-     * dispatch phase, hence a simple counter is enough for them.
-     */
-    uint32_t notify_me;
-
-    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
-     * and to ensure that no callbacks are removed while we're walking and
-     * dispatching them.
-     */
-    QemuLockCnt list_lock;
-
-    /* Bottom Halves pending aio_bh_poll() processing */
-    BHList bh_list;
-
-    /* Chained BH list slices for each nested aio_bh_poll() call */
-    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
-
-    /* Used by aio_notify.
-     *
-     * "notified" is used to avoid expensive event_notifier_test_and_clear
-     * calls.  When it is clear, the EventNotifier is clear, or one thread
-     * is going to clear "notified" before processing more events.  False
-     * positives are possible, i.e. "notified" could be set even though the
-     * EventNotifier is clear.
-     *
-     * Note that event_notifier_set *cannot* be optimized the same way.  For
-     * more information on the problem that would result, see "#ifdef BUG2"
-     * in the docs/aio_notify_accept.promela formal model.
-     */
-    bool notified;
-    EventNotifier notifier;
-
-    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
-    QEMUBH *co_schedule_bh;
-
-    int thread_pool_min;
-    int thread_pool_max;
-    /* Thread pool for performing work and receiving completion callbacks.
-     * Has its own locking.
-     */
-    struct ThreadPoolAio *thread_pool;
-
-#ifdef CONFIG_LINUX_AIO
-    struct LinuxAioState *linux_aio;
-#endif
-#ifdef CONFIG_LINUX_IO_URING
-    /* State for file descriptor monitoring using Linux io_uring */
-    struct io_uring fdmon_io_uring;
-    AioHandlerSList submit_list;
-    void *io_uring_fd_tag;
-
-    /* Pending callback state for cqe handlers */
-    CqeHandlerSimpleQ cqe_handler_ready_list;
-#endif /* CONFIG_LINUX_IO_URING */
-
-    /* TimerLists for calling timers - one per clock type.  Has its own
-     * locking.
-     */
-    QEMUTimerListGroup tlg;
-
-    /* Number of AioHandlers without .io_poll() */
-    int poll_disable_cnt;
-
-    /* Polling mode parameters */
-    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
-    int64_t poll_grow;      /* polling time growth factor */
-    int64_t poll_shrink;    /* polling time shrink factor */
-
-    /* AIO engine parameters */
-    int64_t aio_max_batch;  /* maximum number of requests in a batch */
-
-    /*
-     * List of handlers participating in userspace polling.  Protected by
-     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
-     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
-     * only touches the list to delete nodes if ctx->list_lock's count is zero.
-     */
-    AioHandlerList poll_aio_handlers;
-
-    /* Are we in polling mode or monitoring file descriptors? */
-    bool poll_started;
-
-    /* epoll(7) state used when built with CONFIG_EPOLL */
-    int epollfd;
-
-    /* The GSource unix fd tag for epollfd */
-    void *epollfd_tag;
-
-    const FDMonOps *fdmon_ops;
-
-    /* Was aio_context_new() successful? */
-    bool initialized;
-};
-
-/**
- * aio_context_new: Allocate a new AioContext.
- *
- * AioContext provide a mini event-loop that can be waited on synchronously.
- * They also provide bottom halves, a service to execute a piece of code
- * as soon as possible.
- */
-AioContext *aio_context_new(Error **errp);
-
-/**
- * aio_context_ref:
- * @ctx: The AioContext to operate on.
- *
- * Add a reference to an AioContext.
- */
-void aio_context_ref(AioContext *ctx);
-
-/**
- * aio_context_unref:
- * @ctx: The AioContext to operate on.
- *
- * Drop a reference to an AioContext.
- */
-void aio_context_unref(AioContext *ctx);
-
-/**
- * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
- * run only once and as soon as possible.
- *
- * @name: A human-readable identifier for debugging purposes.
- */
-void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
-                                  const char *name);
-
-/**
- * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
- * only once and as soon as possible.
- *
- * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
- * name string.
- */
-#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
-    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
-
-/**
- * aio_bh_new_full: Allocate a new bottom half structure.
- *
- * Bottom halves are lightweight callbacks whose invocation is guaranteed
- * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
- * is opaque and must be allocated prior to its use.
- *
- * @name: A human-readable identifier for debugging purposes.
- * @reentrancy_guard: A guard set when entering a cb to prevent
- * device-reentrancy issues
- */
-QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
-                        const char *name, MemReentrancyGuard *reentrancy_guard);
-
-/**
- * aio_bh_new: Allocate a new bottom half structure
- *
- * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
- * string.
- */
-#define aio_bh_new(ctx, cb, opaque) \
-    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
-
-/**
- * aio_bh_new_guarded: Allocate a new bottom half structure with a
- * reentrancy_guard
- *
- * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
- * string.
- */
-#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
-    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
-
-/**
- * aio_notify: Force processing of pending events.
- *
- * Similar to signaling a condition variable, aio_notify forces
- * aio_poll to exit, so that the next call will re-examine pending events.
- * The caller of aio_notify will usually call aio_poll again very soon,
- * or go through another iteration of the GLib main loop.  Hence, aio_notify
- * also has the side effect of recalculating the sets of file descriptors
- * that the main loop waits for.
- *
- * Calling aio_notify is rarely necessary, because for example scheduling
- * a bottom half calls it already.
- */
-void aio_notify(AioContext *ctx);
-
-/**
- * aio_notify_accept: Acknowledge receiving an aio_notify.
- *
- * aio_notify() uses an EventNotifier in order to wake up a sleeping
- * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
- * usually rare, but the AioContext has to clear the EventNotifier on
- * every aio_poll() or g_main_context_iteration() in order to avoid
- * busy waiting.  This event_notifier_test_and_clear() cannot be done
- * using the usual aio_context_set_event_notifier(), because it must
- * be done before processing all events (file descriptors, bottom halves,
- * timers).
- *
- * aio_notify_accept() is an optimized event_notifier_test_and_clear()
- * that is specific to an AioContext's notifier; it is used internally
- * to clear the EventNotifier only if aio_notify() had been called.
- */
-void aio_notify_accept(AioContext *ctx);
-
-/**
- * aio_bh_call: Executes callback function of the specified BH.
- */
-void aio_bh_call(QEMUBH *bh);
-
-/**
- * aio_bh_poll: Poll bottom halves for an AioContext.
- *
- * These are internal functions used by the QEMU main loop.
- * And notice that multiple occurrences of aio_bh_poll cannot
- * be called concurrently
- */
-int aio_bh_poll(AioContext *ctx);
-
-/**
- * qemu_bh_schedule: Schedule a bottom half.
- *
- * Scheduling a bottom half interrupts the main loop and causes the
- * execution of the callback that was passed to qemu_bh_new.
- *
- * Bottom halves that are scheduled from a bottom half handler are instantly
- * invoked.  This can create an infinite loop if a bottom half handler
- * schedules itself.
- *
- * @bh: The bottom half to be scheduled.
- */
-void qemu_bh_schedule(QEMUBH *bh);
-
-/**
- * qemu_bh_cancel: Cancel execution of a bottom half.
- *
- * Canceling execution of a bottom half undoes the effect of calls to
- * qemu_bh_schedule without freeing its resources yet.  While cancellation
- * itself is also wait-free and thread-safe, it can of course race with the
- * loop that executes bottom halves unless you are holding the iothread
- * mutex.  This makes it mostly useless if you are not holding the mutex.
- *
- * @bh: The bottom half to be canceled.
- */
-void qemu_bh_cancel(QEMUBH *bh);
-
-/**
- *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
- *
- * Deleting a bottom half frees the memory that was allocated for it by
- * qemu_bh_new.  It also implies canceling the bottom half if it was
- * scheduled.
- * This func is async. The bottom half will do the delete action at the finial
- * end.
- *
- * @bh: The bottom half to be deleted.
- */
-void qemu_bh_delete(QEMUBH *bh);
-
-/* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext, before g_poll is invoked.
- *
- * This is used internally in the implementation of the GSource.
- */
-bool aio_prepare(AioContext *ctx);
-
-/* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext, after g_poll is invoked.
- *
- * This is used internally in the implementation of the GSource.
- */
-bool aio_pending(AioContext *ctx);
-
-/* Dispatch any pending callbacks from the GSource attached to the AioContext.
- *
- * This is used internally in the implementation of the GSource.
- */
-void aio_dispatch(AioContext *ctx);
-
-/* Progress in completing AIO work to occur.  This can issue new pending
- * aio as a result of executing I/O completion or bh callbacks.
- *
- * Return whether any progress was made by executing AIO or bottom half
- * handlers.  If @blocking == true, this should always be true except
- * if someone called aio_notify.
- *
- * If there are no pending bottom halves, but there are pending AIO
- * operations, it may not be possible to make any progress without
- * blocking.  If @blocking is true, this function will wait until one
- * or more AIO events have completed, to ensure something has moved
- * before returning.
- */
-bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
-
-/* Register a file descriptor and associated callbacks.  Behaves very similarly
- * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
- * be invoked when using aio_poll().
- *
- * Code that invokes AIO completion functions should rely on this function
- * instead of qemu_set_fd_handler[2].
- */
-void aio_set_fd_handler(AioContext *ctx,
-                        int fd,
-                        IOHandler *io_read,
-                        IOHandler *io_write,
-                        AioPollFn *io_poll,
-                        IOHandler *io_poll_ready,
-                        void *opaque);
-
-/* Register an event notifier and associated callbacks.  Behaves very similarly
- * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
- * will be invoked when using aio_poll().
- *
- * Code that invokes AIO completion functions should rely on this function
- * instead of event_notifier_set_handler.
- */
-void aio_set_event_notifier(AioContext *ctx,
-                            EventNotifier *notifier,
-                            EventNotifierHandler *io_read,
-                            AioPollFn *io_poll,
-                            EventNotifierHandler *io_poll_ready);
-
-/*
- * Set polling begin/end callbacks for an event notifier that has already been
- * registered with aio_set_event_notifier.  Do nothing if the event notifier is
- * not registered.
- *
- * Note that if the io_poll_end() callback (or the entire notifier) is removed
- * during polling, it will not be called, so an io_poll_begin() is not
- * necessarily always followed by an io_poll_end().
- */
-void aio_set_event_notifier_poll(AioContext *ctx,
-                                 EventNotifier *notifier,
-                                 EventNotifierHandler *io_poll_begin,
-                                 EventNotifierHandler *io_poll_end);
-
-/* Return a GSource that lets the main loop poll the file descriptors attached
- * to this AioContext.
- */
-GSource *aio_get_g_source(AioContext *ctx);
-
-/* Return the ThreadPoolAio bound to this AioContext */
-struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
-
-/* Setup the LinuxAioState bound to this AioContext */
-struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
-
-/* Return the LinuxAioState bound to this AioContext */
-struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
-
-/**
- * aio_timer_new_with_attrs:
- * @ctx: the aio context
- * @type: the clock type
- * @scale: the scale
- * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
- *              to assign
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Allocate a new timer (with attributes) attached to the context @ctx.
- * The function is responsible for memory allocation.
- *
- * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
- * Use that unless you really need dynamic memory allocation.
- *
- * Returns: a pointer to the new timer
- */
-static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
-                                                  QEMUClockType type,
-                                                  int scale, int attributes,
-                                                  QEMUTimerCB *cb, void *opaque)
-{
-    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
-}
-
-/**
- * aio_timer_new:
- * @ctx: the aio context
- * @type: the clock type
- * @scale: the scale
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Allocate a new timer attached to the context @ctx.
- * See aio_timer_new_with_attrs for details.
- *
- * Returns: a pointer to the new timer
- */
-static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
-                                       int scale,
-                                       QEMUTimerCB *cb, void *opaque)
-{
-    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
-}
-
-/**
- * aio_timer_init_with_attrs:
- * @ctx: the aio context
- * @ts: the timer
- * @type: the clock type
- * @scale: the scale
- * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
- *              to assign
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Initialise a new timer (with attributes) attached to the context @ctx.
- * The caller is responsible for memory allocation.
- */
-static inline void aio_timer_init_with_attrs(AioContext *ctx,
-                                             QEMUTimer *ts, QEMUClockType type,
-                                             int scale, int attributes,
-                                             QEMUTimerCB *cb, void *opaque)
-{
-    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
-}
-
-/**
- * aio_timer_init:
- * @ctx: the aio context
- * @ts: the timer
- * @type: the clock type
- * @scale: the scale
- * @cb: the callback to call on timer expiry
- * @opaque: the opaque pointer to pass to the callback
- *
- * Initialise a new timer attached to the context @ctx.
- * See aio_timer_init_with_attrs for details.
- */
-static inline void aio_timer_init(AioContext *ctx,
-                                  QEMUTimer *ts, QEMUClockType type,
-                                  int scale,
-                                  QEMUTimerCB *cb, void *opaque)
-{
-    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
-}
-
-/**
- * aio_compute_timeout:
- * @ctx: the aio context
- *
- * Compute the timeout that a blocking aio_poll should use.
- */
-int64_t aio_compute_timeout(AioContext *ctx);
-
-/**
- * aio_co_schedule:
- * @ctx: the aio context
- * @co: the coroutine
- *
- * Start a coroutine on a remote AioContext.
- *
- * The coroutine must not be entered by anyone else while aio_co_schedule()
- * is active.  In addition the coroutine must have yielded unless ctx
- * is the context in which the coroutine is running (i.e. the value of
- * qemu_get_current_aio_context() from the coroutine itself).
- */
-void aio_co_schedule(AioContext *ctx, Coroutine *co);
-
-/**
- * aio_co_reschedule_self:
- * @new_ctx: the new context
- *
- * Move the currently running coroutine to new_ctx. If the coroutine is already
- * running in new_ctx, do nothing.
- *
- * Note that this function cannot reschedule from iohandler_ctx to
- * qemu_aio_context.
- */
-void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
-
-/**
- * aio_co_wake:
- * @co: the coroutine
- *
- * Restart a coroutine on the AioContext where it was running last, thus
- * preventing coroutines from jumping from one context to another when they
- * go to sleep.
- *
- * aio_co_wake may be executed either in coroutine or non-coroutine
- * context.  The coroutine must not be entered by anyone else while
- * aio_co_wake() is active.
- *
- * If `co`'s AioContext differs from the current AioContext, this will call
- * aio_co_schedule(), which makes this safe to use even when `co` has not
- * yielded yet.  In such a case, it will be entered once it yields.
- *
- * In contrast, if `co`'s AioContext is equal to the current one, it is
- * required for `co` to currently be yielding.  This is generally the case
- * if the caller is not in `co` (i.e. invoked by `co`), because the only
- * other way for the caller to be running then is for `co` to currently be
- * yielding.
- *
- * Therefore, if there is no way for the caller to be invoked/entered by
- * `co`, it is generally safe to call this regardless of whether `co` is
- * known to already be yielding or not -- it only has to yield at some
- * point.
- */
-void aio_co_wake(Coroutine *co);
-
-/**
- * aio_co_enter:
- * @ctx: the context to run the coroutine
- * @co: the coroutine to run
- *
- * Enter a coroutine in the specified AioContext.
- */
-void aio_co_enter(AioContext *ctx, Coroutine *co);
-
-/**
- * Return the AioContext whose event loop runs in the current thread.
- *
- * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from the main thread or with the "big QEMU lock" taken it
- * will be the main loop AioContext.
- *
- * Note that the return value is never the main loop's iohandler_ctx and the
- * return value is the main loop AioContext instead.
- */
-AioContext *qemu_get_current_aio_context(void);
-
-void qemu_set_current_aio_context(AioContext *ctx);
-
-/**
- * aio_context_setup:
- * @ctx: the aio context
- * @errp: error pointer
- *
- * Initialize the aio context.
- *
- * Returns: true on success, false otherwise
- */
-bool aio_context_setup(AioContext *ctx, Error **errp);
-
-/**
- * aio_context_destroy:
- * @ctx: the aio context
- *
- * Destroy the aio context.
- */
-void aio_context_destroy(AioContext *ctx);
-
-/**
- * aio_context_set_poll_params:
- * @ctx: the aio context
- * @max_ns: how long to busy poll for, in nanoseconds
- * @grow: polling time growth factor
- * @shrink: polling time shrink factor
- *
- * Poll mode can be disabled by setting poll_max_ns to 0.
- */
-void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
-                                 int64_t grow, int64_t shrink,
-                                 Error **errp);
-
-/**
- * aio_context_set_aio_params:
- * @ctx: the aio context
- * @max_batch: maximum number of requests in a batch, 0 means that the
- *             engine will use its default
- */
-void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
-
-/**
- * aio_context_set_thread_pool_params:
- * @ctx: the aio context
- * @min: min number of threads to have readily available in the thread pool
- * @min: max number of threads the thread pool can contain
- */
-void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
-                                        int64_t max, Error **errp);
-
-#ifdef CONFIG_LINUX_IO_URING
-/**
- * aio_has_io_uring: Return whether io_uring is available.
- *
- * io_uring is either available in all AioContexts or in none, so this only
- * needs to be called once from within any thread's AioContext.
- */
-static inline bool aio_has_io_uring(void)
-{
-    AioContext *ctx = qemu_get_current_aio_context();
-    return ctx->fdmon_ops->add_sqe;
-}
-
-/**
- * aio_add_sqe: Add an io_uring sqe for submission.
- * @prep_sqe: invoked with an sqe that should be prepared for submission
- * @opaque: user-defined argument to @prep_sqe()
- * @cqe_handler: the unique cqe handler associated with this request
- *
- * The caller's @prep_sqe() function is invoked to fill in the details of the
- * sqe. Do not call io_uring_sqe_set_data() on this sqe.
- *
- * The sqe is submitted by the current AioContext. The kernel may see the sqe
- * as soon as @prep_sqe() returns or it may take until the next event loop
- * iteration.
- *
- * When the AioContext is destroyed, pending sqes are ignored and their
- * CqeHandlers are not invoked.
- *
- * This function must be called only when aio_has_io_uring() returns true.
- */
-void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
-                 void *opaque, CqeHandler *cqe_handler);
-#endif /* CONFIG_LINUX_IO_URING */
-
 #endif
diff --git a/include/qemu/aio.h b/include/qemu/aio.h
new file mode 100644
index 00000000000..8cca2360d1a
--- /dev/null
+++ b/include/qemu/aio.h
@@ -0,0 +1,852 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
+#include "qemu/coroutine-core.h"
+#include "qemu/queue.h"
+#include "qemu/event_notifier.h"
+#include "qemu/lockcnt.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+
+struct MemReentrancyGuard;
+
+typedef struct AioHandler AioHandler;
+typedef QLIST_HEAD(, AioHandler) AioHandlerList;
+typedef void QEMUBHFunc(void *opaque);
+typedef bool AioPollFn(void *opaque);
+typedef void IOHandler(void *opaque);
+
+struct ThreadPoolAio;
+struct LinuxAioState;
+typedef struct LuringState LuringState;
+
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
+#ifdef CONFIG_LINUX_IO_URING
+/*
+ * Each io_uring request must have a unique CqeHandler that processes the cqe.
+ * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
+ * ->cb() invocation.
+ */
+typedef struct CqeHandler CqeHandler;
+struct CqeHandler {
+    /* Called by the AioContext when the request has completed */
+    void (*cb)(CqeHandler *handler);
+
+    /* Used internally, do not access this */
+    QSIMPLEQ_ENTRY(CqeHandler) next;
+
+    /* This field is filled in before ->cb() is called */
+    struct io_uring_cqe cqe;
+};
+
+typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
+#endif /* CONFIG_LINUX_IO_URING */
+
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+    /*
+     * update:
+     * @ctx: the AioContext
+     * @old_node: the existing handler or NULL if this file descriptor is being
+     *            monitored for the first time
+     * @new_node: the new handler or NULL if this file descriptor is being
+     *            removed
+     *
+     * Add/remove/modify a monitored file descriptor.
+     *
+     * Called with ctx->list_lock acquired.
+     */
+    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
+
+    /*
+     * wait:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     * @timeout: maximum duration to wait, in nanoseconds
+     *
+     * Wait for file descriptors to become ready and place them on ready_list.
+     *
+     * Called with ctx->list_lock incremented but not locked.
+     *
+     * Returns: number of ready file descriptors.
+     */
+    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+    /*
+     * need_wait:
+     * @ctx: the AioContext
+     *
+     * Tell aio_poll() when to stop userspace polling early because ->wait()
+     * has fds ready.
+     *
+     * File descriptor monitoring implementations that cannot poll fd readiness
+     * from userspace should use aio_poll_disabled() here.  This ensures that
+     * file descriptors are not starved by handlers that frequently make
+     * progress via userspace polling.
+     *
+     * Returns: true if ->wait() should be called, false otherwise.
+     */
+    bool (*need_wait)(AioContext *ctx);
+
+    /*
+     * dispatch:
+     * @ctx: the AioContext
+     *
+     * Dispatch any work that is specific to this file descriptor monitoring
+     * implementation. Usually the event loop's generic file descriptor
+     * monitoring, BH, and timer dispatching code is sufficient, but file
+     * descriptor monitoring implementations offering additional functionality
+     * may need to implement this function for custom behavior. Called at a
+     * point in the event loop when it is safe to invoke user-defined
+     * callbacks.
+     *
+     * This function is optional and may be NULL.
+     *
+     * Returns: true if progress was made (see aio_poll()'s return value),
+     * false otherwise.
+     */
+    bool (*dispatch)(AioContext *ctx);
+
+    /*
+     * gsource_prepare:
+     * @ctx: the AioContext
+     *
+     * Prepare for the glib event loop to wait for events instead of the usual
+     * ->wait() call. See glib's GSourceFuncs->prepare().
+     */
+    void (*gsource_prepare)(AioContext *ctx);
+
+    /*
+     * gsource_check:
+     * @ctx: the AioContext
+     *
+     * Called by the glib event loop from glib's GSourceFuncs->check() after
+     * waiting for events.
+     *
+     * Returns: true when ready to be dispatched.
+     */
+    bool (*gsource_check)(AioContext *ctx);
+
+    /*
+     * gsource_dispatch:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     *
+     * Place ready AioHandlers on ready_list. Called as part of the glib event
+     * loop from glib's GSourceFuncs->dispatch().
+     *
+     * Called with list_lock incremented.
+     */
+    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
+
+#ifdef CONFIG_LINUX_IO_URING
+    /**
+     * add_sqe: Add an io_uring sqe for submission.
+     * @prep_sqe: invoked with an sqe that should be prepared for submission
+     * @opaque: user-defined argument to @prep_sqe()
+     * @cqe_handler: the unique cqe handler associated with this request
+     *
+     * The caller's @prep_sqe() function is invoked to fill in the details of
+     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
+     *
+     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
+     * until the next event loop iteration.
+     *
+     * This function is called from the current AioContext and is not
+     * thread-safe.
+     */
+    void (*add_sqe)(AioContext *ctx,
+                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                    void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+} FDMonOps;
+
+/*
+ * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
+ * scheduled BHs are not processed until the next aio_bh_poll() call.  All
+ * active aio_bh_poll() calls chain their slices together in a list, so that
+ * nested aio_bh_poll() calls process all scheduled bottom halves.
+ */
+typedef QSLIST_HEAD(, QEMUBH) BHList;
+typedef struct BHListSlice BHListSlice;
+struct BHListSlice {
+    BHList bh_list;
+    QSIMPLEQ_ENTRY(BHListSlice) next;
+};
+
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
+typedef struct AioPolledEvent {
+    int64_t ns;        /* current polling time in nanoseconds */
+} AioPolledEvent;
+
+struct AioContext {
+    GSource source;
+
+    /* Used by AioContext users to protect from multi-threaded access.  */
+    QemuRecMutex lock;
+
+    /*
+     * Keep track of readers and writers of the block layer graph.
+     * This is essential to avoid performing additions and removal
+     * of nodes and edges from block graph while some
+     * other thread is traversing it.
+     */
+    struct BdrvGraphRWlock *bdrv_graph;
+
+    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
+    AioHandlerList aio_handlers;
+
+    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
+    AioHandlerList deleted_aio_handlers;
+
+    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+     * only written from the AioContext home thread, or under the BQL in
+     * the case of the main AioContext.  However, it is read from any
+     * thread so it is still accessed with atomic primitives.
+     *
+     * If this field is 0, everything (file descriptors, bottom halves,
+     * timers) will be re-evaluated before the next blocking poll() or
+     * io_uring wait; therefore, the event_notifier_set call can be
+     * skipped.  If it is non-zero, you may need to wake up a concurrent
+     * aio_poll or the glib main event loop, making event_notifier_set
+     * necessary.
+     *
+     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+     * Bits 1-31 simply count the number of active calls to aio_poll
+     * that are in the prepare or poll phase.
+     *
+     * The GSource and aio_poll must use a different mechanism because
+     * there is no certainty that a call to GSource's prepare callback
+     * (via g_main_context_prepare) is indeed followed by check and
+     * dispatch.  It's not clear whether this would be a bug, but let's
+     * play safe and allow it---it will just cause extra calls to
+     * event_notifier_set until the next call to dispatch.
+     *
+     * Instead, the aio_poll calls include both the prepare and the
+     * dispatch phase, hence a simple counter is enough for them.
+     */
+    uint32_t notify_me;
+
+    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
+     * and to ensure that no callbacks are removed while we're walking and
+     * dispatching them.
+     */
+    QemuLockCnt list_lock;
+
+    /* Bottom Halves pending aio_bh_poll() processing */
+    BHList bh_list;
+
+    /* Chained BH list slices for each nested aio_bh_poll() call */
+    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
+
+    /* Used by aio_notify.
+     *
+     * "notified" is used to avoid expensive event_notifier_test_and_clear
+     * calls.  When it is clear, the EventNotifier is clear, or one thread
+     * is going to clear "notified" before processing more events.  False
+     * positives are possible, i.e. "notified" could be set even though the
+     * EventNotifier is clear.
+     *
+     * Note that event_notifier_set *cannot* be optimized the same way.  For
+     * more information on the problem that would result, see "#ifdef BUG2"
+     * in the docs/aio_notify_accept.promela formal model.
+     */
+    bool notified;
+    EventNotifier notifier;
+
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
+    int thread_pool_min;
+    int thread_pool_max;
+    /* Thread pool for performing work and receiving completion callbacks.
+     * Has its own locking.
+     */
+    struct ThreadPoolAio *thread_pool;
+
+#ifdef CONFIG_LINUX_AIO
+    struct LinuxAioState *linux_aio;
+#endif
+#ifdef CONFIG_LINUX_IO_URING
+    /* State for file descriptor monitoring using Linux io_uring */
+    struct io_uring fdmon_io_uring;
+    AioHandlerSList submit_list;
+    void *io_uring_fd_tag;
+
+    /* Pending callback state for cqe handlers */
+    CqeHandlerSimpleQ cqe_handler_ready_list;
+#endif /* CONFIG_LINUX_IO_URING */
+
+    /* TimerLists for calling timers - one per clock type.  Has its own
+     * locking.
+     */
+    QEMUTimerListGroup tlg;
+
+    /* Number of AioHandlers without .io_poll() */
+    int poll_disable_cnt;
+
+    /* Polling mode parameters */
+    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
+    int64_t poll_grow;      /* polling time growth factor */
+    int64_t poll_shrink;    /* polling time shrink factor */
+
+    /* AIO engine parameters */
+    int64_t aio_max_batch;  /* maximum number of requests in a batch */
+
+    /*
+     * List of handlers participating in userspace polling.  Protected by
+     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
+     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
+     * only touches the list to delete nodes if ctx->list_lock's count is zero.
+     */
+    AioHandlerList poll_aio_handlers;
+
+    /* Are we in polling mode or monitoring file descriptors? */
+    bool poll_started;
+
+    /* epoll(7) state used when built with CONFIG_EPOLL */
+    int epollfd;
+
+    /* The GSource unix fd tag for epollfd */
+    void *epollfd_tag;
+
+    const FDMonOps *fdmon_ops;
+
+    /* Was aio_context_new() successful? */
+    bool initialized;
+};
+
+/**
+ * aio_context_new: Allocate a new AioContext.
+ *
+ * AioContext provide a mini event-loop that can be waited on synchronously.
+ * They also provide bottom halves, a service to execute a piece of code
+ * as soon as possible.
+ */
+AioContext *aio_context_new(Error **errp);
+
+/**
+ * aio_context_ref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Add a reference to an AioContext.
+ */
+void aio_context_ref(AioContext *ctx);
+
+/**
+ * aio_context_unref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Drop a reference to an AioContext.
+ */
+void aio_context_unref(AioContext *ctx);
+
+/**
+ * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
+ * run only once and as soon as possible.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                                  const char *name);
+
+/**
+ * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
+ * only once and as soon as possible.
+ *
+ * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
+ * name string.
+ */
+#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
+    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_bh_new_full: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ * @reentrancy_guard: A guard set when entering a cb to prevent
+ * device-reentrancy issues
+ */
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                        const char *name, struct MemReentrancyGuard *reentrancy_guard);
+
+/**
+ * aio_bh_new: Allocate a new bottom half structure
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new(ctx, cb, opaque) \
+    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
+
+/**
+ * aio_bh_new_guarded: Allocate a new bottom half structure with a
+ * reentrancy_guard
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
+    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
+
+/**
+ * aio_notify: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, aio_notify forces
+ * aio_poll to exit, so that the next call will re-examine pending events.
+ * The caller of aio_notify will usually call aio_poll again very soon,
+ * or go through another iteration of the GLib main loop.  Hence, aio_notify
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling aio_notify is rarely necessary, because for example scheduling
+ * a bottom half calls it already.
+ */
+void aio_notify(AioContext *ctx);
+
+/**
+ * aio_notify_accept: Acknowledge receiving an aio_notify.
+ *
+ * aio_notify() uses an EventNotifier in order to wake up a sleeping
+ * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
+ * usually rare, but the AioContext has to clear the EventNotifier on
+ * every aio_poll() or g_main_context_iteration() in order to avoid
+ * busy waiting.  This event_notifier_test_and_clear() cannot be done
+ * using the usual aio_context_set_event_notifier(), because it must
+ * be done before processing all events (file descriptors, bottom halves,
+ * timers).
+ *
+ * aio_notify_accept() is an optimized event_notifier_test_and_clear()
+ * that is specific to an AioContext's notifier; it is used internally
+ * to clear the EventNotifier only if aio_notify() had been called.
+ */
+void aio_notify_accept(AioContext *ctx);
+
+/**
+ * aio_bh_call: Executes callback function of the specified BH.
+ */
+void aio_bh_call(QEMUBH *bh);
+
+/**
+ * aio_bh_poll: Poll bottom halves for an AioContext.
+ *
+ * These are internal functions used by the QEMU main loop.
+ * And notice that multiple occurrences of aio_bh_poll cannot
+ * be called concurrently
+ */
+int aio_bh_poll(AioContext *ctx);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet.  While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex.  This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new.  It also implies canceling the bottom half if it was
+ * scheduled.
+ * This func is async. The bottom half will do the delete action at the finial
+ * end.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, before g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_prepare(AioContext *ctx);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, after g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_pending(AioContext *ctx);
+
+/* Dispatch any pending callbacks from the GSource attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+void aio_dispatch(AioContext *ctx);
+
+/* Progress in completing AIO work to occur.  This can issue new pending
+ * aio as a result of executing I/O completion or bh callbacks.
+ *
+ * Return whether any progress was made by executing AIO or bottom half
+ * handlers.  If @blocking == true, this should always be true except
+ * if someone called aio_notify.
+ *
+ * If there are no pending bottom halves, but there are pending AIO
+ * operations, it may not be possible to make any progress without
+ * blocking.  If @blocking is true, this function will wait until one
+ * or more AIO events have completed, to ensure something has moved
+ * before returning.
+ */
+bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
+
+/* Register a file descriptor and associated callbacks.  Behaves very similarly
+ * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
+ * be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of qemu_set_fd_handler[2].
+ */
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioPollFn *io_poll,
+                        IOHandler *io_poll_ready,
+                        void *opaque);
+
+/* Register an event notifier and associated callbacks.  Behaves very similarly
+ * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
+ * will be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of event_notifier_set_handler.
+ */
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *notifier,
+                            EventNotifierHandler *io_read,
+                            AioPollFn *io_poll,
+                            EventNotifierHandler *io_poll_ready);
+
+/*
+ * Set polling begin/end callbacks for an event notifier that has already been
+ * registered with aio_set_event_notifier.  Do nothing if the event notifier is
+ * not registered.
+ *
+ * Note that if the io_poll_end() callback (or the entire notifier) is removed
+ * during polling, it will not be called, so an io_poll_begin() is not
+ * necessarily always followed by an io_poll_end().
+ */
+void aio_set_event_notifier_poll(AioContext *ctx,
+                                 EventNotifier *notifier,
+                                 EventNotifierHandler *io_poll_begin,
+                                 EventNotifierHandler *io_poll_end);
+
+/* Return a GSource that lets the main loop poll the file descriptors attached
+ * to this AioContext.
+ */
+GSource *aio_get_g_source(AioContext *ctx);
+
+/* Return the ThreadPoolAio bound to this AioContext */
+struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
+
+/* Setup the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
+
+/* Return the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
+
+/**
+ * aio_timer_new_with_attrs:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ *              to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer (with attributes) attached to the context @ctx.
+ * The function is responsible for memory allocation.
+ *
+ * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
+ * Use that unless you really need dynamic memory allocation.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
+                                                  QEMUClockType type,
+                                                  int scale, int attributes,
+                                                  QEMUTimerCB *cb, void *opaque)
+{
+    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_new:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer attached to the context @ctx.
+ * See aio_timer_new_with_attrs for details.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
+                                       int scale,
+                                       QEMUTimerCB *cb, void *opaque)
+{
+    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_timer_init_with_attrs:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ *              to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer (with attributes) attached to the context @ctx.
+ * The caller is responsible for memory allocation.
+ */
+static inline void aio_timer_init_with_attrs(AioContext *ctx,
+                                             QEMUTimer *ts, QEMUClockType type,
+                                             int scale, int attributes,
+                                             QEMUTimerCB *cb, void *opaque)
+{
+    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_init:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer attached to the context @ctx.
+ * See aio_timer_init_with_attrs for details.
+ */
+static inline void aio_timer_init(AioContext *ctx,
+                                  QEMUTimer *ts, QEMUClockType type,
+                                  int scale,
+                                  QEMUTimerCB *cb, void *opaque)
+{
+    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_compute_timeout:
+ * @ctx: the aio context
+ *
+ * Compute the timeout that a blocking aio_poll should use.
+ */
+int64_t aio_compute_timeout(AioContext *ctx);
+
+/**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, Coroutine *co);
+
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ *
+ * Note that this function cannot reschedule from iohandler_ctx to
+ * qemu_aio_context.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ *
+ * If `co`'s AioContext differs from the current AioContext, this will call
+ * aio_co_schedule(), which makes this safe to use even when `co` has not
+ * yielded yet.  In such a case, it will be entered once it yields.
+ *
+ * In contrast, if `co`'s AioContext is equal to the current one, it is
+ * required for `co` to currently be yielding.  This is generally the case
+ * if the caller is not in `co` (i.e. invoked by `co`), because the only
+ * other way for the caller to be running then is for `co` to currently be
+ * yielding.
+ *
+ * Therefore, if there is no way for the caller to be invoked/entered by
+ * `co`, it is generally safe to call this regardless of whether `co` is
+ * known to already be yielding or not -- it only has to yield at some
+ * point.
+ */
+void aio_co_wake(Coroutine *co);
+
+/**
+ * aio_co_enter:
+ * @ctx: the context to run the coroutine
+ * @co: the coroutine to run
+ *
+ * Enter a coroutine in the specified AioContext.
+ */
+void aio_co_enter(AioContext *ctx, Coroutine *co);
+
+/**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
+ *
+ * Note that the return value is never the main loop's iohandler_ctx and the
+ * return value is the main loop AioContext instead.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+void qemu_set_current_aio_context(AioContext *ctx);
+
+/**
+ * aio_context_setup:
+ * @ctx: the aio context
+ * @errp: error pointer
+ *
+ * Initialize the aio context.
+ *
+ * Returns: true on success, false otherwise
+ */
+bool aio_context_setup(AioContext *ctx, Error **errp);
+
+/**
+ * aio_context_destroy:
+ * @ctx: the aio context
+ *
+ * Destroy the aio context.
+ */
+void aio_context_destroy(AioContext *ctx);
+
+/**
+ * aio_context_set_poll_params:
+ * @ctx: the aio context
+ * @max_ns: how long to busy poll for, in nanoseconds
+ * @grow: polling time growth factor
+ * @shrink: polling time shrink factor
+ *
+ * Poll mode can be disabled by setting poll_max_ns to 0.
+ */
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+                                 int64_t grow, int64_t shrink,
+                                 Error **errp);
+
+/**
+ * aio_context_set_aio_params:
+ * @ctx: the aio context
+ * @max_batch: maximum number of requests in a batch, 0 means that the
+ *             engine will use its default
+ */
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
+
+/**
+ * aio_context_set_thread_pool_params:
+ * @ctx: the aio context
+ * @min: min number of threads to have readily available in the thread pool
+ * @min: max number of threads the thread pool can contain
+ */
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+                                        int64_t max, Error **errp);
+
+#ifdef CONFIG_LINUX_IO_URING
+/**
+ * aio_has_io_uring: Return whether io_uring is available.
+ *
+ * io_uring is either available in all AioContexts or in none, so this only
+ * needs to be called once from within any thread's AioContext.
+ */
+static inline bool aio_has_io_uring(void)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    return ctx->fdmon_ops->add_sqe;
+}
+
+/**
+ * aio_add_sqe: Add an io_uring sqe for submission.
+ * @prep_sqe: invoked with an sqe that should be prepared for submission
+ * @opaque: user-defined argument to @prep_sqe()
+ * @cqe_handler: the unique cqe handler associated with this request
+ *
+ * The caller's @prep_sqe() function is invoked to fill in the details of the
+ * sqe. Do not call io_uring_sqe_set_data() on this sqe.
+ *
+ * The sqe is submitted by the current AioContext. The kernel may see the sqe
+ * as soon as @prep_sqe() returns or it may take until the next event loop
+ * iteration.
+ *
+ * When the AioContext is destroyed, pending sqes are ignored and their
+ * CqeHandlers are not invoked.
+ *
+ * This function must be called only when aio_has_io_uring() returns true.
+ */
+void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                 void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+
+#endif
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 0d55c636b21..8c1241a2c11 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -25,7 +25,7 @@
 #ifndef QEMU_MAIN_LOOP_H
 #define QEMU_MAIN_LOOP_H
 
-#include "block/aio.h"
+#include "qemu/aio.h"
 #include "qom/object.h"
 #include "system/event-loop-base.h"
 
@@ -431,7 +431,7 @@ void qemu_cond_timedwait_bql(QemuCond *cond, int ms);
 #define qemu_bh_new(cb, opaque) \
     qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL)
 QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
-                         MemReentrancyGuard *reentrancy_guard);
+                         struct MemReentrancyGuard *reentrancy_guard);
 void qemu_bh_schedule_idle(QEMUBH *bh);
 
 enum {
-- 
2.51.1

Re: [PATCH] block: split qemu/aio.h out of block/aio.h

Posted by Kevin Wolf 2 months, 1 week ago

Am 28.11.2025 um 11:15 hat Paolo Bonzini geschrieben:
> Rust bindings are roughly broken up according to subdirectories of
> include/ (that's not exact, but it's roughly an aim).  However,
> block/aio.h contains both block layer-specific concepts (BlockAIOCB,
> BlockCompletionFunc) and AioContext-related declarations that are
> used be qemu/main-loop.h.
> 
> Break out the latter into their own header file, and use that to
> break the inclusion of block/ from qemu/main-loop.h.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> 	Based on top of
> 	https://lore.kernel.org/qemu-devel/20251127131516.80807-3-pbonzini@redhat.com/
> 
>  include/block/aio.h      | 838 +-------------------------------------
>  include/qemu/aio.h       | 852 +++++++++++++++++++++++++++++++++++++++
>  include/qemu/main-loop.h |   4 +-
>  3 files changed, 857 insertions(+), 837 deletions(-)
>  create mode 100644 include/qemu/aio.h

Is it really worth keeping a separate tiny include file for BlockAIOCB?
Maybe let's have a patch to move it and its three functions to
block-common.h, and then just move this file unchanged into the place
where it should always have been.

Kevin

Re: [PATCH] block: split qemu/aio.h out of block/aio.h

Posted by Paolo Bonzini 2 months, 1 week ago

Il mar 2 dic 2025, 17:03 Kevin Wolf <kwolf@redhat.com> ha scritto:

> Is it really worth keeping a separate tiny include file for BlockAIOCB?
> Maybe let's have a patch to move it and its three functions to
> block-common.h, and then just move this file unchanged into the place
> where it should always have been.
>

Ok, sure! I did it this way just because it avoided the full-tree change to
rename the include file, but I will reorganize it as a content
reorganization + rename instead.

Paolo


> Kevin
>
>

Re: [PATCH] block: split qemu/aio.h out of block/aio.h

Posted by Prasad Pandit 2 months, 1 week ago

On Fri, 28 Nov 2025 at 15:47, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Rust bindings are roughly broken up according to subdirectories of
> include/ (that's not exact, but it's roughly an aim).  However,
> block/aio.h contains both block layer-specific concepts (BlockAIOCB,
> BlockCompletionFunc) and AioContext-related declarations that are
> used be qemu/main-loop.h.
>
> Break out the latter into their own header file, and use that to
> break the inclusion of block/ from qemu/main-loop.h.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>         Based on top of
>         https://lore.kernel.org/qemu-devel/20251127131516.80807-3-pbonzini@redhat.com/
>
>  include/block/aio.h      | 838 +-------------------------------------
>  include/qemu/aio.h       | 852 +++++++++++++++++++++++++++++++++++++++
>  include/qemu/main-loop.h |   4 +-
>  3 files changed, 857 insertions(+), 837 deletions(-)
>  create mode 100644 include/qemu/aio.h
>
> diff --git a/include/block/aio.h b/include/block/aio.h
> index cc3d5f25a24..dba423f896e 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -11,22 +11,13 @@
>   *
>   */
>
> -#ifndef QEMU_AIO_H
> -#define QEMU_AIO_H
> +#ifndef QEMU_BLOCK_AIO_H
> +#define QEMU_BLOCK_AIO_H
>
> -#ifdef CONFIG_LINUX_IO_URING
> -#include <liburing.h>
> -#endif
> -#include "qemu/coroutine-core.h"
> -#include "qemu/queue.h"
> -#include "qemu/event_notifier.h"
> -#include "qemu/lockcnt.h"
> -#include "qemu/thread.h"
> -#include "qemu/timer.h"
> +#include "qemu/aio.h"
>  #include "block/graph-lock.h"
>  #include "hw/core/qdev.h"
>
> -
>  typedef struct BlockAIOCB BlockAIOCB;
>  typedef void BlockCompletionFunc(void *opaque, int ret);
>
> @@ -48,827 +39,4 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
>  void qemu_aio_unref(void *p);
>  void qemu_aio_ref(void *p);
>
> -typedef struct AioHandler AioHandler;
> -typedef QLIST_HEAD(, AioHandler) AioHandlerList;
> -typedef void QEMUBHFunc(void *opaque);
> -typedef bool AioPollFn(void *opaque);
> -typedef void IOHandler(void *opaque);
> -
> -struct ThreadPoolAio;
> -struct LinuxAioState;
> -typedef struct LuringState LuringState;
> -
> -/* Is polling disabled? */
> -bool aio_poll_disabled(AioContext *ctx);
> -
> -#ifdef CONFIG_LINUX_IO_URING
> -/*
> - * Each io_uring request must have a unique CqeHandler that processes the cqe.
> - * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
> - * ->cb() invocation.
> - */
> -typedef struct CqeHandler CqeHandler;
> -struct CqeHandler {
> -    /* Called by the AioContext when the request has completed */
> -    void (*cb)(CqeHandler *handler);
> -
> -    /* Used internally, do not access this */
> -    QSIMPLEQ_ENTRY(CqeHandler) next;
> -
> -    /* This field is filled in before ->cb() is called */
> -    struct io_uring_cqe cqe;
> -};
> -
> -typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
> -#endif /* CONFIG_LINUX_IO_URING */
> -
> -/* Callbacks for file descriptor monitoring implementations */
> -typedef struct {
> -    /*
> -     * update:
> -     * @ctx: the AioContext
> -     * @old_node: the existing handler or NULL if this file descriptor is being
> -     *            monitored for the first time
> -     * @new_node: the new handler or NULL if this file descriptor is being
> -     *            removed
> -     *
> -     * Add/remove/modify a monitored file descriptor.
> -     *
> -     * Called with ctx->list_lock acquired.
> -     */
> -    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
> -
> -    /*
> -     * wait:
> -     * @ctx: the AioContext
> -     * @ready_list: list for handlers that become ready
> -     * @timeout: maximum duration to wait, in nanoseconds
> -     *
> -     * Wait for file descriptors to become ready and place them on ready_list.
> -     *
> -     * Called with ctx->list_lock incremented but not locked.
> -     *
> -     * Returns: number of ready file descriptors.
> -     */
> -    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
> -
> -    /*
> -     * need_wait:
> -     * @ctx: the AioContext
> -     *
> -     * Tell aio_poll() when to stop userspace polling early because ->wait()
> -     * has fds ready.
> -     *
> -     * File descriptor monitoring implementations that cannot poll fd readiness
> -     * from userspace should use aio_poll_disabled() here.  This ensures that
> -     * file descriptors are not starved by handlers that frequently make
> -     * progress via userspace polling.
> -     *
> -     * Returns: true if ->wait() should be called, false otherwise.
> -     */
> -    bool (*need_wait)(AioContext *ctx);
> -
> -    /*
> -     * dispatch:
> -     * @ctx: the AioContext
> -     *
> -     * Dispatch any work that is specific to this file descriptor monitoring
> -     * implementation. Usually the event loop's generic file descriptor
> -     * monitoring, BH, and timer dispatching code is sufficient, but file
> -     * descriptor monitoring implementations offering additional functionality
> -     * may need to implement this function for custom behavior. Called at a
> -     * point in the event loop when it is safe to invoke user-defined
> -     * callbacks.
> -     *
> -     * This function is optional and may be NULL.
> -     *
> -     * Returns: true if progress was made (see aio_poll()'s return value),
> -     * false otherwise.
> -     */
> -    bool (*dispatch)(AioContext *ctx);
> -
> -    /*
> -     * gsource_prepare:
> -     * @ctx: the AioContext
> -     *
> -     * Prepare for the glib event loop to wait for events instead of the usual
> -     * ->wait() call. See glib's GSourceFuncs->prepare().
> -     */
> -    void (*gsource_prepare)(AioContext *ctx);
> -
> -    /*
> -     * gsource_check:
> -     * @ctx: the AioContext
> -     *
> -     * Called by the glib event loop from glib's GSourceFuncs->check() after
> -     * waiting for events.
> -     *
> -     * Returns: true when ready to be dispatched.
> -     */
> -    bool (*gsource_check)(AioContext *ctx);
> -
> -    /*
> -     * gsource_dispatch:
> -     * @ctx: the AioContext
> -     * @ready_list: list for handlers that become ready
> -     *
> -     * Place ready AioHandlers on ready_list. Called as part of the glib event
> -     * loop from glib's GSourceFuncs->dispatch().
> -     *
> -     * Called with list_lock incremented.
> -     */
> -    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
> -
> -#ifdef CONFIG_LINUX_IO_URING
> -    /**
> -     * add_sqe: Add an io_uring sqe for submission.
> -     * @prep_sqe: invoked with an sqe that should be prepared for submission
> -     * @opaque: user-defined argument to @prep_sqe()
> -     * @cqe_handler: the unique cqe handler associated with this request
> -     *
> -     * The caller's @prep_sqe() function is invoked to fill in the details of
> -     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
> -     *
> -     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
> -     * until the next event loop iteration.
> -     *
> -     * This function is called from the current AioContext and is not
> -     * thread-safe.
> -     */
> -    void (*add_sqe)(AioContext *ctx,
> -                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
> -                    void *opaque, CqeHandler *cqe_handler);
> -#endif /* CONFIG_LINUX_IO_URING */
> -} FDMonOps;
> -
> -/*
> - * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
> - * scheduled BHs are not processed until the next aio_bh_poll() call.  All
> - * active aio_bh_poll() calls chain their slices together in a list, so that
> - * nested aio_bh_poll() calls process all scheduled bottom halves.
> - */
> -typedef QSLIST_HEAD(, QEMUBH) BHList;
> -typedef struct BHListSlice BHListSlice;
> -struct BHListSlice {
> -    BHList bh_list;
> -    QSIMPLEQ_ENTRY(BHListSlice) next;
> -};
> -
> -typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
> -
> -typedef struct AioPolledEvent {
> -    int64_t ns;        /* current polling time in nanoseconds */
> -} AioPolledEvent;
> -
> -struct AioContext {
> -    GSource source;
> -
> -    /* Used by AioContext users to protect from multi-threaded access.  */
> -    QemuRecMutex lock;
> -
> -    /*
> -     * Keep track of readers and writers of the block layer graph.
> -     * This is essential to avoid performing additions and removal
> -     * of nodes and edges from block graph while some
> -     * other thread is traversing it.
> -     */
> -    BdrvGraphRWlock *bdrv_graph;
> -
> -    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
> -    AioHandlerList aio_handlers;
> -
> -    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
> -    AioHandlerList deleted_aio_handlers;
> -
> -    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
> -     * only written from the AioContext home thread, or under the BQL in
> -     * the case of the main AioContext.  However, it is read from any
> -     * thread so it is still accessed with atomic primitives.
> -     *
> -     * If this field is 0, everything (file descriptors, bottom halves,
> -     * timers) will be re-evaluated before the next blocking poll() or
> -     * io_uring wait; therefore, the event_notifier_set call can be
> -     * skipped.  If it is non-zero, you may need to wake up a concurrent
> -     * aio_poll or the glib main event loop, making event_notifier_set
> -     * necessary.
> -     *
> -     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
> -     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
> -     * Bits 1-31 simply count the number of active calls to aio_poll
> -     * that are in the prepare or poll phase.
> -     *
> -     * The GSource and aio_poll must use a different mechanism because
> -     * there is no certainty that a call to GSource's prepare callback
> -     * (via g_main_context_prepare) is indeed followed by check and
> -     * dispatch.  It's not clear whether this would be a bug, but let's
> -     * play safe and allow it---it will just cause extra calls to
> -     * event_notifier_set until the next call to dispatch.
> -     *
> -     * Instead, the aio_poll calls include both the prepare and the
> -     * dispatch phase, hence a simple counter is enough for them.
> -     */
> -    uint32_t notify_me;
> -
> -    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
> -     * and to ensure that no callbacks are removed while we're walking and
> -     * dispatching them.
> -     */
> -    QemuLockCnt list_lock;
> -
> -    /* Bottom Halves pending aio_bh_poll() processing */
> -    BHList bh_list;
> -
> -    /* Chained BH list slices for each nested aio_bh_poll() call */
> -    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
> -
> -    /* Used by aio_notify.
> -     *
> -     * "notified" is used to avoid expensive event_notifier_test_and_clear
> -     * calls.  When it is clear, the EventNotifier is clear, or one thread
> -     * is going to clear "notified" before processing more events.  False
> -     * positives are possible, i.e. "notified" could be set even though the
> -     * EventNotifier is clear.
> -     *
> -     * Note that event_notifier_set *cannot* be optimized the same way.  For
> -     * more information on the problem that would result, see "#ifdef BUG2"
> -     * in the docs/aio_notify_accept.promela formal model.
> -     */
> -    bool notified;
> -    EventNotifier notifier;
> -
> -    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
> -    QEMUBH *co_schedule_bh;
> -
> -    int thread_pool_min;
> -    int thread_pool_max;
> -    /* Thread pool for performing work and receiving completion callbacks.
> -     * Has its own locking.
> -     */
> -    struct ThreadPoolAio *thread_pool;
> -
> -#ifdef CONFIG_LINUX_AIO
> -    struct LinuxAioState *linux_aio;
> -#endif
> -#ifdef CONFIG_LINUX_IO_URING
> -    /* State for file descriptor monitoring using Linux io_uring */
> -    struct io_uring fdmon_io_uring;
> -    AioHandlerSList submit_list;
> -    void *io_uring_fd_tag;
> -
> -    /* Pending callback state for cqe handlers */
> -    CqeHandlerSimpleQ cqe_handler_ready_list;
> -#endif /* CONFIG_LINUX_IO_URING */
> -
> -    /* TimerLists for calling timers - one per clock type.  Has its own
> -     * locking.
> -     */
> -    QEMUTimerListGroup tlg;
> -
> -    /* Number of AioHandlers without .io_poll() */
> -    int poll_disable_cnt;
> -
> -    /* Polling mode parameters */
> -    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
> -    int64_t poll_grow;      /* polling time growth factor */
> -    int64_t poll_shrink;    /* polling time shrink factor */
> -
> -    /* AIO engine parameters */
> -    int64_t aio_max_batch;  /* maximum number of requests in a batch */
> -
> -    /*
> -     * List of handlers participating in userspace polling.  Protected by
> -     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
> -     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
> -     * only touches the list to delete nodes if ctx->list_lock's count is zero.
> -     */
> -    AioHandlerList poll_aio_handlers;
> -
> -    /* Are we in polling mode or monitoring file descriptors? */
> -    bool poll_started;
> -
> -    /* epoll(7) state used when built with CONFIG_EPOLL */
> -    int epollfd;
> -
> -    /* The GSource unix fd tag for epollfd */
> -    void *epollfd_tag;
> -
> -    const FDMonOps *fdmon_ops;
> -
> -    /* Was aio_context_new() successful? */
> -    bool initialized;
> -};
> -
> -/**
> - * aio_context_new: Allocate a new AioContext.
> - *
> - * AioContext provide a mini event-loop that can be waited on synchronously.
> - * They also provide bottom halves, a service to execute a piece of code
> - * as soon as possible.
> - */
> -AioContext *aio_context_new(Error **errp);
> -
> -/**
> - * aio_context_ref:
> - * @ctx: The AioContext to operate on.
> - *
> - * Add a reference to an AioContext.
> - */
> -void aio_context_ref(AioContext *ctx);
> -
> -/**
> - * aio_context_unref:
> - * @ctx: The AioContext to operate on.
> - *
> - * Drop a reference to an AioContext.
> - */
> -void aio_context_unref(AioContext *ctx);
> -
> -/**
> - * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
> - * run only once and as soon as possible.
> - *
> - * @name: A human-readable identifier for debugging purposes.
> - */
> -void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
> -                                  const char *name);
> -
> -/**
> - * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
> - * only once and as soon as possible.
> - *
> - * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
> - * name string.
> - */
> -#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
> -    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
> -
> -/**
> - * aio_bh_new_full: Allocate a new bottom half structure.
> - *
> - * Bottom halves are lightweight callbacks whose invocation is guaranteed
> - * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
> - * is opaque and must be allocated prior to its use.
> - *
> - * @name: A human-readable identifier for debugging purposes.
> - * @reentrancy_guard: A guard set when entering a cb to prevent
> - * device-reentrancy issues
> - */
> -QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
> -                        const char *name, MemReentrancyGuard *reentrancy_guard);
> -
> -/**
> - * aio_bh_new: Allocate a new bottom half structure
> - *
> - * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
> - * string.
> - */
> -#define aio_bh_new(ctx, cb, opaque) \
> -    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
> -
> -/**
> - * aio_bh_new_guarded: Allocate a new bottom half structure with a
> - * reentrancy_guard
> - *
> - * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
> - * string.
> - */
> -#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
> -    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
> -
> -/**
> - * aio_notify: Force processing of pending events.
> - *
> - * Similar to signaling a condition variable, aio_notify forces
> - * aio_poll to exit, so that the next call will re-examine pending events.
> - * The caller of aio_notify will usually call aio_poll again very soon,
> - * or go through another iteration of the GLib main loop.  Hence, aio_notify
> - * also has the side effect of recalculating the sets of file descriptors
> - * that the main loop waits for.
> - *
> - * Calling aio_notify is rarely necessary, because for example scheduling
> - * a bottom half calls it already.
> - */
> -void aio_notify(AioContext *ctx);
> -
> -/**
> - * aio_notify_accept: Acknowledge receiving an aio_notify.
> - *
> - * aio_notify() uses an EventNotifier in order to wake up a sleeping
> - * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
> - * usually rare, but the AioContext has to clear the EventNotifier on
> - * every aio_poll() or g_main_context_iteration() in order to avoid
> - * busy waiting.  This event_notifier_test_and_clear() cannot be done
> - * using the usual aio_context_set_event_notifier(), because it must
> - * be done before processing all events (file descriptors, bottom halves,
> - * timers).
> - *
> - * aio_notify_accept() is an optimized event_notifier_test_and_clear()
> - * that is specific to an AioContext's notifier; it is used internally
> - * to clear the EventNotifier only if aio_notify() had been called.
> - */
> -void aio_notify_accept(AioContext *ctx);
> -
> -/**
> - * aio_bh_call: Executes callback function of the specified BH.
> - */
> -void aio_bh_call(QEMUBH *bh);
> -
> -/**
> - * aio_bh_poll: Poll bottom halves for an AioContext.
> - *
> - * These are internal functions used by the QEMU main loop.
> - * And notice that multiple occurrences of aio_bh_poll cannot
> - * be called concurrently
> - */
> -int aio_bh_poll(AioContext *ctx);
> -
> -/**
> - * qemu_bh_schedule: Schedule a bottom half.
> - *
> - * Scheduling a bottom half interrupts the main loop and causes the
> - * execution of the callback that was passed to qemu_bh_new.
> - *
> - * Bottom halves that are scheduled from a bottom half handler are instantly
> - * invoked.  This can create an infinite loop if a bottom half handler
> - * schedules itself.
> - *
> - * @bh: The bottom half to be scheduled.
> - */
> -void qemu_bh_schedule(QEMUBH *bh);
> -
> -/**
> - * qemu_bh_cancel: Cancel execution of a bottom half.
> - *
> - * Canceling execution of a bottom half undoes the effect of calls to
> - * qemu_bh_schedule without freeing its resources yet.  While cancellation
> - * itself is also wait-free and thread-safe, it can of course race with the
> - * loop that executes bottom halves unless you are holding the iothread
> - * mutex.  This makes it mostly useless if you are not holding the mutex.
> - *
> - * @bh: The bottom half to be canceled.
> - */
> -void qemu_bh_cancel(QEMUBH *bh);
> -
> -/**
> - *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
> - *
> - * Deleting a bottom half frees the memory that was allocated for it by
> - * qemu_bh_new.  It also implies canceling the bottom half if it was
> - * scheduled.
> - * This func is async. The bottom half will do the delete action at the finial
> - * end.
> - *
> - * @bh: The bottom half to be deleted.
> - */
> -void qemu_bh_delete(QEMUBH *bh);
> -
> -/* Return whether there are any pending callbacks from the GSource
> - * attached to the AioContext, before g_poll is invoked.
> - *
> - * This is used internally in the implementation of the GSource.
> - */
> -bool aio_prepare(AioContext *ctx);
> -
> -/* Return whether there are any pending callbacks from the GSource
> - * attached to the AioContext, after g_poll is invoked.
> - *
> - * This is used internally in the implementation of the GSource.
> - */
> -bool aio_pending(AioContext *ctx);
> -
> -/* Dispatch any pending callbacks from the GSource attached to the AioContext.
> - *
> - * This is used internally in the implementation of the GSource.
> - */
> -void aio_dispatch(AioContext *ctx);
> -
> -/* Progress in completing AIO work to occur.  This can issue new pending
> - * aio as a result of executing I/O completion or bh callbacks.
> - *
> - * Return whether any progress was made by executing AIO or bottom half
> - * handlers.  If @blocking == true, this should always be true except
> - * if someone called aio_notify.
> - *
> - * If there are no pending bottom halves, but there are pending AIO
> - * operations, it may not be possible to make any progress without
> - * blocking.  If @blocking is true, this function will wait until one
> - * or more AIO events have completed, to ensure something has moved
> - * before returning.
> - */
> -bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
> -
> -/* Register a file descriptor and associated callbacks.  Behaves very similarly
> - * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
> - * be invoked when using aio_poll().
> - *
> - * Code that invokes AIO completion functions should rely on this function
> - * instead of qemu_set_fd_handler[2].
> - */
> -void aio_set_fd_handler(AioContext *ctx,
> -                        int fd,
> -                        IOHandler *io_read,
> -                        IOHandler *io_write,
> -                        AioPollFn *io_poll,
> -                        IOHandler *io_poll_ready,
> -                        void *opaque);
> -
> -/* Register an event notifier and associated callbacks.  Behaves very similarly
> - * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
> - * will be invoked when using aio_poll().
> - *
> - * Code that invokes AIO completion functions should rely on this function
> - * instead of event_notifier_set_handler.
> - */
> -void aio_set_event_notifier(AioContext *ctx,
> -                            EventNotifier *notifier,
> -                            EventNotifierHandler *io_read,
> -                            AioPollFn *io_poll,
> -                            EventNotifierHandler *io_poll_ready);
> -
> -/*
> - * Set polling begin/end callbacks for an event notifier that has already been
> - * registered with aio_set_event_notifier.  Do nothing if the event notifier is
> - * not registered.
> - *
> - * Note that if the io_poll_end() callback (or the entire notifier) is removed
> - * during polling, it will not be called, so an io_poll_begin() is not
> - * necessarily always followed by an io_poll_end().
> - */
> -void aio_set_event_notifier_poll(AioContext *ctx,
> -                                 EventNotifier *notifier,
> -                                 EventNotifierHandler *io_poll_begin,
> -                                 EventNotifierHandler *io_poll_end);
> -
> -/* Return a GSource that lets the main loop poll the file descriptors attached
> - * to this AioContext.
> - */
> -GSource *aio_get_g_source(AioContext *ctx);
> -
> -/* Return the ThreadPoolAio bound to this AioContext */
> -struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
> -
> -/* Setup the LinuxAioState bound to this AioContext */
> -struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
> -
> -/* Return the LinuxAioState bound to this AioContext */
> -struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
> -
> -/**
> - * aio_timer_new_with_attrs:
> - * @ctx: the aio context
> - * @type: the clock type
> - * @scale: the scale
> - * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
> - *              to assign
> - * @cb: the callback to call on timer expiry
> - * @opaque: the opaque pointer to pass to the callback
> - *
> - * Allocate a new timer (with attributes) attached to the context @ctx.
> - * The function is responsible for memory allocation.
> - *
> - * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
> - * Use that unless you really need dynamic memory allocation.
> - *
> - * Returns: a pointer to the new timer
> - */
> -static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
> -                                                  QEMUClockType type,
> -                                                  int scale, int attributes,
> -                                                  QEMUTimerCB *cb, void *opaque)
> -{
> -    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
> -}
> -
> -/**
> - * aio_timer_new:
> - * @ctx: the aio context
> - * @type: the clock type
> - * @scale: the scale
> - * @cb: the callback to call on timer expiry
> - * @opaque: the opaque pointer to pass to the callback
> - *
> - * Allocate a new timer attached to the context @ctx.
> - * See aio_timer_new_with_attrs for details.
> - *
> - * Returns: a pointer to the new timer
> - */
> -static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
> -                                       int scale,
> -                                       QEMUTimerCB *cb, void *opaque)
> -{
> -    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
> -}
> -
> -/**
> - * aio_timer_init_with_attrs:
> - * @ctx: the aio context
> - * @ts: the timer
> - * @type: the clock type
> - * @scale: the scale
> - * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
> - *              to assign
> - * @cb: the callback to call on timer expiry
> - * @opaque: the opaque pointer to pass to the callback
> - *
> - * Initialise a new timer (with attributes) attached to the context @ctx.
> - * The caller is responsible for memory allocation.
> - */
> -static inline void aio_timer_init_with_attrs(AioContext *ctx,
> -                                             QEMUTimer *ts, QEMUClockType type,
> -                                             int scale, int attributes,
> -                                             QEMUTimerCB *cb, void *opaque)
> -{
> -    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
> -}
> -
> -/**
> - * aio_timer_init:
> - * @ctx: the aio context
> - * @ts: the timer
> - * @type: the clock type
> - * @scale: the scale
> - * @cb: the callback to call on timer expiry
> - * @opaque: the opaque pointer to pass to the callback
> - *
> - * Initialise a new timer attached to the context @ctx.
> - * See aio_timer_init_with_attrs for details.
> - */
> -static inline void aio_timer_init(AioContext *ctx,
> -                                  QEMUTimer *ts, QEMUClockType type,
> -                                  int scale,
> -                                  QEMUTimerCB *cb, void *opaque)
> -{
> -    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
> -}
> -
> -/**
> - * aio_compute_timeout:
> - * @ctx: the aio context
> - *
> - * Compute the timeout that a blocking aio_poll should use.
> - */
> -int64_t aio_compute_timeout(AioContext *ctx);
> -
> -/**
> - * aio_co_schedule:
> - * @ctx: the aio context
> - * @co: the coroutine
> - *
> - * Start a coroutine on a remote AioContext.
> - *
> - * The coroutine must not be entered by anyone else while aio_co_schedule()
> - * is active.  In addition the coroutine must have yielded unless ctx
> - * is the context in which the coroutine is running (i.e. the value of
> - * qemu_get_current_aio_context() from the coroutine itself).
> - */
> -void aio_co_schedule(AioContext *ctx, Coroutine *co);
> -
> -/**
> - * aio_co_reschedule_self:
> - * @new_ctx: the new context
> - *
> - * Move the currently running coroutine to new_ctx. If the coroutine is already
> - * running in new_ctx, do nothing.
> - *
> - * Note that this function cannot reschedule from iohandler_ctx to
> - * qemu_aio_context.
> - */
> -void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
> -
> -/**
> - * aio_co_wake:
> - * @co: the coroutine
> - *
> - * Restart a coroutine on the AioContext where it was running last, thus
> - * preventing coroutines from jumping from one context to another when they
> - * go to sleep.
> - *
> - * aio_co_wake may be executed either in coroutine or non-coroutine
> - * context.  The coroutine must not be entered by anyone else while
> - * aio_co_wake() is active.
> - *
> - * If `co`'s AioContext differs from the current AioContext, this will call
> - * aio_co_schedule(), which makes this safe to use even when `co` has not
> - * yielded yet.  In such a case, it will be entered once it yields.
> - *
> - * In contrast, if `co`'s AioContext is equal to the current one, it is
> - * required for `co` to currently be yielding.  This is generally the case
> - * if the caller is not in `co` (i.e. invoked by `co`), because the only
> - * other way for the caller to be running then is for `co` to currently be
> - * yielding.
> - *
> - * Therefore, if there is no way for the caller to be invoked/entered by
> - * `co`, it is generally safe to call this regardless of whether `co` is
> - * known to already be yielding or not -- it only has to yield at some
> - * point.
> - */
> -void aio_co_wake(Coroutine *co);
> -
> -/**
> - * aio_co_enter:
> - * @ctx: the context to run the coroutine
> - * @co: the coroutine to run
> - *
> - * Enter a coroutine in the specified AioContext.
> - */
> -void aio_co_enter(AioContext *ctx, Coroutine *co);
> -
> -/**
> - * Return the AioContext whose event loop runs in the current thread.
> - *
> - * If called from an IOThread this will be the IOThread's AioContext.  If
> - * called from the main thread or with the "big QEMU lock" taken it
> - * will be the main loop AioContext.
> - *
> - * Note that the return value is never the main loop's iohandler_ctx and the
> - * return value is the main loop AioContext instead.
> - */
> -AioContext *qemu_get_current_aio_context(void);
> -
> -void qemu_set_current_aio_context(AioContext *ctx);
> -
> -/**
> - * aio_context_setup:
> - * @ctx: the aio context
> - * @errp: error pointer
> - *
> - * Initialize the aio context.
> - *
> - * Returns: true on success, false otherwise
> - */
> -bool aio_context_setup(AioContext *ctx, Error **errp);
> -
> -/**
> - * aio_context_destroy:
> - * @ctx: the aio context
> - *
> - * Destroy the aio context.
> - */
> -void aio_context_destroy(AioContext *ctx);
> -
> -/**
> - * aio_context_set_poll_params:
> - * @ctx: the aio context
> - * @max_ns: how long to busy poll for, in nanoseconds
> - * @grow: polling time growth factor
> - * @shrink: polling time shrink factor
> - *
> - * Poll mode can be disabled by setting poll_max_ns to 0.
> - */
> -void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
> -                                 int64_t grow, int64_t shrink,
> -                                 Error **errp);
> -
> -/**
> - * aio_context_set_aio_params:
> - * @ctx: the aio context
> - * @max_batch: maximum number of requests in a batch, 0 means that the
> - *             engine will use its default
> - */
> -void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
> -
> -/**
> - * aio_context_set_thread_pool_params:
> - * @ctx: the aio context
> - * @min: min number of threads to have readily available in the thread pool
> - * @min: max number of threads the thread pool can contain
> - */
> -void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
> -                                        int64_t max, Error **errp);
> -
> -#ifdef CONFIG_LINUX_IO_URING
> -/**
> - * aio_has_io_uring: Return whether io_uring is available.
> - *
> - * io_uring is either available in all AioContexts or in none, so this only
> - * needs to be called once from within any thread's AioContext.
> - */
> -static inline bool aio_has_io_uring(void)
> -{
> -    AioContext *ctx = qemu_get_current_aio_context();
> -    return ctx->fdmon_ops->add_sqe;
> -}
> -
> -/**
> - * aio_add_sqe: Add an io_uring sqe for submission.
> - * @prep_sqe: invoked with an sqe that should be prepared for submission
> - * @opaque: user-defined argument to @prep_sqe()
> - * @cqe_handler: the unique cqe handler associated with this request
> - *
> - * The caller's @prep_sqe() function is invoked to fill in the details of the
> - * sqe. Do not call io_uring_sqe_set_data() on this sqe.
> - *
> - * The sqe is submitted by the current AioContext. The kernel may see the sqe
> - * as soon as @prep_sqe() returns or it may take until the next event loop
> - * iteration.
> - *
> - * When the AioContext is destroyed, pending sqes are ignored and their
> - * CqeHandlers are not invoked.
> - *
> - * This function must be called only when aio_has_io_uring() returns true.
> - */
> -void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
> -                 void *opaque, CqeHandler *cqe_handler);
> -#endif /* CONFIG_LINUX_IO_URING */
> -
>  #endif
> diff --git a/include/qemu/aio.h b/include/qemu/aio.h
> new file mode 100644
> index 00000000000..8cca2360d1a
> --- /dev/null
> +++ b/include/qemu/aio.h
> @@ -0,0 +1,852 @@
> +/*
> + * QEMU aio implementation
> + *
> + * Copyright IBM, Corp. 2008
> + *
> + * Authors:
> + *  Anthony Liguori   <aliguori@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef QEMU_AIO_H
> +#define QEMU_AIO_H
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +#include <liburing.h>
> +#endif
> +#include "qemu/coroutine-core.h"
> +#include "qemu/queue.h"
> +#include "qemu/event_notifier.h"
> +#include "qemu/lockcnt.h"
> +#include "qemu/thread.h"
> +#include "qemu/timer.h"
> +
> +struct MemReentrancyGuard;
> +
> +typedef struct AioHandler AioHandler;
> +typedef QLIST_HEAD(, AioHandler) AioHandlerList;
> +typedef void QEMUBHFunc(void *opaque);
> +typedef bool AioPollFn(void *opaque);
> +typedef void IOHandler(void *opaque);
> +
> +struct ThreadPoolAio;
> +struct LinuxAioState;
> +typedef struct LuringState LuringState;
> +
> +/* Is polling disabled? */
> +bool aio_poll_disabled(AioContext *ctx);
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +/*
> + * Each io_uring request must have a unique CqeHandler that processes the cqe.
> + * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
> + * ->cb() invocation.
> + */
> +typedef struct CqeHandler CqeHandler;
> +struct CqeHandler {
> +    /* Called by the AioContext when the request has completed */
> +    void (*cb)(CqeHandler *handler);
> +
> +    /* Used internally, do not access this */
> +    QSIMPLEQ_ENTRY(CqeHandler) next;
> +
> +    /* This field is filled in before ->cb() is called */
> +    struct io_uring_cqe cqe;
> +};
> +
> +typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
> +#endif /* CONFIG_LINUX_IO_URING */
> +
> +/* Callbacks for file descriptor monitoring implementations */
> +typedef struct {
> +    /*
> +     * update:
> +     * @ctx: the AioContext
> +     * @old_node: the existing handler or NULL if this file descriptor is being
> +     *            monitored for the first time
> +     * @new_node: the new handler or NULL if this file descriptor is being
> +     *            removed
> +     *
> +     * Add/remove/modify a monitored file descriptor.
> +     *
> +     * Called with ctx->list_lock acquired.
> +     */
> +    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
> +
> +    /*
> +     * wait:
> +     * @ctx: the AioContext
> +     * @ready_list: list for handlers that become ready
> +     * @timeout: maximum duration to wait, in nanoseconds
> +     *
> +     * Wait for file descriptors to become ready and place them on ready_list.
> +     *
> +     * Called with ctx->list_lock incremented but not locked.
> +     *
> +     * Returns: number of ready file descriptors.
> +     */
> +    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
> +
> +    /*
> +     * need_wait:
> +     * @ctx: the AioContext
> +     *
> +     * Tell aio_poll() when to stop userspace polling early because ->wait()
> +     * has fds ready.
> +     *
> +     * File descriptor monitoring implementations that cannot poll fd readiness
> +     * from userspace should use aio_poll_disabled() here.  This ensures that
> +     * file descriptors are not starved by handlers that frequently make
> +     * progress via userspace polling.
> +     *
> +     * Returns: true if ->wait() should be called, false otherwise.
> +     */
> +    bool (*need_wait)(AioContext *ctx);
> +
> +    /*
> +     * dispatch:
> +     * @ctx: the AioContext
> +     *
> +     * Dispatch any work that is specific to this file descriptor monitoring
> +     * implementation. Usually the event loop's generic file descriptor
> +     * monitoring, BH, and timer dispatching code is sufficient, but file
> +     * descriptor monitoring implementations offering additional functionality
> +     * may need to implement this function for custom behavior. Called at a
> +     * point in the event loop when it is safe to invoke user-defined
> +     * callbacks.
> +     *
> +     * This function is optional and may be NULL.
> +     *
> +     * Returns: true if progress was made (see aio_poll()'s return value),
> +     * false otherwise.
> +     */
> +    bool (*dispatch)(AioContext *ctx);
> +
> +    /*
> +     * gsource_prepare:
> +     * @ctx: the AioContext
> +     *
> +     * Prepare for the glib event loop to wait for events instead of the usual
> +     * ->wait() call. See glib's GSourceFuncs->prepare().
> +     */
> +    void (*gsource_prepare)(AioContext *ctx);
> +
> +    /*
> +     * gsource_check:
> +     * @ctx: the AioContext
> +     *
> +     * Called by the glib event loop from glib's GSourceFuncs->check() after
> +     * waiting for events.
> +     *
> +     * Returns: true when ready to be dispatched.
> +     */
> +    bool (*gsource_check)(AioContext *ctx);
> +
> +    /*
> +     * gsource_dispatch:
> +     * @ctx: the AioContext
> +     * @ready_list: list for handlers that become ready
> +     *
> +     * Place ready AioHandlers on ready_list. Called as part of the glib event
> +     * loop from glib's GSourceFuncs->dispatch().
> +     *
> +     * Called with list_lock incremented.
> +     */
> +    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +    /**
> +     * add_sqe: Add an io_uring sqe for submission.
> +     * @prep_sqe: invoked with an sqe that should be prepared for submission
> +     * @opaque: user-defined argument to @prep_sqe()
> +     * @cqe_handler: the unique cqe handler associated with this request
> +     *
> +     * The caller's @prep_sqe() function is invoked to fill in the details of
> +     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
> +     *
> +     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
> +     * until the next event loop iteration.
> +     *
> +     * This function is called from the current AioContext and is not
> +     * thread-safe.
> +     */
> +    void (*add_sqe)(AioContext *ctx,
> +                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
> +                    void *opaque, CqeHandler *cqe_handler);
> +#endif /* CONFIG_LINUX_IO_URING */
> +} FDMonOps;
> +
> +/*
> + * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
> + * scheduled BHs are not processed until the next aio_bh_poll() call.  All
> + * active aio_bh_poll() calls chain their slices together in a list, so that
> + * nested aio_bh_poll() calls process all scheduled bottom halves.
> + */
> +typedef QSLIST_HEAD(, QEMUBH) BHList;
> +typedef struct BHListSlice BHListSlice;
> +struct BHListSlice {
> +    BHList bh_list;
> +    QSIMPLEQ_ENTRY(BHListSlice) next;
> +};
> +
> +typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
> +
> +typedef struct AioPolledEvent {
> +    int64_t ns;        /* current polling time in nanoseconds */
> +} AioPolledEvent;
> +
> +struct AioContext {
> +    GSource source;
> +
> +    /* Used by AioContext users to protect from multi-threaded access.  */
> +    QemuRecMutex lock;
> +
> +    /*
> +     * Keep track of readers and writers of the block layer graph.
> +     * This is essential to avoid performing additions and removal
> +     * of nodes and edges from block graph while some
> +     * other thread is traversing it.
> +     */
> +    struct BdrvGraphRWlock *bdrv_graph;
> +
> +    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
> +    AioHandlerList aio_handlers;
> +
> +    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
> +    AioHandlerList deleted_aio_handlers;
> +
> +    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
> +     * only written from the AioContext home thread, or under the BQL in
> +     * the case of the main AioContext.  However, it is read from any
> +     * thread so it is still accessed with atomic primitives.
> +     *
> +     * If this field is 0, everything (file descriptors, bottom halves,
> +     * timers) will be re-evaluated before the next blocking poll() or
> +     * io_uring wait; therefore, the event_notifier_set call can be
> +     * skipped.  If it is non-zero, you may need to wake up a concurrent
> +     * aio_poll or the glib main event loop, making event_notifier_set
> +     * necessary.
> +     *
> +     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
> +     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
> +     * Bits 1-31 simply count the number of active calls to aio_poll
> +     * that are in the prepare or poll phase.
> +     *
> +     * The GSource and aio_poll must use a different mechanism because
> +     * there is no certainty that a call to GSource's prepare callback
> +     * (via g_main_context_prepare) is indeed followed by check and
> +     * dispatch.  It's not clear whether this would be a bug, but let's
> +     * play safe and allow it---it will just cause extra calls to
> +     * event_notifier_set until the next call to dispatch.
> +     *
> +     * Instead, the aio_poll calls include both the prepare and the
> +     * dispatch phase, hence a simple counter is enough for them.
> +     */
> +    uint32_t notify_me;
> +
> +    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
> +     * and to ensure that no callbacks are removed while we're walking and
> +     * dispatching them.
> +     */
> +    QemuLockCnt list_lock;
> +
> +    /* Bottom Halves pending aio_bh_poll() processing */
> +    BHList bh_list;
> +
> +    /* Chained BH list slices for each nested aio_bh_poll() call */
> +    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
> +
> +    /* Used by aio_notify.
> +     *
> +     * "notified" is used to avoid expensive event_notifier_test_and_clear
> +     * calls.  When it is clear, the EventNotifier is clear, or one thread
> +     * is going to clear "notified" before processing more events.  False
> +     * positives are possible, i.e. "notified" could be set even though the
> +     * EventNotifier is clear.
> +     *
> +     * Note that event_notifier_set *cannot* be optimized the same way.  For
> +     * more information on the problem that would result, see "#ifdef BUG2"
> +     * in the docs/aio_notify_accept.promela formal model.
> +     */
> +    bool notified;
> +    EventNotifier notifier;
> +
> +    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
> +    QEMUBH *co_schedule_bh;
> +
> +    int thread_pool_min;
> +    int thread_pool_max;
> +    /* Thread pool for performing work and receiving completion callbacks.
> +     * Has its own locking.
> +     */
> +    struct ThreadPoolAio *thread_pool;
> +
> +#ifdef CONFIG_LINUX_AIO
> +    struct LinuxAioState *linux_aio;
> +#endif
> +#ifdef CONFIG_LINUX_IO_URING
> +    /* State for file descriptor monitoring using Linux io_uring */
> +    struct io_uring fdmon_io_uring;
> +    AioHandlerSList submit_list;
> +    void *io_uring_fd_tag;
> +
> +    /* Pending callback state for cqe handlers */
> +    CqeHandlerSimpleQ cqe_handler_ready_list;
> +#endif /* CONFIG_LINUX_IO_URING */
> +
> +    /* TimerLists for calling timers - one per clock type.  Has its own
> +     * locking.
> +     */
> +    QEMUTimerListGroup tlg;
> +
> +    /* Number of AioHandlers without .io_poll() */
> +    int poll_disable_cnt;
> +
> +    /* Polling mode parameters */
> +    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
> +    int64_t poll_grow;      /* polling time growth factor */
> +    int64_t poll_shrink;    /* polling time shrink factor */
> +
> +    /* AIO engine parameters */
> +    int64_t aio_max_batch;  /* maximum number of requests in a batch */
> +
> +    /*
> +     * List of handlers participating in userspace polling.  Protected by
> +     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
> +     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
> +     * only touches the list to delete nodes if ctx->list_lock's count is zero.
> +     */
> +    AioHandlerList poll_aio_handlers;
> +
> +    /* Are we in polling mode or monitoring file descriptors? */
> +    bool poll_started;
> +
> +    /* epoll(7) state used when built with CONFIG_EPOLL */
> +    int epollfd;
> +
> +    /* The GSource unix fd tag for epollfd */
> +    void *epollfd_tag;
> +
> +    const FDMonOps *fdmon_ops;
> +
> +    /* Was aio_context_new() successful? */
> +    bool initialized;
> +};
> +
> +/**
> + * aio_context_new: Allocate a new AioContext.
> + *
> + * AioContext provide a mini event-loop that can be waited on synchronously.
> + * They also provide bottom halves, a service to execute a piece of code
> + * as soon as possible.
> + */
> +AioContext *aio_context_new(Error **errp);
> +
> +/**
> + * aio_context_ref:
> + * @ctx: The AioContext to operate on.
> + *
> + * Add a reference to an AioContext.
> + */
> +void aio_context_ref(AioContext *ctx);
> +
> +/**
> + * aio_context_unref:
> + * @ctx: The AioContext to operate on.
> + *
> + * Drop a reference to an AioContext.
> + */
> +void aio_context_unref(AioContext *ctx);
> +
> +/**
> + * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
> + * run only once and as soon as possible.
> + *
> + * @name: A human-readable identifier for debugging purposes.
> + */
> +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
> +                                  const char *name);
> +
> +/**
> + * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
> + * only once and as soon as possible.
> + *
> + * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
> + * name string.
> + */
> +#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
> +    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
> +
> +/**
> + * aio_bh_new_full: Allocate a new bottom half structure.
> + *
> + * Bottom halves are lightweight callbacks whose invocation is guaranteed
> + * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
> + * is opaque and must be allocated prior to its use.
> + *
> + * @name: A human-readable identifier for debugging purposes.
> + * @reentrancy_guard: A guard set when entering a cb to prevent
> + * device-reentrancy issues
> + */
> +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
> +                        const char *name, struct MemReentrancyGuard *reentrancy_guard);
> +
> +/**
> + * aio_bh_new: Allocate a new bottom half structure
> + *
> + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
> + * string.
> + */
> +#define aio_bh_new(ctx, cb, opaque) \
> +    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
> +
> +/**
> + * aio_bh_new_guarded: Allocate a new bottom half structure with a
> + * reentrancy_guard
> + *
> + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
> + * string.
> + */
> +#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
> +    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
> +
> +/**
> + * aio_notify: Force processing of pending events.
> + *
> + * Similar to signaling a condition variable, aio_notify forces
> + * aio_poll to exit, so that the next call will re-examine pending events.
> + * The caller of aio_notify will usually call aio_poll again very soon,
> + * or go through another iteration of the GLib main loop.  Hence, aio_notify
> + * also has the side effect of recalculating the sets of file descriptors
> + * that the main loop waits for.
> + *
> + * Calling aio_notify is rarely necessary, because for example scheduling
> + * a bottom half calls it already.
> + */
> +void aio_notify(AioContext *ctx);
> +
> +/**
> + * aio_notify_accept: Acknowledge receiving an aio_notify.
> + *
> + * aio_notify() uses an EventNotifier in order to wake up a sleeping
> + * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
> + * usually rare, but the AioContext has to clear the EventNotifier on
> + * every aio_poll() or g_main_context_iteration() in order to avoid
> + * busy waiting.  This event_notifier_test_and_clear() cannot be done
> + * using the usual aio_context_set_event_notifier(), because it must
> + * be done before processing all events (file descriptors, bottom halves,
> + * timers).
> + *
> + * aio_notify_accept() is an optimized event_notifier_test_and_clear()
> + * that is specific to an AioContext's notifier; it is used internally
> + * to clear the EventNotifier only if aio_notify() had been called.
> + */
> +void aio_notify_accept(AioContext *ctx);
> +
> +/**
> + * aio_bh_call: Executes callback function of the specified BH.
> + */
> +void aio_bh_call(QEMUBH *bh);
> +
> +/**
> + * aio_bh_poll: Poll bottom halves for an AioContext.
> + *
> + * These are internal functions used by the QEMU main loop.
> + * And notice that multiple occurrences of aio_bh_poll cannot
> + * be called concurrently
> + */
> +int aio_bh_poll(AioContext *ctx);
> +
> +/**
> + * qemu_bh_schedule: Schedule a bottom half.
> + *
> + * Scheduling a bottom half interrupts the main loop and causes the
> + * execution of the callback that was passed to qemu_bh_new.
> + *
> + * Bottom halves that are scheduled from a bottom half handler are instantly
> + * invoked.  This can create an infinite loop if a bottom half handler
> + * schedules itself.
> + *
> + * @bh: The bottom half to be scheduled.
> + */
> +void qemu_bh_schedule(QEMUBH *bh);
> +
> +/**
> + * qemu_bh_cancel: Cancel execution of a bottom half.
> + *
> + * Canceling execution of a bottom half undoes the effect of calls to
> + * qemu_bh_schedule without freeing its resources yet.  While cancellation
> + * itself is also wait-free and thread-safe, it can of course race with the
> + * loop that executes bottom halves unless you are holding the iothread
> + * mutex.  This makes it mostly useless if you are not holding the mutex.
> + *
> + * @bh: The bottom half to be canceled.
> + */
> +void qemu_bh_cancel(QEMUBH *bh);
> +
> +/**
> + *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
> + *
> + * Deleting a bottom half frees the memory that was allocated for it by
> + * qemu_bh_new.  It also implies canceling the bottom half if it was
> + * scheduled.
> + * This func is async. The bottom half will do the delete action at the finial
> + * end.
> + *
> + * @bh: The bottom half to be deleted.
> + */
> +void qemu_bh_delete(QEMUBH *bh);
> +
> +/* Return whether there are any pending callbacks from the GSource
> + * attached to the AioContext, before g_poll is invoked.
> + *
> + * This is used internally in the implementation of the GSource.
> + */
> +bool aio_prepare(AioContext *ctx);
> +
> +/* Return whether there are any pending callbacks from the GSource
> + * attached to the AioContext, after g_poll is invoked.
> + *
> + * This is used internally in the implementation of the GSource.
> + */
> +bool aio_pending(AioContext *ctx);
> +
> +/* Dispatch any pending callbacks from the GSource attached to the AioContext.
> + *
> + * This is used internally in the implementation of the GSource.
> + */
> +void aio_dispatch(AioContext *ctx);
> +
> +/* Progress in completing AIO work to occur.  This can issue new pending
> + * aio as a result of executing I/O completion or bh callbacks.
> + *
> + * Return whether any progress was made by executing AIO or bottom half
> + * handlers.  If @blocking == true, this should always be true except
> + * if someone called aio_notify.
> + *
> + * If there are no pending bottom halves, but there are pending AIO
> + * operations, it may not be possible to make any progress without
> + * blocking.  If @blocking is true, this function will wait until one
> + * or more AIO events have completed, to ensure something has moved
> + * before returning.
> + */
> +bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
> +
> +/* Register a file descriptor and associated callbacks.  Behaves very similarly
> + * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
> + * be invoked when using aio_poll().
> + *
> + * Code that invokes AIO completion functions should rely on this function
> + * instead of qemu_set_fd_handler[2].
> + */
> +void aio_set_fd_handler(AioContext *ctx,
> +                        int fd,
> +                        IOHandler *io_read,
> +                        IOHandler *io_write,
> +                        AioPollFn *io_poll,
> +                        IOHandler *io_poll_ready,
> +                        void *opaque);
> +
> +/* Register an event notifier and associated callbacks.  Behaves very similarly
> + * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
> + * will be invoked when using aio_poll().
> + *
> + * Code that invokes AIO completion functions should rely on this function
> + * instead of event_notifier_set_handler.
> + */
> +void aio_set_event_notifier(AioContext *ctx,
> +                            EventNotifier *notifier,
> +                            EventNotifierHandler *io_read,
> +                            AioPollFn *io_poll,
> +                            EventNotifierHandler *io_poll_ready);
> +
> +/*
> + * Set polling begin/end callbacks for an event notifier that has already been
> + * registered with aio_set_event_notifier.  Do nothing if the event notifier is
> + * not registered.
> + *
> + * Note that if the io_poll_end() callback (or the entire notifier) is removed
> + * during polling, it will not be called, so an io_poll_begin() is not
> + * necessarily always followed by an io_poll_end().
> + */
> +void aio_set_event_notifier_poll(AioContext *ctx,
> +                                 EventNotifier *notifier,
> +                                 EventNotifierHandler *io_poll_begin,
> +                                 EventNotifierHandler *io_poll_end);
> +
> +/* Return a GSource that lets the main loop poll the file descriptors attached
> + * to this AioContext.
> + */
> +GSource *aio_get_g_source(AioContext *ctx);
> +
> +/* Return the ThreadPoolAio bound to this AioContext */
> +struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx);
> +
> +/* Setup the LinuxAioState bound to this AioContext */
> +struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
> +
> +/* Return the LinuxAioState bound to this AioContext */
> +struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
> +
> +/**
> + * aio_timer_new_with_attrs:
> + * @ctx: the aio context
> + * @type: the clock type
> + * @scale: the scale
> + * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
> + *              to assign
> + * @cb: the callback to call on timer expiry
> + * @opaque: the opaque pointer to pass to the callback
> + *
> + * Allocate a new timer (with attributes) attached to the context @ctx.
> + * The function is responsible for memory allocation.
> + *
> + * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
> + * Use that unless you really need dynamic memory allocation.
> + *
> + * Returns: a pointer to the new timer
> + */
> +static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
> +                                                  QEMUClockType type,
> +                                                  int scale, int attributes,
> +                                                  QEMUTimerCB *cb, void *opaque)
> +{
> +    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
> +}
> +
> +/**
> + * aio_timer_new:
> + * @ctx: the aio context
> + * @type: the clock type
> + * @scale: the scale
> + * @cb: the callback to call on timer expiry
> + * @opaque: the opaque pointer to pass to the callback
> + *
> + * Allocate a new timer attached to the context @ctx.
> + * See aio_timer_new_with_attrs for details.
> + *
> + * Returns: a pointer to the new timer
> + */
> +static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
> +                                       int scale,
> +                                       QEMUTimerCB *cb, void *opaque)
> +{
> +    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
> +}
> +
> +/**
> + * aio_timer_init_with_attrs:
> + * @ctx: the aio context
> + * @ts: the timer
> + * @type: the clock type
> + * @scale: the scale
> + * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
> + *              to assign
> + * @cb: the callback to call on timer expiry
> + * @opaque: the opaque pointer to pass to the callback
> + *
> + * Initialise a new timer (with attributes) attached to the context @ctx.
> + * The caller is responsible for memory allocation.
> + */
> +static inline void aio_timer_init_with_attrs(AioContext *ctx,
> +                                             QEMUTimer *ts, QEMUClockType type,
> +                                             int scale, int attributes,
> +                                             QEMUTimerCB *cb, void *opaque)
> +{
> +    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
> +}
> +
> +/**
> + * aio_timer_init:
> + * @ctx: the aio context
> + * @ts: the timer
> + * @type: the clock type
> + * @scale: the scale
> + * @cb: the callback to call on timer expiry
> + * @opaque: the opaque pointer to pass to the callback
> + *
> + * Initialise a new timer attached to the context @ctx.
> + * See aio_timer_init_with_attrs for details.
> + */
> +static inline void aio_timer_init(AioContext *ctx,
> +                                  QEMUTimer *ts, QEMUClockType type,
> +                                  int scale,
> +                                  QEMUTimerCB *cb, void *opaque)
> +{
> +    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
> +}
> +
> +/**
> + * aio_compute_timeout:
> + * @ctx: the aio context
> + *
> + * Compute the timeout that a blocking aio_poll should use.
> + */
> +int64_t aio_compute_timeout(AioContext *ctx);
> +
> +/**
> + * aio_co_schedule:
> + * @ctx: the aio context
> + * @co: the coroutine
> + *
> + * Start a coroutine on a remote AioContext.
> + *
> + * The coroutine must not be entered by anyone else while aio_co_schedule()
> + * is active.  In addition the coroutine must have yielded unless ctx
> + * is the context in which the coroutine is running (i.e. the value of
> + * qemu_get_current_aio_context() from the coroutine itself).
> + */
> +void aio_co_schedule(AioContext *ctx, Coroutine *co);
> +
> +/**
> + * aio_co_reschedule_self:
> + * @new_ctx: the new context
> + *
> + * Move the currently running coroutine to new_ctx. If the coroutine is already
> + * running in new_ctx, do nothing.
> + *
> + * Note that this function cannot reschedule from iohandler_ctx to
> + * qemu_aio_context.
> + */
> +void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
> +
> +/**
> + * aio_co_wake:
> + * @co: the coroutine
> + *
> + * Restart a coroutine on the AioContext where it was running last, thus
> + * preventing coroutines from jumping from one context to another when they
> + * go to sleep.
> + *
> + * aio_co_wake may be executed either in coroutine or non-coroutine
> + * context.  The coroutine must not be entered by anyone else while
> + * aio_co_wake() is active.
> + *
> + * If `co`'s AioContext differs from the current AioContext, this will call
> + * aio_co_schedule(), which makes this safe to use even when `co` has not
> + * yielded yet.  In such a case, it will be entered once it yields.
> + *
> + * In contrast, if `co`'s AioContext is equal to the current one, it is
> + * required for `co` to currently be yielding.  This is generally the case
> + * if the caller is not in `co` (i.e. invoked by `co`), because the only
> + * other way for the caller to be running then is for `co` to currently be
> + * yielding.
> + *
> + * Therefore, if there is no way for the caller to be invoked/entered by
> + * `co`, it is generally safe to call this regardless of whether `co` is
> + * known to already be yielding or not -- it only has to yield at some
> + * point.
> + */
> +void aio_co_wake(Coroutine *co);
> +
> +/**
> + * aio_co_enter:
> + * @ctx: the context to run the coroutine
> + * @co: the coroutine to run
> + *
> + * Enter a coroutine in the specified AioContext.
> + */
> +void aio_co_enter(AioContext *ctx, Coroutine *co);
> +
> +/**
> + * Return the AioContext whose event loop runs in the current thread.
> + *
> + * If called from an IOThread this will be the IOThread's AioContext.  If
> + * called from the main thread or with the "big QEMU lock" taken it
> + * will be the main loop AioContext.
> + *
> + * Note that the return value is never the main loop's iohandler_ctx and the
> + * return value is the main loop AioContext instead.
> + */
> +AioContext *qemu_get_current_aio_context(void);
> +
> +void qemu_set_current_aio_context(AioContext *ctx);
> +
> +/**
> + * aio_context_setup:
> + * @ctx: the aio context
> + * @errp: error pointer
> + *
> + * Initialize the aio context.
> + *
> + * Returns: true on success, false otherwise
> + */
> +bool aio_context_setup(AioContext *ctx, Error **errp);
> +
> +/**
> + * aio_context_destroy:
> + * @ctx: the aio context
> + *
> + * Destroy the aio context.
> + */
> +void aio_context_destroy(AioContext *ctx);
> +
> +/**
> + * aio_context_set_poll_params:
> + * @ctx: the aio context
> + * @max_ns: how long to busy poll for, in nanoseconds
> + * @grow: polling time growth factor
> + * @shrink: polling time shrink factor
> + *
> + * Poll mode can be disabled by setting poll_max_ns to 0.
> + */
> +void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
> +                                 int64_t grow, int64_t shrink,
> +                                 Error **errp);
> +
> +/**
> + * aio_context_set_aio_params:
> + * @ctx: the aio context
> + * @max_batch: maximum number of requests in a batch, 0 means that the
> + *             engine will use its default
> + */
> +void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
> +
> +/**
> + * aio_context_set_thread_pool_params:
> + * @ctx: the aio context
> + * @min: min number of threads to have readily available in the thread pool
> + * @min: max number of threads the thread pool can contain
> + */
> +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
> +                                        int64_t max, Error **errp);
> +
> +#ifdef CONFIG_LINUX_IO_URING
> +/**
> + * aio_has_io_uring: Return whether io_uring is available.
> + *
> + * io_uring is either available in all AioContexts or in none, so this only
> + * needs to be called once from within any thread's AioContext.
> + */
> +static inline bool aio_has_io_uring(void)
> +{
> +    AioContext *ctx = qemu_get_current_aio_context();
> +    return ctx->fdmon_ops->add_sqe;
> +}
> +
> +/**
> + * aio_add_sqe: Add an io_uring sqe for submission.
> + * @prep_sqe: invoked with an sqe that should be prepared for submission
> + * @opaque: user-defined argument to @prep_sqe()
> + * @cqe_handler: the unique cqe handler associated with this request
> + *
> + * The caller's @prep_sqe() function is invoked to fill in the details of the
> + * sqe. Do not call io_uring_sqe_set_data() on this sqe.
> + *
> + * The sqe is submitted by the current AioContext. The kernel may see the sqe
> + * as soon as @prep_sqe() returns or it may take until the next event loop
> + * iteration.
> + *
> + * When the AioContext is destroyed, pending sqes are ignored and their
> + * CqeHandlers are not invoked.
> + *
> + * This function must be called only when aio_has_io_uring() returns true.
> + */
> +void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
> +                 void *opaque, CqeHandler *cqe_handler);
> +#endif /* CONFIG_LINUX_IO_URING */
> +
> +#endif
> diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
> index 0d55c636b21..8c1241a2c11 100644
> --- a/include/qemu/main-loop.h
> +++ b/include/qemu/main-loop.h
> @@ -25,7 +25,7 @@
>  #ifndef QEMU_MAIN_LOOP_H
>  #define QEMU_MAIN_LOOP_H
>
> -#include "block/aio.h"
> +#include "qemu/aio.h"
>  #include "qom/object.h"
>  #include "system/event-loop-base.h"
>
> @@ -431,7 +431,7 @@ void qemu_cond_timedwait_bql(QemuCond *cond, int ms);
>  #define qemu_bh_new(cb, opaque) \
>      qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL)
>  QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
> -                         MemReentrancyGuard *reentrancy_guard);
> +                         struct MemReentrancyGuard *reentrancy_guard);
>  void qemu_bh_schedule_idle(QEMUBH *bh);
>
>  enum {
> --

Should  block/aio.h include qemu/aio.h?  User can include them both
where needed.

Otherwise looks okay.
Reviewed-by: Prasad Pandit <pjp@fedoraproject.org>

Thank you.
---
  - Prasad

Re: [PATCH] block: split qemu/aio.h out of block/aio.h

Posted by Paolo Bonzini 2 months, 1 week ago

On 11/28/25 14:14, Prasad Pandit wrote:
> Should  block/aio.h include qemu/aio.h?  User can include them both
> where needed.

block/aio.h is quite useless without functions like aio_co_wake(), so it 
includes qemu/aio.h.

Paolo

> Otherwise looks okay.
> Reviewed-by: Prasad Pandit <pjp@fedoraproject.org>
> 
> Thank you.
> ---
>    - Prasad
>