File descriptor monitoring is O(1) with epoll(7), but
aio_dispatch_handlers() still scans all AioHandlers instead of
dispatching just those that are ready. This makes aio_poll() O(n) with
respect to the total number of registered handlers.
Add a local ready_list to aio_poll() so that each nested aio_poll()
builds a list of handlers ready to be dispatched. Since file descriptor
polling is level-triggered, nested aio_poll() calls also see fds that
were ready in the parent but not yet dispatched. This guarantees that
nested aio_poll() invocations will dispatch all fds, even those that
became ready before the nested invocation.
Since only handlers ready to be dispatched are placed onto the
ready_list, the new aio_dispatch_ready_handlers() function provides O(1)
dispatch.
Note that AioContext polling is still O(n) and currently cannot be fully
disabled. This still needs to be fixed before aio_poll() is fully O(1).
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
util/aio-posix.c | 106 +++++++++++++++++++++++++++++++++--------------
1 file changed, 76 insertions(+), 30 deletions(-)
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 3a98a2acb9..dc33ca08a6 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -34,6 +34,7 @@ struct AioHandler
void *opaque;
bool is_external;
QLIST_ENTRY(AioHandler) node;
+ QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
QLIST_ENTRY(AioHandler) node_deleted;
};
@@ -104,7 +105,18 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
}
}
-static int aio_epoll(AioContext *ctx, int64_t timeout)
+/* Add a handler to a ready list */
+static void add_ready_handler(AioHandlerList *ready_list,
+ AioHandler *node,
+ int revents)
+{
+ QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
+ node->pfd.revents = revents;
+ QLIST_INSERT_HEAD(ready_list, node, node_ready);
+}
+
+static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
{
GPollFD pfd = {
.fd = ctx->epollfd,
@@ -129,11 +141,13 @@ static int aio_epoll(AioContext *ctx, int64_t timeout)
}
for (i = 0; i < ret; i++) {
int ev = events[i].events;
+ int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+ (ev & EPOLLOUT ? G_IO_OUT : 0) |
+ (ev & EPOLLHUP ? G_IO_HUP : 0) |
+ (ev & EPOLLERR ? G_IO_ERR : 0);
+
node = events[i].data.ptr;
- node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
- (ev & EPOLLOUT ? G_IO_OUT : 0) |
- (ev & EPOLLHUP ? G_IO_HUP : 0) |
- (ev & EPOLLERR ? G_IO_ERR : 0);
+ add_ready_handler(ready_list, node, revents);
}
}
out:
@@ -437,36 +451,63 @@ static void aio_free_deleted_handlers(AioContext *ctx)
qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
}
-static bool aio_dispatch_handlers(AioContext *ctx)
+static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
{
- AioHandler *node, *tmp;
bool progress = false;
+ int revents;
- QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
- int revents;
+ revents = node->pfd.revents & node->pfd.events;
+ node->pfd.revents = 0;
- revents = node->pfd.revents & node->pfd.events;
- node->pfd.revents = 0;
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+ aio_node_check(ctx, node->is_external) &&
+ node->io_read) {
+ node->io_read(node->opaque);
- if (!QLIST_IS_INSERTED(node, node_deleted) &&
- (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
- aio_node_check(ctx, node->is_external) &&
- node->io_read) {
- node->io_read(node->opaque);
-
- /* aio_notify() does not count as progress */
- if (node->opaque != &ctx->notifier) {
- progress = true;
- }
- }
- if (!QLIST_IS_INSERTED(node, node_deleted) &&
- (revents & (G_IO_OUT | G_IO_ERR)) &&
- aio_node_check(ctx, node->is_external) &&
- node->io_write) {
- node->io_write(node->opaque);
+ /* aio_notify() does not count as progress */
+ if (node->opaque != &ctx->notifier) {
progress = true;
}
}
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ (revents & (G_IO_OUT | G_IO_ERR)) &&
+ aio_node_check(ctx, node->is_external) &&
+ node->io_write) {
+ node->io_write(node->opaque);
+ progress = true;
+ }
+
+ return progress;
+}
+
+/*
+ * If we have a list of ready handlers then this is more efficient than
+ * scanning all handlers with aio_dispatch_handlers().
+ */
+static bool aio_dispatch_ready_handlers(AioContext *ctx,
+ AioHandlerList *ready_list)
+{
+ bool progress = false;
+ AioHandler *node;
+
+ while ((node = QLIST_FIRST(ready_list))) {
+ QLIST_SAFE_REMOVE(node, node_ready);
+ progress = aio_dispatch_handler(ctx, node) || progress;
+ }
+
+ return progress;
+}
+
+/* Slower than aio_dispatch_ready_handlers() but only used via glib */
+static bool aio_dispatch_handlers(AioContext *ctx)
+{
+ AioHandler *node, *tmp;
+ bool progress = false;
+
+ QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+ progress = aio_dispatch_handler(ctx, node) || progress;
+ }
return progress;
}
@@ -628,6 +669,7 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
bool aio_poll(AioContext *ctx, bool blocking)
{
+ AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
AioHandler *node;
int i;
int ret = 0;
@@ -678,7 +720,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
/* wait until next event */
if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
npfd = 0; /* pollfds[] is not being used */
- ret = aio_epoll(ctx, timeout);
+ ret = aio_epoll(ctx, &ready_list, timeout);
} else {
ret = qemu_poll_ns(pollfds, npfd, timeout);
}
@@ -733,7 +775,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
/* if we have any readable fds, dispatch event */
if (ret > 0) {
for (i = 0; i < npfd; i++) {
- nodes[i]->pfd.revents = pollfds[i].revents;
+ int revents = pollfds[i].revents;
+
+ if (revents) {
+ add_ready_handler(&ready_list, nodes[i], revents);
+ }
}
}
@@ -742,7 +788,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
progress |= aio_bh_poll(ctx);
if (ret > 0) {
- progress |= aio_dispatch_handlers(ctx);
+ progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
}
aio_free_deleted_handlers(ctx);
--
2.24.1
On Fri, Feb 14, 2020 at 05:17:12PM +0000, Stefan Hajnoczi wrote: > File descriptor monitoring is O(1) with epoll(7), but > aio_dispatch_handlers() still scans all AioHandlers instead of > dispatching just those that are ready. This makes aio_poll() O(n) with > respect to the total number of registered handlers. > > Add a local ready_list to aio_poll() so that each nested aio_poll() > builds a list of handlers ready to be dispatched. Since file descriptor > polling is level-triggered, nested aio_poll() calls also see fds that > were ready in the parent but not yet dispatched. This guarantees that > nested aio_poll() invocations will dispatch all fds, even those that > became ready before the nested invocation. > > Since only handlers ready to be dispatched are placed onto the > ready_list, the new aio_dispatch_ready_handlers() function provides O(1) > dispatch. > > Note that AioContext polling is still O(n) and currently cannot be fully > disabled. This still needs to be fixed before aio_poll() is fully O(1). > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > --- > util/aio-posix.c | 106 +++++++++++++++++++++++++++++++++-------------- > 1 file changed, 76 insertions(+), 30 deletions(-) Reviewed-by: Sergio Lopez <slp@redhat.com>
On 14/02/20 18:17, Stefan Hajnoczi wrote:
> + while ((node = QLIST_FIRST(ready_list))) {
> + QLIST_SAFE_REMOVE(node, node_ready);
Why does this need safe remove?
Paolo
> + progress = aio_dispatch_handler(ctx, node) || progress;
> + }
On Wed, Feb 19, 2020 at 12:13:40PM +0100, Paolo Bonzini wrote:
> On 14/02/20 18:17, Stefan Hajnoczi wrote:
> > + while ((node = QLIST_FIRST(ready_list))) {
> > + QLIST_SAFE_REMOVE(node, node_ready);
>
> Why does this need safe remove?
Yes, it's necessary. QLIST_SAFE_REMOVE() has two properties that make
it "safe":
1. It doesn't crash if the node is currently not on a list.
2. It clears the node's linked list pointers so that future linked
list operations (like QLIST_SAFE_REMOVE()) aren't accidentally
performed on stale pointers.
The node has a long lifespan and will be inserted into ready_lists
multiple times. We need to safely remove it from ready_list to protect
against a corruption the next time the node is inserted into a
ready_list again:
/* Add a handler to a ready list */
static void add_ready_handler(AioHandlerList *ready_list,
AioHandler *node,
int revents)
{
QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
^---- would cause corruption if node->node_ready was stale!
Would you like me to add a comment?
Stefan
On 21/02/20 13:59, Stefan Hajnoczi wrote:
> 1. It doesn't crash if the node is currently not on a list.
> 2. It clears the node's linked list pointers so that future linked
> list operations (like QLIST_SAFE_REMOVE()) aren't accidentally
> performed on stale pointers.
>
> The node has a long lifespan and will be inserted into ready_lists
> multiple times. We need to safely remove it from ready_list to protect
> against a corruption the next time the node is inserted into a
> ready_list again:
Ah, so the one I singled out is for (2) (we know the node is currently
on a list), while the one below is for (1). Would it make sense to move
(2) to Q*_REMOVE_*? We can do it separately after this pull request.
> /* Add a handler to a ready list */
> static void add_ready_handler(AioHandlerList *ready_list,
> AioHandler *node,
> int revents)
> {
> QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
> ^---- would cause corruption if node->node_ready was stale!
>
> Would you like me to add a comment?
No, it's okay.
Paolo
On Fri, Feb 21, 2020 at 02:06:26PM +0100, Paolo Bonzini wrote: > On 21/02/20 13:59, Stefan Hajnoczi wrote: > > 1. It doesn't crash if the node is currently not on a list. > > 2. It clears the node's linked list pointers so that future linked > > list operations (like QLIST_SAFE_REMOVE()) aren't accidentally > > performed on stale pointers. > > > > The node has a long lifespan and will be inserted into ready_lists > > multiple times. We need to safely remove it from ready_list to protect > > against a corruption the next time the node is inserted into a > > ready_list again: > > Ah, so the one I singled out is for (2) (we know the node is currently > on a list), while the one below is for (1). Would it make sense to move > (2) to Q*_REMOVE_*? We can do it separately after this pull request. Extending all Q*_REMOVE*() macros to clear the linked list pointers is nice. I'll send a follow-up patch. Stefan
On Fri, Feb 21, 2020 at 02:06:26PM +0100, Paolo Bonzini wrote:
> On 21/02/20 13:59, Stefan Hajnoczi wrote:
> > /* Add a handler to a ready list */
> > static void add_ready_handler(AioHandlerList *ready_list,
> > AioHandler *node,
> > int revents)
> > {
> > QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
> > ^---- would cause corruption if node->node_ready was stale!
> >
> > Would you like me to add a comment?
> No, it's okay.
Are you happy with this series?
Shall I include it in my next pull request or do you want to merge it?
Thanks,
Stefan
On 21/02/20 15:47, Stefan Hajnoczi wrote: >>> QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ >>> ^---- would cause corruption if node->node_ready was stale! >>> >>> Would you like me to add a comment? >> No, it's okay. > Are you happy with this series? Yes. Let's keep the Q*_REMOVE cleanup on the todo list. I'd keep Q*_SAFE_REMOVE, but clear the pointer unconditionally in Q*_REMOVE so that we can have something like Q*_IN_LIST too. > Shall I include it in my next pull request or do you want to merge it? No, it's yours. Paolo
On Fri, Feb 21, 2020 at 04:04:10PM +0100, Paolo Bonzini wrote: > On 21/02/20 15:47, Stefan Hajnoczi wrote: > >>> QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ > >>> ^---- would cause corruption if node->node_ready was stale! > >>> > >>> Would you like me to add a comment? > >> No, it's okay. > > Are you happy with this series? > > Yes. Let's keep the Q*_REMOVE cleanup on the todo list. I'd keep > Q*_SAFE_REMOVE, but clear the pointer unconditionally in Q*_REMOVE so > that we can have something like Q*_IN_LIST too. QLIST_IS_INSERTED() is part of this patch series, although I can rename it to Q*_IN_LIST() and cover all linked list variants. :) > > Shall I include it in my next pull request or do you want to merge it? > > No, it's yours. Thanks! Stefan
On 21/02/20 16:29, Stefan Hajnoczi wrote: >> Yes. Let's keep the Q*_REMOVE cleanup on the todo list. I'd keep >> Q*_SAFE_REMOVE, but clear the pointer unconditionally in Q*_REMOVE so >> that we can have something like Q*_IN_LIST too. > QLIST_IS_INSERTED() is part of this patch series, although I can rename > it to Q*_IN_LIST() and cover all linked list variants. :) Yes I meant having it for all variants. I remembered you adding it but not the exact spelling! Paolo
© 2016 - 2026 Red Hat, Inc.