1 | The following changes since commit 3f3bbfc7cef4490c5ed5550766a81e7d18f08db1: | 1 | The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/huth-gitlab/tags/pull-request-2019-03-12' into staging (2019-03-12 21:06:26 +0000) | 3 | Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | git://github.com/stefanha/qemu.git tags/block-pull-request | 7 | https://github.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to f357fcd890a8d6ced6d261338b859a41414561e9: | 9 | for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3: |
10 | 10 | ||
11 | file-posix: add drop-cache=on|off option (2019-03-13 10:54:55 +0000) | 11 | aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | * Add 'drop-cache=on|off' option to file-posix.c. The default is on. | ||
17 | Disabling the option fixes a QEMU 3.0.0 performance regression when live | ||
18 | migrating on the same host with cache.direct=off. | ||
19 | |||
20 | ---------------------------------------------------------------- | 16 | ---------------------------------------------------------------- |
21 | 17 | ||
22 | Stefan Hajnoczi (1): | 18 | Stefan Hajnoczi (9): |
23 | file-posix: add drop-cache=on|off option | 19 | qemu/queue.h: clear linked list pointers on remove |
20 | aio-posix: remove confusing QLIST_SAFE_REMOVE() | ||
21 | aio-posix: completely stop polling when disabled | ||
22 | aio-posix: move RCU_READ_LOCK() into run_poll_handlers() | ||
23 | aio-posix: extract ppoll(2) and epoll(7) fd monitoring | ||
24 | aio-posix: simplify FDMonOps->update() prototype | ||
25 | aio-posix: add io_uring fd monitoring implementation | ||
26 | aio-posix: support userspace polling of fd monitoring | ||
27 | aio-posix: remove idle poll handlers to improve scalability | ||
24 | 28 | ||
25 | qapi/block-core.json | 6 ++++++ | 29 | MAINTAINERS | 2 + |
26 | block/file-posix.c | 16 ++++++++++++++++ | 30 | configure | 5 + |
27 | 2 files changed, 22 insertions(+) | 31 | include/block/aio.h | 71 ++++++- |
32 | include/qemu/queue.h | 19 +- | ||
33 | util/Makefile.objs | 3 + | ||
34 | util/aio-posix.c | 451 ++++++++++++++---------------------------- | ||
35 | util/aio-posix.h | 81 ++++++++ | ||
36 | util/fdmon-epoll.c | 155 +++++++++++++++ | ||
37 | util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++ | ||
38 | util/fdmon-poll.c | 107 ++++++++++ | ||
39 | util/trace-events | 2 + | ||
40 | 11 files changed, 915 insertions(+), 313 deletions(-) | ||
41 | create mode 100644 util/aio-posix.h | ||
42 | create mode 100644 util/fdmon-epoll.c | ||
43 | create mode 100644 util/fdmon-io_uring.c | ||
44 | create mode 100644 util/fdmon-poll.c | ||
28 | 45 | ||
29 | -- | 46 | -- |
30 | 2.20.1 | 47 | 2.24.1 |
31 | 48 | ||
32 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Do not leave stale linked list pointers around after removal. It's | ||
2 | safer to set them to NULL so that use-after-removal results in an | ||
3 | immediate segfault. | ||
1 | 4 | ||
5 | The RCU queue removal macros are unchanged since nodes may still be | ||
6 | traversed after removal. | ||
7 | |||
8 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> | ||
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com | ||
11 | Message-Id: <20200224103406.1894923-2-stefanha@redhat.com> | ||
12 | --- | ||
13 | include/qemu/queue.h | 19 +++++++++++++++---- | ||
14 | 1 file changed, 15 insertions(+), 4 deletions(-) | ||
15 | |||
16 | diff --git a/include/qemu/queue.h b/include/qemu/queue.h | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/include/qemu/queue.h | ||
19 | +++ b/include/qemu/queue.h | ||
20 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
21 | (elm)->field.le_next->field.le_prev = \ | ||
22 | (elm)->field.le_prev; \ | ||
23 | *(elm)->field.le_prev = (elm)->field.le_next; \ | ||
24 | + (elm)->field.le_next = NULL; \ | ||
25 | + (elm)->field.le_prev = NULL; \ | ||
26 | } while (/*CONSTCOND*/0) | ||
27 | |||
28 | /* | ||
29 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
30 | } while (/*CONSTCOND*/0) | ||
31 | |||
32 | #define QSLIST_REMOVE_HEAD(head, field) do { \ | ||
33 | - (head)->slh_first = (head)->slh_first->field.sle_next; \ | ||
34 | + typeof((head)->slh_first) elm = (head)->slh_first; \ | ||
35 | + (head)->slh_first = elm->field.sle_next; \ | ||
36 | + elm->field.sle_next = NULL; \ | ||
37 | } while (/*CONSTCOND*/0) | ||
38 | |||
39 | #define QSLIST_REMOVE_AFTER(slistelm, field) do { \ | ||
40 | - (slistelm)->field.sle_next = \ | ||
41 | - QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field); \ | ||
42 | + typeof(slistelm) next = (slistelm)->field.sle_next; \ | ||
43 | + (slistelm)->field.sle_next = next->field.sle_next; \ | ||
44 | + next->field.sle_next = NULL; \ | ||
45 | } while (/*CONSTCOND*/0) | ||
46 | |||
47 | #define QSLIST_REMOVE(head, elm, type, field) do { \ | ||
48 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
49 | while (curelm->field.sle_next != (elm)) \ | ||
50 | curelm = curelm->field.sle_next; \ | ||
51 | curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \ | ||
52 | + (elm)->field.sle_next = NULL; \ | ||
53 | } \ | ||
54 | } while (/*CONSTCOND*/0) | ||
55 | |||
56 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
57 | } while (/*CONSTCOND*/0) | ||
58 | |||
59 | #define QSIMPLEQ_REMOVE_HEAD(head, field) do { \ | ||
60 | - if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\ | ||
61 | + typeof((head)->sqh_first) elm = (head)->sqh_first; \ | ||
62 | + if (((head)->sqh_first = elm->field.sqe_next) == NULL) \ | ||
63 | (head)->sqh_last = &(head)->sqh_first; \ | ||
64 | + elm->field.sqe_next = NULL; \ | ||
65 | } while (/*CONSTCOND*/0) | ||
66 | |||
67 | #define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do { \ | ||
68 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
69 | if ((curelm->field.sqe_next = \ | ||
70 | curelm->field.sqe_next->field.sqe_next) == NULL) \ | ||
71 | (head)->sqh_last = &(curelm)->field.sqe_next; \ | ||
72 | + (elm)->field.sqe_next = NULL; \ | ||
73 | } \ | ||
74 | } while (/*CONSTCOND*/0) | ||
75 | |||
76 | @@ -XXX,XX +XXX,XX @@ union { \ | ||
77 | (head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \ | ||
78 | (elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \ | ||
79 | (elm)->field.tqe_circ.tql_prev = NULL; \ | ||
80 | + (elm)->field.tqe_circ.tql_next = NULL; \ | ||
81 | + (elm)->field.tqe_next = NULL; \ | ||
82 | } while (/*CONSTCOND*/0) | ||
83 | |||
84 | /* remove @left, @right and all elements in between from @head */ | ||
85 | -- | ||
86 | 2.24.1 | ||
87 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | QLIST_SAFE_REMOVE() is confusing here because the node must be on the | ||
2 | list. We actually just wanted to clear the linked list pointers when | ||
3 | removing it from the list. QLIST_REMOVE() now does this, so switch to | ||
4 | it. | ||
1 | 5 | ||
6 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> | ||
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
8 | Link: https://lore.kernel.org/r/20200224103406.1894923-3-stefanha@redhat.com | ||
9 | Message-Id: <20200224103406.1894923-3-stefanha@redhat.com> | ||
10 | --- | ||
11 | util/aio-posix.c | 2 +- | ||
12 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
13 | |||
14 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/util/aio-posix.c | ||
17 | +++ b/util/aio-posix.c | ||
18 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx, | ||
19 | AioHandler *node; | ||
20 | |||
21 | while ((node = QLIST_FIRST(ready_list))) { | ||
22 | - QLIST_SAFE_REMOVE(node, node_ready); | ||
23 | + QLIST_REMOVE(node, node_ready); | ||
24 | progress = aio_dispatch_handler(ctx, node) || progress; | ||
25 | } | ||
26 | |||
27 | -- | ||
28 | 2.24.1 | ||
29 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | One iteration of polling is always performed even when polling is | ||
2 | disabled. This is done because: | ||
3 | 1. Userspace polling is cheaper than making a syscall. We might get | ||
4 | lucky. | ||
5 | 2. We must poll once more after polling has stopped in case an event | ||
6 | occurred while stopping polling. | ||
1 | 7 | ||
8 | However, there are downsides: | ||
9 | 1. Polling becomes a bottleneck when the number of event sources is very | ||
10 | high. It's more efficient to monitor fds in that case. | ||
11 | 2. A high-frequency polling event source can starve non-polling event | ||
12 | sources because ppoll(2)/epoll(7) is never invoked. | ||
13 | |||
14 | This patch removes the forced polling iteration so that poll_ns=0 really | ||
15 | means no polling. | ||
16 | |||
17 | IOPS increases from 10k to 60k when the guest has 100 | ||
18 | virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 | ||
19 | device because the large number of event sources being polled slows down | ||
20 | the event loop. | ||
21 | |||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
23 | Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com | ||
24 | Message-Id: <20200305170806.1313245-2-stefanha@redhat.com> | ||
25 | --- | ||
26 | util/aio-posix.c | 22 +++++++++++++++------- | ||
27 | 1 file changed, 15 insertions(+), 7 deletions(-) | ||
28 | |||
29 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
30 | index XXXXXXX..XXXXXXX 100644 | ||
31 | --- a/util/aio-posix.c | ||
32 | +++ b/util/aio-posix.c | ||
33 | @@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx, | ||
34 | (IOHandler *)io_poll_end); | ||
35 | } | ||
36 | |||
37 | -static void poll_set_started(AioContext *ctx, bool started) | ||
38 | +static bool poll_set_started(AioContext *ctx, bool started) | ||
39 | { | ||
40 | AioHandler *node; | ||
41 | + bool progress = false; | ||
42 | |||
43 | if (started == ctx->poll_started) { | ||
44 | - return; | ||
45 | + return false; | ||
46 | } | ||
47 | |||
48 | ctx->poll_started = started; | ||
49 | @@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started) | ||
50 | if (fn) { | ||
51 | fn(node->opaque); | ||
52 | } | ||
53 | + | ||
54 | + /* Poll one last time in case ->io_poll_end() raced with the event */ | ||
55 | + if (!started) { | ||
56 | + progress = node->io_poll(node->opaque) || progress; | ||
57 | + } | ||
58 | } | ||
59 | qemu_lockcnt_dec(&ctx->list_lock); | ||
60 | + | ||
61 | + return progress; | ||
62 | } | ||
63 | |||
64 | |||
65 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
66 | } | ||
67 | } | ||
68 | |||
69 | - poll_set_started(ctx, false); | ||
70 | + if (poll_set_started(ctx, false)) { | ||
71 | + *timeout = 0; | ||
72 | + return true; | ||
73 | + } | ||
74 | |||
75 | - /* Even if we don't run busy polling, try polling once in case it can make | ||
76 | - * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2). | ||
77 | - */ | ||
78 | - return run_poll_handlers_once(ctx, timeout); | ||
79 | + return false; | ||
80 | } | ||
81 | |||
82 | bool aio_poll(AioContext *ctx, bool blocking) | ||
83 | -- | ||
84 | 2.24.1 | ||
85 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Now that run_poll_handlers_once() is only called by run_poll_handlers() | ||
2 | we can improve the CPU time profile by moving the expensive | ||
3 | RCU_READ_LOCK() out of the polling loop. | ||
1 | 4 | ||
5 | This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's | ||
6 | sampling profiler output. | ||
7 | |||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com | ||
10 | Message-Id: <20200305170806.1313245-3-stefanha@redhat.com> | ||
11 | --- | ||
12 | util/aio-posix.c | 20 ++++++++++---------- | ||
13 | 1 file changed, 10 insertions(+), 10 deletions(-) | ||
14 | |||
15 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
16 | index XXXXXXX..XXXXXXX 100644 | ||
17 | --- a/util/aio-posix.c | ||
18 | +++ b/util/aio-posix.c | ||
19 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
20 | bool progress = false; | ||
21 | AioHandler *node; | ||
22 | |||
23 | - /* | ||
24 | - * Optimization: ->io_poll() handlers often contain RCU read critical | ||
25 | - * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() | ||
26 | - * -> rcu_read_lock() -> ... sequences with expensive memory | ||
27 | - * synchronization primitives. Make the entire polling loop an RCU | ||
28 | - * critical section because nested rcu_read_lock()/rcu_read_unlock() calls | ||
29 | - * are cheap. | ||
30 | - */ | ||
31 | - RCU_READ_LOCK_GUARD(); | ||
32 | - | ||
33 | QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
34 | if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && | ||
35 | aio_node_check(ctx, node->is_external) && | ||
36 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
37 | |||
38 | trace_run_poll_handlers_begin(ctx, max_ns, *timeout); | ||
39 | |||
40 | + /* | ||
41 | + * Optimization: ->io_poll() handlers often contain RCU read critical | ||
42 | + * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() | ||
43 | + * -> rcu_read_lock() -> ... sequences with expensive memory | ||
44 | + * synchronization primitives. Make the entire polling loop an RCU | ||
45 | + * critical section because nested rcu_read_lock()/rcu_read_unlock() calls | ||
46 | + * are cheap. | ||
47 | + */ | ||
48 | + RCU_READ_LOCK_GUARD(); | ||
49 | + | ||
50 | start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
51 | do { | ||
52 | progress = run_poll_handlers_once(ctx, timeout); | ||
53 | -- | ||
54 | 2.24.1 | ||
55 | diff view generated by jsdifflib |
1 | Commit dd577a26ff03b6829721b1ffbbf9e7c411b72378 ("block/file-posix: | 1 | The ppoll(2) and epoll(7) file descriptor monitoring implementations are |
---|---|---|---|
2 | implement bdrv_co_invalidate_cache() on Linux") introduced page cache | 2 | mixed with the core util/aio-posix.c code. Before adding another |
3 | invalidation so that cache.direct=off live migration is safe on Linux. | 3 | implementation for Linux io_uring, extract out the existing |
4 | ones so there is a clear interface and the core code is simpler. | ||
4 | 5 | ||
5 | The invalidation takes a significant amount of time when the file is | 6 | The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps |
6 | large and present in the page cache. Normally this is not the case for | 7 | struct. See the patch for details. |
7 | cross-host live migration but it can happen when migrating between QEMU | ||
8 | processes on the same host. | ||
9 | 8 | ||
10 | On same-host migration we don't need to invalidate pages for correctness | 9 | Semantic changes: |
11 | anyway, so an option to skip page cache invalidation is useful. I | 10 | 1. ppoll(2) now reflects events from pollfds[] back into AioHandlers |
12 | investigated optimizing invalidation and detecting same-host migration, | 11 | while we're still on the clock for adaptive polling. This was |
13 | but both are hard to achieve so a user-visible option will suffice. | 12 | already happening for epoll(7), so if it's really an issue then we'll |
13 | need to fix both in the future. | ||
14 | 2. epoll(7)'s fallback to ppoll(2) while external events are disabled | ||
15 | was broken when the number of fds exceeded the epoll(7) upgrade | ||
16 | threshold. I guess this code path simply wasn't tested and no one | ||
17 | noticed the bug. I didn't go out of my way to fix it but the correct | ||
18 | code is simpler than preserving the bug. | ||
14 | 19 | ||
15 | As a bonus this option means that the cache invalidation feature will | 20 | I also took some liberties in removing the unnecessary |
16 | now be detectable by libvirt via QMP schema introspection. | 21 | AioContext->epoll_available (just check AioContext->epollfd != -1 |
22 | instead) and AioContext->epoll_enabled (it's implicit if our | ||
23 | AioContext->fdmon_ops callbacks are being invoked) fields. | ||
17 | 24 | ||
18 | Suggested-by: Neil Skrypuch <neil@tembosocial.com> | ||
19 | Tested-by: Neil Skrypuch <neil@tembosocial.com> | ||
20 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> | ||
21 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
23 | Message-id: 20190307164941.3322-1-stefanha@redhat.com | 26 | Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com |
24 | Message-Id: <20190307164941.3322-1-stefanha@redhat.com> | 27 | Message-Id: <20200305170806.1313245-4-stefanha@redhat.com> |
25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
26 | --- | 28 | --- |
27 | qapi/block-core.json | 6 ++++++ | 29 | MAINTAINERS | 2 + |
28 | block/file-posix.c | 16 ++++++++++++++++ | 30 | include/block/aio.h | 36 +++++- |
29 | 2 files changed, 22 insertions(+) | 31 | util/Makefile.objs | 2 + |
32 | util/aio-posix.c | 286 ++------------------------------------------ | ||
33 | util/aio-posix.h | 61 ++++++++++ | ||
34 | util/fdmon-epoll.c | 151 +++++++++++++++++++++++ | ||
35 | util/fdmon-poll.c | 104 ++++++++++++++++ | ||
36 | 7 files changed, 366 insertions(+), 276 deletions(-) | ||
37 | create mode 100644 util/aio-posix.h | ||
38 | create mode 100644 util/fdmon-epoll.c | ||
39 | create mode 100644 util/fdmon-poll.c | ||
30 | 40 | ||
31 | diff --git a/qapi/block-core.json b/qapi/block-core.json | 41 | diff --git a/MAINTAINERS b/MAINTAINERS |
32 | index XXXXXXX..XXXXXXX 100644 | 42 | index XXXXXXX..XXXXXXX 100644 |
33 | --- a/qapi/block-core.json | 43 | --- a/MAINTAINERS |
34 | +++ b/qapi/block-core.json | 44 | +++ b/MAINTAINERS |
45 | @@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org | ||
46 | S: Supported | ||
47 | F: util/async.c | ||
48 | F: util/aio-*.c | ||
49 | +F: util/aio-*.h | ||
50 | +F: util/fdmon-*.c | ||
51 | F: block/io.c | ||
52 | F: migration/block* | ||
53 | F: include/block/aio.h | ||
54 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
55 | index XXXXXXX..XXXXXXX 100644 | ||
56 | --- a/include/block/aio.h | ||
57 | +++ b/include/block/aio.h | ||
58 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool; | ||
59 | struct LinuxAioState; | ||
60 | struct LuringState; | ||
61 | |||
62 | +/* Callbacks for file descriptor monitoring implementations */ | ||
63 | +typedef struct { | ||
64 | + /* | ||
65 | + * update: | ||
66 | + * @ctx: the AioContext | ||
67 | + * @node: the handler | ||
68 | + * @is_new: is the file descriptor already being monitored? | ||
69 | + * | ||
70 | + * Add/remove/modify a monitored file descriptor. There are three cases: | ||
71 | + * 1. node->pfd.events == 0 means remove the file descriptor. | ||
72 | + * 2. !is_new means modify an already monitored file descriptor. | ||
73 | + * 3. is_new means add a new file descriptor. | ||
74 | + * | ||
75 | + * Called with ctx->list_lock acquired. | ||
76 | + */ | ||
77 | + void (*update)(AioContext *ctx, AioHandler *node, bool is_new); | ||
78 | + | ||
79 | + /* | ||
80 | + * wait: | ||
81 | + * @ctx: the AioContext | ||
82 | + * @ready_list: list for handlers that become ready | ||
83 | + * @timeout: maximum duration to wait, in nanoseconds | ||
84 | + * | ||
85 | + * Wait for file descriptors to become ready and place them on ready_list. | ||
86 | + * | ||
87 | + * Called with ctx->list_lock incremented but not locked. | ||
88 | + * | ||
89 | + * Returns: number of ready file descriptors. | ||
90 | + */ | ||
91 | + int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); | ||
92 | +} FDMonOps; | ||
93 | + | ||
94 | /* | ||
95 | * Each aio_bh_poll() call carves off a slice of the BH list, so that newly | ||
96 | * scheduled BHs are not processed until the next aio_bh_poll() call. All | ||
97 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
98 | |||
99 | /* epoll(7) state used when built with CONFIG_EPOLL */ | ||
100 | int epollfd; | ||
101 | - bool epoll_enabled; | ||
102 | - bool epoll_available; | ||
103 | + | ||
104 | + const FDMonOps *fdmon_ops; | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | diff --git a/util/Makefile.objs b/util/Makefile.objs | ||
109 | index XXXXXXX..XXXXXXX 100644 | ||
110 | --- a/util/Makefile.objs | ||
111 | +++ b/util/Makefile.objs | ||
112 | @@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o | ||
113 | util-obj-y += main-loop.o | ||
114 | util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o | ||
115 | util-obj-$(CONFIG_POSIX) += aio-posix.o | ||
116 | +util-obj-$(CONFIG_POSIX) += fdmon-poll.o | ||
117 | +util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o | ||
118 | util-obj-$(CONFIG_POSIX) += compatfd.o | ||
119 | util-obj-$(CONFIG_POSIX) += event_notifier-posix.o | ||
120 | util-obj-$(CONFIG_POSIX) += mmap-alloc.o | ||
121 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
122 | index XXXXXXX..XXXXXXX 100644 | ||
123 | --- a/util/aio-posix.c | ||
124 | +++ b/util/aio-posix.c | ||
35 | @@ -XXX,XX +XXX,XX @@ | 125 | @@ -XXX,XX +XXX,XX @@ |
36 | # @locking: whether to enable file locking. If set to 'auto', only enable | 126 | #include "qemu/sockets.h" |
37 | # when Open File Descriptor (OFD) locking API is available | 127 | #include "qemu/cutils.h" |
38 | # (default: auto, since 2.10) | 128 | #include "trace.h" |
39 | +# @drop-cache: invalidate page cache during live migration. This prevents | 129 | -#ifdef CONFIG_EPOLL_CREATE1 |
40 | +# stale data on the migration destination with cache.direct=off. | 130 | -#include <sys/epoll.h> |
41 | +# Currently only supported on Linux hosts. | 131 | -#endif |
42 | +# (default: on, since: 4.0) | 132 | +#include "aio-posix.h" |
43 | # @x-check-cache-dropped: whether to check that page cache was dropped on live | 133 | |
44 | # migration. May cause noticeable delays if the image | 134 | -struct AioHandler |
45 | # file is large, do not use in production. | 135 | -{ |
46 | @@ -XXX,XX +XXX,XX @@ | 136 | - GPollFD pfd; |
47 | '*pr-manager': 'str', | 137 | - IOHandler *io_read; |
48 | '*locking': 'OnOffAuto', | 138 | - IOHandler *io_write; |
49 | '*aio': 'BlockdevAioOptions', | 139 | - AioPollFn *io_poll; |
50 | + '*drop-cache': {'type': 'bool', | 140 | - IOHandler *io_poll_begin; |
51 | + 'if': 'defined(CONFIG_LINUX)'}, | 141 | - IOHandler *io_poll_end; |
52 | '*x-check-cache-dropped': 'bool' } } | 142 | - void *opaque; |
53 | 143 | - bool is_external; | |
54 | ## | 144 | - QLIST_ENTRY(AioHandler) node; |
55 | diff --git a/block/file-posix.c b/block/file-posix.c | 145 | - QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ |
56 | index XXXXXXX..XXXXXXX 100644 | 146 | - QLIST_ENTRY(AioHandler) node_deleted; |
57 | --- a/block/file-posix.c | 147 | -}; |
58 | +++ b/block/file-posix.c | 148 | - |
59 | @@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState { | 149 | -/* Add a handler to a ready list */ |
60 | bool page_cache_inconsistent:1; | 150 | -static void add_ready_handler(AioHandlerList *ready_list, |
61 | bool has_fallocate; | 151 | - AioHandler *node, |
62 | bool needs_alignment; | 152 | - int revents) |
63 | + bool drop_cache; | 153 | +void aio_add_ready_handler(AioHandlerList *ready_list, |
64 | bool check_cache_dropped; | 154 | + AioHandler *node, |
65 | 155 | + int revents) | |
66 | PRManager *pr_mgr; | 156 | { |
67 | @@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState { | 157 | QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ |
68 | typedef struct BDRVRawReopenState { | 158 | node->pfd.revents = revents; |
69 | int fd; | 159 | QLIST_INSERT_HEAD(ready_list, node, node_ready); |
70 | int open_flags; | 160 | } |
71 | + bool drop_cache; | 161 | |
72 | bool check_cache_dropped; | 162 | -#ifdef CONFIG_EPOLL_CREATE1 |
73 | } BDRVRawReopenState; | 163 | - |
74 | 164 | -/* The fd number threshold to switch to epoll */ | |
75 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList raw_runtime_opts = { | 165 | -#define EPOLL_ENABLE_THRESHOLD 64 |
76 | .type = QEMU_OPT_STRING, | 166 | - |
77 | .help = "id of persistent reservation manager object (default: none)", | 167 | -static void aio_epoll_disable(AioContext *ctx) |
78 | }, | 168 | -{ |
79 | +#if defined(__linux__) | 169 | - ctx->epoll_enabled = false; |
80 | + { | 170 | - if (!ctx->epoll_available) { |
81 | + .name = "drop-cache", | 171 | - return; |
82 | + .type = QEMU_OPT_BOOL, | 172 | - } |
83 | + .help = "invalidate page cache during live migration (default: on)", | 173 | - ctx->epoll_available = false; |
84 | + }, | 174 | - close(ctx->epollfd); |
85 | +#endif | 175 | -} |
86 | { | 176 | - |
87 | .name = "x-check-cache-dropped", | 177 | -static inline int epoll_events_from_pfd(int pfd_events) |
88 | .type = QEMU_OPT_BOOL, | 178 | -{ |
89 | @@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options, | 179 | - return (pfd_events & G_IO_IN ? EPOLLIN : 0) | |
180 | - (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | ||
181 | - (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | ||
182 | - (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
183 | -} | ||
184 | - | ||
185 | -static bool aio_epoll_try_enable(AioContext *ctx) | ||
186 | -{ | ||
187 | - AioHandler *node; | ||
188 | - struct epoll_event event; | ||
189 | - | ||
190 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
191 | - int r; | ||
192 | - if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { | ||
193 | - continue; | ||
194 | - } | ||
195 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
196 | - event.data.ptr = node; | ||
197 | - r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | ||
198 | - if (r) { | ||
199 | - return false; | ||
200 | - } | ||
201 | - } | ||
202 | - ctx->epoll_enabled = true; | ||
203 | - return true; | ||
204 | -} | ||
205 | - | ||
206 | -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
207 | -{ | ||
208 | - struct epoll_event event; | ||
209 | - int r; | ||
210 | - int ctl; | ||
211 | - | ||
212 | - if (!ctx->epoll_enabled) { | ||
213 | - return; | ||
214 | - } | ||
215 | - if (!node->pfd.events) { | ||
216 | - ctl = EPOLL_CTL_DEL; | ||
217 | - } else { | ||
218 | - event.data.ptr = node; | ||
219 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
220 | - ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
221 | - } | ||
222 | - | ||
223 | - r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | ||
224 | - if (r) { | ||
225 | - aio_epoll_disable(ctx); | ||
226 | - } | ||
227 | -} | ||
228 | - | ||
229 | -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list, | ||
230 | - int64_t timeout) | ||
231 | -{ | ||
232 | - GPollFD pfd = { | ||
233 | - .fd = ctx->epollfd, | ||
234 | - .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, | ||
235 | - }; | ||
236 | - AioHandler *node; | ||
237 | - int i, ret = 0; | ||
238 | - struct epoll_event events[128]; | ||
239 | - | ||
240 | - if (timeout > 0) { | ||
241 | - ret = qemu_poll_ns(&pfd, 1, timeout); | ||
242 | - if (ret > 0) { | ||
243 | - timeout = 0; | ||
244 | - } | ||
245 | - } | ||
246 | - if (timeout <= 0 || ret > 0) { | ||
247 | - ret = epoll_wait(ctx->epollfd, events, | ||
248 | - ARRAY_SIZE(events), | ||
249 | - timeout); | ||
250 | - if (ret <= 0) { | ||
251 | - goto out; | ||
252 | - } | ||
253 | - for (i = 0; i < ret; i++) { | ||
254 | - int ev = events[i].events; | ||
255 | - int revents = (ev & EPOLLIN ? G_IO_IN : 0) | | ||
256 | - (ev & EPOLLOUT ? G_IO_OUT : 0) | | ||
257 | - (ev & EPOLLHUP ? G_IO_HUP : 0) | | ||
258 | - (ev & EPOLLERR ? G_IO_ERR : 0); | ||
259 | - | ||
260 | - node = events[i].data.ptr; | ||
261 | - add_ready_handler(ready_list, node, revents); | ||
262 | - } | ||
263 | - } | ||
264 | -out: | ||
265 | - return ret; | ||
266 | -} | ||
267 | - | ||
268 | -static bool aio_epoll_enabled(AioContext *ctx) | ||
269 | -{ | ||
270 | - /* Fall back to ppoll when external clients are disabled. */ | ||
271 | - return !aio_external_disabled(ctx) && ctx->epoll_enabled; | ||
272 | -} | ||
273 | - | ||
274 | -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | ||
275 | - unsigned npfd, int64_t timeout) | ||
276 | -{ | ||
277 | - if (!ctx->epoll_available) { | ||
278 | - return false; | ||
279 | - } | ||
280 | - if (aio_epoll_enabled(ctx)) { | ||
281 | - return true; | ||
282 | - } | ||
283 | - if (npfd >= EPOLL_ENABLE_THRESHOLD) { | ||
284 | - if (aio_epoll_try_enable(ctx)) { | ||
285 | - return true; | ||
286 | - } else { | ||
287 | - aio_epoll_disable(ctx); | ||
288 | - } | ||
289 | - } | ||
290 | - return false; | ||
291 | -} | ||
292 | - | ||
293 | -#else | ||
294 | - | ||
295 | -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
296 | -{ | ||
297 | -} | ||
298 | - | ||
299 | -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list, | ||
300 | - int64_t timeout) | ||
301 | -{ | ||
302 | - assert(false); | ||
303 | -} | ||
304 | - | ||
305 | -static bool aio_epoll_enabled(AioContext *ctx) | ||
306 | -{ | ||
307 | - return false; | ||
308 | -} | ||
309 | - | ||
310 | -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | ||
311 | - unsigned npfd, int64_t timeout) | ||
312 | -{ | ||
313 | - return false; | ||
314 | -} | ||
315 | - | ||
316 | -#endif | ||
317 | - | ||
318 | static AioHandler *find_aio_handler(AioContext *ctx, int fd) | ||
319 | { | ||
320 | AioHandler *node; | ||
321 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
322 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
323 | |||
324 | if (new_node) { | ||
325 | - aio_epoll_update(ctx, new_node, is_new); | ||
326 | + ctx->fdmon_ops->update(ctx, new_node, is_new); | ||
327 | } else if (node) { | ||
328 | /* Unregister deleted fd_handler */ | ||
329 | - aio_epoll_update(ctx, node, false); | ||
330 | + ctx->fdmon_ops->update(ctx, node, false); | ||
331 | } | ||
332 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
333 | aio_notify(ctx); | ||
334 | @@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx) | ||
335 | timerlistgroup_run_timers(&ctx->tlg); | ||
336 | } | ||
337 | |||
338 | -/* These thread-local variables are used only in a small part of aio_poll | ||
339 | - * around the call to the poll() system call. In particular they are not | ||
340 | - * used while aio_poll is performing callbacks, which makes it much easier | ||
341 | - * to think about reentrancy! | ||
342 | - * | ||
343 | - * Stack-allocated arrays would be perfect but they have size limitations; | ||
344 | - * heap allocation is expensive enough that we want to reuse arrays across | ||
345 | - * calls to aio_poll(). And because poll() has to be called without holding | ||
346 | - * any lock, the arrays cannot be stored in AioContext. Thread-local data | ||
347 | - * has none of the disadvantages of these three options. | ||
348 | - */ | ||
349 | -static __thread GPollFD *pollfds; | ||
350 | -static __thread AioHandler **nodes; | ||
351 | -static __thread unsigned npfd, nalloc; | ||
352 | -static __thread Notifier pollfds_cleanup_notifier; | ||
353 | - | ||
354 | -static void pollfds_cleanup(Notifier *n, void *unused) | ||
355 | -{ | ||
356 | - g_assert(npfd == 0); | ||
357 | - g_free(pollfds); | ||
358 | - g_free(nodes); | ||
359 | - nalloc = 0; | ||
360 | -} | ||
361 | - | ||
362 | -static void add_pollfd(AioHandler *node) | ||
363 | -{ | ||
364 | - if (npfd == nalloc) { | ||
365 | - if (nalloc == 0) { | ||
366 | - pollfds_cleanup_notifier.notify = pollfds_cleanup; | ||
367 | - qemu_thread_atexit_add(&pollfds_cleanup_notifier); | ||
368 | - nalloc = 8; | ||
369 | - } else { | ||
370 | - g_assert(nalloc <= INT_MAX); | ||
371 | - nalloc *= 2; | ||
372 | - } | ||
373 | - pollfds = g_renew(GPollFD, pollfds, nalloc); | ||
374 | - nodes = g_renew(AioHandler *, nodes, nalloc); | ||
375 | - } | ||
376 | - nodes[npfd] = node; | ||
377 | - pollfds[npfd] = (GPollFD) { | ||
378 | - .fd = node->pfd.fd, | ||
379 | - .events = node->pfd.events, | ||
380 | - }; | ||
381 | - npfd++; | ||
382 | -} | ||
383 | - | ||
384 | static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
385 | { | ||
386 | bool progress = false; | ||
387 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
388 | bool aio_poll(AioContext *ctx, bool blocking) | ||
389 | { | ||
390 | AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); | ||
391 | - AioHandler *node; | ||
392 | - int i; | ||
393 | int ret = 0; | ||
394 | bool progress; | ||
395 | int64_t timeout; | ||
396 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
397 | * system call---a single round of run_poll_handlers_once suffices. | ||
398 | */ | ||
399 | if (timeout || atomic_read(&ctx->poll_disable_cnt)) { | ||
400 | - assert(npfd == 0); | ||
401 | - | ||
402 | - /* fill pollfds */ | ||
403 | - | ||
404 | - if (!aio_epoll_enabled(ctx)) { | ||
405 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
406 | - if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events | ||
407 | - && aio_node_check(ctx, node->is_external)) { | ||
408 | - add_pollfd(node); | ||
409 | - } | ||
410 | - } | ||
411 | - } | ||
412 | - | ||
413 | - /* wait until next event */ | ||
414 | - if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) { | ||
415 | - npfd = 0; /* pollfds[] is not being used */ | ||
416 | - ret = aio_epoll(ctx, &ready_list, timeout); | ||
417 | - } else { | ||
418 | - ret = qemu_poll_ns(pollfds, npfd, timeout); | ||
419 | - } | ||
420 | + ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
421 | } | ||
422 | |||
423 | if (blocking) { | ||
424 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
90 | } | 425 | } |
91 | } | 426 | } |
92 | 427 | ||
93 | + s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true); | 428 | - /* if we have any readable fds, dispatch event */ |
94 | s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped", | 429 | - if (ret > 0) { |
95 | false); | 430 | - for (i = 0; i < npfd; i++) { |
96 | 431 | - int revents = pollfds[i].revents; | |
97 | @@ -XXX,XX +XXX,XX @@ static int raw_reopen_prepare(BDRVReopenState *state, | 432 | - |
98 | goto out; | 433 | - if (revents) { |
99 | } | 434 | - add_ready_handler(&ready_list, nodes[i], revents); |
100 | 435 | - } | |
101 | + rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true); | 436 | - } |
102 | rs->check_cache_dropped = | 437 | - } |
103 | qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false); | 438 | - |
104 | 439 | - npfd = 0; | |
105 | @@ -XXX,XX +XXX,XX @@ static void raw_reopen_commit(BDRVReopenState *state) | 440 | - |
106 | BDRVRawState *s = state->bs->opaque; | 441 | progress |= aio_bh_poll(ctx); |
107 | Error *local_err = NULL; | 442 | |
108 | 443 | if (ret > 0) { | |
109 | + s->drop_cache = rs->drop_cache; | 444 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) |
110 | s->check_cache_dropped = rs->check_cache_dropped; | 445 | |
111 | s->open_flags = rs->open_flags; | 446 | void aio_context_setup(AioContext *ctx) |
112 | 447 | { | |
113 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, | 448 | -#ifdef CONFIG_EPOLL_CREATE1 |
114 | return; | 449 | - assert(!ctx->epollfd); |
115 | } | 450 | - ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); |
116 | 451 | - if (ctx->epollfd == -1) { | |
117 | + if (!s->drop_cache) { | 452 | - fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); |
118 | + return; | 453 | - ctx->epoll_available = false; |
119 | + } | 454 | - } else { |
120 | + | 455 | - ctx->epoll_available = true; |
121 | if (s->open_flags & O_DIRECT) { | 456 | - } |
122 | return; /* No host kernel page cache */ | 457 | -#endif |
123 | } | 458 | + ctx->fdmon_ops = &fdmon_poll_ops; |
459 | + ctx->epollfd = -1; | ||
460 | + | ||
461 | + fdmon_epoll_setup(ctx); | ||
462 | } | ||
463 | |||
464 | void aio_context_destroy(AioContext *ctx) | ||
465 | { | ||
466 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
467 | - aio_epoll_disable(ctx); | ||
468 | -#endif | ||
469 | + fdmon_epoll_disable(ctx); | ||
470 | } | ||
471 | |||
472 | void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, | ||
473 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
474 | new file mode 100644 | ||
475 | index XXXXXXX..XXXXXXX | ||
476 | --- /dev/null | ||
477 | +++ b/util/aio-posix.h | ||
478 | @@ -XXX,XX +XXX,XX @@ | ||
479 | +/* | ||
480 | + * AioContext POSIX event loop implementation internal APIs | ||
481 | + * | ||
482 | + * Copyright IBM, Corp. 2008 | ||
483 | + * Copyright Red Hat, Inc. 2020 | ||
484 | + * | ||
485 | + * Authors: | ||
486 | + * Anthony Liguori <aliguori@us.ibm.com> | ||
487 | + * | ||
488 | + * This work is licensed under the terms of the GNU GPL, version 2. See | ||
489 | + * the COPYING file in the top-level directory. | ||
490 | + * | ||
491 | + * Contributions after 2012-01-13 are licensed under the terms of the | ||
492 | + * GNU GPL, version 2 or (at your option) any later version. | ||
493 | + */ | ||
494 | + | ||
495 | +#ifndef AIO_POSIX_H | ||
496 | +#define AIO_POSIX_H | ||
497 | + | ||
498 | +#include "block/aio.h" | ||
499 | + | ||
500 | +struct AioHandler { | ||
501 | + GPollFD pfd; | ||
502 | + IOHandler *io_read; | ||
503 | + IOHandler *io_write; | ||
504 | + AioPollFn *io_poll; | ||
505 | + IOHandler *io_poll_begin; | ||
506 | + IOHandler *io_poll_end; | ||
507 | + void *opaque; | ||
508 | + bool is_external; | ||
509 | + QLIST_ENTRY(AioHandler) node; | ||
510 | + QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
511 | + QLIST_ENTRY(AioHandler) node_deleted; | ||
512 | +}; | ||
513 | + | ||
514 | +/* Add a handler to a ready list */ | ||
515 | +void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node, | ||
516 | + int revents); | ||
517 | + | ||
518 | +extern const FDMonOps fdmon_poll_ops; | ||
519 | + | ||
520 | +#ifdef CONFIG_EPOLL_CREATE1 | ||
521 | +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd); | ||
522 | +void fdmon_epoll_setup(AioContext *ctx); | ||
523 | +void fdmon_epoll_disable(AioContext *ctx); | ||
524 | +#else | ||
525 | +static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) | ||
526 | +{ | ||
527 | + return false; | ||
528 | +} | ||
529 | + | ||
530 | +static inline void fdmon_epoll_setup(AioContext *ctx) | ||
531 | +{ | ||
532 | +} | ||
533 | + | ||
534 | +static inline void fdmon_epoll_disable(AioContext *ctx) | ||
535 | +{ | ||
536 | +} | ||
537 | +#endif /* !CONFIG_EPOLL_CREATE1 */ | ||
538 | + | ||
539 | +#endif /* AIO_POSIX_H */ | ||
540 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
541 | new file mode 100644 | ||
542 | index XXXXXXX..XXXXXXX | ||
543 | --- /dev/null | ||
544 | +++ b/util/fdmon-epoll.c | ||
545 | @@ -XXX,XX +XXX,XX @@ | ||
546 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
547 | +/* | ||
548 | + * epoll(7) file descriptor monitoring | ||
549 | + */ | ||
550 | + | ||
551 | +#include "qemu/osdep.h" | ||
552 | +#include <sys/epoll.h> | ||
553 | +#include "qemu/rcu_queue.h" | ||
554 | +#include "aio-posix.h" | ||
555 | + | ||
556 | +/* The fd number threshold to switch to epoll */ | ||
557 | +#define EPOLL_ENABLE_THRESHOLD 64 | ||
558 | + | ||
559 | +void fdmon_epoll_disable(AioContext *ctx) | ||
560 | +{ | ||
561 | + if (ctx->epollfd >= 0) { | ||
562 | + close(ctx->epollfd); | ||
563 | + ctx->epollfd = -1; | ||
564 | + } | ||
565 | + | ||
566 | + /* Switch back */ | ||
567 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
568 | +} | ||
569 | + | ||
570 | +static inline int epoll_events_from_pfd(int pfd_events) | ||
571 | +{ | ||
572 | + return (pfd_events & G_IO_IN ? EPOLLIN : 0) | | ||
573 | + (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | ||
574 | + (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | ||
575 | + (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
576 | +} | ||
577 | + | ||
578 | +static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
579 | +{ | ||
580 | + struct epoll_event event; | ||
581 | + int r; | ||
582 | + int ctl; | ||
583 | + | ||
584 | + if (!node->pfd.events) { | ||
585 | + ctl = EPOLL_CTL_DEL; | ||
586 | + } else { | ||
587 | + event.data.ptr = node; | ||
588 | + event.events = epoll_events_from_pfd(node->pfd.events); | ||
589 | + ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
590 | + } | ||
591 | + | ||
592 | + r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | ||
593 | + if (r) { | ||
594 | + fdmon_epoll_disable(ctx); | ||
595 | + } | ||
596 | +} | ||
597 | + | ||
598 | +static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
599 | + int64_t timeout) | ||
600 | +{ | ||
601 | + GPollFD pfd = { | ||
602 | + .fd = ctx->epollfd, | ||
603 | + .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, | ||
604 | + }; | ||
605 | + AioHandler *node; | ||
606 | + int i, ret = 0; | ||
607 | + struct epoll_event events[128]; | ||
608 | + | ||
609 | + /* Fall back while external clients are disabled */ | ||
610 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
611 | + return fdmon_poll_ops.wait(ctx, ready_list, timeout); | ||
612 | + } | ||
613 | + | ||
614 | + if (timeout > 0) { | ||
615 | + ret = qemu_poll_ns(&pfd, 1, timeout); | ||
616 | + if (ret > 0) { | ||
617 | + timeout = 0; | ||
618 | + } | ||
619 | + } | ||
620 | + if (timeout <= 0 || ret > 0) { | ||
621 | + ret = epoll_wait(ctx->epollfd, events, | ||
622 | + ARRAY_SIZE(events), | ||
623 | + timeout); | ||
624 | + if (ret <= 0) { | ||
625 | + goto out; | ||
626 | + } | ||
627 | + for (i = 0; i < ret; i++) { | ||
628 | + int ev = events[i].events; | ||
629 | + int revents = (ev & EPOLLIN ? G_IO_IN : 0) | | ||
630 | + (ev & EPOLLOUT ? G_IO_OUT : 0) | | ||
631 | + (ev & EPOLLHUP ? G_IO_HUP : 0) | | ||
632 | + (ev & EPOLLERR ? G_IO_ERR : 0); | ||
633 | + | ||
634 | + node = events[i].data.ptr; | ||
635 | + aio_add_ready_handler(ready_list, node, revents); | ||
636 | + } | ||
637 | + } | ||
638 | +out: | ||
639 | + return ret; | ||
640 | +} | ||
641 | + | ||
642 | +static const FDMonOps fdmon_epoll_ops = { | ||
643 | + .update = fdmon_epoll_update, | ||
644 | + .wait = fdmon_epoll_wait, | ||
645 | +}; | ||
646 | + | ||
647 | +static bool fdmon_epoll_try_enable(AioContext *ctx) | ||
648 | +{ | ||
649 | + AioHandler *node; | ||
650 | + struct epoll_event event; | ||
651 | + | ||
652 | + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
653 | + int r; | ||
654 | + if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { | ||
655 | + continue; | ||
656 | + } | ||
657 | + event.events = epoll_events_from_pfd(node->pfd.events); | ||
658 | + event.data.ptr = node; | ||
659 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | ||
660 | + if (r) { | ||
661 | + return false; | ||
662 | + } | ||
663 | + } | ||
664 | + | ||
665 | + ctx->fdmon_ops = &fdmon_epoll_ops; | ||
666 | + return true; | ||
667 | +} | ||
668 | + | ||
669 | +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) | ||
670 | +{ | ||
671 | + if (ctx->epollfd < 0) { | ||
672 | + return false; | ||
673 | + } | ||
674 | + | ||
675 | + /* Do not upgrade while external clients are disabled */ | ||
676 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
677 | + return false; | ||
678 | + } | ||
679 | + | ||
680 | + if (npfd >= EPOLL_ENABLE_THRESHOLD) { | ||
681 | + if (fdmon_epoll_try_enable(ctx)) { | ||
682 | + return true; | ||
683 | + } else { | ||
684 | + fdmon_epoll_disable(ctx); | ||
685 | + } | ||
686 | + } | ||
687 | + return false; | ||
688 | +} | ||
689 | + | ||
690 | +void fdmon_epoll_setup(AioContext *ctx) | ||
691 | +{ | ||
692 | + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); | ||
693 | + if (ctx->epollfd == -1) { | ||
694 | + fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); | ||
695 | + } | ||
696 | +} | ||
697 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | ||
698 | new file mode 100644 | ||
699 | index XXXXXXX..XXXXXXX | ||
700 | --- /dev/null | ||
701 | +++ b/util/fdmon-poll.c | ||
702 | @@ -XXX,XX +XXX,XX @@ | ||
703 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
704 | +/* | ||
705 | + * poll(2) file descriptor monitoring | ||
706 | + * | ||
707 | + * Uses ppoll(2) when available, g_poll() otherwise. | ||
708 | + */ | ||
709 | + | ||
710 | +#include "qemu/osdep.h" | ||
711 | +#include "aio-posix.h" | ||
712 | +#include "qemu/rcu_queue.h" | ||
713 | + | ||
714 | +/* | ||
715 | + * These thread-local variables are used only in fdmon_poll_wait() around the | ||
716 | + * call to the poll() system call. In particular they are not used while | ||
717 | + * aio_poll is performing callbacks, which makes it much easier to think about | ||
718 | + * reentrancy! | ||
719 | + * | ||
720 | + * Stack-allocated arrays would be perfect but they have size limitations; | ||
721 | + * heap allocation is expensive enough that we want to reuse arrays across | ||
722 | + * calls to aio_poll(). And because poll() has to be called without holding | ||
723 | + * any lock, the arrays cannot be stored in AioContext. Thread-local data | ||
724 | + * has none of the disadvantages of these three options. | ||
725 | + */ | ||
726 | +static __thread GPollFD *pollfds; | ||
727 | +static __thread AioHandler **nodes; | ||
728 | +static __thread unsigned npfd, nalloc; | ||
729 | +static __thread Notifier pollfds_cleanup_notifier; | ||
730 | + | ||
731 | +static void pollfds_cleanup(Notifier *n, void *unused) | ||
732 | +{ | ||
733 | + g_assert(npfd == 0); | ||
734 | + g_free(pollfds); | ||
735 | + g_free(nodes); | ||
736 | + nalloc = 0; | ||
737 | +} | ||
738 | + | ||
739 | +static void add_pollfd(AioHandler *node) | ||
740 | +{ | ||
741 | + if (npfd == nalloc) { | ||
742 | + if (nalloc == 0) { | ||
743 | + pollfds_cleanup_notifier.notify = pollfds_cleanup; | ||
744 | + qemu_thread_atexit_add(&pollfds_cleanup_notifier); | ||
745 | + nalloc = 8; | ||
746 | + } else { | ||
747 | + g_assert(nalloc <= INT_MAX); | ||
748 | + nalloc *= 2; | ||
749 | + } | ||
750 | + pollfds = g_renew(GPollFD, pollfds, nalloc); | ||
751 | + nodes = g_renew(AioHandler *, nodes, nalloc); | ||
752 | + } | ||
753 | + nodes[npfd] = node; | ||
754 | + pollfds[npfd] = (GPollFD) { | ||
755 | + .fd = node->pfd.fd, | ||
756 | + .events = node->pfd.events, | ||
757 | + }; | ||
758 | + npfd++; | ||
759 | +} | ||
760 | + | ||
761 | +static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
762 | + int64_t timeout) | ||
763 | +{ | ||
764 | + AioHandler *node; | ||
765 | + int ret; | ||
766 | + | ||
767 | + assert(npfd == 0); | ||
768 | + | ||
769 | + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
770 | + if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events | ||
771 | + && aio_node_check(ctx, node->is_external)) { | ||
772 | + add_pollfd(node); | ||
773 | + } | ||
774 | + } | ||
775 | + | ||
776 | + /* epoll(7) is faster above a certain number of fds */ | ||
777 | + if (fdmon_epoll_try_upgrade(ctx, npfd)) { | ||
778 | + return ctx->fdmon_ops->wait(ctx, ready_list, timeout); | ||
779 | + } | ||
780 | + | ||
781 | + ret = qemu_poll_ns(pollfds, npfd, timeout); | ||
782 | + if (ret > 0) { | ||
783 | + int i; | ||
784 | + | ||
785 | + for (i = 0; i < npfd; i++) { | ||
786 | + int revents = pollfds[i].revents; | ||
787 | + | ||
788 | + if (revents) { | ||
789 | + aio_add_ready_handler(ready_list, nodes[i], revents); | ||
790 | + } | ||
791 | + } | ||
792 | + } | ||
793 | + | ||
794 | + npfd = 0; | ||
795 | + return ret; | ||
796 | +} | ||
797 | + | ||
798 | +static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
799 | +{ | ||
800 | + /* Do nothing, AioHandler already contains the state we'll need */ | ||
801 | +} | ||
802 | + | ||
803 | +const FDMonOps fdmon_poll_ops = { | ||
804 | + .update = fdmon_poll_update, | ||
805 | + .wait = fdmon_poll_wait, | ||
806 | +}; | ||
124 | -- | 807 | -- |
125 | 2.20.1 | 808 | 2.24.1 |
126 | 809 | ||
127 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | The AioHandler *node, bool is_new arguments are more complicated to | ||
2 | think about than simply being given AioHandler *old_node, AioHandler | ||
3 | *new_node. | ||
1 | 4 | ||
5 | Furthermore, the new Linux io_uring file descriptor monitoring mechanism | ||
6 | added by the new patch requires access to both the old and the new | ||
7 | nodes. Make this change now in preparation. | ||
8 | |||
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com | ||
11 | Message-Id: <20200305170806.1313245-5-stefanha@redhat.com> | ||
12 | --- | ||
13 | include/block/aio.h | 13 ++++++------- | ||
14 | util/aio-posix.c | 7 +------ | ||
15 | util/fdmon-epoll.c | 21 ++++++++++++--------- | ||
16 | util/fdmon-poll.c | 4 +++- | ||
17 | 4 files changed, 22 insertions(+), 23 deletions(-) | ||
18 | |||
19 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/include/block/aio.h | ||
22 | +++ b/include/block/aio.h | ||
23 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
24 | /* | ||
25 | * update: | ||
26 | * @ctx: the AioContext | ||
27 | - * @node: the handler | ||
28 | - * @is_new: is the file descriptor already being monitored? | ||
29 | + * @old_node: the existing handler or NULL if this file descriptor is being | ||
30 | + * monitored for the first time | ||
31 | + * @new_node: the new handler or NULL if this file descriptor is being | ||
32 | + * removed | ||
33 | * | ||
34 | - * Add/remove/modify a monitored file descriptor. There are three cases: | ||
35 | - * 1. node->pfd.events == 0 means remove the file descriptor. | ||
36 | - * 2. !is_new means modify an already monitored file descriptor. | ||
37 | - * 3. is_new means add a new file descriptor. | ||
38 | + * Add/remove/modify a monitored file descriptor. | ||
39 | * | ||
40 | * Called with ctx->list_lock acquired. | ||
41 | */ | ||
42 | - void (*update)(AioContext *ctx, AioHandler *node, bool is_new); | ||
43 | + void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node); | ||
44 | |||
45 | /* | ||
46 | * wait: | ||
47 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
48 | index XXXXXXX..XXXXXXX 100644 | ||
49 | --- a/util/aio-posix.c | ||
50 | +++ b/util/aio-posix.c | ||
51 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
52 | atomic_set(&ctx->poll_disable_cnt, | ||
53 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
54 | |||
55 | - if (new_node) { | ||
56 | - ctx->fdmon_ops->update(ctx, new_node, is_new); | ||
57 | - } else if (node) { | ||
58 | - /* Unregister deleted fd_handler */ | ||
59 | - ctx->fdmon_ops->update(ctx, node, false); | ||
60 | - } | ||
61 | + ctx->fdmon_ops->update(ctx, node, new_node); | ||
62 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
63 | aio_notify(ctx); | ||
64 | |||
65 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
66 | index XXXXXXX..XXXXXXX 100644 | ||
67 | --- a/util/fdmon-epoll.c | ||
68 | +++ b/util/fdmon-epoll.c | ||
69 | @@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events) | ||
70 | (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
71 | } | ||
72 | |||
73 | -static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
74 | +static void fdmon_epoll_update(AioContext *ctx, | ||
75 | + AioHandler *old_node, | ||
76 | + AioHandler *new_node) | ||
77 | { | ||
78 | - struct epoll_event event; | ||
79 | + struct epoll_event event = { | ||
80 | + .data.ptr = new_node, | ||
81 | + .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0, | ||
82 | + }; | ||
83 | int r; | ||
84 | - int ctl; | ||
85 | |||
86 | - if (!node->pfd.events) { | ||
87 | - ctl = EPOLL_CTL_DEL; | ||
88 | + if (!new_node) { | ||
89 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event); | ||
90 | + } else if (!old_node) { | ||
91 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event); | ||
92 | } else { | ||
93 | - event.data.ptr = node; | ||
94 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
95 | - ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
96 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event); | ||
97 | } | ||
98 | |||
99 | - r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | ||
100 | if (r) { | ||
101 | fdmon_epoll_disable(ctx); | ||
102 | } | ||
103 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | ||
104 | index XXXXXXX..XXXXXXX 100644 | ||
105 | --- a/util/fdmon-poll.c | ||
106 | +++ b/util/fdmon-poll.c | ||
107 | @@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | -static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
112 | +static void fdmon_poll_update(AioContext *ctx, | ||
113 | + AioHandler *old_node, | ||
114 | + AioHandler *new_node) | ||
115 | { | ||
116 | /* Do nothing, AioHandler already contains the state we'll need */ | ||
117 | } | ||
118 | -- | ||
119 | 2.24.1 | ||
120 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | The recent Linux io_uring API has several advantages over ppoll(2) and | ||
2 | epoll(2). Details are given in the source code. | ||
1 | 3 | ||
4 | Add an io_uring implementation and make it the default on Linux. | ||
5 | Performance is the same as with epoll(7) but later patches add | ||
6 | optimizations that take advantage of io_uring. | ||
7 | |||
8 | It is necessary to change how aio_set_fd_handler() deals with deleting | ||
9 | AioHandlers since removing monitored file descriptors is asynchronous in | ||
10 | io_uring. fdmon_io_uring_remove() marks the AioHandler deleted and | ||
11 | aio_set_fd_handler() will let it handle deletion in that case. | ||
12 | |||
13 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
14 | Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com | ||
15 | Message-Id: <20200305170806.1313245-6-stefanha@redhat.com> | ||
16 | --- | ||
17 | configure | 5 + | ||
18 | include/block/aio.h | 9 ++ | ||
19 | util/Makefile.objs | 1 + | ||
20 | util/aio-posix.c | 20 ++- | ||
21 | util/aio-posix.h | 20 ++- | ||
22 | util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++ | ||
23 | 6 files changed, 376 insertions(+), 5 deletions(-) | ||
24 | create mode 100644 util/fdmon-io_uring.c | ||
25 | |||
26 | diff --git a/configure b/configure | ||
27 | index XXXXXXX..XXXXXXX 100755 | ||
28 | --- a/configure | ||
29 | +++ b/configure | ||
30 | @@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then | ||
31 | linux_io_uring_cflags=$($pkg_config --cflags liburing) | ||
32 | linux_io_uring_libs=$($pkg_config --libs liburing) | ||
33 | linux_io_uring=yes | ||
34 | + | ||
35 | + # io_uring is used in libqemuutil.a where per-file -libs variables are not | ||
36 | + # seen by programs linking the archive. It's not ideal, but just add the | ||
37 | + # library dependency globally. | ||
38 | + LIBS="$linux_io_uring_libs $LIBS" | ||
39 | else | ||
40 | if test "$linux_io_uring" = "yes" ; then | ||
41 | feature_not_found "linux io_uring" "Install liburing devel" | ||
42 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
43 | index XXXXXXX..XXXXXXX 100644 | ||
44 | --- a/include/block/aio.h | ||
45 | +++ b/include/block/aio.h | ||
46 | @@ -XXX,XX +XXX,XX @@ | ||
47 | #ifndef QEMU_AIO_H | ||
48 | #define QEMU_AIO_H | ||
49 | |||
50 | +#ifdef CONFIG_LINUX_IO_URING | ||
51 | +#include <liburing.h> | ||
52 | +#endif | ||
53 | #include "qemu/queue.h" | ||
54 | #include "qemu/event_notifier.h" | ||
55 | #include "qemu/thread.h" | ||
56 | @@ -XXX,XX +XXX,XX @@ struct BHListSlice { | ||
57 | QSIMPLEQ_ENTRY(BHListSlice) next; | ||
58 | }; | ||
59 | |||
60 | +typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; | ||
61 | + | ||
62 | struct AioContext { | ||
63 | GSource source; | ||
64 | |||
65 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
66 | * locking. | ||
67 | */ | ||
68 | struct LuringState *linux_io_uring; | ||
69 | + | ||
70 | + /* State for file descriptor monitoring using Linux io_uring */ | ||
71 | + struct io_uring fdmon_io_uring; | ||
72 | + AioHandlerSList submit_list; | ||
73 | #endif | ||
74 | |||
75 | /* TimerLists for calling timers - one per clock type. Has its own | ||
76 | diff --git a/util/Makefile.objs b/util/Makefile.objs | ||
77 | index XXXXXXX..XXXXXXX 100644 | ||
78 | --- a/util/Makefile.objs | ||
79 | +++ b/util/Makefile.objs | ||
80 | @@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o | ||
81 | util-obj-$(CONFIG_POSIX) += aio-posix.o | ||
82 | util-obj-$(CONFIG_POSIX) += fdmon-poll.o | ||
83 | util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o | ||
84 | +util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o | ||
85 | util-obj-$(CONFIG_POSIX) += compatfd.o | ||
86 | util-obj-$(CONFIG_POSIX) += event_notifier-posix.o | ||
87 | util-obj-$(CONFIG_POSIX) += mmap-alloc.o | ||
88 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
89 | index XXXXXXX..XXXXXXX 100644 | ||
90 | --- a/util/aio-posix.c | ||
91 | +++ b/util/aio-posix.c | ||
92 | @@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) | ||
93 | g_source_remove_poll(&ctx->source, &node->pfd); | ||
94 | } | ||
95 | |||
96 | + node->pfd.revents = 0; | ||
97 | + | ||
98 | + /* If the fd monitor has already marked it deleted, leave it alone */ | ||
99 | + if (QLIST_IS_INSERTED(node, node_deleted)) { | ||
100 | + return false; | ||
101 | + } | ||
102 | + | ||
103 | /* If a read is in progress, just mark the node as deleted */ | ||
104 | if (qemu_lockcnt_count(&ctx->list_lock)) { | ||
105 | QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | ||
106 | - node->pfd.revents = 0; | ||
107 | return false; | ||
108 | } | ||
109 | /* Otherwise, delete it for real. We can't just mark it as | ||
110 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
111 | |||
112 | QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node); | ||
113 | } | ||
114 | - if (node) { | ||
115 | - deleted = aio_remove_fd_handler(ctx, node); | ||
116 | - } | ||
117 | |||
118 | /* No need to order poll_disable_cnt writes against other updates; | ||
119 | * the counter is only used to avoid wasting time and latency on | ||
120 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
121 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
122 | |||
123 | ctx->fdmon_ops->update(ctx, node, new_node); | ||
124 | + if (node) { | ||
125 | + deleted = aio_remove_fd_handler(ctx, node); | ||
126 | + } | ||
127 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
128 | aio_notify(ctx); | ||
129 | |||
130 | @@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx) | ||
131 | ctx->fdmon_ops = &fdmon_poll_ops; | ||
132 | ctx->epollfd = -1; | ||
133 | |||
134 | + /* Use the fastest fd monitoring implementation if available */ | ||
135 | + if (fdmon_io_uring_setup(ctx)) { | ||
136 | + return; | ||
137 | + } | ||
138 | + | ||
139 | fdmon_epoll_setup(ctx); | ||
140 | } | ||
141 | |||
142 | void aio_context_destroy(AioContext *ctx) | ||
143 | { | ||
144 | + fdmon_io_uring_destroy(ctx); | ||
145 | fdmon_epoll_disable(ctx); | ||
146 | } | ||
147 | |||
148 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
149 | index XXXXXXX..XXXXXXX 100644 | ||
150 | --- a/util/aio-posix.h | ||
151 | +++ b/util/aio-posix.h | ||
152 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | ||
153 | IOHandler *io_poll_begin; | ||
154 | IOHandler *io_poll_end; | ||
155 | void *opaque; | ||
156 | - bool is_external; | ||
157 | QLIST_ENTRY(AioHandler) node; | ||
158 | QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
159 | QLIST_ENTRY(AioHandler) node_deleted; | ||
160 | +#ifdef CONFIG_LINUX_IO_URING | ||
161 | + QSLIST_ENTRY(AioHandler) node_submitted; | ||
162 | + unsigned flags; /* see fdmon-io_uring.c */ | ||
163 | +#endif | ||
164 | + bool is_external; | ||
165 | }; | ||
166 | |||
167 | /* Add a handler to a ready list */ | ||
168 | @@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx) | ||
169 | } | ||
170 | #endif /* !CONFIG_EPOLL_CREATE1 */ | ||
171 | |||
172 | +#ifdef CONFIG_LINUX_IO_URING | ||
173 | +bool fdmon_io_uring_setup(AioContext *ctx); | ||
174 | +void fdmon_io_uring_destroy(AioContext *ctx); | ||
175 | +#else | ||
176 | +static inline bool fdmon_io_uring_setup(AioContext *ctx) | ||
177 | +{ | ||
178 | + return false; | ||
179 | +} | ||
180 | + | ||
181 | +static inline void fdmon_io_uring_destroy(AioContext *ctx) | ||
182 | +{ | ||
183 | +} | ||
184 | +#endif /* !CONFIG_LINUX_IO_URING */ | ||
185 | + | ||
186 | #endif /* AIO_POSIX_H */ | ||
187 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c | ||
188 | new file mode 100644 | ||
189 | index XXXXXXX..XXXXXXX | ||
190 | --- /dev/null | ||
191 | +++ b/util/fdmon-io_uring.c | ||
192 | @@ -XXX,XX +XXX,XX @@ | ||
193 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
194 | +/* | ||
195 | + * Linux io_uring file descriptor monitoring | ||
196 | + * | ||
197 | + * The Linux io_uring API supports file descriptor monitoring with a few | ||
198 | + * advantages over existing APIs like poll(2) and epoll(7): | ||
199 | + * | ||
200 | + * 1. Userspace polling of events is possible because the completion queue (cq | ||
201 | + * ring) is shared between the kernel and userspace. This allows | ||
202 | + * applications that rely on userspace polling to also monitor file | ||
203 | + * descriptors in the same userspace polling loop. | ||
204 | + * | ||
205 | + * 2. Submission and completion is batched and done together in a single system | ||
206 | + * call. This minimizes the number of system calls. | ||
207 | + * | ||
208 | + * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than | ||
209 | + * poll(2). | ||
210 | + * | ||
211 | + * 4. Nanosecond timeouts are supported so it requires fewer syscalls than | ||
212 | + * epoll(7). | ||
213 | + * | ||
214 | + * This code only monitors file descriptors and does not do asynchronous disk | ||
215 | + * I/O. Implementing disk I/O efficiently has other requirements and should | ||
216 | + * use a separate io_uring so it does not make sense to unify the code. | ||
217 | + * | ||
218 | + * File descriptor monitoring is implemented using the following operations: | ||
219 | + * | ||
220 | + * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored. | ||
221 | + * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When | ||
222 | + * the poll mask changes for a file descriptor it is first removed and then | ||
223 | + * re-added with the new poll mask, so this operation is also used as part | ||
224 | + * of modifying an existing monitored file descriptor. | ||
225 | + * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait | ||
226 | + * for events. This operation self-cancels if another event completes | ||
227 | + * before the timeout. | ||
228 | + * | ||
229 | + * io_uring calls the submission queue the "sq ring" and the completion queue | ||
230 | + * the "cq ring". Ring entries are called "sqe" and "cqe", respectively. | ||
231 | + * | ||
232 | + * The code is structured so that sq/cq rings are only modified within | ||
233 | + * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on | ||
234 | + * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD | ||
235 | + * and/or IORING_OP_POLL_REMOVE sqes for them. | ||
236 | + */ | ||
237 | + | ||
238 | +#include "qemu/osdep.h" | ||
239 | +#include <poll.h> | ||
240 | +#include "qemu/rcu_queue.h" | ||
241 | +#include "aio-posix.h" | ||
242 | + | ||
243 | +enum { | ||
244 | + FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ | ||
245 | + | ||
246 | + /* AioHandler::flags */ | ||
247 | + FDMON_IO_URING_PENDING = (1 << 0), | ||
248 | + FDMON_IO_URING_ADD = (1 << 1), | ||
249 | + FDMON_IO_URING_REMOVE = (1 << 2), | ||
250 | +}; | ||
251 | + | ||
252 | +static inline int poll_events_from_pfd(int pfd_events) | ||
253 | +{ | ||
254 | + return (pfd_events & G_IO_IN ? POLLIN : 0) | | ||
255 | + (pfd_events & G_IO_OUT ? POLLOUT : 0) | | ||
256 | + (pfd_events & G_IO_HUP ? POLLHUP : 0) | | ||
257 | + (pfd_events & G_IO_ERR ? POLLERR : 0); | ||
258 | +} | ||
259 | + | ||
260 | +static inline int pfd_events_from_poll(int poll_events) | ||
261 | +{ | ||
262 | + return (poll_events & POLLIN ? G_IO_IN : 0) | | ||
263 | + (poll_events & POLLOUT ? G_IO_OUT : 0) | | ||
264 | + (poll_events & POLLHUP ? G_IO_HUP : 0) | | ||
265 | + (poll_events & POLLERR ? G_IO_ERR : 0); | ||
266 | +} | ||
267 | + | ||
268 | +/* | ||
269 | + * Returns an sqe for submitting a request. Only be called within | ||
270 | + * fdmon_io_uring_wait(). | ||
271 | + */ | ||
272 | +static struct io_uring_sqe *get_sqe(AioContext *ctx) | ||
273 | +{ | ||
274 | + struct io_uring *ring = &ctx->fdmon_io_uring; | ||
275 | + struct io_uring_sqe *sqe = io_uring_get_sqe(ring); | ||
276 | + int ret; | ||
277 | + | ||
278 | + if (likely(sqe)) { | ||
279 | + return sqe; | ||
280 | + } | ||
281 | + | ||
282 | + /* No free sqes left, submit pending sqes first */ | ||
283 | + ret = io_uring_submit(ring); | ||
284 | + assert(ret > 1); | ||
285 | + sqe = io_uring_get_sqe(ring); | ||
286 | + assert(sqe); | ||
287 | + return sqe; | ||
288 | +} | ||
289 | + | ||
290 | +/* Atomically enqueue an AioHandler for sq ring submission */ | ||
291 | +static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags) | ||
292 | +{ | ||
293 | + unsigned old_flags; | ||
294 | + | ||
295 | + old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags); | ||
296 | + if (!(old_flags & FDMON_IO_URING_PENDING)) { | ||
297 | + QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted); | ||
298 | + } | ||
299 | +} | ||
300 | + | ||
301 | +/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */ | ||
302 | +static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) | ||
303 | +{ | ||
304 | + AioHandler *node = QSLIST_FIRST(head); | ||
305 | + | ||
306 | + if (!node) { | ||
307 | + return NULL; | ||
308 | + } | ||
309 | + | ||
310 | + /* Doesn't need to be atomic since fill_sq_ring() moves the list */ | ||
311 | + QSLIST_REMOVE_HEAD(head, node_submitted); | ||
312 | + | ||
313 | + /* | ||
314 | + * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two | ||
315 | + * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and | ||
316 | + * telling process_cqe() to delete the AioHandler when its | ||
317 | + * IORING_OP_POLL_ADD completes. | ||
318 | + */ | ||
319 | + *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | | ||
320 | + FDMON_IO_URING_ADD)); | ||
321 | + return node; | ||
322 | +} | ||
323 | + | ||
324 | +static void fdmon_io_uring_update(AioContext *ctx, | ||
325 | + AioHandler *old_node, | ||
326 | + AioHandler *new_node) | ||
327 | +{ | ||
328 | + if (new_node) { | ||
329 | + enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD); | ||
330 | + } | ||
331 | + | ||
332 | + if (old_node) { | ||
333 | + /* | ||
334 | + * Deletion is tricky because IORING_OP_POLL_ADD and | ||
335 | + * IORING_OP_POLL_REMOVE are async. We need to wait for the original | ||
336 | + * IORING_OP_POLL_ADD to complete before this handler can be freed | ||
337 | + * safely. | ||
338 | + * | ||
339 | + * It's possible that the file descriptor becomes ready and the | ||
340 | + * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is | ||
341 | + * submitted, too. | ||
342 | + * | ||
343 | + * Mark this handler deleted right now but don't place it on | ||
344 | + * ctx->deleted_aio_handlers yet. Instead, manually fudge the list | ||
345 | + * entry to make QLIST_IS_INSERTED() think this handler has been | ||
346 | + * inserted and other code recognizes this AioHandler as deleted. | ||
347 | + * | ||
348 | + * Once the original IORING_OP_POLL_ADD completes we enqueue the | ||
349 | + * handler on the real ctx->deleted_aio_handlers list to be freed. | ||
350 | + */ | ||
351 | + assert(!QLIST_IS_INSERTED(old_node, node_deleted)); | ||
352 | + old_node->node_deleted.le_prev = &old_node->node_deleted.le_next; | ||
353 | + | ||
354 | + enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE); | ||
355 | + } | ||
356 | +} | ||
357 | + | ||
358 | +static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) | ||
359 | +{ | ||
360 | + struct io_uring_sqe *sqe = get_sqe(ctx); | ||
361 | + int events = poll_events_from_pfd(node->pfd.events); | ||
362 | + | ||
363 | + io_uring_prep_poll_add(sqe, node->pfd.fd, events); | ||
364 | + io_uring_sqe_set_data(sqe, node); | ||
365 | +} | ||
366 | + | ||
367 | +static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) | ||
368 | +{ | ||
369 | + struct io_uring_sqe *sqe = get_sqe(ctx); | ||
370 | + | ||
371 | + io_uring_prep_poll_remove(sqe, node); | ||
372 | +} | ||
373 | + | ||
374 | +/* Add a timeout that self-cancels when another cqe becomes ready */ | ||
375 | +static void add_timeout_sqe(AioContext *ctx, int64_t ns) | ||
376 | +{ | ||
377 | + struct io_uring_sqe *sqe; | ||
378 | + struct __kernel_timespec ts = { | ||
379 | + .tv_sec = ns / NANOSECONDS_PER_SECOND, | ||
380 | + .tv_nsec = ns % NANOSECONDS_PER_SECOND, | ||
381 | + }; | ||
382 | + | ||
383 | + sqe = get_sqe(ctx); | ||
384 | + io_uring_prep_timeout(sqe, &ts, 1, 0); | ||
385 | +} | ||
386 | + | ||
387 | +/* Add sqes from ctx->submit_list for submission */ | ||
388 | +static void fill_sq_ring(AioContext *ctx) | ||
389 | +{ | ||
390 | + AioHandlerSList submit_list; | ||
391 | + AioHandler *node; | ||
392 | + unsigned flags; | ||
393 | + | ||
394 | + QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list); | ||
395 | + | ||
396 | + while ((node = dequeue(&submit_list, &flags))) { | ||
397 | + /* Order matters, just in case both flags were set */ | ||
398 | + if (flags & FDMON_IO_URING_ADD) { | ||
399 | + add_poll_add_sqe(ctx, node); | ||
400 | + } | ||
401 | + if (flags & FDMON_IO_URING_REMOVE) { | ||
402 | + add_poll_remove_sqe(ctx, node); | ||
403 | + } | ||
404 | + } | ||
405 | +} | ||
406 | + | ||
407 | +/* Returns true if a handler became ready */ | ||
408 | +static bool process_cqe(AioContext *ctx, | ||
409 | + AioHandlerList *ready_list, | ||
410 | + struct io_uring_cqe *cqe) | ||
411 | +{ | ||
412 | + AioHandler *node = io_uring_cqe_get_data(cqe); | ||
413 | + unsigned flags; | ||
414 | + | ||
415 | + /* poll_timeout and poll_remove have a zero user_data field */ | ||
416 | + if (!node) { | ||
417 | + return false; | ||
418 | + } | ||
419 | + | ||
420 | + /* | ||
421 | + * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race | ||
422 | + * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE | ||
423 | + * bit before IORING_OP_POLL_REMOVE is submitted. | ||
424 | + */ | ||
425 | + flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); | ||
426 | + if (flags & FDMON_IO_URING_REMOVE) { | ||
427 | + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | ||
428 | + return false; | ||
429 | + } | ||
430 | + | ||
431 | + aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); | ||
432 | + | ||
433 | + /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ | ||
434 | + add_poll_add_sqe(ctx, node); | ||
435 | + return true; | ||
436 | +} | ||
437 | + | ||
438 | +static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) | ||
439 | +{ | ||
440 | + struct io_uring *ring = &ctx->fdmon_io_uring; | ||
441 | + struct io_uring_cqe *cqe; | ||
442 | + unsigned num_cqes = 0; | ||
443 | + unsigned num_ready = 0; | ||
444 | + unsigned head; | ||
445 | + | ||
446 | + io_uring_for_each_cqe(ring, head, cqe) { | ||
447 | + if (process_cqe(ctx, ready_list, cqe)) { | ||
448 | + num_ready++; | ||
449 | + } | ||
450 | + | ||
451 | + num_cqes++; | ||
452 | + } | ||
453 | + | ||
454 | + io_uring_cq_advance(ring, num_cqes); | ||
455 | + return num_ready; | ||
456 | +} | ||
457 | + | ||
458 | +static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
459 | + int64_t timeout) | ||
460 | +{ | ||
461 | + unsigned wait_nr = 1; /* block until at least one cqe is ready */ | ||
462 | + int ret; | ||
463 | + | ||
464 | + /* Fall back while external clients are disabled */ | ||
465 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
466 | + return fdmon_poll_ops.wait(ctx, ready_list, timeout); | ||
467 | + } | ||
468 | + | ||
469 | + if (timeout == 0) { | ||
470 | + wait_nr = 0; /* non-blocking */ | ||
471 | + } else if (timeout > 0) { | ||
472 | + add_timeout_sqe(ctx, timeout); | ||
473 | + } | ||
474 | + | ||
475 | + fill_sq_ring(ctx); | ||
476 | + | ||
477 | + ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); | ||
478 | + assert(ret >= 0); | ||
479 | + | ||
480 | + return process_cq_ring(ctx, ready_list); | ||
481 | +} | ||
482 | + | ||
483 | +static const FDMonOps fdmon_io_uring_ops = { | ||
484 | + .update = fdmon_io_uring_update, | ||
485 | + .wait = fdmon_io_uring_wait, | ||
486 | +}; | ||
487 | + | ||
488 | +bool fdmon_io_uring_setup(AioContext *ctx) | ||
489 | +{ | ||
490 | + int ret; | ||
491 | + | ||
492 | + ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); | ||
493 | + if (ret != 0) { | ||
494 | + return false; | ||
495 | + } | ||
496 | + | ||
497 | + QSLIST_INIT(&ctx->submit_list); | ||
498 | + ctx->fdmon_ops = &fdmon_io_uring_ops; | ||
499 | + return true; | ||
500 | +} | ||
501 | + | ||
502 | +void fdmon_io_uring_destroy(AioContext *ctx) | ||
503 | +{ | ||
504 | + if (ctx->fdmon_ops == &fdmon_io_uring_ops) { | ||
505 | + AioHandler *node; | ||
506 | + | ||
507 | + io_uring_queue_exit(&ctx->fdmon_io_uring); | ||
508 | + | ||
509 | + /* No need to submit these anymore, just free them. */ | ||
510 | + while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { | ||
511 | + QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); | ||
512 | + QLIST_REMOVE(node, node); | ||
513 | + g_free(node); | ||
514 | + } | ||
515 | + | ||
516 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
517 | + } | ||
518 | +} | ||
519 | -- | ||
520 | 2.24.1 | ||
521 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled | ||
2 | from userspace. Previously userspace polling was only allowed when all | ||
3 | AioHandler's had an ->io_poll() callback. This prevented starvation of | ||
4 | fds by userspace pollable handlers. | ||
1 | 5 | ||
6 | Add the FDMonOps->need_wait() callback that enables userspace polling | ||
7 | even when some AioHandlers lack ->io_poll(). | ||
8 | |||
9 | For example, it's now possible to do userspace polling when a TCP/IP | ||
10 | socket is monitored thanks to Linux io_uring. | ||
11 | |||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com | ||
14 | Message-Id: <20200305170806.1313245-7-stefanha@redhat.com> | ||
15 | --- | ||
16 | include/block/aio.h | 19 +++++++++++++++++++ | ||
17 | util/aio-posix.c | 11 ++++++++--- | ||
18 | util/fdmon-epoll.c | 1 + | ||
19 | util/fdmon-io_uring.c | 6 ++++++ | ||
20 | util/fdmon-poll.c | 1 + | ||
21 | 5 files changed, 35 insertions(+), 3 deletions(-) | ||
22 | |||
23 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
24 | index XXXXXXX..XXXXXXX 100644 | ||
25 | --- a/include/block/aio.h | ||
26 | +++ b/include/block/aio.h | ||
27 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool; | ||
28 | struct LinuxAioState; | ||
29 | struct LuringState; | ||
30 | |||
31 | +/* Is polling disabled? */ | ||
32 | +bool aio_poll_disabled(AioContext *ctx); | ||
33 | + | ||
34 | /* Callbacks for file descriptor monitoring implementations */ | ||
35 | typedef struct { | ||
36 | /* | ||
37 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
38 | * Returns: number of ready file descriptors. | ||
39 | */ | ||
40 | int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); | ||
41 | + | ||
42 | + /* | ||
43 | + * need_wait: | ||
44 | + * @ctx: the AioContext | ||
45 | + * | ||
46 | + * Tell aio_poll() when to stop userspace polling early because ->wait() | ||
47 | + * has fds ready. | ||
48 | + * | ||
49 | + * File descriptor monitoring implementations that cannot poll fd readiness | ||
50 | + * from userspace should use aio_poll_disabled() here. This ensures that | ||
51 | + * file descriptors are not starved by handlers that frequently make | ||
52 | + * progress via userspace polling. | ||
53 | + * | ||
54 | + * Returns: true if ->wait() should be called, false otherwise. | ||
55 | + */ | ||
56 | + bool (*need_wait)(AioContext *ctx); | ||
57 | } FDMonOps; | ||
58 | |||
59 | /* | ||
60 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
61 | index XXXXXXX..XXXXXXX 100644 | ||
62 | --- a/util/aio-posix.c | ||
63 | +++ b/util/aio-posix.c | ||
64 | @@ -XXX,XX +XXX,XX @@ | ||
65 | #include "trace.h" | ||
66 | #include "aio-posix.h" | ||
67 | |||
68 | +bool aio_poll_disabled(AioContext *ctx) | ||
69 | +{ | ||
70 | + return atomic_read(&ctx->poll_disable_cnt); | ||
71 | +} | ||
72 | + | ||
73 | void aio_add_ready_handler(AioHandlerList *ready_list, | ||
74 | AioHandler *node, | ||
75 | int revents) | ||
76 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
77 | elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; | ||
78 | max_ns = qemu_soonest_timeout(*timeout, max_ns); | ||
79 | assert(!(max_ns && progress)); | ||
80 | - } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt)); | ||
81 | + } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); | ||
82 | |||
83 | /* If time has passed with no successful polling, adjust *timeout to | ||
84 | * keep the same ending time. | ||
85 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
86 | { | ||
87 | int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
88 | |||
89 | - if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) { | ||
90 | + if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
91 | poll_set_started(ctx, true); | ||
92 | |||
93 | if (run_poll_handlers(ctx, max_ns, timeout)) { | ||
94 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
95 | /* If polling is allowed, non-blocking aio_poll does not need the | ||
96 | * system call---a single round of run_poll_handlers_once suffices. | ||
97 | */ | ||
98 | - if (timeout || atomic_read(&ctx->poll_disable_cnt)) { | ||
99 | + if (timeout || ctx->fdmon_ops->need_wait(ctx)) { | ||
100 | ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
101 | } | ||
102 | |||
103 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
104 | index XXXXXXX..XXXXXXX 100644 | ||
105 | --- a/util/fdmon-epoll.c | ||
106 | +++ b/util/fdmon-epoll.c | ||
107 | @@ -XXX,XX +XXX,XX @@ out: | ||
108 | static const FDMonOps fdmon_epoll_ops = { | ||
109 | .update = fdmon_epoll_update, | ||
110 | .wait = fdmon_epoll_wait, | ||
111 | + .need_wait = aio_poll_disabled, | ||
112 | }; | ||
113 | |||
114 | static bool fdmon_epoll_try_enable(AioContext *ctx) | ||
115 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c | ||
116 | index XXXXXXX..XXXXXXX 100644 | ||
117 | --- a/util/fdmon-io_uring.c | ||
118 | +++ b/util/fdmon-io_uring.c | ||
119 | @@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
120 | return process_cq_ring(ctx, ready_list); | ||
121 | } | ||
122 | |||
123 | +static bool fdmon_io_uring_need_wait(AioContext *ctx) | ||
124 | +{ | ||
125 | + return io_uring_cq_ready(&ctx->fdmon_io_uring); | ||
126 | +} | ||
127 | + | ||
128 | static const FDMonOps fdmon_io_uring_ops = { | ||
129 | .update = fdmon_io_uring_update, | ||
130 | .wait = fdmon_io_uring_wait, | ||
131 | + .need_wait = fdmon_io_uring_need_wait, | ||
132 | }; | ||
133 | |||
134 | bool fdmon_io_uring_setup(AioContext *ctx) | ||
135 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | ||
136 | index XXXXXXX..XXXXXXX 100644 | ||
137 | --- a/util/fdmon-poll.c | ||
138 | +++ b/util/fdmon-poll.c | ||
139 | @@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx, | ||
140 | const FDMonOps fdmon_poll_ops = { | ||
141 | .update = fdmon_poll_update, | ||
142 | .wait = fdmon_poll_wait, | ||
143 | + .need_wait = aio_poll_disabled, | ||
144 | }; | ||
145 | -- | ||
146 | 2.24.1 | ||
147 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | 1 | When there are many poll handlers it's likely that some of them are idle | |
2 | most of the time. Remove handlers that haven't had activity recently so | ||
3 | that the polling loop scales better for guests with a large number of | ||
4 | devices. | ||
5 | |||
6 | This feature only takes effect for the Linux io_uring fd monitoring | ||
7 | implementation because it is capable of combining fd monitoring with | ||
8 | userspace polling. The other implementations can't do that and risk | ||
9 | starving fds in favor of poll handlers, so don't try this optimization | ||
10 | when they are in use. | ||
11 | |||
12 | IOPS improves from 10k to 105k when the guest has 100 | ||
13 | virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 | ||
14 | device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe. | ||
15 | |||
16 | [Clarified aio_poll_handlers locking discipline explanation in comment | ||
17 | after discussion with Paolo Bonzini <pbonzini@redhat.com>. | ||
18 | --Stefan] | ||
19 | |||
20 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
21 | Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com | ||
22 | Message-Id: <20200305170806.1313245-8-stefanha@redhat.com> | ||
23 | --- | ||
24 | include/block/aio.h | 8 ++++ | ||
25 | util/aio-posix.c | 93 +++++++++++++++++++++++++++++++++++++++++---- | ||
26 | util/aio-posix.h | 2 + | ||
27 | util/trace-events | 2 + | ||
28 | 4 files changed, 98 insertions(+), 7 deletions(-) | ||
29 | |||
30 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
31 | index XXXXXXX..XXXXXXX 100644 | ||
32 | --- a/include/block/aio.h | ||
33 | +++ b/include/block/aio.h | ||
34 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
35 | int64_t poll_grow; /* polling time growth factor */ | ||
36 | int64_t poll_shrink; /* polling time shrink factor */ | ||
37 | |||
38 | + /* | ||
39 | + * List of handlers participating in userspace polling. Protected by | ||
40 | + * ctx->list_lock. Iterated and modified mostly by the event loop thread | ||
41 | + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() | ||
42 | + * only touches the list to delete nodes if ctx->list_lock's count is zero. | ||
43 | + */ | ||
44 | + AioHandlerList poll_aio_handlers; | ||
45 | + | ||
46 | /* Are we in polling mode or monitoring file descriptors? */ | ||
47 | bool poll_started; | ||
48 | |||
49 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
50 | index XXXXXXX..XXXXXXX 100644 | ||
51 | --- a/util/aio-posix.c | ||
52 | +++ b/util/aio-posix.c | ||
53 | @@ -XXX,XX +XXX,XX @@ | ||
54 | #include "trace.h" | ||
55 | #include "aio-posix.h" | ||
56 | |||
57 | +/* Stop userspace polling on a handler if it isn't active for some time */ | ||
58 | +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) | ||
59 | + | ||
60 | bool aio_poll_disabled(AioContext *ctx) | ||
61 | { | ||
62 | return atomic_read(&ctx->poll_disable_cnt); | ||
63 | @@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) | ||
64 | * deleted because deleted nodes are only cleaned up while | ||
65 | * no one is walking the handlers list. | ||
66 | */ | ||
67 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
68 | QLIST_REMOVE(node, node); | ||
69 | return true; | ||
70 | } | ||
71 | @@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started) | ||
72 | ctx->poll_started = started; | ||
73 | |||
74 | qemu_lockcnt_inc(&ctx->list_lock); | ||
75 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
76 | + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { | ||
77 | IOHandler *fn; | ||
78 | |||
79 | if (QLIST_IS_INSERTED(node, node_deleted)) { | ||
80 | @@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx) | ||
81 | while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { | ||
82 | QLIST_REMOVE(node, node); | ||
83 | QLIST_REMOVE(node, node_deleted); | ||
84 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
85 | g_free(node); | ||
86 | } | ||
87 | |||
88 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) | ||
89 | revents = node->pfd.revents & node->pfd.events; | ||
90 | node->pfd.revents = 0; | ||
91 | |||
92 | + /* | ||
93 | + * Start polling AioHandlers when they become ready because activity is | ||
94 | + * likely to continue. Note that starvation is theoretically possible when | ||
95 | + * fdmon_supports_polling(), but only until the fd fires for the first | ||
96 | + * time. | ||
97 | + */ | ||
98 | + if (!QLIST_IS_INSERTED(node, node_deleted) && | ||
99 | + !QLIST_IS_INSERTED(node, node_poll) && | ||
100 | + node->io_poll) { | ||
101 | + trace_poll_add(ctx, node, node->pfd.fd, revents); | ||
102 | + if (ctx->poll_started && node->io_poll_begin) { | ||
103 | + node->io_poll_begin(node->opaque); | ||
104 | + } | ||
105 | + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); | ||
106 | + } | ||
107 | + | ||
108 | if (!QLIST_IS_INSERTED(node, node_deleted) && | ||
109 | (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && | ||
110 | aio_node_check(ctx, node->is_external) && | ||
111 | @@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx) | ||
112 | timerlistgroup_run_timers(&ctx->tlg); | ||
113 | } | ||
114 | |||
115 | -static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
116 | +static bool run_poll_handlers_once(AioContext *ctx, | ||
117 | + int64_t now, | ||
118 | + int64_t *timeout) | ||
119 | { | ||
120 | bool progress = false; | ||
121 | AioHandler *node; | ||
122 | + AioHandler *tmp; | ||
123 | |||
124 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
125 | - if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && | ||
126 | - aio_node_check(ctx, node->is_external) && | ||
127 | + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { | ||
128 | + if (aio_node_check(ctx, node->is_external) && | ||
129 | node->io_poll(node->opaque)) { | ||
130 | + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; | ||
131 | + | ||
132 | /* | ||
133 | * Polling was successful, exit try_poll_mode immediately | ||
134 | * to adjust the next polling time. | ||
135 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
136 | return progress; | ||
137 | } | ||
138 | |||
139 | +static bool fdmon_supports_polling(AioContext *ctx) | ||
140 | +{ | ||
141 | + return ctx->fdmon_ops->need_wait != aio_poll_disabled; | ||
142 | +} | ||
143 | + | ||
144 | +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) | ||
145 | +{ | ||
146 | + AioHandler *node; | ||
147 | + AioHandler *tmp; | ||
148 | + bool progress = false; | ||
149 | + | ||
150 | + /* | ||
151 | + * File descriptor monitoring implementations without userspace polling | ||
152 | + * support suffer from starvation when a subset of handlers is polled | ||
153 | + * because fds will not be processed in a timely fashion. Don't remove | ||
154 | + * idle poll handlers. | ||
155 | + */ | ||
156 | + if (!fdmon_supports_polling(ctx)) { | ||
157 | + return false; | ||
158 | + } | ||
159 | + | ||
160 | + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { | ||
161 | + if (node->poll_idle_timeout == 0LL) { | ||
162 | + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; | ||
163 | + } else if (now >= node->poll_idle_timeout) { | ||
164 | + trace_poll_remove(ctx, node, node->pfd.fd); | ||
165 | + node->poll_idle_timeout = 0LL; | ||
166 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
167 | + if (ctx->poll_started && node->io_poll_end) { | ||
168 | + node->io_poll_end(node->opaque); | ||
169 | + | ||
170 | + /* | ||
171 | + * Final poll in case ->io_poll_end() races with an event. | ||
172 | + * Nevermind about re-adding the handler in the rare case where | ||
173 | + * this causes progress. | ||
174 | + */ | ||
175 | + progress = node->io_poll(node->opaque) || progress; | ||
176 | + } | ||
177 | + } | ||
178 | + } | ||
179 | + | ||
180 | + return progress; | ||
181 | +} | ||
182 | + | ||
183 | /* run_poll_handlers: | ||
184 | * @ctx: the AioContext | ||
185 | * @max_ns: maximum time to poll for, in nanoseconds | ||
186 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
187 | |||
188 | start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
189 | do { | ||
190 | - progress = run_poll_handlers_once(ctx, timeout); | ||
191 | + progress = run_poll_handlers_once(ctx, start_time, timeout); | ||
192 | elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; | ||
193 | max_ns = qemu_soonest_timeout(*timeout, max_ns); | ||
194 | assert(!(max_ns && progress)); | ||
195 | } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); | ||
196 | |||
197 | + if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { | ||
198 | + *timeout = 0; | ||
199 | + progress = true; | ||
200 | + } | ||
201 | + | ||
202 | /* If time has passed with no successful polling, adjust *timeout to | ||
203 | * keep the same ending time. | ||
204 | */ | ||
205 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
206 | */ | ||
207 | static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
208 | { | ||
209 | - int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
210 | + int64_t max_ns; | ||
211 | + | ||
212 | + if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { | ||
213 | + return false; | ||
214 | + } | ||
215 | |||
216 | + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
217 | if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
218 | poll_set_started(ctx, true); | ||
219 | |||
220 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
221 | index XXXXXXX..XXXXXXX 100644 | ||
222 | --- a/util/aio-posix.h | ||
223 | +++ b/util/aio-posix.h | ||
224 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | ||
225 | QLIST_ENTRY(AioHandler) node; | ||
226 | QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
227 | QLIST_ENTRY(AioHandler) node_deleted; | ||
228 | + QLIST_ENTRY(AioHandler) node_poll; | ||
229 | #ifdef CONFIG_LINUX_IO_URING | ||
230 | QSLIST_ENTRY(AioHandler) node_submitted; | ||
231 | unsigned flags; /* see fdmon-io_uring.c */ | ||
232 | #endif | ||
233 | + int64_t poll_idle_timeout; /* when to stop userspace polling */ | ||
234 | bool is_external; | ||
235 | }; | ||
236 | |||
237 | diff --git a/util/trace-events b/util/trace-events | ||
238 | index XXXXXXX..XXXXXXX 100644 | ||
239 | --- a/util/trace-events | ||
240 | +++ b/util/trace-events | ||
241 | @@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ | ||
242 | run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64 | ||
243 | poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 | ||
244 | poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 | ||
245 | +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x" | ||
246 | +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d" | ||
247 | |||
248 | # async.c | ||
249 | aio_co_schedule(void *ctx, void *co) "ctx %p co %p" | ||
250 | -- | ||
251 | 2.24.1 | ||
252 | diff view generated by jsdifflib |