1
The following changes since commit f6b06fcceef465de0cf2514c9f76fe0192896781:
1
The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35:
2
2
3
Merge remote-tracking branch 'remotes/kraxel/tags/ui-20190121-pull-request' into staging (2019-01-23 17:57:47 +0000)
3
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
git://github.com/stefanha/qemu.git tags/block-pull-request
7
https://github.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 8595685986152334b1ec28c78cb0e5e855d56b54:
9
for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3:
10
10
11
qemu-coroutine-sleep: drop CoSleepCB (2019-01-24 10:05:16 +0000)
11
aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
Changelog: No user-visible changes.
17
18
----------------------------------------------------------------
16
----------------------------------------------------------------
19
17
20
Stefan Hajnoczi (2):
18
Stefan Hajnoczi (9):
21
throttle-groups: fix restart coroutine iothread race
19
qemu/queue.h: clear linked list pointers on remove
22
iotests: add 238 for throttling tgm unregister iothread segfault
20
aio-posix: remove confusing QLIST_SAFE_REMOVE()
21
aio-posix: completely stop polling when disabled
22
aio-posix: move RCU_READ_LOCK() into run_poll_handlers()
23
aio-posix: extract ppoll(2) and epoll(7) fd monitoring
24
aio-posix: simplify FDMonOps->update() prototype
25
aio-posix: add io_uring fd monitoring implementation
26
aio-posix: support userspace polling of fd monitoring
27
aio-posix: remove idle poll handlers to improve scalability
23
28
24
Vladimir Sementsov-Ogievskiy (1):
29
MAINTAINERS | 2 +
25
qemu-coroutine-sleep: drop CoSleepCB
30
configure | 5 +
26
31
include/block/aio.h | 71 ++++++-
27
include/block/throttle-groups.h | 5 ++++
32
include/qemu/queue.h | 19 +-
28
block/throttle-groups.c | 9 +++++++
33
util/Makefile.objs | 3 +
29
util/qemu-coroutine-sleep.c | 27 +++++++------------
34
util/aio-posix.c | 451 ++++++++++++++----------------------------
30
tests/qemu-iotests/238 | 47 +++++++++++++++++++++++++++++++++
35
util/aio-posix.h | 81 ++++++++
31
tests/qemu-iotests/238.out | 6 +++++
36
util/fdmon-epoll.c | 155 +++++++++++++++
32
tests/qemu-iotests/group | 1 +
37
util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++
33
6 files changed, 78 insertions(+), 17 deletions(-)
38
util/fdmon-poll.c | 107 ++++++++++
34
create mode 100755 tests/qemu-iotests/238
39
util/trace-events | 2 +
35
create mode 100644 tests/qemu-iotests/238.out
40
11 files changed, 915 insertions(+), 313 deletions(-)
41
create mode 100644 util/aio-posix.h
42
create mode 100644 util/fdmon-epoll.c
43
create mode 100644 util/fdmon-io_uring.c
44
create mode 100644 util/fdmon-poll.c
36
45
37
--
46
--
38
2.20.1
47
2.24.1
39
48
40
diff view generated by jsdifflib
New patch
1
Do not leave stale linked list pointers around after removal. It's
2
safer to set them to NULL so that use-after-removal results in an
3
immediate segfault.
1
4
5
The RCU queue removal macros are unchanged since nodes may still be
6
traversed after removal.
7
8
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com
11
Message-Id: <20200224103406.1894923-2-stefanha@redhat.com>
12
---
13
include/qemu/queue.h | 19 +++++++++++++++----
14
1 file changed, 15 insertions(+), 4 deletions(-)
15
16
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
17
index XXXXXXX..XXXXXXX 100644
18
--- a/include/qemu/queue.h
19
+++ b/include/qemu/queue.h
20
@@ -XXX,XX +XXX,XX @@ struct { \
21
(elm)->field.le_next->field.le_prev = \
22
(elm)->field.le_prev; \
23
*(elm)->field.le_prev = (elm)->field.le_next; \
24
+ (elm)->field.le_next = NULL; \
25
+ (elm)->field.le_prev = NULL; \
26
} while (/*CONSTCOND*/0)
27
28
/*
29
@@ -XXX,XX +XXX,XX @@ struct { \
30
} while (/*CONSTCOND*/0)
31
32
#define QSLIST_REMOVE_HEAD(head, field) do { \
33
- (head)->slh_first = (head)->slh_first->field.sle_next; \
34
+ typeof((head)->slh_first) elm = (head)->slh_first; \
35
+ (head)->slh_first = elm->field.sle_next; \
36
+ elm->field.sle_next = NULL; \
37
} while (/*CONSTCOND*/0)
38
39
#define QSLIST_REMOVE_AFTER(slistelm, field) do { \
40
- (slistelm)->field.sle_next = \
41
- QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field); \
42
+ typeof(slistelm) next = (slistelm)->field.sle_next; \
43
+ (slistelm)->field.sle_next = next->field.sle_next; \
44
+ next->field.sle_next = NULL; \
45
} while (/*CONSTCOND*/0)
46
47
#define QSLIST_REMOVE(head, elm, type, field) do { \
48
@@ -XXX,XX +XXX,XX @@ struct { \
49
while (curelm->field.sle_next != (elm)) \
50
curelm = curelm->field.sle_next; \
51
curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \
52
+ (elm)->field.sle_next = NULL; \
53
} \
54
} while (/*CONSTCOND*/0)
55
56
@@ -XXX,XX +XXX,XX @@ struct { \
57
} while (/*CONSTCOND*/0)
58
59
#define QSIMPLEQ_REMOVE_HEAD(head, field) do { \
60
- if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
61
+ typeof((head)->sqh_first) elm = (head)->sqh_first; \
62
+ if (((head)->sqh_first = elm->field.sqe_next) == NULL) \
63
(head)->sqh_last = &(head)->sqh_first; \
64
+ elm->field.sqe_next = NULL; \
65
} while (/*CONSTCOND*/0)
66
67
#define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do { \
68
@@ -XXX,XX +XXX,XX @@ struct { \
69
if ((curelm->field.sqe_next = \
70
curelm->field.sqe_next->field.sqe_next) == NULL) \
71
(head)->sqh_last = &(curelm)->field.sqe_next; \
72
+ (elm)->field.sqe_next = NULL; \
73
} \
74
} while (/*CONSTCOND*/0)
75
76
@@ -XXX,XX +XXX,XX @@ union { \
77
(head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \
78
(elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \
79
(elm)->field.tqe_circ.tql_prev = NULL; \
80
+ (elm)->field.tqe_circ.tql_next = NULL; \
81
+ (elm)->field.tqe_next = NULL; \
82
} while (/*CONSTCOND*/0)
83
84
/* remove @left, @right and all elements in between from @head */
85
--
86
2.24.1
87
diff view generated by jsdifflib
New patch
1
QLIST_SAFE_REMOVE() is confusing here because the node must be on the
2
list. We actually just wanted to clear the linked list pointers when
3
removing it from the list. QLIST_REMOVE() now does this, so switch to
4
it.
1
5
6
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Link: https://lore.kernel.org/r/20200224103406.1894923-3-stefanha@redhat.com
9
Message-Id: <20200224103406.1894923-3-stefanha@redhat.com>
10
---
11
util/aio-posix.c | 2 +-
12
1 file changed, 1 insertion(+), 1 deletion(-)
13
14
diff --git a/util/aio-posix.c b/util/aio-posix.c
15
index XXXXXXX..XXXXXXX 100644
16
--- a/util/aio-posix.c
17
+++ b/util/aio-posix.c
18
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
19
AioHandler *node;
20
21
while ((node = QLIST_FIRST(ready_list))) {
22
- QLIST_SAFE_REMOVE(node, node_ready);
23
+ QLIST_REMOVE(node, node_ready);
24
progress = aio_dispatch_handler(ctx, node) || progress;
25
}
26
27
--
28
2.24.1
29
diff view generated by jsdifflib
New patch
1
One iteration of polling is always performed even when polling is
2
disabled. This is done because:
3
1. Userspace polling is cheaper than making a syscall. We might get
4
lucky.
5
2. We must poll once more after polling has stopped in case an event
6
occurred while stopping polling.
1
7
8
However, there are downsides:
9
1. Polling becomes a bottleneck when the number of event sources is very
10
high. It's more efficient to monitor fds in that case.
11
2. A high-frequency polling event source can starve non-polling event
12
sources because ppoll(2)/epoll(7) is never invoked.
13
14
This patch removes the forced polling iteration so that poll_ns=0 really
15
means no polling.
16
17
IOPS increases from 10k to 60k when the guest has 100
18
virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
19
device because the large number of event sources being polled slows down
20
the event loop.
21
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
23
Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com
24
Message-Id: <20200305170806.1313245-2-stefanha@redhat.com>
25
---
26
util/aio-posix.c | 22 +++++++++++++++-------
27
1 file changed, 15 insertions(+), 7 deletions(-)
28
29
diff --git a/util/aio-posix.c b/util/aio-posix.c
30
index XXXXXXX..XXXXXXX 100644
31
--- a/util/aio-posix.c
32
+++ b/util/aio-posix.c
33
@@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx,
34
(IOHandler *)io_poll_end);
35
}
36
37
-static void poll_set_started(AioContext *ctx, bool started)
38
+static bool poll_set_started(AioContext *ctx, bool started)
39
{
40
AioHandler *node;
41
+ bool progress = false;
42
43
if (started == ctx->poll_started) {
44
- return;
45
+ return false;
46
}
47
48
ctx->poll_started = started;
49
@@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started)
50
if (fn) {
51
fn(node->opaque);
52
}
53
+
54
+ /* Poll one last time in case ->io_poll_end() raced with the event */
55
+ if (!started) {
56
+ progress = node->io_poll(node->opaque) || progress;
57
+ }
58
}
59
qemu_lockcnt_dec(&ctx->list_lock);
60
+
61
+ return progress;
62
}
63
64
65
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
66
}
67
}
68
69
- poll_set_started(ctx, false);
70
+ if (poll_set_started(ctx, false)) {
71
+ *timeout = 0;
72
+ return true;
73
+ }
74
75
- /* Even if we don't run busy polling, try polling once in case it can make
76
- * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
77
- */
78
- return run_poll_handlers_once(ctx, timeout);
79
+ return false;
80
}
81
82
bool aio_poll(AioContext *ctx, bool blocking)
83
--
84
2.24.1
85
diff view generated by jsdifflib
New patch
1
Now that run_poll_handlers_once() is only called by run_poll_handlers()
2
we can improve the CPU time profile by moving the expensive
3
RCU_READ_LOCK() out of the polling loop.
1
4
5
This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's
6
sampling profiler output.
7
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com
10
Message-Id: <20200305170806.1313245-3-stefanha@redhat.com>
11
---
12
util/aio-posix.c | 20 ++++++++++----------
13
1 file changed, 10 insertions(+), 10 deletions(-)
14
15
diff --git a/util/aio-posix.c b/util/aio-posix.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/util/aio-posix.c
18
+++ b/util/aio-posix.c
19
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
20
bool progress = false;
21
AioHandler *node;
22
23
- /*
24
- * Optimization: ->io_poll() handlers often contain RCU read critical
25
- * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
26
- * -> rcu_read_lock() -> ... sequences with expensive memory
27
- * synchronization primitives. Make the entire polling loop an RCU
28
- * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
29
- * are cheap.
30
- */
31
- RCU_READ_LOCK_GUARD();
32
-
33
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
34
if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
35
aio_node_check(ctx, node->is_external) &&
36
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
37
38
trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
39
40
+ /*
41
+ * Optimization: ->io_poll() handlers often contain RCU read critical
42
+ * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
43
+ * -> rcu_read_lock() -> ... sequences with expensive memory
44
+ * synchronization primitives. Make the entire polling loop an RCU
45
+ * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
46
+ * are cheap.
47
+ */
48
+ RCU_READ_LOCK_GUARD();
49
+
50
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
51
do {
52
progress = run_poll_handlers_once(ctx, timeout);
53
--
54
2.24.1
55
diff view generated by jsdifflib
1
The following QMP command leads to a crash when iothreads are used:
1
The ppoll(2) and epoll(7) file descriptor monitoring implementations are
2
mixed with the core util/aio-posix.c code. Before adding another
3
implementation for Linux io_uring, extract out the existing
4
ones so there is a clear interface and the core code is simpler.
2
5
3
{ 'execute': 'device_del', 'arguments': {'id': 'data'} }
6
The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps
7
struct. See the patch for details.
4
8
5
The backtrace involves the queue restart coroutine where
9
Semantic changes:
6
tgm->throttle_state is a NULL pointer because
10
1. ppoll(2) now reflects events from pollfds[] back into AioHandlers
7
throttle_group_unregister_tgm() has already been called:
11
while we're still on the clock for adaptive polling. This was
12
already happening for epoll(7), so if it's really an issue then we'll
13
need to fix both in the future.
14
2. epoll(7)'s fallback to ppoll(2) while external events are disabled
15
was broken when the number of fds exceeded the epoll(7) upgrade
16
threshold. I guess this code path simply wasn't tested and no one
17
noticed the bug. I didn't go out of my way to fix it but the correct
18
code is simpler than preserving the bug.
8
19
9
(gdb) bt full
20
I also took some liberties in removing the unnecessary
10
#0 0x00005585a7a3b378 in qemu_mutex_lock_impl (mutex=0xffffffffffffffd0, file=0x5585a7bb3d54 "block/throttle-groups.c", line=412) at util/qemu-thread-posix.c:64
21
AioContext->epoll_available (just check AioContext->epollfd != -1
11
err = <optimized out>
22
instead) and AioContext->epoll_enabled (it's implicit if our
12
__PRETTY_FUNCTION__ = "qemu_mutex_lock_impl"
23
AioContext->fdmon_ops callbacks are being invoked) fields.
13
__func__ = "qemu_mutex_lock_impl"
14
#1 0x00005585a79be074 in throttle_group_restart_queue_entry (opaque=0x5585a9de4eb0) at block/throttle-groups.c:412
15
_f = <optimized out>
16
data = 0x5585a9de4eb0
17
tgm = 0x5585a9079440
18
ts = 0x0
19
tg = 0xffffffffffffff98
20
is_write = false
21
empty_queue = 255
22
23
This coroutine should not execute in the iothread after the throttle
24
group member has been unregistered!
25
26
The root cause is that the device_del code path schedules the restart
27
coroutine in the iothread while holding the AioContext lock. Therefore
28
the iothread cannot execute the coroutine until after device_del
29
releases the lock - by this time it's too late.
30
31
This patch adds a reference count to ThrottleGroupMember so we can
32
synchronously wait for restart coroutines to complete. Once they are
33
done it is safe to unregister the ThrottleGroupMember.
34
24
35
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
25
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
36
Reviewed-by: Alberto Garcia <berto@igalia.com>
26
Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com
37
Message-id: 20190114133257.30299-2-stefanha@redhat.com
27
Message-Id: <20200305170806.1313245-4-stefanha@redhat.com>
38
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
39
---
28
---
40
include/block/throttle-groups.h | 5 +++++
29
MAINTAINERS | 2 +
41
block/throttle-groups.c | 9 +++++++++
30
include/block/aio.h | 36 +++++-
42
2 files changed, 14 insertions(+)
31
util/Makefile.objs | 2 +
32
util/aio-posix.c | 286 ++------------------------------------------
33
util/aio-posix.h | 61 ++++++++++
34
util/fdmon-epoll.c | 151 +++++++++++++++++++++++
35
util/fdmon-poll.c | 104 ++++++++++++++++
36
7 files changed, 366 insertions(+), 276 deletions(-)
37
create mode 100644 util/aio-posix.h
38
create mode 100644 util/fdmon-epoll.c
39
create mode 100644 util/fdmon-poll.c
43
40
44
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
41
diff --git a/MAINTAINERS b/MAINTAINERS
45
index XXXXXXX..XXXXXXX 100644
42
index XXXXXXX..XXXXXXX 100644
46
--- a/include/block/throttle-groups.h
43
--- a/MAINTAINERS
47
+++ b/include/block/throttle-groups.h
44
+++ b/MAINTAINERS
48
@@ -XXX,XX +XXX,XX @@ typedef struct ThrottleGroupMember {
45
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
46
S: Supported
47
F: util/async.c
48
F: util/aio-*.c
49
+F: util/aio-*.h
50
+F: util/fdmon-*.c
51
F: block/io.c
52
F: migration/block*
53
F: include/block/aio.h
54
diff --git a/include/block/aio.h b/include/block/aio.h
55
index XXXXXXX..XXXXXXX 100644
56
--- a/include/block/aio.h
57
+++ b/include/block/aio.h
58
@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
59
struct LinuxAioState;
60
struct LuringState;
61
62
+/* Callbacks for file descriptor monitoring implementations */
63
+typedef struct {
64
+ /*
65
+ * update:
66
+ * @ctx: the AioContext
67
+ * @node: the handler
68
+ * @is_new: is the file descriptor already being monitored?
69
+ *
70
+ * Add/remove/modify a monitored file descriptor. There are three cases:
71
+ * 1. node->pfd.events == 0 means remove the file descriptor.
72
+ * 2. !is_new means modify an already monitored file descriptor.
73
+ * 3. is_new means add a new file descriptor.
74
+ *
75
+ * Called with ctx->list_lock acquired.
76
+ */
77
+ void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
78
+
79
+ /*
80
+ * wait:
81
+ * @ctx: the AioContext
82
+ * @ready_list: list for handlers that become ready
83
+ * @timeout: maximum duration to wait, in nanoseconds
84
+ *
85
+ * Wait for file descriptors to become ready and place them on ready_list.
86
+ *
87
+ * Called with ctx->list_lock incremented but not locked.
88
+ *
89
+ * Returns: number of ready file descriptors.
90
+ */
91
+ int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
92
+} FDMonOps;
93
+
94
/*
95
* Each aio_bh_poll() call carves off a slice of the BH list, so that newly
96
* scheduled BHs are not processed until the next aio_bh_poll() call. All
97
@@ -XXX,XX +XXX,XX @@ struct AioContext {
98
99
/* epoll(7) state used when built with CONFIG_EPOLL */
100
int epollfd;
101
- bool epoll_enabled;
102
- bool epoll_available;
103
+
104
+ const FDMonOps *fdmon_ops;
105
};
106
107
/**
108
diff --git a/util/Makefile.objs b/util/Makefile.objs
109
index XXXXXXX..XXXXXXX 100644
110
--- a/util/Makefile.objs
111
+++ b/util/Makefile.objs
112
@@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
113
util-obj-y += main-loop.o
114
util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
115
util-obj-$(CONFIG_POSIX) += aio-posix.o
116
+util-obj-$(CONFIG_POSIX) += fdmon-poll.o
117
+util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
118
util-obj-$(CONFIG_POSIX) += compatfd.o
119
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
120
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
121
diff --git a/util/aio-posix.c b/util/aio-posix.c
122
index XXXXXXX..XXXXXXX 100644
123
--- a/util/aio-posix.c
124
+++ b/util/aio-posix.c
125
@@ -XXX,XX +XXX,XX @@
126
#include "qemu/sockets.h"
127
#include "qemu/cutils.h"
128
#include "trace.h"
129
-#ifdef CONFIG_EPOLL_CREATE1
130
-#include <sys/epoll.h>
131
-#endif
132
+#include "aio-posix.h"
133
134
-struct AioHandler
135
-{
136
- GPollFD pfd;
137
- IOHandler *io_read;
138
- IOHandler *io_write;
139
- AioPollFn *io_poll;
140
- IOHandler *io_poll_begin;
141
- IOHandler *io_poll_end;
142
- void *opaque;
143
- bool is_external;
144
- QLIST_ENTRY(AioHandler) node;
145
- QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
146
- QLIST_ENTRY(AioHandler) node_deleted;
147
-};
148
-
149
-/* Add a handler to a ready list */
150
-static void add_ready_handler(AioHandlerList *ready_list,
151
- AioHandler *node,
152
- int revents)
153
+void aio_add_ready_handler(AioHandlerList *ready_list,
154
+ AioHandler *node,
155
+ int revents)
156
{
157
QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
158
node->pfd.revents = revents;
159
QLIST_INSERT_HEAD(ready_list, node, node_ready);
160
}
161
162
-#ifdef CONFIG_EPOLL_CREATE1
163
-
164
-/* The fd number threshold to switch to epoll */
165
-#define EPOLL_ENABLE_THRESHOLD 64
166
-
167
-static void aio_epoll_disable(AioContext *ctx)
168
-{
169
- ctx->epoll_enabled = false;
170
- if (!ctx->epoll_available) {
171
- return;
172
- }
173
- ctx->epoll_available = false;
174
- close(ctx->epollfd);
175
-}
176
-
177
-static inline int epoll_events_from_pfd(int pfd_events)
178
-{
179
- return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
180
- (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
181
- (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
182
- (pfd_events & G_IO_ERR ? EPOLLERR : 0);
183
-}
184
-
185
-static bool aio_epoll_try_enable(AioContext *ctx)
186
-{
187
- AioHandler *node;
188
- struct epoll_event event;
189
-
190
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
191
- int r;
192
- if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
193
- continue;
194
- }
195
- event.events = epoll_events_from_pfd(node->pfd.events);
196
- event.data.ptr = node;
197
- r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
198
- if (r) {
199
- return false;
200
- }
201
- }
202
- ctx->epoll_enabled = true;
203
- return true;
204
-}
205
-
206
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
207
-{
208
- struct epoll_event event;
209
- int r;
210
- int ctl;
211
-
212
- if (!ctx->epoll_enabled) {
213
- return;
214
- }
215
- if (!node->pfd.events) {
216
- ctl = EPOLL_CTL_DEL;
217
- } else {
218
- event.data.ptr = node;
219
- event.events = epoll_events_from_pfd(node->pfd.events);
220
- ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
221
- }
222
-
223
- r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
224
- if (r) {
225
- aio_epoll_disable(ctx);
226
- }
227
-}
228
-
229
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
230
- int64_t timeout)
231
-{
232
- GPollFD pfd = {
233
- .fd = ctx->epollfd,
234
- .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
235
- };
236
- AioHandler *node;
237
- int i, ret = 0;
238
- struct epoll_event events[128];
239
-
240
- if (timeout > 0) {
241
- ret = qemu_poll_ns(&pfd, 1, timeout);
242
- if (ret > 0) {
243
- timeout = 0;
244
- }
245
- }
246
- if (timeout <= 0 || ret > 0) {
247
- ret = epoll_wait(ctx->epollfd, events,
248
- ARRAY_SIZE(events),
249
- timeout);
250
- if (ret <= 0) {
251
- goto out;
252
- }
253
- for (i = 0; i < ret; i++) {
254
- int ev = events[i].events;
255
- int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
256
- (ev & EPOLLOUT ? G_IO_OUT : 0) |
257
- (ev & EPOLLHUP ? G_IO_HUP : 0) |
258
- (ev & EPOLLERR ? G_IO_ERR : 0);
259
-
260
- node = events[i].data.ptr;
261
- add_ready_handler(ready_list, node, revents);
262
- }
263
- }
264
-out:
265
- return ret;
266
-}
267
-
268
-static bool aio_epoll_enabled(AioContext *ctx)
269
-{
270
- /* Fall back to ppoll when external clients are disabled. */
271
- return !aio_external_disabled(ctx) && ctx->epoll_enabled;
272
-}
273
-
274
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
275
- unsigned npfd, int64_t timeout)
276
-{
277
- if (!ctx->epoll_available) {
278
- return false;
279
- }
280
- if (aio_epoll_enabled(ctx)) {
281
- return true;
282
- }
283
- if (npfd >= EPOLL_ENABLE_THRESHOLD) {
284
- if (aio_epoll_try_enable(ctx)) {
285
- return true;
286
- } else {
287
- aio_epoll_disable(ctx);
288
- }
289
- }
290
- return false;
291
-}
292
-
293
-#else
294
-
295
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
296
-{
297
-}
298
-
299
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
300
- int64_t timeout)
301
-{
302
- assert(false);
303
-}
304
-
305
-static bool aio_epoll_enabled(AioContext *ctx)
306
-{
307
- return false;
308
-}
309
-
310
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
311
- unsigned npfd, int64_t timeout)
312
-{
313
- return false;
314
-}
315
-
316
-#endif
317
-
318
static AioHandler *find_aio_handler(AioContext *ctx, int fd)
319
{
320
AioHandler *node;
321
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
322
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
323
324
if (new_node) {
325
- aio_epoll_update(ctx, new_node, is_new);
326
+ ctx->fdmon_ops->update(ctx, new_node, is_new);
327
} else if (node) {
328
/* Unregister deleted fd_handler */
329
- aio_epoll_update(ctx, node, false);
330
+ ctx->fdmon_ops->update(ctx, node, false);
331
}
332
qemu_lockcnt_unlock(&ctx->list_lock);
333
aio_notify(ctx);
334
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
335
timerlistgroup_run_timers(&ctx->tlg);
336
}
337
338
-/* These thread-local variables are used only in a small part of aio_poll
339
- * around the call to the poll() system call. In particular they are not
340
- * used while aio_poll is performing callbacks, which makes it much easier
341
- * to think about reentrancy!
342
- *
343
- * Stack-allocated arrays would be perfect but they have size limitations;
344
- * heap allocation is expensive enough that we want to reuse arrays across
345
- * calls to aio_poll(). And because poll() has to be called without holding
346
- * any lock, the arrays cannot be stored in AioContext. Thread-local data
347
- * has none of the disadvantages of these three options.
348
- */
349
-static __thread GPollFD *pollfds;
350
-static __thread AioHandler **nodes;
351
-static __thread unsigned npfd, nalloc;
352
-static __thread Notifier pollfds_cleanup_notifier;
353
-
354
-static void pollfds_cleanup(Notifier *n, void *unused)
355
-{
356
- g_assert(npfd == 0);
357
- g_free(pollfds);
358
- g_free(nodes);
359
- nalloc = 0;
360
-}
361
-
362
-static void add_pollfd(AioHandler *node)
363
-{
364
- if (npfd == nalloc) {
365
- if (nalloc == 0) {
366
- pollfds_cleanup_notifier.notify = pollfds_cleanup;
367
- qemu_thread_atexit_add(&pollfds_cleanup_notifier);
368
- nalloc = 8;
369
- } else {
370
- g_assert(nalloc <= INT_MAX);
371
- nalloc *= 2;
372
- }
373
- pollfds = g_renew(GPollFD, pollfds, nalloc);
374
- nodes = g_renew(AioHandler *, nodes, nalloc);
375
- }
376
- nodes[npfd] = node;
377
- pollfds[npfd] = (GPollFD) {
378
- .fd = node->pfd.fd,
379
- .events = node->pfd.events,
380
- };
381
- npfd++;
382
-}
383
-
384
static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
385
{
386
bool progress = false;
387
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
388
bool aio_poll(AioContext *ctx, bool blocking)
389
{
390
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
391
- AioHandler *node;
392
- int i;
393
int ret = 0;
394
bool progress;
395
int64_t timeout;
396
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
397
* system call---a single round of run_poll_handlers_once suffices.
49
*/
398
*/
50
unsigned int io_limits_disabled;
399
if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
51
400
- assert(npfd == 0);
52
+ /* Number of pending throttle_group_restart_queue_entry() coroutines.
401
-
53
+ * Accessed with atomic operations.
402
- /* fill pollfds */
54
+ */
403
-
55
+ unsigned int restart_pending;
404
- if (!aio_epoll_enabled(ctx)) {
56
+
405
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
57
/* The following fields are protected by the ThrottleGroup lock.
406
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
58
* See the ThrottleGroup documentation for details.
407
- && aio_node_check(ctx, node->is_external)) {
59
* throttle_state tells us if I/O limits are configured. */
408
- add_pollfd(node);
60
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
409
- }
61
index XXXXXXX..XXXXXXX 100644
410
- }
62
--- a/block/throttle-groups.c
411
- }
63
+++ b/block/throttle-groups.c
412
-
64
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
413
- /* wait until next event */
414
- if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
415
- npfd = 0; /* pollfds[] is not being used */
416
- ret = aio_epoll(ctx, &ready_list, timeout);
417
- } else {
418
- ret = qemu_poll_ns(pollfds, npfd, timeout);
419
- }
420
+ ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
65
}
421
}
66
422
67
g_free(data);
423
if (blocking) {
68
+
424
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
69
+ atomic_dec(&tgm->restart_pending);
425
}
70
+ aio_wait_kick();
426
}
427
428
- /* if we have any readable fds, dispatch event */
429
- if (ret > 0) {
430
- for (i = 0; i < npfd; i++) {
431
- int revents = pollfds[i].revents;
432
-
433
- if (revents) {
434
- add_ready_handler(&ready_list, nodes[i], revents);
435
- }
436
- }
437
- }
438
-
439
- npfd = 0;
440
-
441
progress |= aio_bh_poll(ctx);
442
443
if (ret > 0) {
444
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
445
446
void aio_context_setup(AioContext *ctx)
447
{
448
-#ifdef CONFIG_EPOLL_CREATE1
449
- assert(!ctx->epollfd);
450
- ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
451
- if (ctx->epollfd == -1) {
452
- fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
453
- ctx->epoll_available = false;
454
- } else {
455
- ctx->epoll_available = true;
456
- }
457
-#endif
458
+ ctx->fdmon_ops = &fdmon_poll_ops;
459
+ ctx->epollfd = -1;
460
+
461
+ fdmon_epoll_setup(ctx);
71
}
462
}
72
463
73
static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
464
void aio_context_destroy(AioContext *ctx)
74
@@ -XXX,XX +XXX,XX @@ static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write
465
{
75
* be no timer pending on this tgm at this point */
466
-#ifdef CONFIG_EPOLL_CREATE1
76
assert(!timer_pending(tgm->throttle_timers.timers[is_write]));
467
- aio_epoll_disable(ctx);
77
468
-#endif
78
+ atomic_inc(&tgm->restart_pending);
469
+ fdmon_epoll_disable(ctx);
79
+
80
co = qemu_coroutine_create(throttle_group_restart_queue_entry, rd);
81
aio_co_enter(tgm->aio_context, co);
82
}
470
}
83
@@ -XXX,XX +XXX,XX @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
471
84
472
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
85
tgm->throttle_state = ts;
473
diff --git a/util/aio-posix.h b/util/aio-posix.h
86
tgm->aio_context = ctx;
474
new file mode 100644
87
+ atomic_set(&tgm->restart_pending, 0);
475
index XXXXXXX..XXXXXXX
88
476
--- /dev/null
89
qemu_mutex_lock(&tg->lock);
477
+++ b/util/aio-posix.h
90
/* If the ThrottleGroup is new set this ThrottleGroupMember as the token */
478
@@ -XXX,XX +XXX,XX @@
91
@@ -XXX,XX +XXX,XX @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
479
+/*
92
return;
480
+ * AioContext POSIX event loop implementation internal APIs
93
}
481
+ *
94
482
+ * Copyright IBM, Corp. 2008
95
+ /* Wait for throttle_group_restart_queue_entry() coroutines to finish */
483
+ * Copyright Red Hat, Inc. 2020
96
+ AIO_WAIT_WHILE(tgm->aio_context, atomic_read(&tgm->restart_pending) > 0);
484
+ *
97
+
485
+ * Authors:
98
qemu_mutex_lock(&tg->lock);
486
+ * Anthony Liguori <aliguori@us.ibm.com>
99
for (i = 0; i < 2; i++) {
487
+ *
100
assert(tgm->pending_reqs[i] == 0);
488
+ * This work is licensed under the terms of the GNU GPL, version 2. See
489
+ * the COPYING file in the top-level directory.
490
+ *
491
+ * Contributions after 2012-01-13 are licensed under the terms of the
492
+ * GNU GPL, version 2 or (at your option) any later version.
493
+ */
494
+
495
+#ifndef AIO_POSIX_H
496
+#define AIO_POSIX_H
497
+
498
+#include "block/aio.h"
499
+
500
+struct AioHandler {
501
+ GPollFD pfd;
502
+ IOHandler *io_read;
503
+ IOHandler *io_write;
504
+ AioPollFn *io_poll;
505
+ IOHandler *io_poll_begin;
506
+ IOHandler *io_poll_end;
507
+ void *opaque;
508
+ bool is_external;
509
+ QLIST_ENTRY(AioHandler) node;
510
+ QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
511
+ QLIST_ENTRY(AioHandler) node_deleted;
512
+};
513
+
514
+/* Add a handler to a ready list */
515
+void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
516
+ int revents);
517
+
518
+extern const FDMonOps fdmon_poll_ops;
519
+
520
+#ifdef CONFIG_EPOLL_CREATE1
521
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
522
+void fdmon_epoll_setup(AioContext *ctx);
523
+void fdmon_epoll_disable(AioContext *ctx);
524
+#else
525
+static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
526
+{
527
+ return false;
528
+}
529
+
530
+static inline void fdmon_epoll_setup(AioContext *ctx)
531
+{
532
+}
533
+
534
+static inline void fdmon_epoll_disable(AioContext *ctx)
535
+{
536
+}
537
+#endif /* !CONFIG_EPOLL_CREATE1 */
538
+
539
+#endif /* AIO_POSIX_H */
540
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
541
new file mode 100644
542
index XXXXXXX..XXXXXXX
543
--- /dev/null
544
+++ b/util/fdmon-epoll.c
545
@@ -XXX,XX +XXX,XX @@
546
+/* SPDX-License-Identifier: GPL-2.0-or-later */
547
+/*
548
+ * epoll(7) file descriptor monitoring
549
+ */
550
+
551
+#include "qemu/osdep.h"
552
+#include <sys/epoll.h>
553
+#include "qemu/rcu_queue.h"
554
+#include "aio-posix.h"
555
+
556
+/* The fd number threshold to switch to epoll */
557
+#define EPOLL_ENABLE_THRESHOLD 64
558
+
559
+void fdmon_epoll_disable(AioContext *ctx)
560
+{
561
+ if (ctx->epollfd >= 0) {
562
+ close(ctx->epollfd);
563
+ ctx->epollfd = -1;
564
+ }
565
+
566
+ /* Switch back */
567
+ ctx->fdmon_ops = &fdmon_poll_ops;
568
+}
569
+
570
+static inline int epoll_events_from_pfd(int pfd_events)
571
+{
572
+ return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
573
+ (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
574
+ (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
575
+ (pfd_events & G_IO_ERR ? EPOLLERR : 0);
576
+}
577
+
578
+static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
579
+{
580
+ struct epoll_event event;
581
+ int r;
582
+ int ctl;
583
+
584
+ if (!node->pfd.events) {
585
+ ctl = EPOLL_CTL_DEL;
586
+ } else {
587
+ event.data.ptr = node;
588
+ event.events = epoll_events_from_pfd(node->pfd.events);
589
+ ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
590
+ }
591
+
592
+ r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
593
+ if (r) {
594
+ fdmon_epoll_disable(ctx);
595
+ }
596
+}
597
+
598
+static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
599
+ int64_t timeout)
600
+{
601
+ GPollFD pfd = {
602
+ .fd = ctx->epollfd,
603
+ .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
604
+ };
605
+ AioHandler *node;
606
+ int i, ret = 0;
607
+ struct epoll_event events[128];
608
+
609
+ /* Fall back while external clients are disabled */
610
+ if (atomic_read(&ctx->external_disable_cnt)) {
611
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
612
+ }
613
+
614
+ if (timeout > 0) {
615
+ ret = qemu_poll_ns(&pfd, 1, timeout);
616
+ if (ret > 0) {
617
+ timeout = 0;
618
+ }
619
+ }
620
+ if (timeout <= 0 || ret > 0) {
621
+ ret = epoll_wait(ctx->epollfd, events,
622
+ ARRAY_SIZE(events),
623
+ timeout);
624
+ if (ret <= 0) {
625
+ goto out;
626
+ }
627
+ for (i = 0; i < ret; i++) {
628
+ int ev = events[i].events;
629
+ int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
630
+ (ev & EPOLLOUT ? G_IO_OUT : 0) |
631
+ (ev & EPOLLHUP ? G_IO_HUP : 0) |
632
+ (ev & EPOLLERR ? G_IO_ERR : 0);
633
+
634
+ node = events[i].data.ptr;
635
+ aio_add_ready_handler(ready_list, node, revents);
636
+ }
637
+ }
638
+out:
639
+ return ret;
640
+}
641
+
642
+static const FDMonOps fdmon_epoll_ops = {
643
+ .update = fdmon_epoll_update,
644
+ .wait = fdmon_epoll_wait,
645
+};
646
+
647
+static bool fdmon_epoll_try_enable(AioContext *ctx)
648
+{
649
+ AioHandler *node;
650
+ struct epoll_event event;
651
+
652
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
653
+ int r;
654
+ if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
655
+ continue;
656
+ }
657
+ event.events = epoll_events_from_pfd(node->pfd.events);
658
+ event.data.ptr = node;
659
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
660
+ if (r) {
661
+ return false;
662
+ }
663
+ }
664
+
665
+ ctx->fdmon_ops = &fdmon_epoll_ops;
666
+ return true;
667
+}
668
+
669
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
670
+{
671
+ if (ctx->epollfd < 0) {
672
+ return false;
673
+ }
674
+
675
+ /* Do not upgrade while external clients are disabled */
676
+ if (atomic_read(&ctx->external_disable_cnt)) {
677
+ return false;
678
+ }
679
+
680
+ if (npfd >= EPOLL_ENABLE_THRESHOLD) {
681
+ if (fdmon_epoll_try_enable(ctx)) {
682
+ return true;
683
+ } else {
684
+ fdmon_epoll_disable(ctx);
685
+ }
686
+ }
687
+ return false;
688
+}
689
+
690
+void fdmon_epoll_setup(AioContext *ctx)
691
+{
692
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
693
+ if (ctx->epollfd == -1) {
694
+ fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
695
+ }
696
+}
697
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
698
new file mode 100644
699
index XXXXXXX..XXXXXXX
700
--- /dev/null
701
+++ b/util/fdmon-poll.c
702
@@ -XXX,XX +XXX,XX @@
703
+/* SPDX-License-Identifier: GPL-2.0-or-later */
704
+/*
705
+ * poll(2) file descriptor monitoring
706
+ *
707
+ * Uses ppoll(2) when available, g_poll() otherwise.
708
+ */
709
+
710
+#include "qemu/osdep.h"
711
+#include "aio-posix.h"
712
+#include "qemu/rcu_queue.h"
713
+
714
+/*
715
+ * These thread-local variables are used only in fdmon_poll_wait() around the
716
+ * call to the poll() system call. In particular they are not used while
717
+ * aio_poll is performing callbacks, which makes it much easier to think about
718
+ * reentrancy!
719
+ *
720
+ * Stack-allocated arrays would be perfect but they have size limitations;
721
+ * heap allocation is expensive enough that we want to reuse arrays across
722
+ * calls to aio_poll(). And because poll() has to be called without holding
723
+ * any lock, the arrays cannot be stored in AioContext. Thread-local data
724
+ * has none of the disadvantages of these three options.
725
+ */
726
+static __thread GPollFD *pollfds;
727
+static __thread AioHandler **nodes;
728
+static __thread unsigned npfd, nalloc;
729
+static __thread Notifier pollfds_cleanup_notifier;
730
+
731
+static void pollfds_cleanup(Notifier *n, void *unused)
732
+{
733
+ g_assert(npfd == 0);
734
+ g_free(pollfds);
735
+ g_free(nodes);
736
+ nalloc = 0;
737
+}
738
+
739
+static void add_pollfd(AioHandler *node)
740
+{
741
+ if (npfd == nalloc) {
742
+ if (nalloc == 0) {
743
+ pollfds_cleanup_notifier.notify = pollfds_cleanup;
744
+ qemu_thread_atexit_add(&pollfds_cleanup_notifier);
745
+ nalloc = 8;
746
+ } else {
747
+ g_assert(nalloc <= INT_MAX);
748
+ nalloc *= 2;
749
+ }
750
+ pollfds = g_renew(GPollFD, pollfds, nalloc);
751
+ nodes = g_renew(AioHandler *, nodes, nalloc);
752
+ }
753
+ nodes[npfd] = node;
754
+ pollfds[npfd] = (GPollFD) {
755
+ .fd = node->pfd.fd,
756
+ .events = node->pfd.events,
757
+ };
758
+ npfd++;
759
+}
760
+
761
+static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
762
+ int64_t timeout)
763
+{
764
+ AioHandler *node;
765
+ int ret;
766
+
767
+ assert(npfd == 0);
768
+
769
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
770
+ if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
771
+ && aio_node_check(ctx, node->is_external)) {
772
+ add_pollfd(node);
773
+ }
774
+ }
775
+
776
+ /* epoll(7) is faster above a certain number of fds */
777
+ if (fdmon_epoll_try_upgrade(ctx, npfd)) {
778
+ return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
779
+ }
780
+
781
+ ret = qemu_poll_ns(pollfds, npfd, timeout);
782
+ if (ret > 0) {
783
+ int i;
784
+
785
+ for (i = 0; i < npfd; i++) {
786
+ int revents = pollfds[i].revents;
787
+
788
+ if (revents) {
789
+ aio_add_ready_handler(ready_list, nodes[i], revents);
790
+ }
791
+ }
792
+ }
793
+
794
+ npfd = 0;
795
+ return ret;
796
+}
797
+
798
+static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
799
+{
800
+ /* Do nothing, AioHandler already contains the state we'll need */
801
+}
802
+
803
+const FDMonOps fdmon_poll_ops = {
804
+ .update = fdmon_poll_update,
805
+ .wait = fdmon_poll_wait,
806
+};
101
--
807
--
102
2.20.1
808
2.24.1
103
809
104
diff view generated by jsdifflib
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
1
The AioHandler *node, bool is_new arguments are more complicated to
2
think about than simply being given AioHandler *old_node, AioHandler
3
*new_node.
2
4
3
Drop CoSleepCB structure. It's actually unused.
5
Furthermore, the new Linux io_uring file descriptor monitoring mechanism
6
added by the new patch requires access to both the old and the new
7
nodes. Make this change now in preparation.
4
8
5
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
6
Message-id: 20190122143113.20331-1-vsementsov@virtuozzo.com
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com
11
Message-Id: <20200305170806.1313245-5-stefanha@redhat.com>
8
---
12
---
9
util/qemu-coroutine-sleep.c | 27 ++++++++++-----------------
13
include/block/aio.h | 13 ++++++-------
10
1 file changed, 10 insertions(+), 17 deletions(-)
14
util/aio-posix.c | 7 +------
15
util/fdmon-epoll.c | 21 ++++++++++++---------
16
util/fdmon-poll.c | 4 +++-
17
4 files changed, 22 insertions(+), 23 deletions(-)
11
18
12
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
19
diff --git a/include/block/aio.h b/include/block/aio.h
13
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
14
--- a/util/qemu-coroutine-sleep.c
21
--- a/include/block/aio.h
15
+++ b/util/qemu-coroutine-sleep.c
22
+++ b/include/block/aio.h
16
@@ -XXX,XX +XXX,XX @@
23
@@ -XXX,XX +XXX,XX @@ typedef struct {
17
#include "qemu/timer.h"
24
/*
18
#include "block/aio.h"
25
* update:
19
26
* @ctx: the AioContext
20
-typedef struct CoSleepCB {
27
- * @node: the handler
21
- QEMUTimer *ts;
28
- * @is_new: is the file descriptor already being monitored?
22
- Coroutine *co;
29
+ * @old_node: the existing handler or NULL if this file descriptor is being
23
-} CoSleepCB;
30
+ * monitored for the first time
24
-
31
+ * @new_node: the new handler or NULL if this file descriptor is being
25
static void co_sleep_cb(void *opaque)
32
+ * removed
33
*
34
- * Add/remove/modify a monitored file descriptor. There are three cases:
35
- * 1. node->pfd.events == 0 means remove the file descriptor.
36
- * 2. !is_new means modify an already monitored file descriptor.
37
- * 3. is_new means add a new file descriptor.
38
+ * Add/remove/modify a monitored file descriptor.
39
*
40
* Called with ctx->list_lock acquired.
41
*/
42
- void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
43
+ void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
44
45
/*
46
* wait:
47
diff --git a/util/aio-posix.c b/util/aio-posix.c
48
index XXXXXXX..XXXXXXX 100644
49
--- a/util/aio-posix.c
50
+++ b/util/aio-posix.c
51
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
52
atomic_set(&ctx->poll_disable_cnt,
53
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
54
55
- if (new_node) {
56
- ctx->fdmon_ops->update(ctx, new_node, is_new);
57
- } else if (node) {
58
- /* Unregister deleted fd_handler */
59
- ctx->fdmon_ops->update(ctx, node, false);
60
- }
61
+ ctx->fdmon_ops->update(ctx, node, new_node);
62
qemu_lockcnt_unlock(&ctx->list_lock);
63
aio_notify(ctx);
64
65
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
66
index XXXXXXX..XXXXXXX 100644
67
--- a/util/fdmon-epoll.c
68
+++ b/util/fdmon-epoll.c
69
@@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events)
70
(pfd_events & G_IO_ERR ? EPOLLERR : 0);
71
}
72
73
-static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
74
+static void fdmon_epoll_update(AioContext *ctx,
75
+ AioHandler *old_node,
76
+ AioHandler *new_node)
26
{
77
{
27
- CoSleepCB *sleep_cb = opaque;
78
- struct epoll_event event;
28
+ Coroutine *co = opaque;
79
+ struct epoll_event event = {
29
80
+ .data.ptr = new_node,
30
/* Write of schedule protected by barrier write in aio_co_schedule */
81
+ .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
31
- atomic_set(&sleep_cb->co->scheduled, NULL);
82
+ };
32
- aio_co_wake(sleep_cb->co);
83
int r;
33
+ atomic_set(&co->scheduled, NULL);
84
- int ctl;
34
+ aio_co_wake(co);
85
86
- if (!node->pfd.events) {
87
- ctl = EPOLL_CTL_DEL;
88
+ if (!new_node) {
89
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
90
+ } else if (!old_node) {
91
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
92
} else {
93
- event.data.ptr = node;
94
- event.events = epoll_events_from_pfd(node->pfd.events);
95
- ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
96
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
97
}
98
99
- r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
100
if (r) {
101
fdmon_epoll_disable(ctx);
102
}
103
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
104
index XXXXXXX..XXXXXXX 100644
105
--- a/util/fdmon-poll.c
106
+++ b/util/fdmon-poll.c
107
@@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
108
return ret;
35
}
109
}
36
110
37
void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns)
111
-static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
112
+static void fdmon_poll_update(AioContext *ctx,
113
+ AioHandler *old_node,
114
+ AioHandler *new_node)
38
{
115
{
39
AioContext *ctx = qemu_get_current_aio_context();
116
/* Do nothing, AioHandler already contains the state we'll need */
40
- CoSleepCB sleep_cb = {
41
- .co = qemu_coroutine_self(),
42
- };
43
+ QEMUTimer *ts;
44
+ Coroutine *co = qemu_coroutine_self();
45
46
- const char *scheduled = atomic_cmpxchg(&sleep_cb.co->scheduled, NULL,
47
- __func__);
48
+ const char *scheduled = atomic_cmpxchg(&co->scheduled, NULL, __func__);
49
if (scheduled) {
50
fprintf(stderr,
51
"%s: Co-routine was already scheduled in '%s'\n",
52
__func__, scheduled);
53
abort();
54
}
55
- sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
56
- timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
57
+ ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, co);
58
+ timer_mod(ts, qemu_clock_get_ns(type) + ns);
59
qemu_coroutine_yield();
60
- timer_del(sleep_cb.ts);
61
- timer_free(sleep_cb.ts);
62
+ timer_del(ts);
63
+ timer_free(ts);
64
}
117
}
65
--
118
--
66
2.20.1
119
2.24.1
67
120
68
diff view generated by jsdifflib
1
Hot-unplug a scsi-hd using an iothread. The previous patch fixes a
1
The recent Linux io_uring API has several advantages over ppoll(2) and
2
segfault in this scenario.
2
epoll(2). Details are given in the source code.
3
3
4
This patch adds a regression test.
4
Add an io_uring implementation and make it the default on Linux.
5
Performance is the same as with epoll(7) but later patches add
6
optimizations that take advantage of io_uring.
5
7
6
Suggested-by: Alberto Garcia <berto@igalia.com>
8
It is necessary to change how aio_set_fd_handler() deals with deleting
7
Suggested-by: Kevin Wolf <kwolf@redhat.com>
9
AioHandlers since removing monitored file descriptors is asynchronous in
10
io_uring. fdmon_io_uring_remove() marks the AioHandler deleted and
11
aio_set_fd_handler() will let it handle deletion in that case.
12
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Alberto Garcia <berto@igalia.com>
14
Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com
10
Message-id: 20190114133257.30299-3-stefanha@redhat.com
15
Message-Id: <20200305170806.1313245-6-stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
16
---
13
tests/qemu-iotests/238 | 47 ++++++++++++++++++++++++++++++++++++++
17
configure | 5 +
14
tests/qemu-iotests/238.out | 6 +++++
18
include/block/aio.h | 9 ++
15
tests/qemu-iotests/group | 1 +
19
util/Makefile.objs | 1 +
16
3 files changed, 54 insertions(+)
20
util/aio-posix.c | 20 ++-
17
create mode 100755 tests/qemu-iotests/238
21
util/aio-posix.h | 20 ++-
18
create mode 100644 tests/qemu-iotests/238.out
22
util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++
23
6 files changed, 376 insertions(+), 5 deletions(-)
24
create mode 100644 util/fdmon-io_uring.c
19
25
20
diff --git a/tests/qemu-iotests/238 b/tests/qemu-iotests/238
26
diff --git a/configure b/configure
21
new file mode 100755
27
index XXXXXXX..XXXXXXX 100755
22
index XXXXXXX..XXXXXXX
28
--- a/configure
23
--- /dev/null
29
+++ b/configure
24
+++ b/tests/qemu-iotests/238
30
@@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then
31
linux_io_uring_cflags=$($pkg_config --cflags liburing)
32
linux_io_uring_libs=$($pkg_config --libs liburing)
33
linux_io_uring=yes
34
+
35
+ # io_uring is used in libqemuutil.a where per-file -libs variables are not
36
+ # seen by programs linking the archive. It's not ideal, but just add the
37
+ # library dependency globally.
38
+ LIBS="$linux_io_uring_libs $LIBS"
39
else
40
if test "$linux_io_uring" = "yes" ; then
41
feature_not_found "linux io_uring" "Install liburing devel"
42
diff --git a/include/block/aio.h b/include/block/aio.h
43
index XXXXXXX..XXXXXXX 100644
44
--- a/include/block/aio.h
45
+++ b/include/block/aio.h
25
@@ -XXX,XX +XXX,XX @@
46
@@ -XXX,XX +XXX,XX @@
26
+#!/usr/bin/env python
47
#ifndef QEMU_AIO_H
27
+#
48
#define QEMU_AIO_H
28
+# Regression test for throttle group member unregister segfault with iothread
49
29
+#
50
+#ifdef CONFIG_LINUX_IO_URING
30
+# Copyright (c) 2019 Red Hat, Inc.
51
+#include <liburing.h>
31
+#
52
+#endif
32
+# This program is free software; you can redistribute it and/or modify
53
#include "qemu/queue.h"
33
+# it under the terms of the GNU General Public License as published by
54
#include "qemu/event_notifier.h"
34
+# the Free Software Foundation; either version 2 of the License, or
55
#include "qemu/thread.h"
35
+# (at your option) any later version.
56
@@ -XXX,XX +XXX,XX @@ struct BHListSlice {
36
+#
57
QSIMPLEQ_ENTRY(BHListSlice) next;
37
+# This program is distributed in the hope that it will be useful,
58
};
38
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
59
39
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
60
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
40
+# GNU General Public License for more details.
61
+
41
+#
62
struct AioContext {
42
+# You should have received a copy of the GNU General Public License
63
GSource source;
43
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
64
44
+#
65
@@ -XXX,XX +XXX,XX @@ struct AioContext {
45
+
66
* locking.
46
+import sys
67
*/
47
+import os
68
struct LuringState *linux_io_uring;
48
+import iotests
69
+
49
+from iotests import log
70
+ /* State for file descriptor monitoring using Linux io_uring */
50
+
71
+ struct io_uring fdmon_io_uring;
51
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'scripts'))
72
+ AioHandlerSList submit_list;
52
+
73
#endif
53
+from qemu import QEMUMachine
74
54
+
75
/* TimerLists for calling timers - one per clock type. Has its own
55
+if iotests.qemu_default_machine == 's390-ccw-virtio':
76
diff --git a/util/Makefile.objs b/util/Makefile.objs
56
+ virtio_scsi_device = 'virtio-scsi-ccw'
77
index XXXXXXX..XXXXXXX 100644
57
+else:
78
--- a/util/Makefile.objs
58
+ virtio_scsi_device = 'virtio-scsi-pci'
79
+++ b/util/Makefile.objs
59
+
80
@@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
60
+vm = QEMUMachine(iotests.qemu_prog)
81
util-obj-$(CONFIG_POSIX) += aio-posix.o
61
+vm.add_args('-machine', 'accel=kvm')
82
util-obj-$(CONFIG_POSIX) += fdmon-poll.o
62
+vm.launch()
83
util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
63
+
84
+util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o
64
+log(vm.qmp('blockdev-add', node_name='hd0', driver='null-co'))
85
util-obj-$(CONFIG_POSIX) += compatfd.o
65
+log(vm.qmp('object-add', qom_type='iothread', id='iothread0'))
86
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
66
+log(vm.qmp('device_add', id='scsi0', driver=virtio_scsi_device, iothread='iothread0'))
87
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
67
+log(vm.qmp('device_add', id='scsi-hd0', driver='scsi-hd', drive='hd0'))
88
diff --git a/util/aio-posix.c b/util/aio-posix.c
68
+log(vm.qmp('block_set_io_throttle', id='scsi-hd0', bps=0, bps_rd=0, bps_wr=0,
89
index XXXXXXX..XXXXXXX 100644
69
+ iops=1000, iops_rd=0, iops_wr=0, conv_keys=False))
90
--- a/util/aio-posix.c
70
+log(vm.qmp('device_del', id='scsi-hd0'))
91
+++ b/util/aio-posix.c
71
+
92
@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
72
+vm.shutdown()
93
g_source_remove_poll(&ctx->source, &node->pfd);
73
diff --git a/tests/qemu-iotests/238.out b/tests/qemu-iotests/238.out
94
}
95
96
+ node->pfd.revents = 0;
97
+
98
+ /* If the fd monitor has already marked it deleted, leave it alone */
99
+ if (QLIST_IS_INSERTED(node, node_deleted)) {
100
+ return false;
101
+ }
102
+
103
/* If a read is in progress, just mark the node as deleted */
104
if (qemu_lockcnt_count(&ctx->list_lock)) {
105
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
106
- node->pfd.revents = 0;
107
return false;
108
}
109
/* Otherwise, delete it for real. We can't just mark it as
110
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
111
112
QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
113
}
114
- if (node) {
115
- deleted = aio_remove_fd_handler(ctx, node);
116
- }
117
118
/* No need to order poll_disable_cnt writes against other updates;
119
* the counter is only used to avoid wasting time and latency on
120
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
121
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
122
123
ctx->fdmon_ops->update(ctx, node, new_node);
124
+ if (node) {
125
+ deleted = aio_remove_fd_handler(ctx, node);
126
+ }
127
qemu_lockcnt_unlock(&ctx->list_lock);
128
aio_notify(ctx);
129
130
@@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx)
131
ctx->fdmon_ops = &fdmon_poll_ops;
132
ctx->epollfd = -1;
133
134
+ /* Use the fastest fd monitoring implementation if available */
135
+ if (fdmon_io_uring_setup(ctx)) {
136
+ return;
137
+ }
138
+
139
fdmon_epoll_setup(ctx);
140
}
141
142
void aio_context_destroy(AioContext *ctx)
143
{
144
+ fdmon_io_uring_destroy(ctx);
145
fdmon_epoll_disable(ctx);
146
}
147
148
diff --git a/util/aio-posix.h b/util/aio-posix.h
149
index XXXXXXX..XXXXXXX 100644
150
--- a/util/aio-posix.h
151
+++ b/util/aio-posix.h
152
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
153
IOHandler *io_poll_begin;
154
IOHandler *io_poll_end;
155
void *opaque;
156
- bool is_external;
157
QLIST_ENTRY(AioHandler) node;
158
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
159
QLIST_ENTRY(AioHandler) node_deleted;
160
+#ifdef CONFIG_LINUX_IO_URING
161
+ QSLIST_ENTRY(AioHandler) node_submitted;
162
+ unsigned flags; /* see fdmon-io_uring.c */
163
+#endif
164
+ bool is_external;
165
};
166
167
/* Add a handler to a ready list */
168
@@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx)
169
}
170
#endif /* !CONFIG_EPOLL_CREATE1 */
171
172
+#ifdef CONFIG_LINUX_IO_URING
173
+bool fdmon_io_uring_setup(AioContext *ctx);
174
+void fdmon_io_uring_destroy(AioContext *ctx);
175
+#else
176
+static inline bool fdmon_io_uring_setup(AioContext *ctx)
177
+{
178
+ return false;
179
+}
180
+
181
+static inline void fdmon_io_uring_destroy(AioContext *ctx)
182
+{
183
+}
184
+#endif /* !CONFIG_LINUX_IO_URING */
185
+
186
#endif /* AIO_POSIX_H */
187
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
74
new file mode 100644
188
new file mode 100644
75
index XXXXXXX..XXXXXXX
189
index XXXXXXX..XXXXXXX
76
--- /dev/null
190
--- /dev/null
77
+++ b/tests/qemu-iotests/238.out
191
+++ b/util/fdmon-io_uring.c
78
@@ -XXX,XX +XXX,XX @@
192
@@ -XXX,XX +XXX,XX @@
79
+{"return": {}}
193
+/* SPDX-License-Identifier: GPL-2.0-or-later */
80
+{"return": {}}
194
+/*
81
+{"return": {}}
195
+ * Linux io_uring file descriptor monitoring
82
+{"return": {}}
196
+ *
83
+{"return": {}}
197
+ * The Linux io_uring API supports file descriptor monitoring with a few
84
+{"return": {}}
198
+ * advantages over existing APIs like poll(2) and epoll(7):
85
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
199
+ *
86
index XXXXXXX..XXXXXXX 100644
200
+ * 1. Userspace polling of events is possible because the completion queue (cq
87
--- a/tests/qemu-iotests/group
201
+ * ring) is shared between the kernel and userspace. This allows
88
+++ b/tests/qemu-iotests/group
202
+ * applications that rely on userspace polling to also monitor file
89
@@ -XXX,XX +XXX,XX @@
203
+ * descriptors in the same userspace polling loop.
90
234 auto quick migration
204
+ *
91
235 auto quick
205
+ * 2. Submission and completion is batched and done together in a single system
92
236 auto quick
206
+ * call. This minimizes the number of system calls.
93
+238 auto quick
207
+ *
208
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
209
+ * poll(2).
210
+ *
211
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
212
+ * epoll(7).
213
+ *
214
+ * This code only monitors file descriptors and does not do asynchronous disk
215
+ * I/O. Implementing disk I/O efficiently has other requirements and should
216
+ * use a separate io_uring so it does not make sense to unify the code.
217
+ *
218
+ * File descriptor monitoring is implemented using the following operations:
219
+ *
220
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
221
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When
222
+ * the poll mask changes for a file descriptor it is first removed and then
223
+ * re-added with the new poll mask, so this operation is also used as part
224
+ * of modifying an existing monitored file descriptor.
225
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
226
+ * for events. This operation self-cancels if another event completes
227
+ * before the timeout.
228
+ *
229
+ * io_uring calls the submission queue the "sq ring" and the completion queue
230
+ * the "cq ring". Ring entries are called "sqe" and "cqe", respectively.
231
+ *
232
+ * The code is structured so that sq/cq rings are only modified within
233
+ * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on
234
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
235
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
236
+ */
237
+
238
+#include "qemu/osdep.h"
239
+#include <poll.h>
240
+#include "qemu/rcu_queue.h"
241
+#include "aio-posix.h"
242
+
243
+enum {
244
+ FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
245
+
246
+ /* AioHandler::flags */
247
+ FDMON_IO_URING_PENDING = (1 << 0),
248
+ FDMON_IO_URING_ADD = (1 << 1),
249
+ FDMON_IO_URING_REMOVE = (1 << 2),
250
+};
251
+
252
+static inline int poll_events_from_pfd(int pfd_events)
253
+{
254
+ return (pfd_events & G_IO_IN ? POLLIN : 0) |
255
+ (pfd_events & G_IO_OUT ? POLLOUT : 0) |
256
+ (pfd_events & G_IO_HUP ? POLLHUP : 0) |
257
+ (pfd_events & G_IO_ERR ? POLLERR : 0);
258
+}
259
+
260
+static inline int pfd_events_from_poll(int poll_events)
261
+{
262
+ return (poll_events & POLLIN ? G_IO_IN : 0) |
263
+ (poll_events & POLLOUT ? G_IO_OUT : 0) |
264
+ (poll_events & POLLHUP ? G_IO_HUP : 0) |
265
+ (poll_events & POLLERR ? G_IO_ERR : 0);
266
+}
267
+
268
+/*
269
+ * Returns an sqe for submitting a request. Only be called within
270
+ * fdmon_io_uring_wait().
271
+ */
272
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
273
+{
274
+ struct io_uring *ring = &ctx->fdmon_io_uring;
275
+ struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
276
+ int ret;
277
+
278
+ if (likely(sqe)) {
279
+ return sqe;
280
+ }
281
+
282
+ /* No free sqes left, submit pending sqes first */
283
+ ret = io_uring_submit(ring);
284
+ assert(ret > 1);
285
+ sqe = io_uring_get_sqe(ring);
286
+ assert(sqe);
287
+ return sqe;
288
+}
289
+
290
+/* Atomically enqueue an AioHandler for sq ring submission */
291
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
292
+{
293
+ unsigned old_flags;
294
+
295
+ old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
296
+ if (!(old_flags & FDMON_IO_URING_PENDING)) {
297
+ QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
298
+ }
299
+}
300
+
301
+/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */
302
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
303
+{
304
+ AioHandler *node = QSLIST_FIRST(head);
305
+
306
+ if (!node) {
307
+ return NULL;
308
+ }
309
+
310
+ /* Doesn't need to be atomic since fill_sq_ring() moves the list */
311
+ QSLIST_REMOVE_HEAD(head, node_submitted);
312
+
313
+ /*
314
+ * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two
315
+ * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
316
+ * telling process_cqe() to delete the AioHandler when its
317
+ * IORING_OP_POLL_ADD completes.
318
+ */
319
+ *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
320
+ FDMON_IO_URING_ADD));
321
+ return node;
322
+}
323
+
324
+static void fdmon_io_uring_update(AioContext *ctx,
325
+ AioHandler *old_node,
326
+ AioHandler *new_node)
327
+{
328
+ if (new_node) {
329
+ enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
330
+ }
331
+
332
+ if (old_node) {
333
+ /*
334
+ * Deletion is tricky because IORING_OP_POLL_ADD and
335
+ * IORING_OP_POLL_REMOVE are async. We need to wait for the original
336
+ * IORING_OP_POLL_ADD to complete before this handler can be freed
337
+ * safely.
338
+ *
339
+ * It's possible that the file descriptor becomes ready and the
340
+ * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
341
+ * submitted, too.
342
+ *
343
+ * Mark this handler deleted right now but don't place it on
344
+ * ctx->deleted_aio_handlers yet. Instead, manually fudge the list
345
+ * entry to make QLIST_IS_INSERTED() think this handler has been
346
+ * inserted and other code recognizes this AioHandler as deleted.
347
+ *
348
+ * Once the original IORING_OP_POLL_ADD completes we enqueue the
349
+ * handler on the real ctx->deleted_aio_handlers list to be freed.
350
+ */
351
+ assert(!QLIST_IS_INSERTED(old_node, node_deleted));
352
+ old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
353
+
354
+ enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
355
+ }
356
+}
357
+
358
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
359
+{
360
+ struct io_uring_sqe *sqe = get_sqe(ctx);
361
+ int events = poll_events_from_pfd(node->pfd.events);
362
+
363
+ io_uring_prep_poll_add(sqe, node->pfd.fd, events);
364
+ io_uring_sqe_set_data(sqe, node);
365
+}
366
+
367
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
368
+{
369
+ struct io_uring_sqe *sqe = get_sqe(ctx);
370
+
371
+ io_uring_prep_poll_remove(sqe, node);
372
+}
373
+
374
+/* Add a timeout that self-cancels when another cqe becomes ready */
375
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
376
+{
377
+ struct io_uring_sqe *sqe;
378
+ struct __kernel_timespec ts = {
379
+ .tv_sec = ns / NANOSECONDS_PER_SECOND,
380
+ .tv_nsec = ns % NANOSECONDS_PER_SECOND,
381
+ };
382
+
383
+ sqe = get_sqe(ctx);
384
+ io_uring_prep_timeout(sqe, &ts, 1, 0);
385
+}
386
+
387
+/* Add sqes from ctx->submit_list for submission */
388
+static void fill_sq_ring(AioContext *ctx)
389
+{
390
+ AioHandlerSList submit_list;
391
+ AioHandler *node;
392
+ unsigned flags;
393
+
394
+ QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
395
+
396
+ while ((node = dequeue(&submit_list, &flags))) {
397
+ /* Order matters, just in case both flags were set */
398
+ if (flags & FDMON_IO_URING_ADD) {
399
+ add_poll_add_sqe(ctx, node);
400
+ }
401
+ if (flags & FDMON_IO_URING_REMOVE) {
402
+ add_poll_remove_sqe(ctx, node);
403
+ }
404
+ }
405
+}
406
+
407
+/* Returns true if a handler became ready */
408
+static bool process_cqe(AioContext *ctx,
409
+ AioHandlerList *ready_list,
410
+ struct io_uring_cqe *cqe)
411
+{
412
+ AioHandler *node = io_uring_cqe_get_data(cqe);
413
+ unsigned flags;
414
+
415
+ /* poll_timeout and poll_remove have a zero user_data field */
416
+ if (!node) {
417
+ return false;
418
+ }
419
+
420
+ /*
421
+ * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
422
+ * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
423
+ * bit before IORING_OP_POLL_REMOVE is submitted.
424
+ */
425
+ flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
426
+ if (flags & FDMON_IO_URING_REMOVE) {
427
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
428
+ return false;
429
+ }
430
+
431
+ aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
432
+
433
+ /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
434
+ add_poll_add_sqe(ctx, node);
435
+ return true;
436
+}
437
+
438
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
439
+{
440
+ struct io_uring *ring = &ctx->fdmon_io_uring;
441
+ struct io_uring_cqe *cqe;
442
+ unsigned num_cqes = 0;
443
+ unsigned num_ready = 0;
444
+ unsigned head;
445
+
446
+ io_uring_for_each_cqe(ring, head, cqe) {
447
+ if (process_cqe(ctx, ready_list, cqe)) {
448
+ num_ready++;
449
+ }
450
+
451
+ num_cqes++;
452
+ }
453
+
454
+ io_uring_cq_advance(ring, num_cqes);
455
+ return num_ready;
456
+}
457
+
458
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
459
+ int64_t timeout)
460
+{
461
+ unsigned wait_nr = 1; /* block until at least one cqe is ready */
462
+ int ret;
463
+
464
+ /* Fall back while external clients are disabled */
465
+ if (atomic_read(&ctx->external_disable_cnt)) {
466
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
467
+ }
468
+
469
+ if (timeout == 0) {
470
+ wait_nr = 0; /* non-blocking */
471
+ } else if (timeout > 0) {
472
+ add_timeout_sqe(ctx, timeout);
473
+ }
474
+
475
+ fill_sq_ring(ctx);
476
+
477
+ ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
478
+ assert(ret >= 0);
479
+
480
+ return process_cq_ring(ctx, ready_list);
481
+}
482
+
483
+static const FDMonOps fdmon_io_uring_ops = {
484
+ .update = fdmon_io_uring_update,
485
+ .wait = fdmon_io_uring_wait,
486
+};
487
+
488
+bool fdmon_io_uring_setup(AioContext *ctx)
489
+{
490
+ int ret;
491
+
492
+ ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
493
+ if (ret != 0) {
494
+ return false;
495
+ }
496
+
497
+ QSLIST_INIT(&ctx->submit_list);
498
+ ctx->fdmon_ops = &fdmon_io_uring_ops;
499
+ return true;
500
+}
501
+
502
+void fdmon_io_uring_destroy(AioContext *ctx)
503
+{
504
+ if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
505
+ AioHandler *node;
506
+
507
+ io_uring_queue_exit(&ctx->fdmon_io_uring);
508
+
509
+ /* No need to submit these anymore, just free them. */
510
+ while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
511
+ QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
512
+ QLIST_REMOVE(node, node);
513
+ g_free(node);
514
+ }
515
+
516
+ ctx->fdmon_ops = &fdmon_poll_ops;
517
+ }
518
+}
94
--
519
--
95
2.20.1
520
2.24.1
96
521
97
diff view generated by jsdifflib
New patch
1
Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled
2
from userspace. Previously userspace polling was only allowed when all
3
AioHandler's had an ->io_poll() callback. This prevented starvation of
4
fds by userspace pollable handlers.
1
5
6
Add the FDMonOps->need_wait() callback that enables userspace polling
7
even when some AioHandlers lack ->io_poll().
8
9
For example, it's now possible to do userspace polling when a TCP/IP
10
socket is monitored thanks to Linux io_uring.
11
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com
14
Message-Id: <20200305170806.1313245-7-stefanha@redhat.com>
15
---
16
include/block/aio.h | 19 +++++++++++++++++++
17
util/aio-posix.c | 11 ++++++++---
18
util/fdmon-epoll.c | 1 +
19
util/fdmon-io_uring.c | 6 ++++++
20
util/fdmon-poll.c | 1 +
21
5 files changed, 35 insertions(+), 3 deletions(-)
22
23
diff --git a/include/block/aio.h b/include/block/aio.h
24
index XXXXXXX..XXXXXXX 100644
25
--- a/include/block/aio.h
26
+++ b/include/block/aio.h
27
@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
28
struct LinuxAioState;
29
struct LuringState;
30
31
+/* Is polling disabled? */
32
+bool aio_poll_disabled(AioContext *ctx);
33
+
34
/* Callbacks for file descriptor monitoring implementations */
35
typedef struct {
36
/*
37
@@ -XXX,XX +XXX,XX @@ typedef struct {
38
* Returns: number of ready file descriptors.
39
*/
40
int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
41
+
42
+ /*
43
+ * need_wait:
44
+ * @ctx: the AioContext
45
+ *
46
+ * Tell aio_poll() when to stop userspace polling early because ->wait()
47
+ * has fds ready.
48
+ *
49
+ * File descriptor monitoring implementations that cannot poll fd readiness
50
+ * from userspace should use aio_poll_disabled() here. This ensures that
51
+ * file descriptors are not starved by handlers that frequently make
52
+ * progress via userspace polling.
53
+ *
54
+ * Returns: true if ->wait() should be called, false otherwise.
55
+ */
56
+ bool (*need_wait)(AioContext *ctx);
57
} FDMonOps;
58
59
/*
60
diff --git a/util/aio-posix.c b/util/aio-posix.c
61
index XXXXXXX..XXXXXXX 100644
62
--- a/util/aio-posix.c
63
+++ b/util/aio-posix.c
64
@@ -XXX,XX +XXX,XX @@
65
#include "trace.h"
66
#include "aio-posix.h"
67
68
+bool aio_poll_disabled(AioContext *ctx)
69
+{
70
+ return atomic_read(&ctx->poll_disable_cnt);
71
+}
72
+
73
void aio_add_ready_handler(AioHandlerList *ready_list,
74
AioHandler *node,
75
int revents)
76
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
77
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
78
max_ns = qemu_soonest_timeout(*timeout, max_ns);
79
assert(!(max_ns && progress));
80
- } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
81
+ } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
82
83
/* If time has passed with no successful polling, adjust *timeout to
84
* keep the same ending time.
85
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
86
{
87
int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
88
89
- if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
90
+ if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
91
poll_set_started(ctx, true);
92
93
if (run_poll_handlers(ctx, max_ns, timeout)) {
94
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
95
/* If polling is allowed, non-blocking aio_poll does not need the
96
* system call---a single round of run_poll_handlers_once suffices.
97
*/
98
- if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
99
+ if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
100
ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
101
}
102
103
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
104
index XXXXXXX..XXXXXXX 100644
105
--- a/util/fdmon-epoll.c
106
+++ b/util/fdmon-epoll.c
107
@@ -XXX,XX +XXX,XX @@ out:
108
static const FDMonOps fdmon_epoll_ops = {
109
.update = fdmon_epoll_update,
110
.wait = fdmon_epoll_wait,
111
+ .need_wait = aio_poll_disabled,
112
};
113
114
static bool fdmon_epoll_try_enable(AioContext *ctx)
115
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
116
index XXXXXXX..XXXXXXX 100644
117
--- a/util/fdmon-io_uring.c
118
+++ b/util/fdmon-io_uring.c
119
@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
120
return process_cq_ring(ctx, ready_list);
121
}
122
123
+static bool fdmon_io_uring_need_wait(AioContext *ctx)
124
+{
125
+ return io_uring_cq_ready(&ctx->fdmon_io_uring);
126
+}
127
+
128
static const FDMonOps fdmon_io_uring_ops = {
129
.update = fdmon_io_uring_update,
130
.wait = fdmon_io_uring_wait,
131
+ .need_wait = fdmon_io_uring_need_wait,
132
};
133
134
bool fdmon_io_uring_setup(AioContext *ctx)
135
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
136
index XXXXXXX..XXXXXXX 100644
137
--- a/util/fdmon-poll.c
138
+++ b/util/fdmon-poll.c
139
@@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx,
140
const FDMonOps fdmon_poll_ops = {
141
.update = fdmon_poll_update,
142
.wait = fdmon_poll_wait,
143
+ .need_wait = aio_poll_disabled,
144
};
145
--
146
2.24.1
147
diff view generated by jsdifflib
New patch
1
1
When there are many poll handlers it's likely that some of them are idle
2
most of the time. Remove handlers that haven't had activity recently so
3
that the polling loop scales better for guests with a large number of
4
devices.
5
6
This feature only takes effect for the Linux io_uring fd monitoring
7
implementation because it is capable of combining fd monitoring with
8
userspace polling. The other implementations can't do that and risk
9
starving fds in favor of poll handlers, so don't try this optimization
10
when they are in use.
11
12
IOPS improves from 10k to 105k when the guest has 100
13
virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
14
device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe.
15
16
[Clarified aio_poll_handlers locking discipline explanation in comment
17
after discussion with Paolo Bonzini <pbonzini@redhat.com>.
18
--Stefan]
19
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com
22
Message-Id: <20200305170806.1313245-8-stefanha@redhat.com>
23
---
24
include/block/aio.h | 8 ++++
25
util/aio-posix.c | 93 +++++++++++++++++++++++++++++++++++++++++----
26
util/aio-posix.h | 2 +
27
util/trace-events | 2 +
28
4 files changed, 98 insertions(+), 7 deletions(-)
29
30
diff --git a/include/block/aio.h b/include/block/aio.h
31
index XXXXXXX..XXXXXXX 100644
32
--- a/include/block/aio.h
33
+++ b/include/block/aio.h
34
@@ -XXX,XX +XXX,XX @@ struct AioContext {
35
int64_t poll_grow; /* polling time growth factor */
36
int64_t poll_shrink; /* polling time shrink factor */
37
38
+ /*
39
+ * List of handlers participating in userspace polling. Protected by
40
+ * ctx->list_lock. Iterated and modified mostly by the event loop thread
41
+ * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
42
+ * only touches the list to delete nodes if ctx->list_lock's count is zero.
43
+ */
44
+ AioHandlerList poll_aio_handlers;
45
+
46
/* Are we in polling mode or monitoring file descriptors? */
47
bool poll_started;
48
49
diff --git a/util/aio-posix.c b/util/aio-posix.c
50
index XXXXXXX..XXXXXXX 100644
51
--- a/util/aio-posix.c
52
+++ b/util/aio-posix.c
53
@@ -XXX,XX +XXX,XX @@
54
#include "trace.h"
55
#include "aio-posix.h"
56
57
+/* Stop userspace polling on a handler if it isn't active for some time */
58
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
59
+
60
bool aio_poll_disabled(AioContext *ctx)
61
{
62
return atomic_read(&ctx->poll_disable_cnt);
63
@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
64
* deleted because deleted nodes are only cleaned up while
65
* no one is walking the handlers list.
66
*/
67
+ QLIST_SAFE_REMOVE(node, node_poll);
68
QLIST_REMOVE(node, node);
69
return true;
70
}
71
@@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started)
72
ctx->poll_started = started;
73
74
qemu_lockcnt_inc(&ctx->list_lock);
75
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
76
+ QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
77
IOHandler *fn;
78
79
if (QLIST_IS_INSERTED(node, node_deleted)) {
80
@@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx)
81
while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
82
QLIST_REMOVE(node, node);
83
QLIST_REMOVE(node, node_deleted);
84
+ QLIST_SAFE_REMOVE(node, node_poll);
85
g_free(node);
86
}
87
88
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
89
revents = node->pfd.revents & node->pfd.events;
90
node->pfd.revents = 0;
91
92
+ /*
93
+ * Start polling AioHandlers when they become ready because activity is
94
+ * likely to continue. Note that starvation is theoretically possible when
95
+ * fdmon_supports_polling(), but only until the fd fires for the first
96
+ * time.
97
+ */
98
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
99
+ !QLIST_IS_INSERTED(node, node_poll) &&
100
+ node->io_poll) {
101
+ trace_poll_add(ctx, node, node->pfd.fd, revents);
102
+ if (ctx->poll_started && node->io_poll_begin) {
103
+ node->io_poll_begin(node->opaque);
104
+ }
105
+ QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
106
+ }
107
+
108
if (!QLIST_IS_INSERTED(node, node_deleted) &&
109
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
110
aio_node_check(ctx, node->is_external) &&
111
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
112
timerlistgroup_run_timers(&ctx->tlg);
113
}
114
115
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
116
+static bool run_poll_handlers_once(AioContext *ctx,
117
+ int64_t now,
118
+ int64_t *timeout)
119
{
120
bool progress = false;
121
AioHandler *node;
122
+ AioHandler *tmp;
123
124
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
125
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
126
- aio_node_check(ctx, node->is_external) &&
127
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
128
+ if (aio_node_check(ctx, node->is_external) &&
129
node->io_poll(node->opaque)) {
130
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
131
+
132
/*
133
* Polling was successful, exit try_poll_mode immediately
134
* to adjust the next polling time.
135
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
136
return progress;
137
}
138
139
+static bool fdmon_supports_polling(AioContext *ctx)
140
+{
141
+ return ctx->fdmon_ops->need_wait != aio_poll_disabled;
142
+}
143
+
144
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
145
+{
146
+ AioHandler *node;
147
+ AioHandler *tmp;
148
+ bool progress = false;
149
+
150
+ /*
151
+ * File descriptor monitoring implementations without userspace polling
152
+ * support suffer from starvation when a subset of handlers is polled
153
+ * because fds will not be processed in a timely fashion. Don't remove
154
+ * idle poll handlers.
155
+ */
156
+ if (!fdmon_supports_polling(ctx)) {
157
+ return false;
158
+ }
159
+
160
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
161
+ if (node->poll_idle_timeout == 0LL) {
162
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
163
+ } else if (now >= node->poll_idle_timeout) {
164
+ trace_poll_remove(ctx, node, node->pfd.fd);
165
+ node->poll_idle_timeout = 0LL;
166
+ QLIST_SAFE_REMOVE(node, node_poll);
167
+ if (ctx->poll_started && node->io_poll_end) {
168
+ node->io_poll_end(node->opaque);
169
+
170
+ /*
171
+ * Final poll in case ->io_poll_end() races with an event.
172
+ * Nevermind about re-adding the handler in the rare case where
173
+ * this causes progress.
174
+ */
175
+ progress = node->io_poll(node->opaque) || progress;
176
+ }
177
+ }
178
+ }
179
+
180
+ return progress;
181
+}
182
+
183
/* run_poll_handlers:
184
* @ctx: the AioContext
185
* @max_ns: maximum time to poll for, in nanoseconds
186
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
187
188
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
189
do {
190
- progress = run_poll_handlers_once(ctx, timeout);
191
+ progress = run_poll_handlers_once(ctx, start_time, timeout);
192
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
193
max_ns = qemu_soonest_timeout(*timeout, max_ns);
194
assert(!(max_ns && progress));
195
} while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
196
197
+ if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
198
+ *timeout = 0;
199
+ progress = true;
200
+ }
201
+
202
/* If time has passed with no successful polling, adjust *timeout to
203
* keep the same ending time.
204
*/
205
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
206
*/
207
static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
208
{
209
- int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
210
+ int64_t max_ns;
211
+
212
+ if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
213
+ return false;
214
+ }
215
216
+ max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
217
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
218
poll_set_started(ctx, true);
219
220
diff --git a/util/aio-posix.h b/util/aio-posix.h
221
index XXXXXXX..XXXXXXX 100644
222
--- a/util/aio-posix.h
223
+++ b/util/aio-posix.h
224
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
225
QLIST_ENTRY(AioHandler) node;
226
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
227
QLIST_ENTRY(AioHandler) node_deleted;
228
+ QLIST_ENTRY(AioHandler) node_poll;
229
#ifdef CONFIG_LINUX_IO_URING
230
QSLIST_ENTRY(AioHandler) node_submitted;
231
unsigned flags; /* see fdmon-io_uring.c */
232
#endif
233
+ int64_t poll_idle_timeout; /* when to stop userspace polling */
234
bool is_external;
235
};
236
237
diff --git a/util/trace-events b/util/trace-events
238
index XXXXXXX..XXXXXXX 100644
239
--- a/util/trace-events
240
+++ b/util/trace-events
241
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
242
run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
243
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
244
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
245
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
246
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
247
248
# async.c
249
aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
250
--
251
2.24.1
252
diff view generated by jsdifflib