1 | The following changes since commit d0ed6a69d399ae193959225cdeaa9382746c91cc: | 1 | The following changes since commit 3521ade3510eb5cefb2e27a101667f25dad89935: |
---|---|---|---|
2 | 2 | ||
3 | Update version for v5.1.0 release (2020-08-11 17:07:03 +0100) | 3 | Merge remote-tracking branch 'remotes/thuth-gitlab/tags/pull-request-2021-07-29' into staging (2021-07-29 13:17:20 +0100) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 44277bf914471962c9e88e09c859aae65ae109c4: | 9 | for you to fetch changes up to cc8eecd7f105a1dff5876adeb238a14696061a4a: |
10 | 10 | ||
11 | aio-posix: keep aio_notify_me disabled during polling (2020-08-13 13:34:14 = | 11 | MAINTAINERS: Added myself as a reviewer for the NVMe Block Driver (2021-07-29 17:17:34 +0100) |
12 | +0100) | ||
13 | 12 | ||
14 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
15 | Pull request | 14 | Pull request |
16 | 15 | ||
16 | The main fix here is for io_uring. Spurious -EAGAIN errors can happen and the | ||
17 | request needs to be resubmitted. | ||
18 | |||
19 | The MAINTAINERS changes carry no risk and we might as well include them in QEMU | ||
20 | 6.1. | ||
21 | |||
17 | ---------------------------------------------------------------- | 22 | ---------------------------------------------------------------- |
18 | 23 | ||
19 | Stefan Hajnoczi (3): | 24 | Fabian Ebner (1): |
20 | async: rename event_notifier_dummy_cb/poll() | 25 | block/io_uring: resubmit when result is -EAGAIN |
21 | async: always set ctx->notified in aio_notify() | ||
22 | aio-posix: keep aio_notify_me disabled during polling | ||
23 | 26 | ||
24 | util/aio-posix.c | 47 +++++++++++++++++++++++++---------------------- | 27 | Philippe Mathieu-Daudé (1): |
25 | util/async.c | 36 +++++++++++++++++++++++------------- | 28 | MAINTAINERS: Added myself as a reviewer for the NVMe Block Driver |
26 | 2 files changed, 48 insertions(+), 35 deletions(-) | ||
27 | 29 | ||
28 | --=20 | 30 | Stefano Garzarella (1): |
29 | 2.26.2 | 31 | MAINTAINERS: add Stefano Garzarella as io_uring reviewer |
30 | 32 | ||
33 | MAINTAINERS | 2 ++ | ||
34 | block/io_uring.c | 16 +++++++++++++++- | ||
35 | 2 files changed, 17 insertions(+), 1 deletion(-) | ||
36 | |||
37 | -- | ||
38 | 2.31.1 | ||
39 | diff view generated by jsdifflib |
1 | Polling only monitors the ctx->notified field and does not need the | 1 | From: Stefano Garzarella <sgarzare@redhat.com> |
---|---|---|---|
2 | ctx->notifier EventNotifier to be signalled. Keep ctx->aio_notify_me | ||
3 | disabled while polling to avoid unnecessary EventNotifier syscalls. | ||
4 | 2 | ||
5 | This optimization improves virtio-blk 4KB random read performance by | 3 | I've been working with io_uring for a while so I'd like to help |
6 | 18%. The following results are with an IOThread and the null-co block | 4 | with reviews. |
7 | driver: | ||
8 | 5 | ||
9 | Test IOPS Error | 6 | Signed-off-by: Stefano Garzarella <sgarzare@redhat.com> |
10 | Before 244518.62 ± 1.20% | 7 | Message-Id: <20210728131515.131045-1-sgarzare@redhat.com> |
11 | After 290706.11 ± 0.44% | ||
12 | |||
13 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
14 | Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> | ||
15 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
16 | Message-id: 20200806131802.569478-4-stefanha@redhat.com | ||
17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
18 | --- | 9 | --- |
19 | util/aio-posix.c | 47 +++++++++++++++++++++++++---------------------- | 10 | MAINTAINERS | 1 + |
20 | 1 file changed, 25 insertions(+), 22 deletions(-) | 11 | 1 file changed, 1 insertion(+) |
21 | 12 | ||
22 | diff --git a/util/aio-posix.c b/util/aio-posix.c | 13 | diff --git a/MAINTAINERS b/MAINTAINERS |
23 | index XXXXXXX..XXXXXXX 100644 | 14 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/util/aio-posix.c | 15 | --- a/MAINTAINERS |
25 | +++ b/util/aio-posix.c | 16 | +++ b/MAINTAINERS |
26 | @@ -XXX,XX +XXX,XX @@ static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) | 17 | @@ -XXX,XX +XXX,XX @@ Linux io_uring |
27 | * | 18 | M: Aarushi Mehta <mehta.aaru20@gmail.com> |
28 | * Polls for a given time. | 19 | M: Julia Suvorova <jusual@redhat.com> |
29 | * | 20 | M: Stefan Hajnoczi <stefanha@redhat.com> |
30 | - * Note that ctx->notify_me must be non-zero so this function can detect | 21 | +R: Stefano Garzarella <sgarzare@redhat.com> |
31 | - * aio_notify(). | 22 | L: qemu-block@nongnu.org |
32 | - * | 23 | S: Maintained |
33 | * Note that the caller must have incremented ctx->list_lock. | 24 | F: block/io_uring.c |
34 | * | ||
35 | * Returns: true if progress was made, false otherwise | ||
36 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
37 | bool progress; | ||
38 | int64_t start_time, elapsed_time; | ||
39 | |||
40 | - assert(ctx->notify_me); | ||
41 | assert(qemu_lockcnt_count(&ctx->list_lock) > 0); | ||
42 | |||
43 | trace_run_poll_handlers_begin(ctx, max_ns, *timeout); | ||
44 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
45 | * @timeout: timeout for blocking wait, computed by the caller and updated if | ||
46 | * polling succeeds. | ||
47 | * | ||
48 | - * ctx->notify_me must be non-zero so this function can detect aio_notify(). | ||
49 | - * | ||
50 | * Note that the caller must have incremented ctx->list_lock. | ||
51 | * | ||
52 | * Returns: true if progress was made, false otherwise | ||
53 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
54 | AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); | ||
55 | int ret = 0; | ||
56 | bool progress; | ||
57 | + bool use_notify_me; | ||
58 | int64_t timeout; | ||
59 | int64_t start = 0; | ||
60 | |||
61 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
62 | */ | ||
63 | assert(in_aio_context_home_thread(ctx)); | ||
64 | |||
65 | - /* aio_notify can avoid the expensive event_notifier_set if | ||
66 | + qemu_lockcnt_inc(&ctx->list_lock); | ||
67 | + | ||
68 | + if (ctx->poll_max_ns) { | ||
69 | + start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
70 | + } | ||
71 | + | ||
72 | + timeout = blocking ? aio_compute_timeout(ctx) : 0; | ||
73 | + progress = try_poll_mode(ctx, &timeout); | ||
74 | + assert(!(timeout && progress)); | ||
75 | + | ||
76 | + /* | ||
77 | + * aio_notify can avoid the expensive event_notifier_set if | ||
78 | * everything (file descriptors, bottom halves, timers) will | ||
79 | * be re-evaluated before the next blocking poll(). This is | ||
80 | * already true when aio_poll is called with blocking == false; | ||
81 | * if blocking == true, it is only true after poll() returns, | ||
82 | * so disable the optimization now. | ||
83 | */ | ||
84 | - if (blocking) { | ||
85 | + use_notify_me = timeout != 0; | ||
86 | + if (use_notify_me) { | ||
87 | atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); | ||
88 | /* | ||
89 | - * Write ctx->notify_me before computing the timeout | ||
90 | - * (reading bottom half flags, etc.). Pairs with | ||
91 | + * Write ctx->notify_me before reading ctx->notified. Pairs with | ||
92 | * smp_mb in aio_notify(). | ||
93 | */ | ||
94 | smp_mb(); | ||
95 | - } | ||
96 | - | ||
97 | - qemu_lockcnt_inc(&ctx->list_lock); | ||
98 | |||
99 | - if (ctx->poll_max_ns) { | ||
100 | - start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
101 | + /* Don't block if aio_notify() was called */ | ||
102 | + if (atomic_read(&ctx->notified)) { | ||
103 | + timeout = 0; | ||
104 | + } | ||
105 | } | ||
106 | |||
107 | - timeout = blocking ? aio_compute_timeout(ctx) : 0; | ||
108 | - progress = try_poll_mode(ctx, &timeout); | ||
109 | - assert(!(timeout && progress)); | ||
110 | - | ||
111 | /* If polling is allowed, non-blocking aio_poll does not need the | ||
112 | * system call---a single round of run_poll_handlers_once suffices. | ||
113 | */ | ||
114 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
115 | ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
116 | } | ||
117 | |||
118 | - if (blocking) { | ||
119 | + if (use_notify_me) { | ||
120 | /* Finish the poll before clearing the flag. */ | ||
121 | - atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); | ||
122 | - aio_notify_accept(ctx); | ||
123 | + atomic_store_release(&ctx->notify_me, | ||
124 | + atomic_read(&ctx->notify_me) - 2); | ||
125 | } | ||
126 | |||
127 | + aio_notify_accept(ctx); | ||
128 | + | ||
129 | /* Adjust polling time */ | ||
130 | if (ctx->poll_max_ns) { | ||
131 | int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; | ||
132 | -- | 25 | -- |
133 | 2.26.2 | 26 | 2.31.1 |
134 | 27 | diff view generated by jsdifflib |
1 | aio_notify() does not set ctx->notified when called with | 1 | From: Fabian Ebner <f.ebner@proxmox.com> |
---|---|---|---|
2 | ctx->aio_notify_me disabled. Therefore aio_notify_me needs to be enabled | ||
3 | during polling. | ||
4 | 2 | ||
5 | This is suboptimal since expensive event_notifier_set(&ctx->notifier) | 3 | Linux SCSI can throw spurious -EAGAIN in some corner cases in its |
6 | and event_notifier_test_and_clear(&ctx->notifier) calls are required | 4 | completion path, which will end up being the result in the completed |
7 | when ctx->aio_notify_me is enabled. | 5 | io_uring request. |
8 | 6 | ||
9 | Change aio_notify() so that aio->notified is always set, regardless of | 7 | Resubmitting such requests should allow block jobs to complete, even |
10 | ctx->aio_notify_me. This will make polling cheaper since | 8 | if such spurious errors are encountered. |
11 | ctx->aio_notify_me can remain disabled. Move the | ||
12 | event_notifier_test_and_clear() to the fd handler function (which is now | ||
13 | no longer an empty function so "dummy" has been dropped from its name). | ||
14 | 9 | ||
15 | The next patch takes advantage of this by optimizing polling in | 10 | Co-authored-by: Stefan Hajnoczi <stefanha@gmail.com> |
16 | util/aio-posix.c. | 11 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> |
17 | 12 | Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | |
18 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 13 | Message-id: 20210729091029.65369-1-f.ebner@proxmox.com |
19 | Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> | ||
20 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
21 | Message-id: 20200806131802.569478-3-stefanha@redhat.com | ||
22 | |||
23 | [Paolo Bonzini pointed out that the smp_wmb() in aio_notify_accept() | ||
24 | should be smp_wb() but the comment should be smp_wmb() instead of | ||
25 | smp_wb(). Fixed. | ||
26 | --Stefan] | ||
27 | |||
28 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
29 | --- | 15 | --- |
30 | util/async.c | 32 +++++++++++++++++++++----------- | 16 | block/io_uring.c | 16 +++++++++++++++- |
31 | 1 file changed, 21 insertions(+), 11 deletions(-) | 17 | 1 file changed, 15 insertions(+), 1 deletion(-) |
32 | 18 | ||
33 | diff --git a/util/async.c b/util/async.c | 19 | diff --git a/block/io_uring.c b/block/io_uring.c |
34 | index XXXXXXX..XXXXXXX 100644 | 20 | index XXXXXXX..XXXXXXX 100644 |
35 | --- a/util/async.c | 21 | --- a/block/io_uring.c |
36 | +++ b/util/async.c | 22 | +++ b/block/io_uring.c |
37 | @@ -XXX,XX +XXX,XX @@ LuringState *aio_get_linux_io_uring(AioContext *ctx) | 23 | @@ -XXX,XX +XXX,XX @@ static void luring_process_completions(LuringState *s) |
38 | 24 | total_bytes = ret + luringcb->total_read; | |
39 | void aio_notify(AioContext *ctx) | 25 | |
40 | { | 26 | if (ret < 0) { |
41 | - /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs | 27 | - if (ret == -EINTR) { |
42 | + /* | 28 | + /* |
43 | + * Write e.g. bh->flags before writing ctx->notified. Pairs with smp_mb in | 29 | + * Only writev/readv/fsync requests on regular files or host block |
44 | + * aio_notify_accept. | 30 | + * devices are submitted. Therefore -EAGAIN is not expected but it's |
45 | + */ | 31 | + * known to happen sometimes with Linux SCSI. Submit again and hope |
46 | + smp_wmb(); | 32 | + * the request completes successfully. |
47 | + atomic_set(&ctx->notified, true); | 33 | + * |
48 | + | 34 | + * For more information, see: |
49 | + /* | 35 | + * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u |
50 | + * Write ctx->notified before reading ctx->notify_me. Pairs | 36 | + * |
51 | * with smp_mb in aio_ctx_prepare or aio_poll. | 37 | + * If the code is changed to submit other types of requests in the |
52 | */ | 38 | + * future, then this workaround may need to be extended to deal with |
53 | smp_mb(); | 39 | + * genuine -EAGAIN results that should not be resubmitted |
54 | if (atomic_read(&ctx->notify_me)) { | 40 | + * immediately. |
55 | event_notifier_set(&ctx->notifier); | 41 | + */ |
56 | - atomic_mb_set(&ctx->notified, true); | 42 | + if (ret == -EINTR || ret == -EAGAIN) { |
57 | } | 43 | luring_resubmit(s, luringcb); |
58 | } | 44 | continue; |
59 | 45 | } | |
60 | void aio_notify_accept(AioContext *ctx) | ||
61 | { | ||
62 | - if (atomic_xchg(&ctx->notified, false) | ||
63 | -#ifdef WIN32 | ||
64 | - || true | ||
65 | -#endif | ||
66 | - ) { | ||
67 | - event_notifier_test_and_clear(&ctx->notifier); | ||
68 | - } | ||
69 | + atomic_set(&ctx->notified, false); | ||
70 | + | ||
71 | + /* | ||
72 | + * Write ctx->notified before reading e.g. bh->flags. Pairs with smp_wmb | ||
73 | + * in aio_notify. | ||
74 | + */ | ||
75 | + smp_mb(); | ||
76 | } | ||
77 | |||
78 | static void aio_timerlist_notify(void *opaque, QEMUClockType type) | ||
79 | @@ -XXX,XX +XXX,XX @@ static void aio_timerlist_notify(void *opaque, QEMUClockType type) | ||
80 | aio_notify(opaque); | ||
81 | } | ||
82 | |||
83 | -static void aio_context_notifier_dummy_cb(EventNotifier *e) | ||
84 | +static void aio_context_notifier_cb(EventNotifier *e) | ||
85 | { | ||
86 | + AioContext *ctx = container_of(e, AioContext, notifier); | ||
87 | + | ||
88 | + event_notifier_test_and_clear(&ctx->notifier); | ||
89 | } | ||
90 | |||
91 | /* Returns true if aio_notify() was called (e.g. a BH was scheduled) */ | ||
92 | @@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp) | ||
93 | |||
94 | aio_set_event_notifier(ctx, &ctx->notifier, | ||
95 | false, | ||
96 | - aio_context_notifier_dummy_cb, | ||
97 | + aio_context_notifier_cb, | ||
98 | aio_context_notifier_poll); | ||
99 | #ifdef CONFIG_LINUX_AIO | ||
100 | ctx->linux_aio = NULL; | ||
101 | -- | 46 | -- |
102 | 2.26.2 | 47 | 2.31.1 |
103 | 48 | diff view generated by jsdifflib |
1 | The event_notifier_*() prefix can be confused with the EventNotifier | 1 | From: Philippe Mathieu-Daudé <philmd@redhat.com> |
---|---|---|---|
2 | APIs that are also called event_notifier_*(). | ||
3 | 2 | ||
4 | Rename the functions to aio_context_notifier_*() to make it clear that | 3 | I'm interested in following the activity around the NVMe bdrv. |
5 | they relate to the AioContext::notifier field. | ||
6 | 4 | ||
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> |
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 6 | Message-id: 20210728183340.2018313-1-philmd@redhat.com |
9 | Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> | ||
10 | Message-id: 20200806131802.569478-2-stefanha@redhat.com | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
12 | --- | 8 | --- |
13 | util/async.c | 8 ++++---- | 9 | MAINTAINERS | 1 + |
14 | 1 file changed, 4 insertions(+), 4 deletions(-) | 10 | 1 file changed, 1 insertion(+) |
15 | 11 | ||
16 | diff --git a/util/async.c b/util/async.c | 12 | diff --git a/MAINTAINERS b/MAINTAINERS |
17 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
18 | --- a/util/async.c | 14 | --- a/MAINTAINERS |
19 | +++ b/util/async.c | 15 | +++ b/MAINTAINERS |
20 | @@ -XXX,XX +XXX,XX @@ static void aio_timerlist_notify(void *opaque, QEMUClockType type) | 16 | @@ -XXX,XX +XXX,XX @@ F: block/null.c |
21 | aio_notify(opaque); | 17 | NVMe Block Driver |
22 | } | 18 | M: Stefan Hajnoczi <stefanha@redhat.com> |
23 | 19 | R: Fam Zheng <fam@euphon.net> | |
24 | -static void event_notifier_dummy_cb(EventNotifier *e) | 20 | +R: Philippe Mathieu-Daudé <philmd@redhat.com> |
25 | +static void aio_context_notifier_dummy_cb(EventNotifier *e) | 21 | L: qemu-block@nongnu.org |
26 | { | 22 | S: Supported |
27 | } | 23 | F: block/nvme* |
28 | |||
29 | /* Returns true if aio_notify() was called (e.g. a BH was scheduled) */ | ||
30 | -static bool event_notifier_poll(void *opaque) | ||
31 | +static bool aio_context_notifier_poll(void *opaque) | ||
32 | { | ||
33 | EventNotifier *e = opaque; | ||
34 | AioContext *ctx = container_of(e, AioContext, notifier); | ||
35 | @@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp) | ||
36 | |||
37 | aio_set_event_notifier(ctx, &ctx->notifier, | ||
38 | false, | ||
39 | - event_notifier_dummy_cb, | ||
40 | - event_notifier_poll); | ||
41 | + aio_context_notifier_dummy_cb, | ||
42 | + aio_context_notifier_poll); | ||
43 | #ifdef CONFIG_LINUX_AIO | ||
44 | ctx->linux_aio = NULL; | ||
45 | #endif | ||
46 | -- | 24 | -- |
47 | 2.26.2 | 25 | 2.31.1 |
48 | 26 | diff view generated by jsdifflib |