1 | The following changes since commit 8e9398e3b1a860b8c29c670c1b6c36afe8d87849: | 1 | The following changes since commit 887cba855bb6ff4775256f7968409281350b568c: |
---|---|---|---|
2 | 2 | ||
3 | Merge tag 'pull-ppc-20220706' of https://gitlab.com/danielhb/qemu into staging (2022-07-07 06:21:05 +0530) | 3 | configure: Fix cross-building for RISCV host (v5) (2023-07-11 17:56:09 +0100) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to be6a166fde652589761cf70471bcde623e9bd72a: | 9 | for you to fetch changes up to 75dcb4d790bbe5327169fd72b185960ca58e2fa6: |
10 | 10 | ||
11 | block/io_uring: clarify that short reads can happen (2022-07-07 09:04:15 +0100) | 11 | virtio-blk: fix host notifier issues during dataplane start/stop (2023-07-12 15:20:32 -0400) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | ---------------------------------------------------------------- | 16 | ---------------------------------------------------------------- |
17 | 17 | ||
18 | Dominique Martinet (1): | 18 | Stefan Hajnoczi (1): |
19 | io_uring: fix short read slow path | 19 | virtio-blk: fix host notifier issues during dataplane start/stop |
20 | 20 | ||
21 | Stefan Hajnoczi (1): | 21 | hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++-------------- |
22 | block/io_uring: clarify that short reads can happen | 22 | 1 file changed, 38 insertions(+), 29 deletions(-) |
23 | |||
24 | block/io_uring.c | 12 ++++-------- | ||
25 | 1 file changed, 4 insertions(+), 8 deletions(-) | ||
26 | 23 | ||
27 | -- | 24 | -- |
28 | 2.36.1 | 25 | 2.40.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Dominique Martinet <dominique.martinet@atmark-techno.com> | ||
2 | 1 | ||
3 | sqeq.off here is the offset to read within the disk image, so obviously | ||
4 | not 'nread' (the amount we just read), but as the author meant to write | ||
5 | its current value incremented by the amount we just read. | ||
6 | |||
7 | Normally recent versions of linux will not issue short reads, | ||
8 | but it can happen so we should fix this. | ||
9 | |||
10 | This lead to weird image corruptions when short read happened | ||
11 | |||
12 | Fixes: 6663a0a33764 ("block/io_uring: implements interfaces for io_uring") | ||
13 | Link: https://lkml.kernel.org/r/YrrFGO4A1jS0GI0G@atmark-techno.com | ||
14 | Signed-off-by: Dominique Martinet <dominique.martinet@atmark-techno.com> | ||
15 | Message-Id: <20220630010137.2518851-1-dominique.martinet@atmark-techno.com> | ||
16 | Reviewed-by: Hanna Reitz <hreitz@redhat.com> | ||
17 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> | ||
18 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
19 | --- | ||
20 | block/io_uring.c | 4 ++-- | ||
21 | 1 file changed, 2 insertions(+), 2 deletions(-) | ||
22 | |||
23 | diff --git a/block/io_uring.c b/block/io_uring.c | ||
24 | index XXXXXXX..XXXXXXX 100644 | ||
25 | --- a/block/io_uring.c | ||
26 | +++ b/block/io_uring.c | ||
27 | @@ -XXX,XX +XXX,XX @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, | ||
28 | trace_luring_resubmit_short_read(s, luringcb, nread); | ||
29 | |||
30 | /* Update read position */ | ||
31 | - luringcb->total_read = nread; | ||
32 | + luringcb->total_read += nread; | ||
33 | remaining = luringcb->qiov->size - luringcb->total_read; | ||
34 | |||
35 | /* Shorten qiov */ | ||
36 | @@ -XXX,XX +XXX,XX @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, | ||
37 | remaining); | ||
38 | |||
39 | /* Update sqe */ | ||
40 | - luringcb->sqeq.off = nread; | ||
41 | + luringcb->sqeq.off += nread; | ||
42 | luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov; | ||
43 | luringcb->sqeq.len = luringcb->resubmit_qiov.niov; | ||
44 | |||
45 | -- | ||
46 | 2.36.1 | diff view generated by jsdifflib |
1 | Jens Axboe has confirmed that short reads are rare but can happen: | 1 | The main loop thread can consume 100% CPU when using --device |
---|---|---|---|
2 | https://lore.kernel.org/io-uring/YsU%2FCGkl9ZXUI+Tj@stefanha-x1.localdomain/T/#m729963dc577d709b709c191922e98ec79d7eef54 | 2 | virtio-blk-pci,iothread=<iothread>. ppoll() constantly returns but |
3 | reading virtqueue host notifiers fails with EAGAIN. The file descriptors | ||
4 | are stale and remain registered with the AioContext because of bugs in | ||
5 | the virtio-blk dataplane start/stop code. | ||
3 | 6 | ||
4 | The luring_resubmit_short_read() comment claimed they were only due to a | 7 | The problem is that the dataplane start/stop code involves drain |
5 | specific io_uring bug that was fixed in Linux commit 9d93a3f5a0c | 8 | operations, which call virtio_blk_drained_begin() and |
6 | ("io_uring: punt short reads to async context"), which is wrong. | 9 | virtio_blk_drained_end() at points where the host notifier is not |
7 | Dominique Martinet found that a btrfs bug also causes short reads. There | 10 | operational: |
8 | may be more kernel code paths that result in short reads. | 11 | - In virtio_blk_data_plane_start(), blk_set_aio_context() drains after |
12 | vblk->dataplane_started has been set to true but the host notifier has | ||
13 | not been attached yet. | ||
14 | - In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context() | ||
15 | drain after the host notifier has already been detached but with | ||
16 | vblk->dataplane_started still set to true. | ||
9 | 17 | ||
10 | Let's consider short reads fair game. | 18 | I would like to simplify ->ioeventfd_start/stop() to avoid interactions |
19 | with drain entirely, but couldn't find a way to do that. Instead, this | ||
20 | patch accepts the fragile nature of the code and reorders it so that | ||
21 | vblk->dataplane_started is false during drain operations. This way the | ||
22 | virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't | ||
23 | touch the host notifier. The result is that | ||
24 | virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have | ||
25 | complete control over the host notifier and stale file descriptors are | ||
26 | no longer left in the AioContext. | ||
11 | 27 | ||
12 | Cc: Dominique Martinet <dominique.martinet@atmark-techno.com> | 28 | This patch fixes the 100% CPU consumption in the main loop thread and |
13 | Based-on: <20220630010137.2518851-1-dominique.martinet@atmark-techno.com> | 29 | correctly moves host notifier processing to the IOThread. |
30 | |||
31 | Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()") | ||
32 | Reported-by: Lukáš Doktor <ldoktor@redhat.com> | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 33 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
15 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> | 34 | Tested-by: Lukas Doktor <ldoktor@redhat.com> |
16 | Message-id: 20220706080341.1206476-1-stefanha@redhat.com | 35 | Message-id: 20230704151527.193586-1-stefanha@redhat.com |
17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 36 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
18 | --- | 37 | --- |
19 | block/io_uring.c | 8 ++------ | 38 | hw/block/dataplane/virtio-blk.c | 67 +++++++++++++++++++-------------- |
20 | 1 file changed, 2 insertions(+), 6 deletions(-) | 39 | 1 file changed, 38 insertions(+), 29 deletions(-) |
21 | 40 | ||
22 | diff --git a/block/io_uring.c b/block/io_uring.c | 41 | diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c |
23 | index XXXXXXX..XXXXXXX 100644 | 42 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/block/io_uring.c | 43 | --- a/hw/block/dataplane/virtio-blk.c |
25 | +++ b/block/io_uring.c | 44 | +++ b/hw/block/dataplane/virtio-blk.c |
26 | @@ -XXX,XX +XXX,XX @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) | 45 | @@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) |
27 | /** | 46 | |
28 | * luring_resubmit_short_read: | 47 | memory_region_transaction_commit(); |
29 | * | 48 | |
30 | - * Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async | 49 | - /* |
31 | - * context") a buffered I/O request with the start of the file range in the | 50 | - * These fields are visible to the IOThread so we rely on implicit barriers |
32 | - * page cache could result in a short read. Applications need to resubmit the | 51 | - * in aio_context_acquire() on the write side and aio_notify_accept() on |
33 | - * remaining read request. | 52 | - * the read side. |
34 | - * | 53 | - */ |
35 | - * This is a slow path but recent kernels never take it. | 54 | - s->starting = false; |
36 | + * Short reads are rare but may occur. The remaining read request needs to be | 55 | - vblk->dataplane_started = true; |
37 | + * resubmitted. | 56 | trace_virtio_blk_data_plane_start(s); |
38 | */ | 57 | |
39 | static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, | 58 | old_context = blk_get_aio_context(s->conf->conf.blk); |
40 | int nread) | 59 | @@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) |
60 | event_notifier_set(virtio_queue_get_host_notifier(vq)); | ||
61 | } | ||
62 | |||
63 | + /* | ||
64 | + * These fields must be visible to the IOThread when it processes the | ||
65 | + * virtqueue, otherwise it will think dataplane has not started yet. | ||
66 | + * | ||
67 | + * Make sure ->dataplane_started is false when blk_set_aio_context() is | ||
68 | + * called above so that draining does not cause the host notifier to be | ||
69 | + * detached/attached prematurely. | ||
70 | + */ | ||
71 | + s->starting = false; | ||
72 | + vblk->dataplane_started = true; | ||
73 | + smp_wmb(); /* paired with aio_notify_accept() on the read side */ | ||
74 | + | ||
75 | /* Get this show started by hooking up our callbacks */ | ||
76 | if (!blk_in_drain(s->conf->conf.blk)) { | ||
77 | aio_context_acquire(s->ctx); | ||
78 | @@ -XXX,XX +XXX,XX @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) | ||
79 | fail_guest_notifiers: | ||
80 | vblk->dataplane_disabled = true; | ||
81 | s->starting = false; | ||
82 | - vblk->dataplane_started = true; | ||
83 | return -ENOSYS; | ||
84 | } | ||
85 | |||
86 | @@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) | ||
87 | aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s); | ||
88 | } | ||
89 | |||
90 | + /* | ||
91 | + * Batch all the host notifiers in a single transaction to avoid | ||
92 | + * quadratic time complexity in address_space_update_ioeventfds(). | ||
93 | + */ | ||
94 | + memory_region_transaction_begin(); | ||
95 | + | ||
96 | + for (i = 0; i < nvqs; i++) { | ||
97 | + virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); | ||
98 | + } | ||
99 | + | ||
100 | + /* | ||
101 | + * The transaction expects the ioeventfds to be open when it | ||
102 | + * commits. Do it now, before the cleanup loop. | ||
103 | + */ | ||
104 | + memory_region_transaction_commit(); | ||
105 | + | ||
106 | + for (i = 0; i < nvqs; i++) { | ||
107 | + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); | ||
108 | + } | ||
109 | + | ||
110 | + /* | ||
111 | + * Set ->dataplane_started to false before draining so that host notifiers | ||
112 | + * are not detached/attached anymore. | ||
113 | + */ | ||
114 | + vblk->dataplane_started = false; | ||
115 | + | ||
116 | aio_context_acquire(s->ctx); | ||
117 | |||
118 | /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */ | ||
119 | @@ -XXX,XX +XXX,XX @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) | ||
120 | |||
121 | aio_context_release(s->ctx); | ||
122 | |||
123 | - /* | ||
124 | - * Batch all the host notifiers in a single transaction to avoid | ||
125 | - * quadratic time complexity in address_space_update_ioeventfds(). | ||
126 | - */ | ||
127 | - memory_region_transaction_begin(); | ||
128 | - | ||
129 | - for (i = 0; i < nvqs; i++) { | ||
130 | - virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); | ||
131 | - } | ||
132 | - | ||
133 | - /* | ||
134 | - * The transaction expects the ioeventfds to be open when it | ||
135 | - * commits. Do it now, before the cleanup loop. | ||
136 | - */ | ||
137 | - memory_region_transaction_commit(); | ||
138 | - | ||
139 | - for (i = 0; i < nvqs; i++) { | ||
140 | - virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); | ||
141 | - } | ||
142 | - | ||
143 | qemu_bh_cancel(s->bh); | ||
144 | notify_guest_bh(s); /* final chance to notify guest */ | ||
145 | |||
146 | /* Clean up guest notifier (irq) */ | ||
147 | k->set_guest_notifiers(qbus->parent, nvqs, false); | ||
148 | |||
149 | - vblk->dataplane_started = false; | ||
150 | s->stopping = false; | ||
151 | } | ||
41 | -- | 152 | -- |
42 | 2.36.1 | 153 | 2.40.1 |
154 | |||
155 | diff view generated by jsdifflib |