1 | The following changes since commit dd25f97c66a75d1508f1d4c6478ed2c95bec428f: | 1 | The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20190913' into staging (2019-09-16 10:15:15 +0100) | 3 | Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/XanClic/qemu.git tags/pull-block-2019-09-16 | 7 | https://github.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 1825cc0783ccf0ec5d9f0b225a99b340bdd4c68f: | 9 | for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3: |
10 | 10 | ||
11 | qemu-iotests: Add test for bz #1745922 (2019-09-16 15:37:12 +0200) | 11 | aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Block patches: | 14 | Pull request |
15 | - Fix for block jobs when used with I/O threads | ||
16 | - Fix for a corruption when using qcow2's LUKS encryption mode | ||
17 | - cURL fix | ||
18 | - check-block.sh cleanups (for make check) | ||
19 | - Refactoring | ||
20 | 15 | ||
21 | ---------------------------------------------------------------- | 16 | ---------------------------------------------------------------- |
22 | Max Reitz (7): | ||
23 | curl: Keep pointer to the CURLState in CURLSocket | ||
24 | curl: Keep *socket until the end of curl_sock_cb() | ||
25 | curl: Check completion in curl_multi_do() | ||
26 | curl: Pass CURLSocket to curl_multi_do() | ||
27 | curl: Report only ready sockets | ||
28 | curl: Handle success in multi_check_completion | ||
29 | curl: Check curl_multi_add_handle()'s return code | ||
30 | 17 | ||
31 | Maxim Levitsky (3): | 18 | Stefan Hajnoczi (9): |
32 | block/qcow2: Fix corruption introduced by commit 8ac0f15f335 | 19 | qemu/queue.h: clear linked list pointers on remove |
33 | block/qcow2: refactor encryption code | 20 | aio-posix: remove confusing QLIST_SAFE_REMOVE() |
34 | qemu-iotests: Add test for bz #1745922 | 21 | aio-posix: completely stop polling when disabled |
22 | aio-posix: move RCU_READ_LOCK() into run_poll_handlers() | ||
23 | aio-posix: extract ppoll(2) and epoll(7) fd monitoring | ||
24 | aio-posix: simplify FDMonOps->update() prototype | ||
25 | aio-posix: add io_uring fd monitoring implementation | ||
26 | aio-posix: support userspace polling of fd monitoring | ||
27 | aio-posix: remove idle poll handlers to improve scalability | ||
35 | 28 | ||
36 | Nir Soffer (2): | 29 | MAINTAINERS | 2 + |
37 | block: Use QEMU_IS_ALIGNED | 30 | configure | 5 + |
38 | block: Remove unused masks | 31 | include/block/aio.h | 71 ++++++- |
39 | 32 | include/qemu/queue.h | 19 +- | |
40 | Sergio Lopez (1): | 33 | util/Makefile.objs | 3 + |
41 | blockjob: update nodes head while removing all bdrv | 34 | util/aio-posix.c | 451 ++++++++++++++---------------------------- |
42 | 35 | util/aio-posix.h | 81 ++++++++ | |
43 | Thomas Huth (2): | 36 | util/fdmon-epoll.c | 155 +++++++++++++++ |
44 | tests/qemu-iotests/check: Replace "tests" with "iotests" in final | 37 | util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++ |
45 | status text | 38 | util/fdmon-poll.c | 107 ++++++++++ |
46 | tests/Makefile: Do not print the name of the check-block.sh shell | 39 | util/trace-events | 2 + |
47 | script | 40 | 11 files changed, 915 insertions(+), 313 deletions(-) |
48 | 41 | create mode 100644 util/aio-posix.h | |
49 | Vladimir Sementsov-Ogievskiy (1): | 42 | create mode 100644 util/fdmon-epoll.c |
50 | tests/qemu-iotests: Fix qemu-io related output in 026.out.nocache | 43 | create mode 100644 util/fdmon-io_uring.c |
51 | 44 | create mode 100644 util/fdmon-poll.c | |
52 | tests/Makefile.include | 2 +- | ||
53 | block/qcow2.h | 8 +- | ||
54 | include/block/block.h | 2 - | ||
55 | block/bochs.c | 4 +- | ||
56 | block/cloop.c | 4 +- | ||
57 | block/curl.c | 133 ++++++++++------------- | ||
58 | block/dmg.c | 4 +- | ||
59 | block/io.c | 8 +- | ||
60 | block/qcow2-cluster.c | 40 +++---- | ||
61 | block/qcow2-threads.c | 63 ++++++++--- | ||
62 | block/qcow2.c | 9 +- | ||
63 | block/vvfat.c | 8 +- | ||
64 | blockjob.c | 17 ++- | ||
65 | migration/block.c | 2 +- | ||
66 | qemu-img.c | 2 +- | ||
67 | tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++--------------- | ||
68 | tests/qemu-iotests/263 | 91 ++++++++++++++++ | ||
69 | tests/qemu-iotests/263.out | 40 +++++++ | ||
70 | tests/qemu-iotests/check | 8 +- | ||
71 | tests/qemu-iotests/group | 1 + | ||
72 | 20 files changed, 380 insertions(+), 234 deletions(-) | ||
73 | create mode 100755 tests/qemu-iotests/263 | ||
74 | create mode 100644 tests/qemu-iotests/263.out | ||
75 | 45 | ||
76 | -- | 46 | -- |
77 | 2.21.0 | 47 | 2.24.1 |
78 | 48 | ||
79 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Nir Soffer <nirsof@gmail.com> | ||
2 | 1 | ||
3 | Replace instances of: | ||
4 | |||
5 | (n & (BDRV_SECTOR_SIZE - 1)) == 0 | ||
6 | |||
7 | And: | ||
8 | |||
9 | (n & ~BDRV_SECTOR_MASK) == 0 | ||
10 | |||
11 | With: | ||
12 | |||
13 | QEMU_IS_ALIGNED(n, BDRV_SECTOR_SIZE) | ||
14 | |||
15 | Which reveals the intent of the code better, and makes it easier to | ||
16 | locate the code checking alignment. | ||
17 | |||
18 | Signed-off-by: Nir Soffer <nsoffer@redhat.com> | ||
19 | Message-id: 20190827185913.27427-2-nsoffer@redhat.com | ||
20 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
21 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
22 | --- | ||
23 | block/bochs.c | 4 ++-- | ||
24 | block/cloop.c | 4 ++-- | ||
25 | block/dmg.c | 4 ++-- | ||
26 | block/io.c | 8 ++++---- | ||
27 | block/qcow2-cluster.c | 4 ++-- | ||
28 | block/qcow2.c | 4 ++-- | ||
29 | block/vvfat.c | 8 ++++---- | ||
30 | qemu-img.c | 2 +- | ||
31 | 8 files changed, 19 insertions(+), 19 deletions(-) | ||
32 | |||
33 | diff --git a/block/bochs.c b/block/bochs.c | ||
34 | index XXXXXXX..XXXXXXX 100644 | ||
35 | --- a/block/bochs.c | ||
36 | +++ b/block/bochs.c | ||
37 | @@ -XXX,XX +XXX,XX @@ bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
38 | QEMUIOVector local_qiov; | ||
39 | int ret; | ||
40 | |||
41 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
42 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
43 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
44 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
45 | |||
46 | qemu_iovec_init(&local_qiov, qiov->niov); | ||
47 | qemu_co_mutex_lock(&s->lock); | ||
48 | diff --git a/block/cloop.c b/block/cloop.c | ||
49 | index XXXXXXX..XXXXXXX 100644 | ||
50 | --- a/block/cloop.c | ||
51 | +++ b/block/cloop.c | ||
52 | @@ -XXX,XX +XXX,XX @@ cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
53 | int nb_sectors = bytes >> BDRV_SECTOR_BITS; | ||
54 | int ret, i; | ||
55 | |||
56 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
57 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
58 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
59 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
60 | |||
61 | qemu_co_mutex_lock(&s->lock); | ||
62 | |||
63 | diff --git a/block/dmg.c b/block/dmg.c | ||
64 | index XXXXXXX..XXXXXXX 100644 | ||
65 | --- a/block/dmg.c | ||
66 | +++ b/block/dmg.c | ||
67 | @@ -XXX,XX +XXX,XX @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
68 | int nb_sectors = bytes >> BDRV_SECTOR_BITS; | ||
69 | int ret, i; | ||
70 | |||
71 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
72 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
73 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
74 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
75 | |||
76 | qemu_co_mutex_lock(&s->lock); | ||
77 | |||
78 | diff --git a/block/io.c b/block/io.c | ||
79 | index XXXXXXX..XXXXXXX 100644 | ||
80 | --- a/block/io.c | ||
81 | +++ b/block/io.c | ||
82 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, | ||
83 | sector_num = offset >> BDRV_SECTOR_BITS; | ||
84 | nb_sectors = bytes >> BDRV_SECTOR_BITS; | ||
85 | |||
86 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
87 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
88 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
89 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
90 | assert(bytes <= BDRV_REQUEST_MAX_BYTES); | ||
91 | assert(drv->bdrv_co_readv); | ||
92 | |||
93 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, | ||
94 | sector_num = offset >> BDRV_SECTOR_BITS; | ||
95 | nb_sectors = bytes >> BDRV_SECTOR_BITS; | ||
96 | |||
97 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
98 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
99 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
100 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
101 | assert(bytes <= BDRV_REQUEST_MAX_BYTES); | ||
102 | |||
103 | assert(drv->bdrv_co_writev); | ||
104 | diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c | ||
105 | index XXXXXXX..XXXXXXX 100644 | ||
106 | --- a/block/qcow2-cluster.c | ||
107 | +++ b/block/qcow2-cluster.c | ||
108 | @@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, | ||
109 | { | ||
110 | if (bytes && bs->encrypted) { | ||
111 | BDRVQcow2State *s = bs->opaque; | ||
112 | - assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); | ||
113 | - assert((bytes & ~BDRV_SECTOR_MASK) == 0); | ||
114 | + assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE)); | ||
115 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
116 | assert(s->crypto); | ||
117 | if (qcow2_co_encrypt(bs, cluster_offset, | ||
118 | src_cluster_offset + offset_in_cluster, | ||
119 | diff --git a/block/qcow2.c b/block/qcow2.c | ||
120 | index XXXXXXX..XXXXXXX 100644 | ||
121 | --- a/block/qcow2.c | ||
122 | +++ b/block/qcow2.c | ||
123 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs, | ||
124 | goto fail; | ||
125 | } | ||
126 | |||
127 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
128 | - assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
129 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
130 | + assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE)); | ||
131 | if (qcow2_co_decrypt(bs, cluster_offset, offset, | ||
132 | cluster_data, cur_bytes) < 0) { | ||
133 | ret = -EIO; | ||
134 | diff --git a/block/vvfat.c b/block/vvfat.c | ||
135 | index XXXXXXX..XXXXXXX 100644 | ||
136 | --- a/block/vvfat.c | ||
137 | +++ b/block/vvfat.c | ||
138 | @@ -XXX,XX +XXX,XX @@ vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
139 | int nb_sectors = bytes >> BDRV_SECTOR_BITS; | ||
140 | void *buf; | ||
141 | |||
142 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
143 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
144 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
145 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
146 | |||
147 | buf = g_try_malloc(bytes); | ||
148 | if (bytes && buf == NULL) { | ||
149 | @@ -XXX,XX +XXX,XX @@ vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
150 | int nb_sectors = bytes >> BDRV_SECTOR_BITS; | ||
151 | void *buf; | ||
152 | |||
153 | - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
154 | - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); | ||
155 | + assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | ||
156 | + assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
157 | |||
158 | buf = g_try_malloc(bytes); | ||
159 | if (bytes && buf == NULL) { | ||
160 | diff --git a/qemu-img.c b/qemu-img.c | ||
161 | index XXXXXXX..XXXXXXX 100644 | ||
162 | --- a/qemu-img.c | ||
163 | +++ b/qemu-img.c | ||
164 | @@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv) | ||
165 | int64_t sval; | ||
166 | |||
167 | sval = cvtnum(optarg); | ||
168 | - if (sval < 0 || sval & (BDRV_SECTOR_SIZE - 1) || | ||
169 | + if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) || | ||
170 | sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) { | ||
171 | error_report("Invalid buffer size for sparse output specified. " | ||
172 | "Valid sizes are multiples of %llu up to %llu. Select " | ||
173 | -- | ||
174 | 2.21.0 | ||
175 | |||
176 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Nir Soffer <nirsof@gmail.com> | ||
2 | 1 | ||
3 | Replace confusing usage: | ||
4 | |||
5 | ~BDRV_SECTOR_MASK | ||
6 | |||
7 | With more clear: | ||
8 | |||
9 | (BDRV_SECTOR_SIZE - 1) | ||
10 | |||
11 | Remove BDRV_SECTOR_MASK and the unused BDRV_BLOCK_OFFSET_MASK which was | ||
12 | it's last user. | ||
13 | |||
14 | Signed-off-by: Nir Soffer <nsoffer@redhat.com> | ||
15 | Message-id: 20190827185913.27427-3-nsoffer@redhat.com | ||
16 | Reviewed-by: Juan Quintela <quintela@redhat.com> | ||
17 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
18 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
19 | --- | ||
20 | include/block/block.h | 2 -- | ||
21 | migration/block.c | 2 +- | ||
22 | 2 files changed, 1 insertion(+), 3 deletions(-) | ||
23 | |||
24 | diff --git a/include/block/block.h b/include/block/block.h | ||
25 | index XXXXXXX..XXXXXXX 100644 | ||
26 | --- a/include/block/block.h | ||
27 | +++ b/include/block/block.h | ||
28 | @@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry { | ||
29 | |||
30 | #define BDRV_SECTOR_BITS 9 | ||
31 | #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) | ||
32 | -#define BDRV_SECTOR_MASK ~(BDRV_SECTOR_SIZE - 1) | ||
33 | |||
34 | #define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \ | ||
35 | INT_MAX >> BDRV_SECTOR_BITS) | ||
36 | @@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry { | ||
37 | #define BDRV_BLOCK_ALLOCATED 0x10 | ||
38 | #define BDRV_BLOCK_EOF 0x20 | ||
39 | #define BDRV_BLOCK_RECURSE 0x40 | ||
40 | -#define BDRV_BLOCK_OFFSET_MASK BDRV_SECTOR_MASK | ||
41 | |||
42 | typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; | ||
43 | |||
44 | diff --git a/migration/block.c b/migration/block.c | ||
45 | index XXXXXXX..XXXXXXX 100644 | ||
46 | --- a/migration/block.c | ||
47 | +++ b/migration/block.c | ||
48 | @@ -XXX,XX +XXX,XX @@ static int block_load(QEMUFile *f, void *opaque, int version_id) | ||
49 | do { | ||
50 | addr = qemu_get_be64(f); | ||
51 | |||
52 | - flags = addr & ~BDRV_SECTOR_MASK; | ||
53 | + flags = addr & (BDRV_SECTOR_SIZE - 1); | ||
54 | addr >>= BDRV_SECTOR_BITS; | ||
55 | |||
56 | if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { | ||
57 | -- | ||
58 | 2.21.0 | ||
59 | |||
60 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Thomas Huth <thuth@redhat.com> | ||
2 | 1 | ||
3 | When running "make check -j8" or something similar, the iotests are | ||
4 | running in parallel with the other tests. So when they are printing | ||
5 | out "Passed all xx tests" or a similar status message at the end, | ||
6 | it might not be quite clear that this message belongs to the iotests, | ||
7 | since the output might be mixed with the other tests. Thus change the | ||
8 | word "tests" here to "iotests" instead to avoid confusion. | ||
9 | |||
10 | Signed-off-by: Thomas Huth <thuth@redhat.com> | ||
11 | Message-id: 20190906113920.11271-1-thuth@redhat.com | ||
12 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
13 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
14 | --- | ||
15 | tests/qemu-iotests/check | 8 ++++---- | ||
16 | 1 file changed, 4 insertions(+), 4 deletions(-) | ||
17 | |||
18 | diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check | ||
19 | index XXXXXXX..XXXXXXX 100755 | ||
20 | --- a/tests/qemu-iotests/check | ||
21 | +++ b/tests/qemu-iotests/check | ||
22 | @@ -XXX,XX +XXX,XX @@ END { if (NR > 0) { | ||
23 | if [ ! -z "$n_bad" -a $n_bad != 0 ] | ||
24 | then | ||
25 | echo "Failures:$bad" | ||
26 | - echo "Failed $n_bad of $try tests" | ||
27 | + echo "Failed $n_bad of $try iotests" | ||
28 | echo "Failures:$bad" | fmt >>check.log | ||
29 | - echo "Failed $n_bad of $try tests" >>check.log | ||
30 | + echo "Failed $n_bad of $try iotests" >>check.log | ||
31 | else | ||
32 | - echo "Passed all $try tests" | ||
33 | - echo "Passed all $try tests" >>check.log | ||
34 | + echo "Passed all $try iotests" | ||
35 | + echo "Passed all $try iotests" >>check.log | ||
36 | fi | ||
37 | needwrap=false | ||
38 | fi | ||
39 | -- | ||
40 | 2.21.0 | ||
41 | |||
42 | diff view generated by jsdifflib |
1 | From: Maxim Levitsky <mlevitsk@redhat.com> | 1 | Do not leave stale linked list pointers around after removal. It's |
---|---|---|---|
2 | safer to set them to NULL so that use-after-removal results in an | ||
3 | immediate segfault. | ||
2 | 4 | ||
3 | This fixes subtle corruption introduced by luks threaded encryption | 5 | The RCU queue removal macros are unchanged since nodes may still be |
4 | in commit 8ac0f15f335 | 6 | traversed after removal. |
5 | 7 | ||
6 | Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1745922 | 8 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> |
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
10 | Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com | ||
11 | Message-Id: <20200224103406.1894923-2-stefanha@redhat.com> | ||
12 | --- | ||
13 | include/qemu/queue.h | 19 +++++++++++++++---- | ||
14 | 1 file changed, 15 insertions(+), 4 deletions(-) | ||
7 | 15 | ||
8 | The corruption happens when we do a write that | 16 | diff --git a/include/qemu/queue.h b/include/qemu/queue.h |
9 | * writes to two or more unallocated clusters at once | 17 | index XXXXXXX..XXXXXXX 100644 |
10 | * doesn't fully cover the first sector | 18 | --- a/include/qemu/queue.h |
11 | * doesn't fully cover the last sector | 19 | +++ b/include/qemu/queue.h |
12 | * uses luks encryption | 20 | @@ -XXX,XX +XXX,XX @@ struct { \ |
21 | (elm)->field.le_next->field.le_prev = \ | ||
22 | (elm)->field.le_prev; \ | ||
23 | *(elm)->field.le_prev = (elm)->field.le_next; \ | ||
24 | + (elm)->field.le_next = NULL; \ | ||
25 | + (elm)->field.le_prev = NULL; \ | ||
26 | } while (/*CONSTCOND*/0) | ||
27 | |||
28 | /* | ||
29 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
30 | } while (/*CONSTCOND*/0) | ||
31 | |||
32 | #define QSLIST_REMOVE_HEAD(head, field) do { \ | ||
33 | - (head)->slh_first = (head)->slh_first->field.sle_next; \ | ||
34 | + typeof((head)->slh_first) elm = (head)->slh_first; \ | ||
35 | + (head)->slh_first = elm->field.sle_next; \ | ||
36 | + elm->field.sle_next = NULL; \ | ||
37 | } while (/*CONSTCOND*/0) | ||
38 | |||
39 | #define QSLIST_REMOVE_AFTER(slistelm, field) do { \ | ||
40 | - (slistelm)->field.sle_next = \ | ||
41 | - QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field); \ | ||
42 | + typeof(slistelm) next = (slistelm)->field.sle_next; \ | ||
43 | + (slistelm)->field.sle_next = next->field.sle_next; \ | ||
44 | + next->field.sle_next = NULL; \ | ||
45 | } while (/*CONSTCOND*/0) | ||
46 | |||
47 | #define QSLIST_REMOVE(head, elm, type, field) do { \ | ||
48 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
49 | while (curelm->field.sle_next != (elm)) \ | ||
50 | curelm = curelm->field.sle_next; \ | ||
51 | curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \ | ||
52 | + (elm)->field.sle_next = NULL; \ | ||
53 | } \ | ||
54 | } while (/*CONSTCOND*/0) | ||
55 | |||
56 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
57 | } while (/*CONSTCOND*/0) | ||
58 | |||
59 | #define QSIMPLEQ_REMOVE_HEAD(head, field) do { \ | ||
60 | - if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\ | ||
61 | + typeof((head)->sqh_first) elm = (head)->sqh_first; \ | ||
62 | + if (((head)->sqh_first = elm->field.sqe_next) == NULL) \ | ||
63 | (head)->sqh_last = &(head)->sqh_first; \ | ||
64 | + elm->field.sqe_next = NULL; \ | ||
65 | } while (/*CONSTCOND*/0) | ||
66 | |||
67 | #define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do { \ | ||
68 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
69 | if ((curelm->field.sqe_next = \ | ||
70 | curelm->field.sqe_next->field.sqe_next) == NULL) \ | ||
71 | (head)->sqh_last = &(curelm)->field.sqe_next; \ | ||
72 | + (elm)->field.sqe_next = NULL; \ | ||
73 | } \ | ||
74 | } while (/*CONSTCOND*/0) | ||
75 | |||
76 | @@ -XXX,XX +XXX,XX @@ union { \ | ||
77 | (head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \ | ||
78 | (elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \ | ||
79 | (elm)->field.tqe_circ.tql_prev = NULL; \ | ||
80 | + (elm)->field.tqe_circ.tql_next = NULL; \ | ||
81 | + (elm)->field.tqe_next = NULL; \ | ||
82 | } while (/*CONSTCOND*/0) | ||
83 | |||
84 | /* remove @left, @right and all elements in between from @head */ | ||
85 | -- | ||
86 | 2.24.1 | ||
13 | 87 | ||
14 | In this case, when allocating the new clusters we COW both areas | ||
15 | prior to the write and after the write, and we encrypt them. | ||
16 | |||
17 | The above mentioned commit accidentally made it so we encrypt the | ||
18 | second COW area using the physical cluster offset of the first area. | ||
19 | |||
20 | The problem is that offset_in_cluster in do_perform_cow_encrypt | ||
21 | can be larger that the cluster size, thus cluster_offset | ||
22 | will no longer point to the start of the cluster at which encrypted | ||
23 | area starts. | ||
24 | |||
25 | Next patch in this series will refactor the code to avoid all these | ||
26 | assumptions. | ||
27 | |||
28 | In the bugreport that was triggered by rebasing a luks image to new, | ||
29 | zero filled base, which lot of such writes, and causes some files | ||
30 | with zero areas to contain garbage there instead. | ||
31 | But as described above it can happen elsewhere as well | ||
32 | |||
33 | Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> | ||
34 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | ||
35 | Message-id: 20190915203655.21638-2-mlevitsk@redhat.com | ||
36 | Reviewed-by: Max Reitz <mreitz@redhat.com> | ||
37 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
38 | --- | ||
39 | block/qcow2-cluster.c | 7 ++++--- | ||
40 | 1 file changed, 4 insertions(+), 3 deletions(-) | ||
41 | |||
42 | diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c | ||
43 | index XXXXXXX..XXXXXXX 100644 | ||
44 | --- a/block/qcow2-cluster.c | ||
45 | +++ b/block/qcow2-cluster.c | ||
46 | @@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, | ||
47 | assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE)); | ||
48 | assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | ||
49 | assert(s->crypto); | ||
50 | - if (qcow2_co_encrypt(bs, cluster_offset, | ||
51 | - src_cluster_offset + offset_in_cluster, | ||
52 | - buffer, bytes) < 0) { | ||
53 | + if (qcow2_co_encrypt(bs, | ||
54 | + start_of_cluster(s, cluster_offset + offset_in_cluster), | ||
55 | + src_cluster_offset + offset_in_cluster, | ||
56 | + buffer, bytes) < 0) { | ||
57 | return false; | ||
58 | } | ||
59 | } | ||
60 | -- | ||
61 | 2.21.0 | ||
62 | |||
63 | diff view generated by jsdifflib |
1 | From: Thomas Huth <thuth@redhat.com> | 1 | QLIST_SAFE_REMOVE() is confusing here because the node must be on the |
---|---|---|---|
2 | list. We actually just wanted to clear the linked list pointers when | ||
3 | removing it from the list. QLIST_REMOVE() now does this, so switch to | ||
4 | it. | ||
2 | 5 | ||
3 | The check script is already printing out which iotest is currently | 6 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> |
4 | running, so printing out the name of the check-block.sh shell script | 7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
5 | looks superfluous here. | 8 | Link: https://lore.kernel.org/r/20200224103406.1894923-3-stefanha@redhat.com |
6 | 9 | Message-Id: <20200224103406.1894923-3-stefanha@redhat.com> | |
7 | Signed-off-by: Thomas Huth <thuth@redhat.com> | ||
8 | Message-id: 20190906113534.10907-1-thuth@redhat.com | ||
9 | Acked-by: John Snow <jsnow@redhat.com> | ||
10 | Reviewed-by: Max Reitz <mreitz@redhat.com> | ||
11 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
12 | --- | 10 | --- |
13 | tests/Makefile.include | 2 +- | 11 | util/aio-posix.c | 2 +- |
14 | 1 file changed, 1 insertion(+), 1 deletion(-) | 12 | 1 file changed, 1 insertion(+), 1 deletion(-) |
15 | 13 | ||
16 | diff --git a/tests/Makefile.include b/tests/Makefile.include | 14 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
17 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
18 | --- a/tests/Makefile.include | 16 | --- a/util/aio-posix.c |
19 | +++ b/tests/Makefile.include | 17 | +++ b/util/aio-posix.c |
20 | @@ -XXX,XX +XXX,XX @@ QEMU_IOTESTS_HELPERS-$(call land,$(CONFIG_SOFTMMU),$(CONFIG_LINUX)) = tests/qemu | 18 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx, |
21 | check-tests/check-block.sh: tests/check-block.sh qemu-img$(EXESUF) \ | 19 | AioHandler *node; |
22 | qemu-io$(EXESUF) qemu-nbd$(EXESUF) $(QEMU_IOTESTS_HELPERS-y) \ | 20 | |
23 | $(patsubst %,%/all,$(filter %-softmmu,$(TARGET_DIRS))) | 21 | while ((node = QLIST_FIRST(ready_list))) { |
24 | - $< | 22 | - QLIST_SAFE_REMOVE(node, node_ready); |
25 | + @$< | 23 | + QLIST_REMOVE(node, node_ready); |
26 | 24 | progress = aio_dispatch_handler(ctx, node) || progress; | |
27 | .PHONY: $(patsubst %, check-%, $(check-qapi-schema-y)) | 25 | } |
28 | $(patsubst %, check-%, $(check-qapi-schema-y)): check-%.json: $(SRC_PATH)/%.json | 26 | |
29 | -- | 27 | -- |
30 | 2.21.0 | 28 | 2.24.1 |
31 | 29 | ||
32 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | ||
2 | 1 | ||
3 | qemu-io now prefixes its error and warnings with "qemu-io:". | ||
4 | 36b9986b08787019e fixed a lot of iotests output but forget about | ||
5 | 026.out.nocache. Fix it too. | ||
6 | |||
7 | Fixes: 99e98d7c9fc1a1639fad ("qemu-io: Use error_[gs]et_progname()") | ||
8 | Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | ||
9 | Message-id: 20190816153015.447957-2-vsementsov@virtuozzo.com | ||
10 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
11 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
12 | --- | ||
13 | tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++--------------- | ||
14 | 1 file changed, 84 insertions(+), 84 deletions(-) | ||
15 | |||
16 | diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/tests/qemu-iotests/026.out.nocache | ||
19 | +++ b/tests/qemu-iotests/026.out.nocache | ||
20 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
21 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
22 | |||
23 | Event: l1_update; errno: 5; imm: off; once: off; write | ||
24 | -Failed to flush the L2 table cache: Input/output error | ||
25 | -Failed to flush the refcount block cache: Input/output error | ||
26 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
27 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
28 | write failed: Input/output error | ||
29 | |||
30 | 1 leaked clusters were found on the image. | ||
31 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
32 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
33 | |||
34 | Event: l1_update; errno: 5; imm: off; once: off; write -b | ||
35 | -Failed to flush the L2 table cache: Input/output error | ||
36 | -Failed to flush the refcount block cache: Input/output error | ||
37 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
38 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
39 | write failed: Input/output error | ||
40 | |||
41 | 1 leaked clusters were found on the image. | ||
42 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
43 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
44 | |||
45 | Event: l1_update; errno: 28; imm: off; once: off; write | ||
46 | -Failed to flush the L2 table cache: No space left on device | ||
47 | -Failed to flush the refcount block cache: No space left on device | ||
48 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
49 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
50 | write failed: No space left on device | ||
51 | |||
52 | 1 leaked clusters were found on the image. | ||
53 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
54 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
55 | |||
56 | Event: l1_update; errno: 28; imm: off; once: off; write -b | ||
57 | -Failed to flush the L2 table cache: No space left on device | ||
58 | -Failed to flush the refcount block cache: No space left on device | ||
59 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
60 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
61 | write failed: No space left on device | ||
62 | |||
63 | 1 leaked clusters were found on the image. | ||
64 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
65 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
66 | |||
67 | Event: l2_update; errno: 5; imm: off; once: off; write | ||
68 | -Failed to flush the L2 table cache: Input/output error | ||
69 | -Failed to flush the refcount block cache: Input/output error | ||
70 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
71 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
72 | wrote 131072/131072 bytes at offset 0 | ||
73 | 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | ||
74 | |||
75 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
76 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
77 | |||
78 | Event: l2_update; errno: 5; imm: off; once: off; write -b | ||
79 | -Failed to flush the L2 table cache: Input/output error | ||
80 | -Failed to flush the refcount block cache: Input/output error | ||
81 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
82 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
83 | wrote 131072/131072 bytes at offset 0 | ||
84 | 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | ||
85 | |||
86 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
87 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
88 | |||
89 | Event: l2_update; errno: 28; imm: off; once: off; write | ||
90 | -Failed to flush the L2 table cache: No space left on device | ||
91 | -Failed to flush the refcount block cache: No space left on device | ||
92 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
93 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
94 | wrote 131072/131072 bytes at offset 0 | ||
95 | 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | ||
96 | |||
97 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
98 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
99 | |||
100 | Event: l2_update; errno: 28; imm: off; once: off; write -b | ||
101 | -Failed to flush the L2 table cache: No space left on device | ||
102 | -Failed to flush the refcount block cache: No space left on device | ||
103 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
104 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
105 | wrote 131072/131072 bytes at offset 0 | ||
106 | 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | ||
107 | |||
108 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
109 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
110 | |||
111 | Event: l2_alloc_write; errno: 5; imm: off; once: off; write | ||
112 | -Failed to flush the L2 table cache: Input/output error | ||
113 | -Failed to flush the refcount block cache: Input/output error | ||
114 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
115 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
116 | write failed: Input/output error | ||
117 | No errors were found on the image. | ||
118 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
119 | |||
120 | Event: l2_alloc_write; errno: 5; imm: off; once: off; write -b | ||
121 | -Failed to flush the L2 table cache: Input/output error | ||
122 | -Failed to flush the refcount block cache: Input/output error | ||
123 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
124 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
125 | write failed: Input/output error | ||
126 | |||
127 | 1 leaked clusters were found on the image. | ||
128 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
129 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
130 | |||
131 | Event: l2_alloc_write; errno: 28; imm: off; once: off; write | ||
132 | -Failed to flush the L2 table cache: No space left on device | ||
133 | -Failed to flush the refcount block cache: No space left on device | ||
134 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
135 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
136 | write failed: No space left on device | ||
137 | No errors were found on the image. | ||
138 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
139 | |||
140 | Event: l2_alloc_write; errno: 28; imm: off; once: off; write -b | ||
141 | -Failed to flush the L2 table cache: No space left on device | ||
142 | -Failed to flush the refcount block cache: No space left on device | ||
143 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
144 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
145 | write failed: No space left on device | ||
146 | |||
147 | 1 leaked clusters were found on the image. | ||
148 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
149 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
150 | |||
151 | Event: write_aio; errno: 5; imm: off; once: off; write | ||
152 | -Failed to flush the L2 table cache: Input/output error | ||
153 | -Failed to flush the refcount block cache: Input/output error | ||
154 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
155 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
156 | write failed: Input/output error | ||
157 | No errors were found on the image. | ||
158 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
159 | |||
160 | Event: write_aio; errno: 5; imm: off; once: off; write -b | ||
161 | -Failed to flush the L2 table cache: Input/output error | ||
162 | -Failed to flush the refcount block cache: Input/output error | ||
163 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
164 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
165 | write failed: Input/output error | ||
166 | No errors were found on the image. | ||
167 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
168 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
169 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
170 | |||
171 | Event: write_aio; errno: 28; imm: off; once: off; write | ||
172 | -Failed to flush the L2 table cache: No space left on device | ||
173 | -Failed to flush the refcount block cache: No space left on device | ||
174 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
175 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
176 | write failed: No space left on device | ||
177 | No errors were found on the image. | ||
178 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
179 | |||
180 | Event: write_aio; errno: 28; imm: off; once: off; write -b | ||
181 | -Failed to flush the L2 table cache: No space left on device | ||
182 | -Failed to flush the refcount block cache: No space left on device | ||
183 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
184 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
185 | write failed: No space left on device | ||
186 | No errors were found on the image. | ||
187 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
188 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
189 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
190 | |||
191 | Event: refblock_load; errno: 5; imm: off; once: off; write | ||
192 | -Failed to flush the L2 table cache: Input/output error | ||
193 | -Failed to flush the refcount block cache: Input/output error | ||
194 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
195 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
196 | write failed: Input/output error | ||
197 | No errors were found on the image. | ||
198 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
199 | |||
200 | Event: refblock_load; errno: 5; imm: off; once: off; write -b | ||
201 | -Failed to flush the L2 table cache: Input/output error | ||
202 | -Failed to flush the refcount block cache: Input/output error | ||
203 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
204 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
205 | write failed: Input/output error | ||
206 | No errors were found on the image. | ||
207 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
208 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
209 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
210 | |||
211 | Event: refblock_load; errno: 28; imm: off; once: off; write | ||
212 | -Failed to flush the L2 table cache: No space left on device | ||
213 | -Failed to flush the refcount block cache: No space left on device | ||
214 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
215 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
216 | write failed: No space left on device | ||
217 | No errors were found on the image. | ||
218 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
219 | |||
220 | Event: refblock_load; errno: 28; imm: off; once: off; write -b | ||
221 | -Failed to flush the L2 table cache: No space left on device | ||
222 | -Failed to flush the refcount block cache: No space left on device | ||
223 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
224 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
225 | write failed: No space left on device | ||
226 | No errors were found on the image. | ||
227 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
228 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
229 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
230 | |||
231 | Event: refblock_update_part; errno: 5; imm: off; once: off; write | ||
232 | -Failed to flush the L2 table cache: Input/output error | ||
233 | -Failed to flush the refcount block cache: Input/output error | ||
234 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
235 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
236 | write failed: Input/output error | ||
237 | No errors were found on the image. | ||
238 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
239 | |||
240 | Event: refblock_update_part; errno: 5; imm: off; once: off; write -b | ||
241 | -Failed to flush the L2 table cache: Input/output error | ||
242 | -Failed to flush the refcount block cache: Input/output error | ||
243 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
244 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
245 | write failed: Input/output error | ||
246 | No errors were found on the image. | ||
247 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
248 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
249 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
250 | |||
251 | Event: refblock_update_part; errno: 28; imm: off; once: off; write | ||
252 | -Failed to flush the L2 table cache: No space left on device | ||
253 | -Failed to flush the refcount block cache: No space left on device | ||
254 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
255 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
256 | write failed: No space left on device | ||
257 | No errors were found on the image. | ||
258 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
259 | |||
260 | Event: refblock_update_part; errno: 28; imm: off; once: off; write -b | ||
261 | -Failed to flush the L2 table cache: No space left on device | ||
262 | -Failed to flush the refcount block cache: No space left on device | ||
263 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
264 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
265 | write failed: No space left on device | ||
266 | No errors were found on the image. | ||
267 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
268 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
269 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
270 | |||
271 | Event: refblock_alloc; errno: 5; imm: off; once: off; write | ||
272 | -Failed to flush the L2 table cache: Input/output error | ||
273 | -Failed to flush the refcount block cache: Input/output error | ||
274 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
275 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
276 | write failed: Input/output error | ||
277 | No errors were found on the image. | ||
278 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
279 | |||
280 | Event: refblock_alloc; errno: 5; imm: off; once: off; write -b | ||
281 | -Failed to flush the L2 table cache: Input/output error | ||
282 | -Failed to flush the refcount block cache: Input/output error | ||
283 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
284 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
285 | write failed: Input/output error | ||
286 | No errors were found on the image. | ||
287 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
288 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
289 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
290 | |||
291 | Event: refblock_alloc; errno: 28; imm: off; once: off; write | ||
292 | -Failed to flush the L2 table cache: No space left on device | ||
293 | -Failed to flush the refcount block cache: No space left on device | ||
294 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
295 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
296 | write failed: No space left on device | ||
297 | No errors were found on the image. | ||
298 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
299 | |||
300 | Event: refblock_alloc; errno: 28; imm: off; once: off; write -b | ||
301 | -Failed to flush the L2 table cache: No space left on device | ||
302 | -Failed to flush the refcount block cache: No space left on device | ||
303 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
304 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
305 | write failed: No space left on device | ||
306 | No errors were found on the image. | ||
307 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
308 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
309 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
310 | |||
311 | Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write | ||
312 | -Failed to flush the L2 table cache: No space left on device | ||
313 | -Failed to flush the refcount block cache: No space left on device | ||
314 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
315 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
316 | write failed: No space left on device | ||
317 | |||
318 | 55 leaked clusters were found on the image. | ||
319 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
320 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
321 | |||
322 | Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write -b | ||
323 | -Failed to flush the L2 table cache: No space left on device | ||
324 | -Failed to flush the refcount block cache: No space left on device | ||
325 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
326 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
327 | write failed: No space left on device | ||
328 | |||
329 | 251 leaked clusters were found on the image. | ||
330 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
331 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
332 | |||
333 | Event: refblock_alloc_write; errno: 28; imm: off; once: off; write | ||
334 | -Failed to flush the L2 table cache: No space left on device | ||
335 | -Failed to flush the refcount block cache: No space left on device | ||
336 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
337 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
338 | write failed: No space left on device | ||
339 | No errors were found on the image. | ||
340 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
341 | |||
342 | Event: refblock_alloc_write; errno: 28; imm: off; once: off; write -b | ||
343 | -Failed to flush the L2 table cache: No space left on device | ||
344 | -Failed to flush the refcount block cache: No space left on device | ||
345 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
346 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
347 | write failed: No space left on device | ||
348 | No errors were found on the image. | ||
349 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
350 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
351 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
352 | |||
353 | Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write | ||
354 | -Failed to flush the L2 table cache: No space left on device | ||
355 | -Failed to flush the refcount block cache: No space left on device | ||
356 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
357 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
358 | write failed: No space left on device | ||
359 | |||
360 | 10 leaked clusters were found on the image. | ||
361 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
362 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
363 | |||
364 | Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write -b | ||
365 | -Failed to flush the L2 table cache: No space left on device | ||
366 | -Failed to flush the refcount block cache: No space left on device | ||
367 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
368 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
369 | write failed: No space left on device | ||
370 | |||
371 | 23 leaked clusters were found on the image. | ||
372 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
373 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
374 | |||
375 | Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write | ||
376 | -Failed to flush the L2 table cache: No space left on device | ||
377 | -Failed to flush the refcount block cache: No space left on device | ||
378 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
379 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
380 | write failed: No space left on device | ||
381 | |||
382 | 10 leaked clusters were found on the image. | ||
383 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
384 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
385 | |||
386 | Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write -b | ||
387 | -Failed to flush the L2 table cache: No space left on device | ||
388 | -Failed to flush the refcount block cache: No space left on device | ||
389 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
390 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
391 | write failed: No space left on device | ||
392 | |||
393 | 23 leaked clusters were found on the image. | ||
394 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
395 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
396 | |||
397 | Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write | ||
398 | -Failed to flush the L2 table cache: No space left on device | ||
399 | -Failed to flush the refcount block cache: No space left on device | ||
400 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
401 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
402 | write failed: No space left on device | ||
403 | |||
404 | 10 leaked clusters were found on the image. | ||
405 | @@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data. | ||
406 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
407 | |||
408 | Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write -b | ||
409 | -Failed to flush the L2 table cache: No space left on device | ||
410 | -Failed to flush the refcount block cache: No space left on device | ||
411 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
412 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
413 | write failed: No space left on device | ||
414 | |||
415 | 23 leaked clusters were found on the image. | ||
416 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
417 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
418 | |||
419 | Event: l1_grow_write_table; errno: 5; imm: off; once: off | ||
420 | -Failed to flush the L2 table cache: Input/output error | ||
421 | -Failed to flush the refcount block cache: Input/output error | ||
422 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
423 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
424 | write failed: Input/output error | ||
425 | No errors were found on the image. | ||
426 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
427 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
428 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
429 | |||
430 | Event: l1_grow_write_table; errno: 28; imm: off; once: off | ||
431 | -Failed to flush the L2 table cache: No space left on device | ||
432 | -Failed to flush the refcount block cache: No space left on device | ||
433 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
434 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
435 | write failed: No space left on device | ||
436 | No errors were found on the image. | ||
437 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
438 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
439 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
440 | |||
441 | Event: l1_grow_activate_table; errno: 5; imm: off; once: off | ||
442 | -Failed to flush the L2 table cache: Input/output error | ||
443 | -Failed to flush the refcount block cache: Input/output error | ||
444 | +qemu-io: Failed to flush the L2 table cache: Input/output error | ||
445 | +qemu-io: Failed to flush the refcount block cache: Input/output error | ||
446 | write failed: Input/output error | ||
447 | |||
448 | 96 leaked clusters were found on the image. | ||
449 | @@ -XXX,XX +XXX,XX @@ No errors were found on the image. | ||
450 | Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 | ||
451 | |||
452 | Event: l1_grow_activate_table; errno: 28; imm: off; once: off | ||
453 | -Failed to flush the L2 table cache: No space left on device | ||
454 | -Failed to flush the refcount block cache: No space left on device | ||
455 | +qemu-io: Failed to flush the L2 table cache: No space left on device | ||
456 | +qemu-io: Failed to flush the refcount block cache: No space left on device | ||
457 | write failed: No space left on device | ||
458 | |||
459 | 96 leaked clusters were found on the image. | ||
460 | -- | ||
461 | 2.21.0 | ||
462 | |||
463 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | A follow-up patch will make curl_multi_do() and curl_multi_read() take a | ||
2 | CURLSocket instead of the CURLState. They still need the latter, | ||
3 | though, so add a pointer to it to the former. | ||
4 | 1 | ||
5 | Cc: qemu-stable@nongnu.org | ||
6 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
7 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
8 | Message-id: 20190910124136.10565-2-mreitz@redhat.com | ||
9 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | ||
10 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
11 | --- | ||
12 | block/curl.c | 3 +++ | ||
13 | 1 file changed, 3 insertions(+) | ||
14 | |||
15 | diff --git a/block/curl.c b/block/curl.c | ||
16 | index XXXXXXX..XXXXXXX 100644 | ||
17 | --- a/block/curl.c | ||
18 | +++ b/block/curl.c | ||
19 | @@ -XXX,XX +XXX,XX @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, | ||
20 | #define CURL_BLOCK_OPT_TIMEOUT_DEFAULT 5 | ||
21 | |||
22 | struct BDRVCURLState; | ||
23 | +struct CURLState; | ||
24 | |||
25 | static bool libcurl_initialized; | ||
26 | |||
27 | @@ -XXX,XX +XXX,XX @@ typedef struct CURLAIOCB { | ||
28 | |||
29 | typedef struct CURLSocket { | ||
30 | int fd; | ||
31 | + struct CURLState *state; | ||
32 | QLIST_ENTRY(CURLSocket) next; | ||
33 | } CURLSocket; | ||
34 | |||
35 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | ||
36 | if (!socket) { | ||
37 | socket = g_new0(CURLSocket, 1); | ||
38 | socket->fd = fd; | ||
39 | + socket->state = state; | ||
40 | QLIST_INSERT_HEAD(&state->sockets, socket, next); | ||
41 | } | ||
42 | socket = NULL; | ||
43 | -- | ||
44 | 2.21.0 | ||
45 | |||
46 | diff view generated by jsdifflib |
1 | This does not really change anything, but it makes the code a bit easier | 1 | One iteration of polling is always performed even when polling is |
---|---|---|---|
2 | to follow once we use @socket as the opaque pointer for | 2 | disabled. This is done because: |
3 | aio_set_fd_handler(). | 3 | 1. Userspace polling is cheaper than making a syscall. We might get |
4 | lucky. | ||
5 | 2. We must poll once more after polling has stopped in case an event | ||
6 | occurred while stopping polling. | ||
4 | 7 | ||
5 | Cc: qemu-stable@nongnu.org | 8 | However, there are downsides: |
6 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 9 | 1. Polling becomes a bottleneck when the number of event sources is very |
7 | Message-id: 20190910124136.10565-3-mreitz@redhat.com | 10 | high. It's more efficient to monitor fds in that case. |
8 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | 11 | 2. A high-frequency polling event source can starve non-polling event |
9 | Reviewed-by: John Snow <jsnow@redhat.com> | 12 | sources because ppoll(2)/epoll(7) is never invoked. |
10 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 13 | |
14 | This patch removes the forced polling iteration so that poll_ns=0 really | ||
15 | means no polling. | ||
16 | |||
17 | IOPS increases from 10k to 60k when the guest has 100 | ||
18 | virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 | ||
19 | device because the large number of event sources being polled slows down | ||
20 | the event loop. | ||
21 | |||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
23 | Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com | ||
24 | Message-Id: <20200305170806.1313245-2-stefanha@redhat.com> | ||
11 | --- | 25 | --- |
12 | block/curl.c | 10 +++++----- | 26 | util/aio-posix.c | 22 +++++++++++++++------- |
13 | 1 file changed, 5 insertions(+), 5 deletions(-) | 27 | 1 file changed, 15 insertions(+), 7 deletions(-) |
14 | 28 | ||
15 | diff --git a/block/curl.c b/block/curl.c | 29 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
16 | index XXXXXXX..XXXXXXX 100644 | 30 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/curl.c | 31 | --- a/util/aio-posix.c |
18 | +++ b/block/curl.c | 32 | +++ b/util/aio-posix.c |
19 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | 33 | @@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx, |
20 | 34 | (IOHandler *)io_poll_end); | |
21 | QLIST_FOREACH(socket, &state->sockets, next) { | 35 | } |
22 | if (socket->fd == fd) { | 36 | |
23 | - if (action == CURL_POLL_REMOVE) { | 37 | -static void poll_set_started(AioContext *ctx, bool started) |
24 | - QLIST_REMOVE(socket, next); | 38 | +static bool poll_set_started(AioContext *ctx, bool started) |
25 | - g_free(socket); | 39 | { |
26 | - } | 40 | AioHandler *node; |
27 | break; | 41 | + bool progress = false; |
42 | |||
43 | if (started == ctx->poll_started) { | ||
44 | - return; | ||
45 | + return false; | ||
46 | } | ||
47 | |||
48 | ctx->poll_started = started; | ||
49 | @@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started) | ||
50 | if (fn) { | ||
51 | fn(node->opaque); | ||
52 | } | ||
53 | + | ||
54 | + /* Poll one last time in case ->io_poll_end() raced with the event */ | ||
55 | + if (!started) { | ||
56 | + progress = node->io_poll(node->opaque) || progress; | ||
57 | + } | ||
58 | } | ||
59 | qemu_lockcnt_dec(&ctx->list_lock); | ||
60 | + | ||
61 | + return progress; | ||
62 | } | ||
63 | |||
64 | |||
65 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
28 | } | 66 | } |
29 | } | 67 | } |
30 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | 68 | |
31 | socket->state = state; | 69 | - poll_set_started(ctx, false); |
32 | QLIST_INSERT_HEAD(&state->sockets, socket, next); | 70 | + if (poll_set_started(ctx, false)) { |
33 | } | 71 | + *timeout = 0; |
34 | - socket = NULL; | 72 | + return true; |
35 | |||
36 | trace_curl_sock_cb(action, (int)fd); | ||
37 | switch (action) { | ||
38 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | ||
39 | break; | ||
40 | } | ||
41 | |||
42 | + if (action == CURL_POLL_REMOVE) { | ||
43 | + QLIST_REMOVE(socket, next); | ||
44 | + g_free(socket); | ||
45 | + } | 73 | + } |
46 | + | 74 | |
47 | return 0; | 75 | - /* Even if we don't run busy polling, try polling once in case it can make |
76 | - * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2). | ||
77 | - */ | ||
78 | - return run_poll_handlers_once(ctx, timeout); | ||
79 | + return false; | ||
48 | } | 80 | } |
49 | 81 | ||
82 | bool aio_poll(AioContext *ctx, bool blocking) | ||
50 | -- | 83 | -- |
51 | 2.21.0 | 84 | 2.24.1 |
52 | 85 | ||
53 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | While it is more likely that transfers complete after some file | ||
2 | descriptor has data ready to read, we probably should not rely on it. | ||
3 | Better be safe than sorry and call curl_multi_check_completion() in | ||
4 | curl_multi_do(), too, just like it is done in curl_multi_read(). | ||
5 | 1 | ||
6 | With this change, curl_multi_do() and curl_multi_read() are actually the | ||
7 | same, so drop curl_multi_read() and use curl_multi_do() as the sole FD | ||
8 | handler. | ||
9 | |||
10 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
11 | Message-id: 20190910124136.10565-4-mreitz@redhat.com | ||
12 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | ||
13 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
14 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
15 | --- | ||
16 | block/curl.c | 14 ++------------ | ||
17 | 1 file changed, 2 insertions(+), 12 deletions(-) | ||
18 | |||
19 | diff --git a/block/curl.c b/block/curl.c | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/block/curl.c | ||
22 | +++ b/block/curl.c | ||
23 | @@ -XXX,XX +XXX,XX @@ typedef struct BDRVCURLState { | ||
24 | |||
25 | static void curl_clean_state(CURLState *s); | ||
26 | static void curl_multi_do(void *arg); | ||
27 | -static void curl_multi_read(void *arg); | ||
28 | |||
29 | #ifdef NEED_CURL_TIMER_CALLBACK | ||
30 | /* Called from curl_multi_do_locked, with s->mutex held. */ | ||
31 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | ||
32 | switch (action) { | ||
33 | case CURL_POLL_IN: | ||
34 | aio_set_fd_handler(s->aio_context, fd, false, | ||
35 | - curl_multi_read, NULL, NULL, state); | ||
36 | + curl_multi_do, NULL, NULL, state); | ||
37 | break; | ||
38 | case CURL_POLL_OUT: | ||
39 | aio_set_fd_handler(s->aio_context, fd, false, | ||
40 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | ||
41 | break; | ||
42 | case CURL_POLL_INOUT: | ||
43 | aio_set_fd_handler(s->aio_context, fd, false, | ||
44 | - curl_multi_read, curl_multi_do, NULL, state); | ||
45 | + curl_multi_do, curl_multi_do, NULL, state); | ||
46 | break; | ||
47 | case CURL_POLL_REMOVE: | ||
48 | aio_set_fd_handler(s->aio_context, fd, false, | ||
49 | @@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg) | ||
50 | { | ||
51 | CURLState *s = (CURLState *)arg; | ||
52 | |||
53 | - qemu_mutex_lock(&s->s->mutex); | ||
54 | - curl_multi_do_locked(s); | ||
55 | - qemu_mutex_unlock(&s->s->mutex); | ||
56 | -} | ||
57 | - | ||
58 | -static void curl_multi_read(void *arg) | ||
59 | -{ | ||
60 | - CURLState *s = (CURLState *)arg; | ||
61 | - | ||
62 | qemu_mutex_lock(&s->s->mutex); | ||
63 | curl_multi_do_locked(s); | ||
64 | curl_multi_check_completion(s->s); | ||
65 | -- | ||
66 | 2.21.0 | ||
67 | |||
68 | diff view generated by jsdifflib |
1 | If we had done that all along, debugging would have been much simpler. | 1 | Now that run_poll_handlers_once() is only called by run_poll_handlers() |
---|---|---|---|
2 | (Also, I/O errors are better than hangs.) | 2 | we can improve the CPU time profile by moving the expensive |
3 | RCU_READ_LOCK() out of the polling loop. | ||
3 | 4 | ||
4 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 5 | This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's |
5 | Message-id: 20190910124136.10565-8-mreitz@redhat.com | 6 | sampling profiler output. |
6 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | 7 | |
7 | Reviewed-by: John Snow <jsnow@redhat.com> | 8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
8 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 9 | Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com |
10 | Message-Id: <20200305170806.1313245-3-stefanha@redhat.com> | ||
9 | --- | 11 | --- |
10 | block/curl.c | 8 +++++++- | 12 | util/aio-posix.c | 20 ++++++++++---------- |
11 | 1 file changed, 7 insertions(+), 1 deletion(-) | 13 | 1 file changed, 10 insertions(+), 10 deletions(-) |
12 | 14 | ||
13 | diff --git a/block/curl.c b/block/curl.c | 15 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
14 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/block/curl.c | 17 | --- a/util/aio-posix.c |
16 | +++ b/block/curl.c | 18 | +++ b/util/aio-posix.c |
17 | @@ -XXX,XX +XXX,XX @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb) | 19 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) |
18 | trace_curl_setup_preadv(acb->bytes, start, state->range); | 20 | bool progress = false; |
19 | curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); | 21 | AioHandler *node; |
20 | 22 | ||
21 | - curl_multi_add_handle(s->multi, state->curl); | 23 | - /* |
22 | + if (curl_multi_add_handle(s->multi, state->curl) != CURLM_OK) { | 24 | - * Optimization: ->io_poll() handlers often contain RCU read critical |
23 | + state->acb[0] = NULL; | 25 | - * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() |
24 | + acb->ret = -EIO; | 26 | - * -> rcu_read_lock() -> ... sequences with expensive memory |
27 | - * synchronization primitives. Make the entire polling loop an RCU | ||
28 | - * critical section because nested rcu_read_lock()/rcu_read_unlock() calls | ||
29 | - * are cheap. | ||
30 | - */ | ||
31 | - RCU_READ_LOCK_GUARD(); | ||
32 | - | ||
33 | QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
34 | if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && | ||
35 | aio_node_check(ctx, node->is_external) && | ||
36 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
37 | |||
38 | trace_run_poll_handlers_begin(ctx, max_ns, *timeout); | ||
39 | |||
40 | + /* | ||
41 | + * Optimization: ->io_poll() handlers often contain RCU read critical | ||
42 | + * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() | ||
43 | + * -> rcu_read_lock() -> ... sequences with expensive memory | ||
44 | + * synchronization primitives. Make the entire polling loop an RCU | ||
45 | + * critical section because nested rcu_read_lock()/rcu_read_unlock() calls | ||
46 | + * are cheap. | ||
47 | + */ | ||
48 | + RCU_READ_LOCK_GUARD(); | ||
25 | + | 49 | + |
26 | + curl_clean_state(state); | 50 | start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); |
27 | + goto out; | 51 | do { |
28 | + } | 52 | progress = run_poll_handlers_once(ctx, timeout); |
29 | |||
30 | /* Tell curl it needs to kick things off */ | ||
31 | curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); | ||
32 | -- | 53 | -- |
33 | 2.21.0 | 54 | 2.24.1 |
34 | 55 | ||
35 | diff view generated by jsdifflib |
1 | From: Maxim Levitsky <mlevitsk@redhat.com> | 1 | The ppoll(2) and epoll(7) file descriptor monitoring implementations are |
---|---|---|---|
2 | mixed with the core util/aio-posix.c code. Before adding another | ||
3 | implementation for Linux io_uring, extract out the existing | ||
4 | ones so there is a clear interface and the core code is simpler. | ||
2 | 5 | ||
3 | * Change the qcow2_co_{encrypt|decrypt} to just receive full host and | 6 | The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps |
4 | guest offsets and use this function directly instead of calling | 7 | struct. See the patch for details. |
5 | do_perform_cow_encrypt (which is removed by that patch). | ||
6 | 8 | ||
7 | * Adjust qcow2_co_encdec to take full host and guest offsets as well. | 9 | Semantic changes: |
10 | 1. ppoll(2) now reflects events from pollfds[] back into AioHandlers | ||
11 | while we're still on the clock for adaptive polling. This was | ||
12 | already happening for epoll(7), so if it's really an issue then we'll | ||
13 | need to fix both in the future. | ||
14 | 2. epoll(7)'s fallback to ppoll(2) while external events are disabled | ||
15 | was broken when the number of fds exceeded the epoll(7) upgrade | ||
16 | threshold. I guess this code path simply wasn't tested and no one | ||
17 | noticed the bug. I didn't go out of my way to fix it but the correct | ||
18 | code is simpler than preserving the bug. | ||
8 | 19 | ||
9 | * Document the qcow2_co_{encrypt|decrypt} arguments | 20 | I also took some liberties in removing the unnecessary |
10 | to prevent the bug fixed in former commit from hopefully | 21 | AioContext->epoll_available (just check AioContext->epollfd != -1 |
11 | happening again. | 22 | instead) and AioContext->epoll_enabled (it's implicit if our |
23 | AioContext->fdmon_ops callbacks are being invoked) fields. | ||
12 | 24 | ||
13 | Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> | 25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
14 | Message-id: 20190915203655.21638-3-mlevitsk@redhat.com | 26 | Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com |
15 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | 27 | Message-Id: <20200305170806.1313245-4-stefanha@redhat.com> |
16 | [mreitz: Let perform_cow() return the error value returned by | ||
17 | qcow2_co_encrypt(), as proposed by Vladimir] | ||
18 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
19 | --- | 28 | --- |
20 | block/qcow2.h | 8 +++--- | 29 | MAINTAINERS | 2 + |
21 | block/qcow2-cluster.c | 41 +++++++++------------------- | 30 | include/block/aio.h | 36 +++++- |
22 | block/qcow2-threads.c | 63 +++++++++++++++++++++++++++++++++---------- | 31 | util/Makefile.objs | 2 + |
23 | block/qcow2.c | 5 ++-- | 32 | util/aio-posix.c | 286 ++------------------------------------------ |
24 | 4 files changed, 69 insertions(+), 48 deletions(-) | 33 | util/aio-posix.h | 61 ++++++++++ |
34 | util/fdmon-epoll.c | 151 +++++++++++++++++++++++ | ||
35 | util/fdmon-poll.c | 104 ++++++++++++++++ | ||
36 | 7 files changed, 366 insertions(+), 276 deletions(-) | ||
37 | create mode 100644 util/aio-posix.h | ||
38 | create mode 100644 util/fdmon-epoll.c | ||
39 | create mode 100644 util/fdmon-poll.c | ||
25 | 40 | ||
26 | diff --git a/block/qcow2.h b/block/qcow2.h | 41 | diff --git a/MAINTAINERS b/MAINTAINERS |
27 | index XXXXXXX..XXXXXXX 100644 | 42 | index XXXXXXX..XXXXXXX 100644 |
28 | --- a/block/qcow2.h | 43 | --- a/MAINTAINERS |
29 | +++ b/block/qcow2.h | 44 | +++ b/MAINTAINERS |
30 | @@ -XXX,XX +XXX,XX @@ ssize_t coroutine_fn | 45 | @@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org |
31 | qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size, | 46 | S: Supported |
32 | const void *src, size_t src_size); | 47 | F: util/async.c |
33 | int coroutine_fn | 48 | F: util/aio-*.c |
34 | -qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset, | 49 | +F: util/aio-*.h |
35 | - uint64_t offset, void *buf, size_t len); | 50 | +F: util/fdmon-*.c |
36 | +qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset, | 51 | F: block/io.c |
37 | + uint64_t guest_offset, void *buf, size_t len); | 52 | F: migration/block* |
38 | int coroutine_fn | 53 | F: include/block/aio.h |
39 | -qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset, | 54 | diff --git a/include/block/aio.h b/include/block/aio.h |
40 | - uint64_t offset, void *buf, size_t len); | ||
41 | +qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset, | ||
42 | + uint64_t guest_offset, void *buf, size_t len); | ||
43 | |||
44 | #endif | ||
45 | diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c | ||
46 | index XXXXXXX..XXXXXXX 100644 | 55 | index XXXXXXX..XXXXXXX 100644 |
47 | --- a/block/qcow2-cluster.c | 56 | --- a/include/block/aio.h |
48 | +++ b/block/qcow2-cluster.c | 57 | +++ b/include/block/aio.h |
49 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, | 58 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool; |
50 | return 0; | 59 | struct LinuxAioState; |
60 | struct LuringState; | ||
61 | |||
62 | +/* Callbacks for file descriptor monitoring implementations */ | ||
63 | +typedef struct { | ||
64 | + /* | ||
65 | + * update: | ||
66 | + * @ctx: the AioContext | ||
67 | + * @node: the handler | ||
68 | + * @is_new: is the file descriptor already being monitored? | ||
69 | + * | ||
70 | + * Add/remove/modify a monitored file descriptor. There are three cases: | ||
71 | + * 1. node->pfd.events == 0 means remove the file descriptor. | ||
72 | + * 2. !is_new means modify an already monitored file descriptor. | ||
73 | + * 3. is_new means add a new file descriptor. | ||
74 | + * | ||
75 | + * Called with ctx->list_lock acquired. | ||
76 | + */ | ||
77 | + void (*update)(AioContext *ctx, AioHandler *node, bool is_new); | ||
78 | + | ||
79 | + /* | ||
80 | + * wait: | ||
81 | + * @ctx: the AioContext | ||
82 | + * @ready_list: list for handlers that become ready | ||
83 | + * @timeout: maximum duration to wait, in nanoseconds | ||
84 | + * | ||
85 | + * Wait for file descriptors to become ready and place them on ready_list. | ||
86 | + * | ||
87 | + * Called with ctx->list_lock incremented but not locked. | ||
88 | + * | ||
89 | + * Returns: number of ready file descriptors. | ||
90 | + */ | ||
91 | + int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); | ||
92 | +} FDMonOps; | ||
93 | + | ||
94 | /* | ||
95 | * Each aio_bh_poll() call carves off a slice of the BH list, so that newly | ||
96 | * scheduled BHs are not processed until the next aio_bh_poll() call. All | ||
97 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
98 | |||
99 | /* epoll(7) state used when built with CONFIG_EPOLL */ | ||
100 | int epollfd; | ||
101 | - bool epoll_enabled; | ||
102 | - bool epoll_available; | ||
103 | + | ||
104 | + const FDMonOps *fdmon_ops; | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | diff --git a/util/Makefile.objs b/util/Makefile.objs | ||
109 | index XXXXXXX..XXXXXXX 100644 | ||
110 | --- a/util/Makefile.objs | ||
111 | +++ b/util/Makefile.objs | ||
112 | @@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o | ||
113 | util-obj-y += main-loop.o | ||
114 | util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o | ||
115 | util-obj-$(CONFIG_POSIX) += aio-posix.o | ||
116 | +util-obj-$(CONFIG_POSIX) += fdmon-poll.o | ||
117 | +util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o | ||
118 | util-obj-$(CONFIG_POSIX) += compatfd.o | ||
119 | util-obj-$(CONFIG_POSIX) += event_notifier-posix.o | ||
120 | util-obj-$(CONFIG_POSIX) += mmap-alloc.o | ||
121 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
122 | index XXXXXXX..XXXXXXX 100644 | ||
123 | --- a/util/aio-posix.c | ||
124 | +++ b/util/aio-posix.c | ||
125 | @@ -XXX,XX +XXX,XX @@ | ||
126 | #include "qemu/sockets.h" | ||
127 | #include "qemu/cutils.h" | ||
128 | #include "trace.h" | ||
129 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
130 | -#include <sys/epoll.h> | ||
131 | -#endif | ||
132 | +#include "aio-posix.h" | ||
133 | |||
134 | -struct AioHandler | ||
135 | -{ | ||
136 | - GPollFD pfd; | ||
137 | - IOHandler *io_read; | ||
138 | - IOHandler *io_write; | ||
139 | - AioPollFn *io_poll; | ||
140 | - IOHandler *io_poll_begin; | ||
141 | - IOHandler *io_poll_end; | ||
142 | - void *opaque; | ||
143 | - bool is_external; | ||
144 | - QLIST_ENTRY(AioHandler) node; | ||
145 | - QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
146 | - QLIST_ENTRY(AioHandler) node_deleted; | ||
147 | -}; | ||
148 | - | ||
149 | -/* Add a handler to a ready list */ | ||
150 | -static void add_ready_handler(AioHandlerList *ready_list, | ||
151 | - AioHandler *node, | ||
152 | - int revents) | ||
153 | +void aio_add_ready_handler(AioHandlerList *ready_list, | ||
154 | + AioHandler *node, | ||
155 | + int revents) | ||
156 | { | ||
157 | QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ | ||
158 | node->pfd.revents = revents; | ||
159 | QLIST_INSERT_HEAD(ready_list, node, node_ready); | ||
51 | } | 160 | } |
52 | 161 | ||
53 | -static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, | 162 | -#ifdef CONFIG_EPOLL_CREATE1 |
54 | - uint64_t src_cluster_offset, | 163 | - |
55 | - uint64_t cluster_offset, | 164 | -/* The fd number threshold to switch to epoll */ |
56 | - unsigned offset_in_cluster, | 165 | -#define EPOLL_ENABLE_THRESHOLD 64 |
57 | - uint8_t *buffer, | 166 | - |
58 | - unsigned bytes) | 167 | -static void aio_epoll_disable(AioContext *ctx) |
59 | -{ | 168 | -{ |
60 | - if (bytes && bs->encrypted) { | 169 | - ctx->epoll_enabled = false; |
61 | - BDRVQcow2State *s = bs->opaque; | 170 | - if (!ctx->epoll_available) { |
62 | - assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE)); | 171 | - return; |
63 | - assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); | 172 | - } |
64 | - assert(s->crypto); | 173 | - ctx->epoll_available = false; |
65 | - if (qcow2_co_encrypt(bs, | 174 | - close(ctx->epollfd); |
66 | - start_of_cluster(s, cluster_offset + offset_in_cluster), | 175 | -} |
67 | - src_cluster_offset + offset_in_cluster, | 176 | - |
68 | - buffer, bytes) < 0) { | 177 | -static inline int epoll_events_from_pfd(int pfd_events) |
178 | -{ | ||
179 | - return (pfd_events & G_IO_IN ? EPOLLIN : 0) | | ||
180 | - (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | ||
181 | - (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | ||
182 | - (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
183 | -} | ||
184 | - | ||
185 | -static bool aio_epoll_try_enable(AioContext *ctx) | ||
186 | -{ | ||
187 | - AioHandler *node; | ||
188 | - struct epoll_event event; | ||
189 | - | ||
190 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
191 | - int r; | ||
192 | - if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { | ||
193 | - continue; | ||
194 | - } | ||
195 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
196 | - event.data.ptr = node; | ||
197 | - r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | ||
198 | - if (r) { | ||
69 | - return false; | 199 | - return false; |
70 | - } | 200 | - } |
71 | - } | 201 | - } |
202 | - ctx->epoll_enabled = true; | ||
72 | - return true; | 203 | - return true; |
73 | -} | 204 | -} |
74 | - | 205 | - |
75 | static int coroutine_fn do_perform_cow_write(BlockDriverState *bs, | 206 | -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) |
76 | uint64_t cluster_offset, | 207 | -{ |
77 | unsigned offset_in_cluster, | 208 | - struct epoll_event event; |
78 | @@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m) | 209 | - int r; |
79 | 210 | - int ctl; | |
80 | /* Encrypt the data if necessary before writing it */ | 211 | - |
81 | if (bs->encrypted) { | 212 | - if (!ctx->epoll_enabled) { |
82 | - if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, | 213 | - return; |
83 | - start->offset, start_buffer, | 214 | - } |
84 | - start->nb_bytes) || | 215 | - if (!node->pfd.events) { |
85 | - !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, | 216 | - ctl = EPOLL_CTL_DEL; |
86 | - end->offset, end_buffer, end->nb_bytes)) { | 217 | - } else { |
87 | - ret = -EIO; | 218 | - event.data.ptr = node; |
88 | + ret = qcow2_co_encrypt(bs, | 219 | - event.events = epoll_events_from_pfd(node->pfd.events); |
89 | + m->alloc_offset + start->offset, | 220 | - ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; |
90 | + m->offset + start->offset, | 221 | - } |
91 | + start_buffer, start->nb_bytes); | 222 | - |
92 | + if (ret < 0) { | 223 | - r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); |
93 | + goto fail; | 224 | - if (r) { |
94 | + } | 225 | - aio_epoll_disable(ctx); |
95 | + | 226 | - } |
96 | + ret = qcow2_co_encrypt(bs, | 227 | -} |
97 | + m->alloc_offset + end->offset, | 228 | - |
98 | + m->offset + end->offset, | 229 | -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list, |
99 | + end_buffer, end->nb_bytes); | 230 | - int64_t timeout) |
100 | + if (ret < 0) { | 231 | -{ |
101 | goto fail; | 232 | - GPollFD pfd = { |
233 | - .fd = ctx->epollfd, | ||
234 | - .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, | ||
235 | - }; | ||
236 | - AioHandler *node; | ||
237 | - int i, ret = 0; | ||
238 | - struct epoll_event events[128]; | ||
239 | - | ||
240 | - if (timeout > 0) { | ||
241 | - ret = qemu_poll_ns(&pfd, 1, timeout); | ||
242 | - if (ret > 0) { | ||
243 | - timeout = 0; | ||
244 | - } | ||
245 | - } | ||
246 | - if (timeout <= 0 || ret > 0) { | ||
247 | - ret = epoll_wait(ctx->epollfd, events, | ||
248 | - ARRAY_SIZE(events), | ||
249 | - timeout); | ||
250 | - if (ret <= 0) { | ||
251 | - goto out; | ||
252 | - } | ||
253 | - for (i = 0; i < ret; i++) { | ||
254 | - int ev = events[i].events; | ||
255 | - int revents = (ev & EPOLLIN ? G_IO_IN : 0) | | ||
256 | - (ev & EPOLLOUT ? G_IO_OUT : 0) | | ||
257 | - (ev & EPOLLHUP ? G_IO_HUP : 0) | | ||
258 | - (ev & EPOLLERR ? G_IO_ERR : 0); | ||
259 | - | ||
260 | - node = events[i].data.ptr; | ||
261 | - add_ready_handler(ready_list, node, revents); | ||
262 | - } | ||
263 | - } | ||
264 | -out: | ||
265 | - return ret; | ||
266 | -} | ||
267 | - | ||
268 | -static bool aio_epoll_enabled(AioContext *ctx) | ||
269 | -{ | ||
270 | - /* Fall back to ppoll when external clients are disabled. */ | ||
271 | - return !aio_external_disabled(ctx) && ctx->epoll_enabled; | ||
272 | -} | ||
273 | - | ||
274 | -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | ||
275 | - unsigned npfd, int64_t timeout) | ||
276 | -{ | ||
277 | - if (!ctx->epoll_available) { | ||
278 | - return false; | ||
279 | - } | ||
280 | - if (aio_epoll_enabled(ctx)) { | ||
281 | - return true; | ||
282 | - } | ||
283 | - if (npfd >= EPOLL_ENABLE_THRESHOLD) { | ||
284 | - if (aio_epoll_try_enable(ctx)) { | ||
285 | - return true; | ||
286 | - } else { | ||
287 | - aio_epoll_disable(ctx); | ||
288 | - } | ||
289 | - } | ||
290 | - return false; | ||
291 | -} | ||
292 | - | ||
293 | -#else | ||
294 | - | ||
295 | -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
296 | -{ | ||
297 | -} | ||
298 | - | ||
299 | -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list, | ||
300 | - int64_t timeout) | ||
301 | -{ | ||
302 | - assert(false); | ||
303 | -} | ||
304 | - | ||
305 | -static bool aio_epoll_enabled(AioContext *ctx) | ||
306 | -{ | ||
307 | - return false; | ||
308 | -} | ||
309 | - | ||
310 | -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | ||
311 | - unsigned npfd, int64_t timeout) | ||
312 | -{ | ||
313 | - return false; | ||
314 | -} | ||
315 | - | ||
316 | -#endif | ||
317 | - | ||
318 | static AioHandler *find_aio_handler(AioContext *ctx, int fd) | ||
319 | { | ||
320 | AioHandler *node; | ||
321 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
322 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
323 | |||
324 | if (new_node) { | ||
325 | - aio_epoll_update(ctx, new_node, is_new); | ||
326 | + ctx->fdmon_ops->update(ctx, new_node, is_new); | ||
327 | } else if (node) { | ||
328 | /* Unregister deleted fd_handler */ | ||
329 | - aio_epoll_update(ctx, node, false); | ||
330 | + ctx->fdmon_ops->update(ctx, node, false); | ||
331 | } | ||
332 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
333 | aio_notify(ctx); | ||
334 | @@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx) | ||
335 | timerlistgroup_run_timers(&ctx->tlg); | ||
336 | } | ||
337 | |||
338 | -/* These thread-local variables are used only in a small part of aio_poll | ||
339 | - * around the call to the poll() system call. In particular they are not | ||
340 | - * used while aio_poll is performing callbacks, which makes it much easier | ||
341 | - * to think about reentrancy! | ||
342 | - * | ||
343 | - * Stack-allocated arrays would be perfect but they have size limitations; | ||
344 | - * heap allocation is expensive enough that we want to reuse arrays across | ||
345 | - * calls to aio_poll(). And because poll() has to be called without holding | ||
346 | - * any lock, the arrays cannot be stored in AioContext. Thread-local data | ||
347 | - * has none of the disadvantages of these three options. | ||
348 | - */ | ||
349 | -static __thread GPollFD *pollfds; | ||
350 | -static __thread AioHandler **nodes; | ||
351 | -static __thread unsigned npfd, nalloc; | ||
352 | -static __thread Notifier pollfds_cleanup_notifier; | ||
353 | - | ||
354 | -static void pollfds_cleanup(Notifier *n, void *unused) | ||
355 | -{ | ||
356 | - g_assert(npfd == 0); | ||
357 | - g_free(pollfds); | ||
358 | - g_free(nodes); | ||
359 | - nalloc = 0; | ||
360 | -} | ||
361 | - | ||
362 | -static void add_pollfd(AioHandler *node) | ||
363 | -{ | ||
364 | - if (npfd == nalloc) { | ||
365 | - if (nalloc == 0) { | ||
366 | - pollfds_cleanup_notifier.notify = pollfds_cleanup; | ||
367 | - qemu_thread_atexit_add(&pollfds_cleanup_notifier); | ||
368 | - nalloc = 8; | ||
369 | - } else { | ||
370 | - g_assert(nalloc <= INT_MAX); | ||
371 | - nalloc *= 2; | ||
372 | - } | ||
373 | - pollfds = g_renew(GPollFD, pollfds, nalloc); | ||
374 | - nodes = g_renew(AioHandler *, nodes, nalloc); | ||
375 | - } | ||
376 | - nodes[npfd] = node; | ||
377 | - pollfds[npfd] = (GPollFD) { | ||
378 | - .fd = node->pfd.fd, | ||
379 | - .events = node->pfd.events, | ||
380 | - }; | ||
381 | - npfd++; | ||
382 | -} | ||
383 | - | ||
384 | static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
385 | { | ||
386 | bool progress = false; | ||
387 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
388 | bool aio_poll(AioContext *ctx, bool blocking) | ||
389 | { | ||
390 | AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); | ||
391 | - AioHandler *node; | ||
392 | - int i; | ||
393 | int ret = 0; | ||
394 | bool progress; | ||
395 | int64_t timeout; | ||
396 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
397 | * system call---a single round of run_poll_handlers_once suffices. | ||
398 | */ | ||
399 | if (timeout || atomic_read(&ctx->poll_disable_cnt)) { | ||
400 | - assert(npfd == 0); | ||
401 | - | ||
402 | - /* fill pollfds */ | ||
403 | - | ||
404 | - if (!aio_epoll_enabled(ctx)) { | ||
405 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
406 | - if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events | ||
407 | - && aio_node_check(ctx, node->is_external)) { | ||
408 | - add_pollfd(node); | ||
409 | - } | ||
410 | - } | ||
411 | - } | ||
412 | - | ||
413 | - /* wait until next event */ | ||
414 | - if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) { | ||
415 | - npfd = 0; /* pollfds[] is not being used */ | ||
416 | - ret = aio_epoll(ctx, &ready_list, timeout); | ||
417 | - } else { | ||
418 | - ret = qemu_poll_ns(pollfds, npfd, timeout); | ||
419 | - } | ||
420 | + ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
421 | } | ||
422 | |||
423 | if (blocking) { | ||
424 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
102 | } | 425 | } |
103 | } | 426 | } |
104 | diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c | 427 | |
105 | index XXXXXXX..XXXXXXX 100644 | 428 | - /* if we have any readable fds, dispatch event */ |
106 | --- a/block/qcow2-threads.c | 429 | - if (ret > 0) { |
107 | +++ b/block/qcow2-threads.c | 430 | - for (i = 0; i < npfd; i++) { |
108 | @@ -XXX,XX +XXX,XX @@ static int qcow2_encdec_pool_func(void *opaque) | 431 | - int revents = pollfds[i].revents; |
432 | - | ||
433 | - if (revents) { | ||
434 | - add_ready_handler(&ready_list, nodes[i], revents); | ||
435 | - } | ||
436 | - } | ||
437 | - } | ||
438 | - | ||
439 | - npfd = 0; | ||
440 | - | ||
441 | progress |= aio_bh_poll(ctx); | ||
442 | |||
443 | if (ret > 0) { | ||
444 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
445 | |||
446 | void aio_context_setup(AioContext *ctx) | ||
447 | { | ||
448 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
449 | - assert(!ctx->epollfd); | ||
450 | - ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); | ||
451 | - if (ctx->epollfd == -1) { | ||
452 | - fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); | ||
453 | - ctx->epoll_available = false; | ||
454 | - } else { | ||
455 | - ctx->epoll_available = true; | ||
456 | - } | ||
457 | -#endif | ||
458 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
459 | + ctx->epollfd = -1; | ||
460 | + | ||
461 | + fdmon_epoll_setup(ctx); | ||
109 | } | 462 | } |
110 | 463 | ||
111 | static int coroutine_fn | 464 | void aio_context_destroy(AioContext *ctx) |
112 | -qcow2_co_encdec(BlockDriverState *bs, uint64_t file_cluster_offset, | ||
113 | - uint64_t offset, void *buf, size_t len, Qcow2EncDecFunc func) | ||
114 | +qcow2_co_encdec(BlockDriverState *bs, uint64_t host_offset, | ||
115 | + uint64_t guest_offset, void *buf, size_t len, | ||
116 | + Qcow2EncDecFunc func) | ||
117 | { | 465 | { |
118 | BDRVQcow2State *s = bs->opaque; | 466 | -#ifdef CONFIG_EPOLL_CREATE1 |
119 | Qcow2EncDecData arg = { | 467 | - aio_epoll_disable(ctx); |
120 | .block = s->crypto, | 468 | -#endif |
121 | - .offset = s->crypt_physical_offset ? | 469 | + fdmon_epoll_disable(ctx); |
122 | - file_cluster_offset + offset_into_cluster(s, offset) : | ||
123 | - offset, | ||
124 | + .offset = s->crypt_physical_offset ? host_offset : guest_offset, | ||
125 | .buf = buf, | ||
126 | .len = len, | ||
127 | .func = func, | ||
128 | }; | ||
129 | |||
130 | - return qcow2_co_process(bs, qcow2_encdec_pool_func, &arg); | ||
131 | + assert(QEMU_IS_ALIGNED(guest_offset, BDRV_SECTOR_SIZE)); | ||
132 | + assert(QEMU_IS_ALIGNED(host_offset, BDRV_SECTOR_SIZE)); | ||
133 | + assert(QEMU_IS_ALIGNED(len, BDRV_SECTOR_SIZE)); | ||
134 | + assert(s->crypto); | ||
135 | + | ||
136 | + return len == 0 ? 0 : qcow2_co_process(bs, qcow2_encdec_pool_func, &arg); | ||
137 | } | 470 | } |
138 | 471 | ||
472 | void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, | ||
473 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
474 | new file mode 100644 | ||
475 | index XXXXXXX..XXXXXXX | ||
476 | --- /dev/null | ||
477 | +++ b/util/aio-posix.h | ||
478 | @@ -XXX,XX +XXX,XX @@ | ||
139 | +/* | 479 | +/* |
140 | + * qcow2_co_encrypt() | 480 | + * AioContext POSIX event loop implementation internal APIs |
141 | + * | 481 | + * |
142 | + * Encrypts one or more contiguous aligned sectors | 482 | + * Copyright IBM, Corp. 2008 |
483 | + * Copyright Red Hat, Inc. 2020 | ||
143 | + * | 484 | + * |
144 | + * @host_offset - underlying storage offset of the first sector of the | 485 | + * Authors: |
145 | + * data to be encrypted | 486 | + * Anthony Liguori <aliguori@us.ibm.com> |
146 | + * | 487 | + * |
147 | + * @guest_offset - guest (virtual) offset of the first sector of the | 488 | + * This work is licensed under the terms of the GNU GPL, version 2. See |
148 | + * data to be encrypted | 489 | + * the COPYING file in the top-level directory. |
149 | + * | 490 | + * |
150 | + * @buf - buffer with the data to encrypt, that after encryption | 491 | + * Contributions after 2012-01-13 are licensed under the terms of the |
151 | + * will be written to the underlying storage device at | 492 | + * GNU GPL, version 2 or (at your option) any later version. |
152 | + * @host_offset | 493 | + */ |
494 | + | ||
495 | +#ifndef AIO_POSIX_H | ||
496 | +#define AIO_POSIX_H | ||
497 | + | ||
498 | +#include "block/aio.h" | ||
499 | + | ||
500 | +struct AioHandler { | ||
501 | + GPollFD pfd; | ||
502 | + IOHandler *io_read; | ||
503 | + IOHandler *io_write; | ||
504 | + AioPollFn *io_poll; | ||
505 | + IOHandler *io_poll_begin; | ||
506 | + IOHandler *io_poll_end; | ||
507 | + void *opaque; | ||
508 | + bool is_external; | ||
509 | + QLIST_ENTRY(AioHandler) node; | ||
510 | + QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
511 | + QLIST_ENTRY(AioHandler) node_deleted; | ||
512 | +}; | ||
513 | + | ||
514 | +/* Add a handler to a ready list */ | ||
515 | +void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node, | ||
516 | + int revents); | ||
517 | + | ||
518 | +extern const FDMonOps fdmon_poll_ops; | ||
519 | + | ||
520 | +#ifdef CONFIG_EPOLL_CREATE1 | ||
521 | +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd); | ||
522 | +void fdmon_epoll_setup(AioContext *ctx); | ||
523 | +void fdmon_epoll_disable(AioContext *ctx); | ||
524 | +#else | ||
525 | +static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) | ||
526 | +{ | ||
527 | + return false; | ||
528 | +} | ||
529 | + | ||
530 | +static inline void fdmon_epoll_setup(AioContext *ctx) | ||
531 | +{ | ||
532 | +} | ||
533 | + | ||
534 | +static inline void fdmon_epoll_disable(AioContext *ctx) | ||
535 | +{ | ||
536 | +} | ||
537 | +#endif /* !CONFIG_EPOLL_CREATE1 */ | ||
538 | + | ||
539 | +#endif /* AIO_POSIX_H */ | ||
540 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
541 | new file mode 100644 | ||
542 | index XXXXXXX..XXXXXXX | ||
543 | --- /dev/null | ||
544 | +++ b/util/fdmon-epoll.c | ||
545 | @@ -XXX,XX +XXX,XX @@ | ||
546 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
547 | +/* | ||
548 | + * epoll(7) file descriptor monitoring | ||
549 | + */ | ||
550 | + | ||
551 | +#include "qemu/osdep.h" | ||
552 | +#include <sys/epoll.h> | ||
553 | +#include "qemu/rcu_queue.h" | ||
554 | +#include "aio-posix.h" | ||
555 | + | ||
556 | +/* The fd number threshold to switch to epoll */ | ||
557 | +#define EPOLL_ENABLE_THRESHOLD 64 | ||
558 | + | ||
559 | +void fdmon_epoll_disable(AioContext *ctx) | ||
560 | +{ | ||
561 | + if (ctx->epollfd >= 0) { | ||
562 | + close(ctx->epollfd); | ||
563 | + ctx->epollfd = -1; | ||
564 | + } | ||
565 | + | ||
566 | + /* Switch back */ | ||
567 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
568 | +} | ||
569 | + | ||
570 | +static inline int epoll_events_from_pfd(int pfd_events) | ||
571 | +{ | ||
572 | + return (pfd_events & G_IO_IN ? EPOLLIN : 0) | | ||
573 | + (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | ||
574 | + (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | ||
575 | + (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
576 | +} | ||
577 | + | ||
578 | +static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
579 | +{ | ||
580 | + struct epoll_event event; | ||
581 | + int r; | ||
582 | + int ctl; | ||
583 | + | ||
584 | + if (!node->pfd.events) { | ||
585 | + ctl = EPOLL_CTL_DEL; | ||
586 | + } else { | ||
587 | + event.data.ptr = node; | ||
588 | + event.events = epoll_events_from_pfd(node->pfd.events); | ||
589 | + ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
590 | + } | ||
591 | + | ||
592 | + r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | ||
593 | + if (r) { | ||
594 | + fdmon_epoll_disable(ctx); | ||
595 | + } | ||
596 | +} | ||
597 | + | ||
598 | +static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
599 | + int64_t timeout) | ||
600 | +{ | ||
601 | + GPollFD pfd = { | ||
602 | + .fd = ctx->epollfd, | ||
603 | + .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, | ||
604 | + }; | ||
605 | + AioHandler *node; | ||
606 | + int i, ret = 0; | ||
607 | + struct epoll_event events[128]; | ||
608 | + | ||
609 | + /* Fall back while external clients are disabled */ | ||
610 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
611 | + return fdmon_poll_ops.wait(ctx, ready_list, timeout); | ||
612 | + } | ||
613 | + | ||
614 | + if (timeout > 0) { | ||
615 | + ret = qemu_poll_ns(&pfd, 1, timeout); | ||
616 | + if (ret > 0) { | ||
617 | + timeout = 0; | ||
618 | + } | ||
619 | + } | ||
620 | + if (timeout <= 0 || ret > 0) { | ||
621 | + ret = epoll_wait(ctx->epollfd, events, | ||
622 | + ARRAY_SIZE(events), | ||
623 | + timeout); | ||
624 | + if (ret <= 0) { | ||
625 | + goto out; | ||
626 | + } | ||
627 | + for (i = 0; i < ret; i++) { | ||
628 | + int ev = events[i].events; | ||
629 | + int revents = (ev & EPOLLIN ? G_IO_IN : 0) | | ||
630 | + (ev & EPOLLOUT ? G_IO_OUT : 0) | | ||
631 | + (ev & EPOLLHUP ? G_IO_HUP : 0) | | ||
632 | + (ev & EPOLLERR ? G_IO_ERR : 0); | ||
633 | + | ||
634 | + node = events[i].data.ptr; | ||
635 | + aio_add_ready_handler(ready_list, node, revents); | ||
636 | + } | ||
637 | + } | ||
638 | +out: | ||
639 | + return ret; | ||
640 | +} | ||
641 | + | ||
642 | +static const FDMonOps fdmon_epoll_ops = { | ||
643 | + .update = fdmon_epoll_update, | ||
644 | + .wait = fdmon_epoll_wait, | ||
645 | +}; | ||
646 | + | ||
647 | +static bool fdmon_epoll_try_enable(AioContext *ctx) | ||
648 | +{ | ||
649 | + AioHandler *node; | ||
650 | + struct epoll_event event; | ||
651 | + | ||
652 | + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
653 | + int r; | ||
654 | + if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { | ||
655 | + continue; | ||
656 | + } | ||
657 | + event.events = epoll_events_from_pfd(node->pfd.events); | ||
658 | + event.data.ptr = node; | ||
659 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | ||
660 | + if (r) { | ||
661 | + return false; | ||
662 | + } | ||
663 | + } | ||
664 | + | ||
665 | + ctx->fdmon_ops = &fdmon_epoll_ops; | ||
666 | + return true; | ||
667 | +} | ||
668 | + | ||
669 | +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) | ||
670 | +{ | ||
671 | + if (ctx->epollfd < 0) { | ||
672 | + return false; | ||
673 | + } | ||
674 | + | ||
675 | + /* Do not upgrade while external clients are disabled */ | ||
676 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
677 | + return false; | ||
678 | + } | ||
679 | + | ||
680 | + if (npfd >= EPOLL_ENABLE_THRESHOLD) { | ||
681 | + if (fdmon_epoll_try_enable(ctx)) { | ||
682 | + return true; | ||
683 | + } else { | ||
684 | + fdmon_epoll_disable(ctx); | ||
685 | + } | ||
686 | + } | ||
687 | + return false; | ||
688 | +} | ||
689 | + | ||
690 | +void fdmon_epoll_setup(AioContext *ctx) | ||
691 | +{ | ||
692 | + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); | ||
693 | + if (ctx->epollfd == -1) { | ||
694 | + fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); | ||
695 | + } | ||
696 | +} | ||
697 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | ||
698 | new file mode 100644 | ||
699 | index XXXXXXX..XXXXXXX | ||
700 | --- /dev/null | ||
701 | +++ b/util/fdmon-poll.c | ||
702 | @@ -XXX,XX +XXX,XX @@ | ||
703 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
704 | +/* | ||
705 | + * poll(2) file descriptor monitoring | ||
153 | + * | 706 | + * |
154 | + * @len - length of the buffer (must be a BDRV_SECTOR_SIZE multiple) | 707 | + * Uses ppoll(2) when available, g_poll() otherwise. |
708 | + */ | ||
709 | + | ||
710 | +#include "qemu/osdep.h" | ||
711 | +#include "aio-posix.h" | ||
712 | +#include "qemu/rcu_queue.h" | ||
713 | + | ||
714 | +/* | ||
715 | + * These thread-local variables are used only in fdmon_poll_wait() around the | ||
716 | + * call to the poll() system call. In particular they are not used while | ||
717 | + * aio_poll is performing callbacks, which makes it much easier to think about | ||
718 | + * reentrancy! | ||
155 | + * | 719 | + * |
156 | + * Depending on the encryption method, @host_offset and/or @guest_offset | 720 | + * Stack-allocated arrays would be perfect but they have size limitations; |
157 | + * may be used for generating the initialization vector for | 721 | + * heap allocation is expensive enough that we want to reuse arrays across |
158 | + * encryption. | 722 | + * calls to aio_poll(). And because poll() has to be called without holding |
159 | + * | 723 | + * any lock, the arrays cannot be stored in AioContext. Thread-local data |
160 | + * Note that while the whole range must be aligned on sectors, it | 724 | + * has none of the disadvantages of these three options. |
161 | + * does not have to be aligned on clusters and can also cross cluster | ||
162 | + * boundaries | ||
163 | + */ | 725 | + */ |
164 | int coroutine_fn | 726 | +static __thread GPollFD *pollfds; |
165 | -qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset, | 727 | +static __thread AioHandler **nodes; |
166 | - uint64_t offset, void *buf, size_t len) | 728 | +static __thread unsigned npfd, nalloc; |
167 | +qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset, | 729 | +static __thread Notifier pollfds_cleanup_notifier; |
168 | + uint64_t guest_offset, void *buf, size_t len) | 730 | + |
169 | { | 731 | +static void pollfds_cleanup(Notifier *n, void *unused) |
170 | - return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len, | 732 | +{ |
171 | - qcrypto_block_encrypt); | 733 | + g_assert(npfd == 0); |
172 | + return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len, | 734 | + g_free(pollfds); |
173 | + qcrypto_block_encrypt); | 735 | + g_free(nodes); |
174 | } | 736 | + nalloc = 0; |
175 | 737 | +} | |
176 | +/* | 738 | + |
177 | + * qcow2_co_decrypt() | 739 | +static void add_pollfd(AioHandler *node) |
178 | + * | 740 | +{ |
179 | + * Decrypts one or more contiguous aligned sectors | 741 | + if (npfd == nalloc) { |
180 | + * Similar to qcow2_co_encrypt | 742 | + if (nalloc == 0) { |
181 | + */ | 743 | + pollfds_cleanup_notifier.notify = pollfds_cleanup; |
182 | int coroutine_fn | 744 | + qemu_thread_atexit_add(&pollfds_cleanup_notifier); |
183 | -qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset, | 745 | + nalloc = 8; |
184 | - uint64_t offset, void *buf, size_t len) | 746 | + } else { |
185 | +qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset, | 747 | + g_assert(nalloc <= INT_MAX); |
186 | + uint64_t guest_offset, void *buf, size_t len) | 748 | + nalloc *= 2; |
187 | { | 749 | + } |
188 | - return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len, | 750 | + pollfds = g_renew(GPollFD, pollfds, nalloc); |
189 | - qcrypto_block_decrypt); | 751 | + nodes = g_renew(AioHandler *, nodes, nalloc); |
190 | + return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len, | 752 | + } |
191 | + qcrypto_block_decrypt); | 753 | + nodes[npfd] = node; |
192 | } | 754 | + pollfds[npfd] = (GPollFD) { |
193 | diff --git a/block/qcow2.c b/block/qcow2.c | 755 | + .fd = node->pfd.fd, |
194 | index XXXXXXX..XXXXXXX 100644 | 756 | + .events = node->pfd.events, |
195 | --- a/block/qcow2.c | 757 | + }; |
196 | +++ b/block/qcow2.c | 758 | + npfd++; |
197 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs, | 759 | +} |
198 | 760 | + | |
199 | assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); | 761 | +static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, |
200 | assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE)); | 762 | + int64_t timeout) |
201 | - if (qcow2_co_decrypt(bs, cluster_offset, offset, | 763 | +{ |
202 | + if (qcow2_co_decrypt(bs, cluster_offset + offset_in_cluster, | 764 | + AioHandler *node; |
203 | + offset, | 765 | + int ret; |
204 | cluster_data, cur_bytes) < 0) { | 766 | + |
205 | ret = -EIO; | 767 | + assert(npfd == 0); |
206 | goto fail; | 768 | + |
207 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev_part( | 769 | + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { |
208 | qemu_iovec_to_buf(qiov, qiov_offset + bytes_done, | 770 | + if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events |
209 | cluster_data, cur_bytes); | 771 | + && aio_node_check(ctx, node->is_external)) { |
210 | 772 | + add_pollfd(node); | |
211 | - if (qcow2_co_encrypt(bs, cluster_offset, offset, | 773 | + } |
212 | + if (qcow2_co_encrypt(bs, cluster_offset + offset_in_cluster, offset, | 774 | + } |
213 | cluster_data, cur_bytes) < 0) { | 775 | + |
214 | ret = -EIO; | 776 | + /* epoll(7) is faster above a certain number of fds */ |
215 | goto out_unlocked; | 777 | + if (fdmon_epoll_try_upgrade(ctx, npfd)) { |
778 | + return ctx->fdmon_ops->wait(ctx, ready_list, timeout); | ||
779 | + } | ||
780 | + | ||
781 | + ret = qemu_poll_ns(pollfds, npfd, timeout); | ||
782 | + if (ret > 0) { | ||
783 | + int i; | ||
784 | + | ||
785 | + for (i = 0; i < npfd; i++) { | ||
786 | + int revents = pollfds[i].revents; | ||
787 | + | ||
788 | + if (revents) { | ||
789 | + aio_add_ready_handler(ready_list, nodes[i], revents); | ||
790 | + } | ||
791 | + } | ||
792 | + } | ||
793 | + | ||
794 | + npfd = 0; | ||
795 | + return ret; | ||
796 | +} | ||
797 | + | ||
798 | +static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
799 | +{ | ||
800 | + /* Do nothing, AioHandler already contains the state we'll need */ | ||
801 | +} | ||
802 | + | ||
803 | +const FDMonOps fdmon_poll_ops = { | ||
804 | + .update = fdmon_poll_update, | ||
805 | + .wait = fdmon_poll_wait, | ||
806 | +}; | ||
216 | -- | 807 | -- |
217 | 2.21.0 | 808 | 2.24.1 |
218 | 809 | ||
219 | diff view generated by jsdifflib |
1 | Instead of reporting all sockets to cURL, only report the one that has | 1 | The AioHandler *node, bool is_new arguments are more complicated to |
---|---|---|---|
2 | caused curl_multi_do_locked() to be called. This lets us get rid of the | 2 | think about than simply being given AioHandler *old_node, AioHandler |
3 | QLIST_FOREACH_SAFE() list, which was actually wrong: SAFE foreaches are | 3 | *new_node. |
4 | only safe when the current element is removed in each iteration. If it | ||
5 | possible for the list to be concurrently modified, we cannot guarantee | ||
6 | that only the current element will be removed. Therefore, we must not | ||
7 | use QLIST_FOREACH_SAFE() here. | ||
8 | 4 | ||
9 | Fixes: ff5ca1664af85b24a4180d595ea6873fd3deac57 | 5 | Furthermore, the new Linux io_uring file descriptor monitoring mechanism |
10 | Cc: qemu-stable@nongnu.org | 6 | added by the new patch requires access to both the old and the new |
11 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 7 | nodes. Make this change now in preparation. |
12 | Message-id: 20190910124136.10565-6-mreitz@redhat.com | 8 | |
13 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | 9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
14 | Reviewed-by: John Snow <jsnow@redhat.com> | 10 | Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com |
15 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 11 | Message-Id: <20200305170806.1313245-5-stefanha@redhat.com> |
16 | --- | 12 | --- |
17 | block/curl.c | 17 ++++++----------- | 13 | include/block/aio.h | 13 ++++++------- |
18 | 1 file changed, 6 insertions(+), 11 deletions(-) | 14 | util/aio-posix.c | 7 +------ |
15 | util/fdmon-epoll.c | 21 ++++++++++++--------- | ||
16 | util/fdmon-poll.c | 4 +++- | ||
17 | 4 files changed, 22 insertions(+), 23 deletions(-) | ||
19 | 18 | ||
20 | diff --git a/block/curl.c b/block/curl.c | 19 | diff --git a/include/block/aio.h b/include/block/aio.h |
21 | index XXXXXXX..XXXXXXX 100644 | 20 | index XXXXXXX..XXXXXXX 100644 |
22 | --- a/block/curl.c | 21 | --- a/include/block/aio.h |
23 | +++ b/block/curl.c | 22 | +++ b/include/block/aio.h |
24 | @@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s) | 23 | @@ -XXX,XX +XXX,XX @@ typedef struct { |
24 | /* | ||
25 | * update: | ||
26 | * @ctx: the AioContext | ||
27 | - * @node: the handler | ||
28 | - * @is_new: is the file descriptor already being monitored? | ||
29 | + * @old_node: the existing handler or NULL if this file descriptor is being | ||
30 | + * monitored for the first time | ||
31 | + * @new_node: the new handler or NULL if this file descriptor is being | ||
32 | + * removed | ||
33 | * | ||
34 | - * Add/remove/modify a monitored file descriptor. There are three cases: | ||
35 | - * 1. node->pfd.events == 0 means remove the file descriptor. | ||
36 | - * 2. !is_new means modify an already monitored file descriptor. | ||
37 | - * 3. is_new means add a new file descriptor. | ||
38 | + * Add/remove/modify a monitored file descriptor. | ||
39 | * | ||
40 | * Called with ctx->list_lock acquired. | ||
41 | */ | ||
42 | - void (*update)(AioContext *ctx, AioHandler *node, bool is_new); | ||
43 | + void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node); | ||
44 | |||
45 | /* | ||
46 | * wait: | ||
47 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
48 | index XXXXXXX..XXXXXXX 100644 | ||
49 | --- a/util/aio-posix.c | ||
50 | +++ b/util/aio-posix.c | ||
51 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
52 | atomic_set(&ctx->poll_disable_cnt, | ||
53 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
54 | |||
55 | - if (new_node) { | ||
56 | - ctx->fdmon_ops->update(ctx, new_node, is_new); | ||
57 | - } else if (node) { | ||
58 | - /* Unregister deleted fd_handler */ | ||
59 | - ctx->fdmon_ops->update(ctx, node, false); | ||
60 | - } | ||
61 | + ctx->fdmon_ops->update(ctx, node, new_node); | ||
62 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
63 | aio_notify(ctx); | ||
64 | |||
65 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
66 | index XXXXXXX..XXXXXXX 100644 | ||
67 | --- a/util/fdmon-epoll.c | ||
68 | +++ b/util/fdmon-epoll.c | ||
69 | @@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events) | ||
70 | (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
25 | } | 71 | } |
26 | 72 | ||
27 | /* Called with s->mutex held. */ | 73 | -static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) |
28 | -static void curl_multi_do_locked(CURLSocket *ready_socket) | 74 | +static void fdmon_epoll_update(AioContext *ctx, |
29 | +static void curl_multi_do_locked(CURLSocket *socket) | 75 | + AioHandler *old_node, |
76 | + AioHandler *new_node) | ||
30 | { | 77 | { |
31 | - CURLSocket *socket, *next_socket; | 78 | - struct epoll_event event; |
32 | - CURLState *s = ready_socket->state; | 79 | + struct epoll_event event = { |
33 | + BDRVCURLState *s = socket->state->s; | 80 | + .data.ptr = new_node, |
34 | int running; | 81 | + .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0, |
82 | + }; | ||
35 | int r; | 83 | int r; |
36 | 84 | - int ctl; | |
37 | - if (!s->s->multi) { | 85 | |
38 | + if (!s->multi) { | 86 | - if (!node->pfd.events) { |
39 | return; | 87 | - ctl = EPOLL_CTL_DEL; |
88 | + if (!new_node) { | ||
89 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event); | ||
90 | + } else if (!old_node) { | ||
91 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event); | ||
92 | } else { | ||
93 | - event.data.ptr = node; | ||
94 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
95 | - ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
96 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event); | ||
40 | } | 97 | } |
41 | 98 | ||
42 | - /* Need to use _SAFE because curl_multi_socket_action() may trigger | 99 | - r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); |
43 | - * curl_sock_cb() which might modify this list */ | 100 | if (r) { |
44 | - QLIST_FOREACH_SAFE(socket, &s->sockets, next, next_socket) { | 101 | fdmon_epoll_disable(ctx); |
45 | - do { | 102 | } |
46 | - r = curl_multi_socket_action(s->s->multi, socket->fd, 0, &running); | 103 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c |
47 | - } while (r == CURLM_CALL_MULTI_PERFORM); | 104 | index XXXXXXX..XXXXXXX 100644 |
48 | - } | 105 | --- a/util/fdmon-poll.c |
49 | + do { | 106 | +++ b/util/fdmon-poll.c |
50 | + r = curl_multi_socket_action(s->multi, socket->fd, 0, &running); | 107 | @@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, |
51 | + } while (r == CURLM_CALL_MULTI_PERFORM); | 108 | return ret; |
52 | } | 109 | } |
53 | 110 | ||
54 | static void curl_multi_do(void *arg) | 111 | -static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new) |
112 | +static void fdmon_poll_update(AioContext *ctx, | ||
113 | + AioHandler *old_node, | ||
114 | + AioHandler *new_node) | ||
115 | { | ||
116 | /* Do nothing, AioHandler already contains the state we'll need */ | ||
117 | } | ||
55 | -- | 118 | -- |
56 | 2.21.0 | 119 | 2.24.1 |
57 | 120 | ||
58 | diff view generated by jsdifflib |
1 | From: Maxim Levitsky <mlevitsk@redhat.com> | 1 | The recent Linux io_uring API has several advantages over ppoll(2) and |
---|---|---|---|
2 | epoll(2). Details are given in the source code. | ||
2 | 3 | ||
3 | Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> | 4 | Add an io_uring implementation and make it the default on Linux. |
4 | Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> | 5 | Performance is the same as with epoll(7) but later patches add |
5 | Message-id: 20190915203655.21638-4-mlevitsk@redhat.com | 6 | optimizations that take advantage of io_uring. |
6 | Reviewed-by: Max Reitz <mreitz@redhat.com> | 7 | |
7 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 8 | It is necessary to change how aio_set_fd_handler() deals with deleting |
9 | AioHandlers since removing monitored file descriptors is asynchronous in | ||
10 | io_uring. fdmon_io_uring_remove() marks the AioHandler deleted and | ||
11 | aio_set_fd_handler() will let it handle deletion in that case. | ||
12 | |||
13 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
14 | Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com | ||
15 | Message-Id: <20200305170806.1313245-6-stefanha@redhat.com> | ||
8 | --- | 16 | --- |
9 | tests/qemu-iotests/263 | 91 ++++++++++++++++++++++++++++++++++++++ | 17 | configure | 5 + |
10 | tests/qemu-iotests/263.out | 40 +++++++++++++++++ | 18 | include/block/aio.h | 9 ++ |
11 | tests/qemu-iotests/group | 1 + | 19 | util/Makefile.objs | 1 + |
12 | 3 files changed, 132 insertions(+) | 20 | util/aio-posix.c | 20 ++- |
13 | create mode 100755 tests/qemu-iotests/263 | 21 | util/aio-posix.h | 20 ++- |
14 | create mode 100644 tests/qemu-iotests/263.out | 22 | util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++ |
23 | 6 files changed, 376 insertions(+), 5 deletions(-) | ||
24 | create mode 100644 util/fdmon-io_uring.c | ||
15 | 25 | ||
16 | diff --git a/tests/qemu-iotests/263 b/tests/qemu-iotests/263 | 26 | diff --git a/configure b/configure |
17 | new file mode 100755 | 27 | index XXXXXXX..XXXXXXX 100755 |
18 | index XXXXXXX..XXXXXXX | 28 | --- a/configure |
19 | --- /dev/null | 29 | +++ b/configure |
20 | +++ b/tests/qemu-iotests/263 | 30 | @@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then |
31 | linux_io_uring_cflags=$($pkg_config --cflags liburing) | ||
32 | linux_io_uring_libs=$($pkg_config --libs liburing) | ||
33 | linux_io_uring=yes | ||
34 | + | ||
35 | + # io_uring is used in libqemuutil.a where per-file -libs variables are not | ||
36 | + # seen by programs linking the archive. It's not ideal, but just add the | ||
37 | + # library dependency globally. | ||
38 | + LIBS="$linux_io_uring_libs $LIBS" | ||
39 | else | ||
40 | if test "$linux_io_uring" = "yes" ; then | ||
41 | feature_not_found "linux io_uring" "Install liburing devel" | ||
42 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
43 | index XXXXXXX..XXXXXXX 100644 | ||
44 | --- a/include/block/aio.h | ||
45 | +++ b/include/block/aio.h | ||
21 | @@ -XXX,XX +XXX,XX @@ | 46 | @@ -XXX,XX +XXX,XX @@ |
22 | +#!/usr/bin/env bash | 47 | #ifndef QEMU_AIO_H |
23 | +# | 48 | #define QEMU_AIO_H |
24 | +# Test encrypted write that crosses cluster boundary of two unallocated clusters | 49 | |
25 | +# Based on 188 | 50 | +#ifdef CONFIG_LINUX_IO_URING |
26 | +# | 51 | +#include <liburing.h> |
27 | +# Copyright (C) 2019 Red Hat, Inc. | 52 | +#endif |
28 | +# | 53 | #include "qemu/queue.h" |
29 | +# This program is free software; you can redistribute it and/or modify | 54 | #include "qemu/event_notifier.h" |
30 | +# it under the terms of the GNU General Public License as published by | 55 | #include "qemu/thread.h" |
31 | +# the Free Software Foundation; either version 2 of the License, or | 56 | @@ -XXX,XX +XXX,XX @@ struct BHListSlice { |
32 | +# (at your option) any later version. | 57 | QSIMPLEQ_ENTRY(BHListSlice) next; |
33 | +# | 58 | }; |
34 | +# This program is distributed in the hope that it will be useful, | 59 | |
35 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | 60 | +typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; |
36 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 61 | + |
37 | +# GNU General Public License for more details. | 62 | struct AioContext { |
38 | +# | 63 | GSource source; |
39 | +# You should have received a copy of the GNU General Public License | 64 | |
40 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | 65 | @@ -XXX,XX +XXX,XX @@ struct AioContext { |
41 | +# | 66 | * locking. |
42 | + | 67 | */ |
43 | +# creator | 68 | struct LuringState *linux_io_uring; |
44 | +owner=mlevitsk@redhat.com | 69 | + |
45 | + | 70 | + /* State for file descriptor monitoring using Linux io_uring */ |
46 | +seq=`basename $0` | 71 | + struct io_uring fdmon_io_uring; |
47 | +echo "QA output created by $seq" | 72 | + AioHandlerSList submit_list; |
48 | + | 73 | #endif |
49 | +status=1 # failure is the default! | 74 | |
50 | + | 75 | /* TimerLists for calling timers - one per clock type. Has its own |
51 | +_cleanup() | 76 | diff --git a/util/Makefile.objs b/util/Makefile.objs |
52 | +{ | 77 | index XXXXXXX..XXXXXXX 100644 |
53 | + _cleanup_test_img | 78 | --- a/util/Makefile.objs |
54 | +} | 79 | +++ b/util/Makefile.objs |
55 | +trap "_cleanup; exit \$status" 0 1 2 3 15 | 80 | @@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o |
56 | + | 81 | util-obj-$(CONFIG_POSIX) += aio-posix.o |
57 | +# get standard environment, filters and checks | 82 | util-obj-$(CONFIG_POSIX) += fdmon-poll.o |
58 | +. ./common.rc | 83 | util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o |
59 | +. ./common.filter | 84 | +util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o |
60 | + | 85 | util-obj-$(CONFIG_POSIX) += compatfd.o |
61 | +_supported_fmt qcow2 | 86 | util-obj-$(CONFIG_POSIX) += event_notifier-posix.o |
62 | +_supported_proto generic | 87 | util-obj-$(CONFIG_POSIX) += mmap-alloc.o |
63 | +_supported_os Linux | 88 | diff --git a/util/aio-posix.c b/util/aio-posix.c |
64 | + | 89 | index XXXXXXX..XXXXXXX 100644 |
65 | + | 90 | --- a/util/aio-posix.c |
66 | +size=1M | 91 | +++ b/util/aio-posix.c |
67 | + | 92 | @@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) |
68 | +SECRET="secret,id=sec0,data=astrochicken" | 93 | g_source_remove_poll(&ctx->source, &node->pfd); |
69 | +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT | 94 | } |
70 | + | 95 | |
71 | + | 96 | + node->pfd.revents = 0; |
72 | +_run_test() | 97 | + |
73 | +{ | 98 | + /* If the fd monitor has already marked it deleted, leave it alone */ |
74 | + echo "== reading the whole image ==" | 99 | + if (QLIST_IS_INSERTED(node, node_deleted)) { |
75 | + $QEMU_IO --object $SECRET -c "read -P 0 0 $size" --image-opts "$1" | _filter_qemu_io | _filter_testdir | 100 | + return false; |
76 | + | 101 | + } |
77 | + echo | 102 | + |
78 | + echo "== write two 512 byte sectors on a cluster boundary ==" | 103 | /* If a read is in progress, just mark the node as deleted */ |
79 | + $QEMU_IO --object $SECRET -c "write -P 0xAA 0xFE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir | 104 | if (qemu_lockcnt_count(&ctx->list_lock)) { |
80 | + | 105 | QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); |
81 | + echo | 106 | - node->pfd.revents = 0; |
82 | + echo "== verify that the rest of the image is not changed ==" | 107 | return false; |
83 | + $QEMU_IO --object $SECRET -c "read -P 0x00 0x00000 0xFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir | 108 | } |
84 | + $QEMU_IO --object $SECRET -c "read -P 0xAA 0x0FE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir | 109 | /* Otherwise, delete it for real. We can't just mark it as |
85 | + $QEMU_IO --object $SECRET -c "read -P 0x00 0x10200 0xEFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir | 110 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, |
86 | + | 111 | |
87 | +} | 112 | QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node); |
88 | + | 113 | } |
89 | + | 114 | - if (node) { |
90 | +echo | 115 | - deleted = aio_remove_fd_handler(ctx, node); |
91 | +echo "testing LUKS qcow2 encryption" | 116 | - } |
92 | +echo | 117 | |
93 | + | 118 | /* No need to order poll_disable_cnt writes against other updates; |
94 | +_make_test_img --object $SECRET -o "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=64K" $size | 119 | * the counter is only used to avoid wasting time and latency on |
95 | +_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG" | 120 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, |
96 | +_cleanup_test_img | 121 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); |
97 | + | 122 | |
98 | +echo | 123 | ctx->fdmon_ops->update(ctx, node, new_node); |
99 | +echo "testing legacy AES qcow2 encryption" | 124 | + if (node) { |
100 | +echo | 125 | + deleted = aio_remove_fd_handler(ctx, node); |
101 | + | 126 | + } |
102 | + | 127 | qemu_lockcnt_unlock(&ctx->list_lock); |
103 | +_make_test_img --object $SECRET -o "encrypt.format=aes,encrypt.key-secret=sec0,cluster_size=64K" $size | 128 | aio_notify(ctx); |
104 | +_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG" | 129 | |
105 | +_cleanup_test_img | 130 | @@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx) |
106 | + | 131 | ctx->fdmon_ops = &fdmon_poll_ops; |
107 | + | 132 | ctx->epollfd = -1; |
108 | + | 133 | |
109 | +# success, all done | 134 | + /* Use the fastest fd monitoring implementation if available */ |
110 | +echo "*** done" | 135 | + if (fdmon_io_uring_setup(ctx)) { |
111 | +rm -f $seq.full | 136 | + return; |
112 | +status=0 | 137 | + } |
113 | diff --git a/tests/qemu-iotests/263.out b/tests/qemu-iotests/263.out | 138 | + |
139 | fdmon_epoll_setup(ctx); | ||
140 | } | ||
141 | |||
142 | void aio_context_destroy(AioContext *ctx) | ||
143 | { | ||
144 | + fdmon_io_uring_destroy(ctx); | ||
145 | fdmon_epoll_disable(ctx); | ||
146 | } | ||
147 | |||
148 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
149 | index XXXXXXX..XXXXXXX 100644 | ||
150 | --- a/util/aio-posix.h | ||
151 | +++ b/util/aio-posix.h | ||
152 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | ||
153 | IOHandler *io_poll_begin; | ||
154 | IOHandler *io_poll_end; | ||
155 | void *opaque; | ||
156 | - bool is_external; | ||
157 | QLIST_ENTRY(AioHandler) node; | ||
158 | QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
159 | QLIST_ENTRY(AioHandler) node_deleted; | ||
160 | +#ifdef CONFIG_LINUX_IO_URING | ||
161 | + QSLIST_ENTRY(AioHandler) node_submitted; | ||
162 | + unsigned flags; /* see fdmon-io_uring.c */ | ||
163 | +#endif | ||
164 | + bool is_external; | ||
165 | }; | ||
166 | |||
167 | /* Add a handler to a ready list */ | ||
168 | @@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx) | ||
169 | } | ||
170 | #endif /* !CONFIG_EPOLL_CREATE1 */ | ||
171 | |||
172 | +#ifdef CONFIG_LINUX_IO_URING | ||
173 | +bool fdmon_io_uring_setup(AioContext *ctx); | ||
174 | +void fdmon_io_uring_destroy(AioContext *ctx); | ||
175 | +#else | ||
176 | +static inline bool fdmon_io_uring_setup(AioContext *ctx) | ||
177 | +{ | ||
178 | + return false; | ||
179 | +} | ||
180 | + | ||
181 | +static inline void fdmon_io_uring_destroy(AioContext *ctx) | ||
182 | +{ | ||
183 | +} | ||
184 | +#endif /* !CONFIG_LINUX_IO_URING */ | ||
185 | + | ||
186 | #endif /* AIO_POSIX_H */ | ||
187 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c | ||
114 | new file mode 100644 | 188 | new file mode 100644 |
115 | index XXXXXXX..XXXXXXX | 189 | index XXXXXXX..XXXXXXX |
116 | --- /dev/null | 190 | --- /dev/null |
117 | +++ b/tests/qemu-iotests/263.out | 191 | +++ b/util/fdmon-io_uring.c |
118 | @@ -XXX,XX +XXX,XX @@ | 192 | @@ -XXX,XX +XXX,XX @@ |
119 | +QA output created by 263 | 193 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ |
120 | + | 194 | +/* |
121 | +testing LUKS qcow2 encryption | 195 | + * Linux io_uring file descriptor monitoring |
122 | + | 196 | + * |
123 | +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10 | 197 | + * The Linux io_uring API supports file descriptor monitoring with a few |
124 | +== reading the whole image == | 198 | + * advantages over existing APIs like poll(2) and epoll(7): |
125 | +read 1048576/1048576 bytes at offset 0 | 199 | + * |
126 | +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 200 | + * 1. Userspace polling of events is possible because the completion queue (cq |
127 | + | 201 | + * ring) is shared between the kernel and userspace. This allows |
128 | +== write two 512 byte sectors on a cluster boundary == | 202 | + * applications that rely on userspace polling to also monitor file |
129 | +wrote 1024/1024 bytes at offset 65024 | 203 | + * descriptors in the same userspace polling loop. |
130 | +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 204 | + * |
131 | + | 205 | + * 2. Submission and completion is batched and done together in a single system |
132 | +== verify that the rest of the image is not changed == | 206 | + * call. This minimizes the number of system calls. |
133 | +read 65024/65024 bytes at offset 0 | 207 | + * |
134 | +63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 208 | + * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than |
135 | +read 1024/1024 bytes at offset 65024 | 209 | + * poll(2). |
136 | +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 210 | + * |
137 | +read 982528/982528 bytes at offset 66048 | 211 | + * 4. Nanosecond timeouts are supported so it requires fewer syscalls than |
138 | +959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 212 | + * epoll(7). |
139 | + | 213 | + * |
140 | +testing legacy AES qcow2 encryption | 214 | + * This code only monitors file descriptors and does not do asynchronous disk |
141 | + | 215 | + * I/O. Implementing disk I/O efficiently has other requirements and should |
142 | +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=aes encrypt.key-secret=sec0 | 216 | + * use a separate io_uring so it does not make sense to unify the code. |
143 | +== reading the whole image == | 217 | + * |
144 | +read 1048576/1048576 bytes at offset 0 | 218 | + * File descriptor monitoring is implemented using the following operations: |
145 | +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 219 | + * |
146 | + | 220 | + * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored. |
147 | +== write two 512 byte sectors on a cluster boundary == | 221 | + * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When |
148 | +wrote 1024/1024 bytes at offset 65024 | 222 | + * the poll mask changes for a file descriptor it is first removed and then |
149 | +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 223 | + * re-added with the new poll mask, so this operation is also used as part |
150 | + | 224 | + * of modifying an existing monitored file descriptor. |
151 | +== verify that the rest of the image is not changed == | 225 | + * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait |
152 | +read 65024/65024 bytes at offset 0 | 226 | + * for events. This operation self-cancels if another event completes |
153 | +63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 227 | + * before the timeout. |
154 | +read 1024/1024 bytes at offset 65024 | 228 | + * |
155 | +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 229 | + * io_uring calls the submission queue the "sq ring" and the completion queue |
156 | +read 982528/982528 bytes at offset 66048 | 230 | + * the "cq ring". Ring entries are called "sqe" and "cqe", respectively. |
157 | +959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | 231 | + * |
158 | +*** done | 232 | + * The code is structured so that sq/cq rings are only modified within |
159 | diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group | 233 | + * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on |
160 | index XXXXXXX..XXXXXXX 100644 | 234 | + * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD |
161 | --- a/tests/qemu-iotests/group | 235 | + * and/or IORING_OP_POLL_REMOVE sqes for them. |
162 | +++ b/tests/qemu-iotests/group | 236 | + */ |
163 | @@ -XXX,XX +XXX,XX @@ | 237 | + |
164 | 257 rw | 238 | +#include "qemu/osdep.h" |
165 | 258 rw quick | 239 | +#include <poll.h> |
166 | 262 rw quick migration | 240 | +#include "qemu/rcu_queue.h" |
167 | +263 rw quick | 241 | +#include "aio-posix.h" |
168 | 265 rw auto quick | 242 | + |
169 | 266 rw quick | 243 | +enum { |
244 | + FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ | ||
245 | + | ||
246 | + /* AioHandler::flags */ | ||
247 | + FDMON_IO_URING_PENDING = (1 << 0), | ||
248 | + FDMON_IO_URING_ADD = (1 << 1), | ||
249 | + FDMON_IO_URING_REMOVE = (1 << 2), | ||
250 | +}; | ||
251 | + | ||
252 | +static inline int poll_events_from_pfd(int pfd_events) | ||
253 | +{ | ||
254 | + return (pfd_events & G_IO_IN ? POLLIN : 0) | | ||
255 | + (pfd_events & G_IO_OUT ? POLLOUT : 0) | | ||
256 | + (pfd_events & G_IO_HUP ? POLLHUP : 0) | | ||
257 | + (pfd_events & G_IO_ERR ? POLLERR : 0); | ||
258 | +} | ||
259 | + | ||
260 | +static inline int pfd_events_from_poll(int poll_events) | ||
261 | +{ | ||
262 | + return (poll_events & POLLIN ? G_IO_IN : 0) | | ||
263 | + (poll_events & POLLOUT ? G_IO_OUT : 0) | | ||
264 | + (poll_events & POLLHUP ? G_IO_HUP : 0) | | ||
265 | + (poll_events & POLLERR ? G_IO_ERR : 0); | ||
266 | +} | ||
267 | + | ||
268 | +/* | ||
269 | + * Returns an sqe for submitting a request. Only be called within | ||
270 | + * fdmon_io_uring_wait(). | ||
271 | + */ | ||
272 | +static struct io_uring_sqe *get_sqe(AioContext *ctx) | ||
273 | +{ | ||
274 | + struct io_uring *ring = &ctx->fdmon_io_uring; | ||
275 | + struct io_uring_sqe *sqe = io_uring_get_sqe(ring); | ||
276 | + int ret; | ||
277 | + | ||
278 | + if (likely(sqe)) { | ||
279 | + return sqe; | ||
280 | + } | ||
281 | + | ||
282 | + /* No free sqes left, submit pending sqes first */ | ||
283 | + ret = io_uring_submit(ring); | ||
284 | + assert(ret > 1); | ||
285 | + sqe = io_uring_get_sqe(ring); | ||
286 | + assert(sqe); | ||
287 | + return sqe; | ||
288 | +} | ||
289 | + | ||
290 | +/* Atomically enqueue an AioHandler for sq ring submission */ | ||
291 | +static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags) | ||
292 | +{ | ||
293 | + unsigned old_flags; | ||
294 | + | ||
295 | + old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags); | ||
296 | + if (!(old_flags & FDMON_IO_URING_PENDING)) { | ||
297 | + QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted); | ||
298 | + } | ||
299 | +} | ||
300 | + | ||
301 | +/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */ | ||
302 | +static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) | ||
303 | +{ | ||
304 | + AioHandler *node = QSLIST_FIRST(head); | ||
305 | + | ||
306 | + if (!node) { | ||
307 | + return NULL; | ||
308 | + } | ||
309 | + | ||
310 | + /* Doesn't need to be atomic since fill_sq_ring() moves the list */ | ||
311 | + QSLIST_REMOVE_HEAD(head, node_submitted); | ||
312 | + | ||
313 | + /* | ||
314 | + * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two | ||
315 | + * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and | ||
316 | + * telling process_cqe() to delete the AioHandler when its | ||
317 | + * IORING_OP_POLL_ADD completes. | ||
318 | + */ | ||
319 | + *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | | ||
320 | + FDMON_IO_URING_ADD)); | ||
321 | + return node; | ||
322 | +} | ||
323 | + | ||
324 | +static void fdmon_io_uring_update(AioContext *ctx, | ||
325 | + AioHandler *old_node, | ||
326 | + AioHandler *new_node) | ||
327 | +{ | ||
328 | + if (new_node) { | ||
329 | + enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD); | ||
330 | + } | ||
331 | + | ||
332 | + if (old_node) { | ||
333 | + /* | ||
334 | + * Deletion is tricky because IORING_OP_POLL_ADD and | ||
335 | + * IORING_OP_POLL_REMOVE are async. We need to wait for the original | ||
336 | + * IORING_OP_POLL_ADD to complete before this handler can be freed | ||
337 | + * safely. | ||
338 | + * | ||
339 | + * It's possible that the file descriptor becomes ready and the | ||
340 | + * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is | ||
341 | + * submitted, too. | ||
342 | + * | ||
343 | + * Mark this handler deleted right now but don't place it on | ||
344 | + * ctx->deleted_aio_handlers yet. Instead, manually fudge the list | ||
345 | + * entry to make QLIST_IS_INSERTED() think this handler has been | ||
346 | + * inserted and other code recognizes this AioHandler as deleted. | ||
347 | + * | ||
348 | + * Once the original IORING_OP_POLL_ADD completes we enqueue the | ||
349 | + * handler on the real ctx->deleted_aio_handlers list to be freed. | ||
350 | + */ | ||
351 | + assert(!QLIST_IS_INSERTED(old_node, node_deleted)); | ||
352 | + old_node->node_deleted.le_prev = &old_node->node_deleted.le_next; | ||
353 | + | ||
354 | + enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE); | ||
355 | + } | ||
356 | +} | ||
357 | + | ||
358 | +static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) | ||
359 | +{ | ||
360 | + struct io_uring_sqe *sqe = get_sqe(ctx); | ||
361 | + int events = poll_events_from_pfd(node->pfd.events); | ||
362 | + | ||
363 | + io_uring_prep_poll_add(sqe, node->pfd.fd, events); | ||
364 | + io_uring_sqe_set_data(sqe, node); | ||
365 | +} | ||
366 | + | ||
367 | +static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) | ||
368 | +{ | ||
369 | + struct io_uring_sqe *sqe = get_sqe(ctx); | ||
370 | + | ||
371 | + io_uring_prep_poll_remove(sqe, node); | ||
372 | +} | ||
373 | + | ||
374 | +/* Add a timeout that self-cancels when another cqe becomes ready */ | ||
375 | +static void add_timeout_sqe(AioContext *ctx, int64_t ns) | ||
376 | +{ | ||
377 | + struct io_uring_sqe *sqe; | ||
378 | + struct __kernel_timespec ts = { | ||
379 | + .tv_sec = ns / NANOSECONDS_PER_SECOND, | ||
380 | + .tv_nsec = ns % NANOSECONDS_PER_SECOND, | ||
381 | + }; | ||
382 | + | ||
383 | + sqe = get_sqe(ctx); | ||
384 | + io_uring_prep_timeout(sqe, &ts, 1, 0); | ||
385 | +} | ||
386 | + | ||
387 | +/* Add sqes from ctx->submit_list for submission */ | ||
388 | +static void fill_sq_ring(AioContext *ctx) | ||
389 | +{ | ||
390 | + AioHandlerSList submit_list; | ||
391 | + AioHandler *node; | ||
392 | + unsigned flags; | ||
393 | + | ||
394 | + QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list); | ||
395 | + | ||
396 | + while ((node = dequeue(&submit_list, &flags))) { | ||
397 | + /* Order matters, just in case both flags were set */ | ||
398 | + if (flags & FDMON_IO_URING_ADD) { | ||
399 | + add_poll_add_sqe(ctx, node); | ||
400 | + } | ||
401 | + if (flags & FDMON_IO_URING_REMOVE) { | ||
402 | + add_poll_remove_sqe(ctx, node); | ||
403 | + } | ||
404 | + } | ||
405 | +} | ||
406 | + | ||
407 | +/* Returns true if a handler became ready */ | ||
408 | +static bool process_cqe(AioContext *ctx, | ||
409 | + AioHandlerList *ready_list, | ||
410 | + struct io_uring_cqe *cqe) | ||
411 | +{ | ||
412 | + AioHandler *node = io_uring_cqe_get_data(cqe); | ||
413 | + unsigned flags; | ||
414 | + | ||
415 | + /* poll_timeout and poll_remove have a zero user_data field */ | ||
416 | + if (!node) { | ||
417 | + return false; | ||
418 | + } | ||
419 | + | ||
420 | + /* | ||
421 | + * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race | ||
422 | + * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE | ||
423 | + * bit before IORING_OP_POLL_REMOVE is submitted. | ||
424 | + */ | ||
425 | + flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); | ||
426 | + if (flags & FDMON_IO_URING_REMOVE) { | ||
427 | + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | ||
428 | + return false; | ||
429 | + } | ||
430 | + | ||
431 | + aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); | ||
432 | + | ||
433 | + /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ | ||
434 | + add_poll_add_sqe(ctx, node); | ||
435 | + return true; | ||
436 | +} | ||
437 | + | ||
438 | +static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) | ||
439 | +{ | ||
440 | + struct io_uring *ring = &ctx->fdmon_io_uring; | ||
441 | + struct io_uring_cqe *cqe; | ||
442 | + unsigned num_cqes = 0; | ||
443 | + unsigned num_ready = 0; | ||
444 | + unsigned head; | ||
445 | + | ||
446 | + io_uring_for_each_cqe(ring, head, cqe) { | ||
447 | + if (process_cqe(ctx, ready_list, cqe)) { | ||
448 | + num_ready++; | ||
449 | + } | ||
450 | + | ||
451 | + num_cqes++; | ||
452 | + } | ||
453 | + | ||
454 | + io_uring_cq_advance(ring, num_cqes); | ||
455 | + return num_ready; | ||
456 | +} | ||
457 | + | ||
458 | +static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
459 | + int64_t timeout) | ||
460 | +{ | ||
461 | + unsigned wait_nr = 1; /* block until at least one cqe is ready */ | ||
462 | + int ret; | ||
463 | + | ||
464 | + /* Fall back while external clients are disabled */ | ||
465 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
466 | + return fdmon_poll_ops.wait(ctx, ready_list, timeout); | ||
467 | + } | ||
468 | + | ||
469 | + if (timeout == 0) { | ||
470 | + wait_nr = 0; /* non-blocking */ | ||
471 | + } else if (timeout > 0) { | ||
472 | + add_timeout_sqe(ctx, timeout); | ||
473 | + } | ||
474 | + | ||
475 | + fill_sq_ring(ctx); | ||
476 | + | ||
477 | + ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); | ||
478 | + assert(ret >= 0); | ||
479 | + | ||
480 | + return process_cq_ring(ctx, ready_list); | ||
481 | +} | ||
482 | + | ||
483 | +static const FDMonOps fdmon_io_uring_ops = { | ||
484 | + .update = fdmon_io_uring_update, | ||
485 | + .wait = fdmon_io_uring_wait, | ||
486 | +}; | ||
487 | + | ||
488 | +bool fdmon_io_uring_setup(AioContext *ctx) | ||
489 | +{ | ||
490 | + int ret; | ||
491 | + | ||
492 | + ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); | ||
493 | + if (ret != 0) { | ||
494 | + return false; | ||
495 | + } | ||
496 | + | ||
497 | + QSLIST_INIT(&ctx->submit_list); | ||
498 | + ctx->fdmon_ops = &fdmon_io_uring_ops; | ||
499 | + return true; | ||
500 | +} | ||
501 | + | ||
502 | +void fdmon_io_uring_destroy(AioContext *ctx) | ||
503 | +{ | ||
504 | + if (ctx->fdmon_ops == &fdmon_io_uring_ops) { | ||
505 | + AioHandler *node; | ||
506 | + | ||
507 | + io_uring_queue_exit(&ctx->fdmon_io_uring); | ||
508 | + | ||
509 | + /* No need to submit these anymore, just free them. */ | ||
510 | + while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { | ||
511 | + QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); | ||
512 | + QLIST_REMOVE(node, node); | ||
513 | + g_free(node); | ||
514 | + } | ||
515 | + | ||
516 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
517 | + } | ||
518 | +} | ||
170 | -- | 519 | -- |
171 | 2.21.0 | 520 | 2.24.1 |
172 | 521 | ||
173 | diff view generated by jsdifflib |
1 | curl_multi_do_locked() currently marks all sockets as ready. That is | 1 | Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled |
---|---|---|---|
2 | not only inefficient, but in fact unsafe (the loop is). A follow-up | 2 | from userspace. Previously userspace polling was only allowed when all |
3 | patch will change that, but to do so, curl_multi_do_locked() needs to | 3 | AioHandler's had an ->io_poll() callback. This prevented starvation of |
4 | know exactly which socket is ready; and that is accomplished by this | 4 | fds by userspace pollable handlers. |
5 | patch here. | ||
6 | 5 | ||
7 | Cc: qemu-stable@nongnu.org | 6 | Add the FDMonOps->need_wait() callback that enables userspace polling |
8 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 7 | even when some AioHandlers lack ->io_poll(). |
9 | Message-id: 20190910124136.10565-5-mreitz@redhat.com | 8 | |
10 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | 9 | For example, it's now possible to do userspace polling when a TCP/IP |
11 | Reviewed-by: John Snow <jsnow@redhat.com> | 10 | socket is monitored thanks to Linux io_uring. |
12 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 11 | |
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com | ||
14 | Message-Id: <20200305170806.1313245-7-stefanha@redhat.com> | ||
13 | --- | 15 | --- |
14 | block/curl.c | 20 +++++++++++--------- | 16 | include/block/aio.h | 19 +++++++++++++++++++ |
15 | 1 file changed, 11 insertions(+), 9 deletions(-) | 17 | util/aio-posix.c | 11 ++++++++--- |
18 | util/fdmon-epoll.c | 1 + | ||
19 | util/fdmon-io_uring.c | 6 ++++++ | ||
20 | util/fdmon-poll.c | 1 + | ||
21 | 5 files changed, 35 insertions(+), 3 deletions(-) | ||
16 | 22 | ||
17 | diff --git a/block/curl.c b/block/curl.c | 23 | diff --git a/include/block/aio.h b/include/block/aio.h |
18 | index XXXXXXX..XXXXXXX 100644 | 24 | index XXXXXXX..XXXXXXX 100644 |
19 | --- a/block/curl.c | 25 | --- a/include/block/aio.h |
20 | +++ b/block/curl.c | 26 | +++ b/include/block/aio.h |
21 | @@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, | 27 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool; |
22 | switch (action) { | 28 | struct LinuxAioState; |
23 | case CURL_POLL_IN: | 29 | struct LuringState; |
24 | aio_set_fd_handler(s->aio_context, fd, false, | 30 | |
25 | - curl_multi_do, NULL, NULL, state); | 31 | +/* Is polling disabled? */ |
26 | + curl_multi_do, NULL, NULL, socket); | 32 | +bool aio_poll_disabled(AioContext *ctx); |
27 | break; | 33 | + |
28 | case CURL_POLL_OUT: | 34 | /* Callbacks for file descriptor monitoring implementations */ |
29 | aio_set_fd_handler(s->aio_context, fd, false, | 35 | typedef struct { |
30 | - NULL, curl_multi_do, NULL, state); | 36 | /* |
31 | + NULL, curl_multi_do, NULL, socket); | 37 | @@ -XXX,XX +XXX,XX @@ typedef struct { |
32 | break; | 38 | * Returns: number of ready file descriptors. |
33 | case CURL_POLL_INOUT: | 39 | */ |
34 | aio_set_fd_handler(s->aio_context, fd, false, | 40 | int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); |
35 | - curl_multi_do, curl_multi_do, NULL, state); | 41 | + |
36 | + curl_multi_do, curl_multi_do, NULL, socket); | 42 | + /* |
37 | break; | 43 | + * need_wait: |
38 | case CURL_POLL_REMOVE: | 44 | + * @ctx: the AioContext |
39 | aio_set_fd_handler(s->aio_context, fd, false, | 45 | + * |
40 | @@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s) | 46 | + * Tell aio_poll() when to stop userspace polling early because ->wait() |
47 | + * has fds ready. | ||
48 | + * | ||
49 | + * File descriptor monitoring implementations that cannot poll fd readiness | ||
50 | + * from userspace should use aio_poll_disabled() here. This ensures that | ||
51 | + * file descriptors are not starved by handlers that frequently make | ||
52 | + * progress via userspace polling. | ||
53 | + * | ||
54 | + * Returns: true if ->wait() should be called, false otherwise. | ||
55 | + */ | ||
56 | + bool (*need_wait)(AioContext *ctx); | ||
57 | } FDMonOps; | ||
58 | |||
59 | /* | ||
60 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
61 | index XXXXXXX..XXXXXXX 100644 | ||
62 | --- a/util/aio-posix.c | ||
63 | +++ b/util/aio-posix.c | ||
64 | @@ -XXX,XX +XXX,XX @@ | ||
65 | #include "trace.h" | ||
66 | #include "aio-posix.h" | ||
67 | |||
68 | +bool aio_poll_disabled(AioContext *ctx) | ||
69 | +{ | ||
70 | + return atomic_read(&ctx->poll_disable_cnt); | ||
71 | +} | ||
72 | + | ||
73 | void aio_add_ready_handler(AioHandlerList *ready_list, | ||
74 | AioHandler *node, | ||
75 | int revents) | ||
76 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
77 | elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; | ||
78 | max_ns = qemu_soonest_timeout(*timeout, max_ns); | ||
79 | assert(!(max_ns && progress)); | ||
80 | - } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt)); | ||
81 | + } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); | ||
82 | |||
83 | /* If time has passed with no successful polling, adjust *timeout to | ||
84 | * keep the same ending time. | ||
85 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
86 | { | ||
87 | int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
88 | |||
89 | - if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) { | ||
90 | + if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
91 | poll_set_started(ctx, true); | ||
92 | |||
93 | if (run_poll_handlers(ctx, max_ns, timeout)) { | ||
94 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
95 | /* If polling is allowed, non-blocking aio_poll does not need the | ||
96 | * system call---a single round of run_poll_handlers_once suffices. | ||
97 | */ | ||
98 | - if (timeout || atomic_read(&ctx->poll_disable_cnt)) { | ||
99 | + if (timeout || ctx->fdmon_ops->need_wait(ctx)) { | ||
100 | ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
101 | } | ||
102 | |||
103 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
104 | index XXXXXXX..XXXXXXX 100644 | ||
105 | --- a/util/fdmon-epoll.c | ||
106 | +++ b/util/fdmon-epoll.c | ||
107 | @@ -XXX,XX +XXX,XX @@ out: | ||
108 | static const FDMonOps fdmon_epoll_ops = { | ||
109 | .update = fdmon_epoll_update, | ||
110 | .wait = fdmon_epoll_wait, | ||
111 | + .need_wait = aio_poll_disabled, | ||
112 | }; | ||
113 | |||
114 | static bool fdmon_epoll_try_enable(AioContext *ctx) | ||
115 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c | ||
116 | index XXXXXXX..XXXXXXX 100644 | ||
117 | --- a/util/fdmon-io_uring.c | ||
118 | +++ b/util/fdmon-io_uring.c | ||
119 | @@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
120 | return process_cq_ring(ctx, ready_list); | ||
41 | } | 121 | } |
42 | 122 | ||
43 | /* Called with s->mutex held. */ | 123 | +static bool fdmon_io_uring_need_wait(AioContext *ctx) |
44 | -static void curl_multi_do_locked(CURLState *s) | 124 | +{ |
45 | +static void curl_multi_do_locked(CURLSocket *ready_socket) | 125 | + return io_uring_cq_ready(&ctx->fdmon_io_uring); |
46 | { | 126 | +} |
47 | CURLSocket *socket, *next_socket; | 127 | + |
48 | + CURLState *s = ready_socket->state; | 128 | static const FDMonOps fdmon_io_uring_ops = { |
49 | int running; | 129 | .update = fdmon_io_uring_update, |
50 | int r; | 130 | .wait = fdmon_io_uring_wait, |
51 | 131 | + .need_wait = fdmon_io_uring_need_wait, | |
52 | @@ -XXX,XX +XXX,XX @@ static void curl_multi_do_locked(CURLState *s) | 132 | }; |
53 | 133 | ||
54 | static void curl_multi_do(void *arg) | 134 | bool fdmon_io_uring_setup(AioContext *ctx) |
55 | { | 135 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c |
56 | - CURLState *s = (CURLState *)arg; | 136 | index XXXXXXX..XXXXXXX 100644 |
57 | + CURLSocket *socket = arg; | 137 | --- a/util/fdmon-poll.c |
58 | + BDRVCURLState *s = socket->state->s; | 138 | +++ b/util/fdmon-poll.c |
59 | 139 | @@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx, | |
60 | - qemu_mutex_lock(&s->s->mutex); | 140 | const FDMonOps fdmon_poll_ops = { |
61 | - curl_multi_do_locked(s); | 141 | .update = fdmon_poll_update, |
62 | - curl_multi_check_completion(s->s); | 142 | .wait = fdmon_poll_wait, |
63 | - qemu_mutex_unlock(&s->s->mutex); | 143 | + .need_wait = aio_poll_disabled, |
64 | + qemu_mutex_lock(&s->mutex); | 144 | }; |
65 | + curl_multi_do_locked(socket); | ||
66 | + curl_multi_check_completion(s); | ||
67 | + qemu_mutex_unlock(&s->mutex); | ||
68 | } | ||
69 | |||
70 | static void curl_multi_timeout_do(void *arg) | ||
71 | -- | 145 | -- |
72 | 2.21.0 | 146 | 2.24.1 |
73 | 147 | ||
74 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Background: As of cURL 7.59.0, it verifies that several functions are | ||
2 | not called from within a callback. Among these functions is | ||
3 | curl_multi_add_handle(). | ||
4 | 1 | ||
5 | curl_read_cb() is a callback from cURL and not a coroutine. Waking up | ||
6 | acb->co will lead to entering it then and there, which means the current | ||
7 | request will settle and the caller (if it runs in the same coroutine) | ||
8 | may then issue the next request. In such a case, we will enter | ||
9 | curl_setup_preadv() effectively from within curl_read_cb(). | ||
10 | |||
11 | Calling curl_multi_add_handle() will then fail and the new request will | ||
12 | not be processed. | ||
13 | |||
14 | Fix this by not letting curl_read_cb() wake up acb->co. Instead, leave | ||
15 | the whole business of settling the AIOCB objects to | ||
16 | curl_multi_check_completion() (which is called from our timer callback | ||
17 | and our FD handler, so not from any cURL callbacks). | ||
18 | |||
19 | Reported-by: Natalie Gavrielov <ngavrilo@redhat.com> | ||
20 | Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1740193 | ||
21 | Cc: qemu-stable@nongnu.org | ||
22 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
23 | Message-id: 20190910124136.10565-7-mreitz@redhat.com | ||
24 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
25 | Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> | ||
26 | Signed-off-by: Max Reitz <mreitz@redhat.com> | ||
27 | --- | ||
28 | block/curl.c | 69 ++++++++++++++++++++++------------------------------ | ||
29 | 1 file changed, 29 insertions(+), 40 deletions(-) | ||
30 | |||
31 | diff --git a/block/curl.c b/block/curl.c | ||
32 | index XXXXXXX..XXXXXXX 100644 | ||
33 | --- a/block/curl.c | ||
34 | +++ b/block/curl.c | ||
35 | @@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) | ||
36 | { | ||
37 | CURLState *s = ((CURLState*)opaque); | ||
38 | size_t realsize = size * nmemb; | ||
39 | - int i; | ||
40 | |||
41 | trace_curl_read_cb(realsize); | ||
42 | |||
43 | @@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) | ||
44 | memcpy(s->orig_buf + s->buf_off, ptr, realsize); | ||
45 | s->buf_off += realsize; | ||
46 | |||
47 | - for(i=0; i<CURL_NUM_ACB; i++) { | ||
48 | - CURLAIOCB *acb = s->acb[i]; | ||
49 | - | ||
50 | - if (!acb) | ||
51 | - continue; | ||
52 | - | ||
53 | - if ((s->buf_off >= acb->end)) { | ||
54 | - size_t request_length = acb->bytes; | ||
55 | - | ||
56 | - qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start, | ||
57 | - acb->end - acb->start); | ||
58 | - | ||
59 | - if (acb->end - acb->start < request_length) { | ||
60 | - size_t offset = acb->end - acb->start; | ||
61 | - qemu_iovec_memset(acb->qiov, offset, 0, | ||
62 | - request_length - offset); | ||
63 | - } | ||
64 | - | ||
65 | - acb->ret = 0; | ||
66 | - s->acb[i] = NULL; | ||
67 | - qemu_mutex_unlock(&s->s->mutex); | ||
68 | - aio_co_wake(acb->co); | ||
69 | - qemu_mutex_lock(&s->s->mutex); | ||
70 | - } | ||
71 | - } | ||
72 | - | ||
73 | read_end: | ||
74 | /* curl will error out if we do not return this value */ | ||
75 | return size * nmemb; | ||
76 | @@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s) | ||
77 | break; | ||
78 | |||
79 | if (msg->msg == CURLMSG_DONE) { | ||
80 | + int i; | ||
81 | CURLState *state = NULL; | ||
82 | + bool error = msg->data.result != CURLE_OK; | ||
83 | + | ||
84 | curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, | ||
85 | (char **)&state); | ||
86 | |||
87 | - /* ACBs for successful messages get completed in curl_read_cb */ | ||
88 | - if (msg->data.result != CURLE_OK) { | ||
89 | - int i; | ||
90 | + if (error) { | ||
91 | static int errcount = 100; | ||
92 | |||
93 | /* Don't lose the original error message from curl, since | ||
94 | @@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s) | ||
95 | error_report("curl: further errors suppressed"); | ||
96 | } | ||
97 | } | ||
98 | + } | ||
99 | |||
100 | - for (i = 0; i < CURL_NUM_ACB; i++) { | ||
101 | - CURLAIOCB *acb = state->acb[i]; | ||
102 | + for (i = 0; i < CURL_NUM_ACB; i++) { | ||
103 | + CURLAIOCB *acb = state->acb[i]; | ||
104 | |||
105 | - if (acb == NULL) { | ||
106 | - continue; | ||
107 | - } | ||
108 | + if (acb == NULL) { | ||
109 | + continue; | ||
110 | + } | ||
111 | + | ||
112 | + if (!error) { | ||
113 | + /* Assert that we have read all data */ | ||
114 | + assert(state->buf_off >= acb->end); | ||
115 | + | ||
116 | + qemu_iovec_from_buf(acb->qiov, 0, | ||
117 | + state->orig_buf + acb->start, | ||
118 | + acb->end - acb->start); | ||
119 | |||
120 | - acb->ret = -EIO; | ||
121 | - state->acb[i] = NULL; | ||
122 | - qemu_mutex_unlock(&s->mutex); | ||
123 | - aio_co_wake(acb->co); | ||
124 | - qemu_mutex_lock(&s->mutex); | ||
125 | + if (acb->end - acb->start < acb->bytes) { | ||
126 | + size_t offset = acb->end - acb->start; | ||
127 | + qemu_iovec_memset(acb->qiov, offset, 0, | ||
128 | + acb->bytes - offset); | ||
129 | + } | ||
130 | } | ||
131 | + | ||
132 | + acb->ret = error ? -EIO : 0; | ||
133 | + state->acb[i] = NULL; | ||
134 | + qemu_mutex_unlock(&s->mutex); | ||
135 | + aio_co_wake(acb->co); | ||
136 | + qemu_mutex_lock(&s->mutex); | ||
137 | } | ||
138 | |||
139 | curl_clean_state(state); | ||
140 | -- | ||
141 | 2.21.0 | ||
142 | |||
143 | diff view generated by jsdifflib |
1 | From: Sergio Lopez <slp@redhat.com> | 1 | When there are many poll handlers it's likely that some of them are idle |
---|---|---|---|
2 | 2 | most of the time. Remove handlers that haven't had activity recently so | |
3 | block_job_remove_all_bdrv() iterates through job->nodes, calling | 3 | that the polling loop scales better for guests with a large number of |
4 | bdrv_root_unref_child() for each entry. The call to the latter may | 4 | devices. |
5 | reach child_job_[can_]set_aio_ctx(), which will also attempt to | 5 | |
6 | traverse job->nodes, potentially finding entries that where freed | 6 | This feature only takes effect for the Linux io_uring fd monitoring |
7 | on previous iterations. | 7 | implementation because it is capable of combining fd monitoring with |
8 | 8 | userspace polling. The other implementations can't do that and risk | |
9 | To avoid this situation, update job->nodes head on each iteration to | 9 | starving fds in favor of poll handlers, so don't try this optimization |
10 | ensure that already freed entries are no longer linked to the list. | 10 | when they are in use. |
11 | 11 | ||
12 | RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1746631 | 12 | IOPS improves from 10k to 105k when the guest has 100 |
13 | Signed-off-by: Sergio Lopez <slp@redhat.com> | 13 | virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 |
14 | Cc: qemu-stable@nongnu.org | 14 | device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe. |
15 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 15 | |
16 | Message-id: 20190911100316.32282-1-mreitz@redhat.com | 16 | [Clarified aio_poll_handlers locking discipline explanation in comment |
17 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 17 | after discussion with Paolo Bonzini <pbonzini@redhat.com>. |
18 | Signed-off-by: Max Reitz <mreitz@redhat.com> | 18 | --Stefan] |
19 | |||
20 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
21 | Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com | ||
22 | Message-Id: <20200305170806.1313245-8-stefanha@redhat.com> | ||
19 | --- | 23 | --- |
20 | blockjob.c | 17 +++++++++++++---- | 24 | include/block/aio.h | 8 ++++ |
21 | 1 file changed, 13 insertions(+), 4 deletions(-) | 25 | util/aio-posix.c | 93 +++++++++++++++++++++++++++++++++++++++++---- |
22 | 26 | util/aio-posix.h | 2 + | |
23 | diff --git a/blockjob.c b/blockjob.c | 27 | util/trace-events | 2 + |
24 | index XXXXXXX..XXXXXXX 100644 | 28 | 4 files changed, 98 insertions(+), 7 deletions(-) |
25 | --- a/blockjob.c | 29 | |
26 | +++ b/blockjob.c | 30 | diff --git a/include/block/aio.h b/include/block/aio.h |
27 | @@ -XXX,XX +XXX,XX @@ static const BdrvChildRole child_job = { | 31 | index XXXXXXX..XXXXXXX 100644 |
28 | 32 | --- a/include/block/aio.h | |
29 | void block_job_remove_all_bdrv(BlockJob *job) | 33 | +++ b/include/block/aio.h |
34 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
35 | int64_t poll_grow; /* polling time growth factor */ | ||
36 | int64_t poll_shrink; /* polling time shrink factor */ | ||
37 | |||
38 | + /* | ||
39 | + * List of handlers participating in userspace polling. Protected by | ||
40 | + * ctx->list_lock. Iterated and modified mostly by the event loop thread | ||
41 | + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() | ||
42 | + * only touches the list to delete nodes if ctx->list_lock's count is zero. | ||
43 | + */ | ||
44 | + AioHandlerList poll_aio_handlers; | ||
45 | + | ||
46 | /* Are we in polling mode or monitoring file descriptors? */ | ||
47 | bool poll_started; | ||
48 | |||
49 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
50 | index XXXXXXX..XXXXXXX 100644 | ||
51 | --- a/util/aio-posix.c | ||
52 | +++ b/util/aio-posix.c | ||
53 | @@ -XXX,XX +XXX,XX @@ | ||
54 | #include "trace.h" | ||
55 | #include "aio-posix.h" | ||
56 | |||
57 | +/* Stop userspace polling on a handler if it isn't active for some time */ | ||
58 | +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) | ||
59 | + | ||
60 | bool aio_poll_disabled(AioContext *ctx) | ||
30 | { | 61 | { |
31 | - GSList *l; | 62 | return atomic_read(&ctx->poll_disable_cnt); |
32 | - for (l = job->nodes; l; l = l->next) { | 63 | @@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) |
64 | * deleted because deleted nodes are only cleaned up while | ||
65 | * no one is walking the handlers list. | ||
66 | */ | ||
67 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
68 | QLIST_REMOVE(node, node); | ||
69 | return true; | ||
70 | } | ||
71 | @@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started) | ||
72 | ctx->poll_started = started; | ||
73 | |||
74 | qemu_lockcnt_inc(&ctx->list_lock); | ||
75 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
76 | + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { | ||
77 | IOHandler *fn; | ||
78 | |||
79 | if (QLIST_IS_INSERTED(node, node_deleted)) { | ||
80 | @@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx) | ||
81 | while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { | ||
82 | QLIST_REMOVE(node, node); | ||
83 | QLIST_REMOVE(node, node_deleted); | ||
84 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
85 | g_free(node); | ||
86 | } | ||
87 | |||
88 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) | ||
89 | revents = node->pfd.revents & node->pfd.events; | ||
90 | node->pfd.revents = 0; | ||
91 | |||
33 | + /* | 92 | + /* |
34 | + * bdrv_root_unref_child() may reach child_job_[can_]set_aio_ctx(), | 93 | + * Start polling AioHandlers when they become ready because activity is |
35 | + * which will also traverse job->nodes, so consume the list one by | 94 | + * likely to continue. Note that starvation is theoretically possible when |
36 | + * one to make sure that such a concurrent access does not attempt | 95 | + * fdmon_supports_polling(), but only until the fd fires for the first |
37 | + * to process an already freed BdrvChild. | 96 | + * time. |
38 | + */ | 97 | + */ |
39 | + while (job->nodes) { | 98 | + if (!QLIST_IS_INSERTED(node, node_deleted) && |
40 | + GSList *l = job->nodes; | 99 | + !QLIST_IS_INSERTED(node, node_poll) && |
41 | BdrvChild *c = l->data; | 100 | + node->io_poll) { |
42 | + | 101 | + trace_poll_add(ctx, node, node->pfd.fd, revents); |
43 | + job->nodes = l->next; | 102 | + if (ctx->poll_started && node->io_poll_begin) { |
44 | + | 103 | + node->io_poll_begin(node->opaque); |
45 | bdrv_op_unblock_all(c->bs, job->blocker); | 104 | + } |
46 | bdrv_root_unref_child(c); | 105 | + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); |
47 | + | 106 | + } |
48 | + g_slist_free_1(l); | 107 | + |
49 | } | 108 | if (!QLIST_IS_INSERTED(node, node_deleted) && |
50 | - g_slist_free(job->nodes); | 109 | (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && |
51 | - job->nodes = NULL; | 110 | aio_node_check(ctx, node->is_external) && |
111 | @@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx) | ||
112 | timerlistgroup_run_timers(&ctx->tlg); | ||
52 | } | 113 | } |
53 | 114 | ||
54 | bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs) | 115 | -static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) |
116 | +static bool run_poll_handlers_once(AioContext *ctx, | ||
117 | + int64_t now, | ||
118 | + int64_t *timeout) | ||
119 | { | ||
120 | bool progress = false; | ||
121 | AioHandler *node; | ||
122 | + AioHandler *tmp; | ||
123 | |||
124 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
125 | - if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && | ||
126 | - aio_node_check(ctx, node->is_external) && | ||
127 | + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { | ||
128 | + if (aio_node_check(ctx, node->is_external) && | ||
129 | node->io_poll(node->opaque)) { | ||
130 | + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; | ||
131 | + | ||
132 | /* | ||
133 | * Polling was successful, exit try_poll_mode immediately | ||
134 | * to adjust the next polling time. | ||
135 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
136 | return progress; | ||
137 | } | ||
138 | |||
139 | +static bool fdmon_supports_polling(AioContext *ctx) | ||
140 | +{ | ||
141 | + return ctx->fdmon_ops->need_wait != aio_poll_disabled; | ||
142 | +} | ||
143 | + | ||
144 | +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) | ||
145 | +{ | ||
146 | + AioHandler *node; | ||
147 | + AioHandler *tmp; | ||
148 | + bool progress = false; | ||
149 | + | ||
150 | + /* | ||
151 | + * File descriptor monitoring implementations without userspace polling | ||
152 | + * support suffer from starvation when a subset of handlers is polled | ||
153 | + * because fds will not be processed in a timely fashion. Don't remove | ||
154 | + * idle poll handlers. | ||
155 | + */ | ||
156 | + if (!fdmon_supports_polling(ctx)) { | ||
157 | + return false; | ||
158 | + } | ||
159 | + | ||
160 | + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { | ||
161 | + if (node->poll_idle_timeout == 0LL) { | ||
162 | + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; | ||
163 | + } else if (now >= node->poll_idle_timeout) { | ||
164 | + trace_poll_remove(ctx, node, node->pfd.fd); | ||
165 | + node->poll_idle_timeout = 0LL; | ||
166 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
167 | + if (ctx->poll_started && node->io_poll_end) { | ||
168 | + node->io_poll_end(node->opaque); | ||
169 | + | ||
170 | + /* | ||
171 | + * Final poll in case ->io_poll_end() races with an event. | ||
172 | + * Nevermind about re-adding the handler in the rare case where | ||
173 | + * this causes progress. | ||
174 | + */ | ||
175 | + progress = node->io_poll(node->opaque) || progress; | ||
176 | + } | ||
177 | + } | ||
178 | + } | ||
179 | + | ||
180 | + return progress; | ||
181 | +} | ||
182 | + | ||
183 | /* run_poll_handlers: | ||
184 | * @ctx: the AioContext | ||
185 | * @max_ns: maximum time to poll for, in nanoseconds | ||
186 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
187 | |||
188 | start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
189 | do { | ||
190 | - progress = run_poll_handlers_once(ctx, timeout); | ||
191 | + progress = run_poll_handlers_once(ctx, start_time, timeout); | ||
192 | elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; | ||
193 | max_ns = qemu_soonest_timeout(*timeout, max_ns); | ||
194 | assert(!(max_ns && progress)); | ||
195 | } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); | ||
196 | |||
197 | + if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { | ||
198 | + *timeout = 0; | ||
199 | + progress = true; | ||
200 | + } | ||
201 | + | ||
202 | /* If time has passed with no successful polling, adjust *timeout to | ||
203 | * keep the same ending time. | ||
204 | */ | ||
205 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
206 | */ | ||
207 | static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
208 | { | ||
209 | - int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
210 | + int64_t max_ns; | ||
211 | + | ||
212 | + if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { | ||
213 | + return false; | ||
214 | + } | ||
215 | |||
216 | + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
217 | if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
218 | poll_set_started(ctx, true); | ||
219 | |||
220 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
221 | index XXXXXXX..XXXXXXX 100644 | ||
222 | --- a/util/aio-posix.h | ||
223 | +++ b/util/aio-posix.h | ||
224 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | ||
225 | QLIST_ENTRY(AioHandler) node; | ||
226 | QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
227 | QLIST_ENTRY(AioHandler) node_deleted; | ||
228 | + QLIST_ENTRY(AioHandler) node_poll; | ||
229 | #ifdef CONFIG_LINUX_IO_URING | ||
230 | QSLIST_ENTRY(AioHandler) node_submitted; | ||
231 | unsigned flags; /* see fdmon-io_uring.c */ | ||
232 | #endif | ||
233 | + int64_t poll_idle_timeout; /* when to stop userspace polling */ | ||
234 | bool is_external; | ||
235 | }; | ||
236 | |||
237 | diff --git a/util/trace-events b/util/trace-events | ||
238 | index XXXXXXX..XXXXXXX 100644 | ||
239 | --- a/util/trace-events | ||
240 | +++ b/util/trace-events | ||
241 | @@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ | ||
242 | run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64 | ||
243 | poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 | ||
244 | poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 | ||
245 | +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x" | ||
246 | +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d" | ||
247 | |||
248 | # async.c | ||
249 | aio_co_schedule(void *ctx, void *co) "ctx %p co %p" | ||
55 | -- | 250 | -- |
56 | 2.21.0 | 251 | 2.24.1 |
57 | 252 | ||
58 | diff view generated by jsdifflib |