1
The following changes since commit dd25f97c66a75d1508f1d4c6478ed2c95bec428f:
1
The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35:
2
2
3
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20190913' into staging (2019-09-16 10:15:15 +0100)
3
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://github.com/XanClic/qemu.git tags/pull-block-2019-09-16
7
https://github.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 1825cc0783ccf0ec5d9f0b225a99b340bdd4c68f:
9
for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3:
10
10
11
qemu-iotests: Add test for bz #1745922 (2019-09-16 15:37:12 +0200)
11
aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Block patches:
14
Pull request
15
- Fix for block jobs when used with I/O threads
16
- Fix for a corruption when using qcow2's LUKS encryption mode
17
- cURL fix
18
- check-block.sh cleanups (for make check)
19
- Refactoring
20
15
21
----------------------------------------------------------------
16
----------------------------------------------------------------
22
Max Reitz (7):
23
curl: Keep pointer to the CURLState in CURLSocket
24
curl: Keep *socket until the end of curl_sock_cb()
25
curl: Check completion in curl_multi_do()
26
curl: Pass CURLSocket to curl_multi_do()
27
curl: Report only ready sockets
28
curl: Handle success in multi_check_completion
29
curl: Check curl_multi_add_handle()'s return code
30
17
31
Maxim Levitsky (3):
18
Stefan Hajnoczi (9):
32
block/qcow2: Fix corruption introduced by commit 8ac0f15f335
19
qemu/queue.h: clear linked list pointers on remove
33
block/qcow2: refactor encryption code
20
aio-posix: remove confusing QLIST_SAFE_REMOVE()
34
qemu-iotests: Add test for bz #1745922
21
aio-posix: completely stop polling when disabled
22
aio-posix: move RCU_READ_LOCK() into run_poll_handlers()
23
aio-posix: extract ppoll(2) and epoll(7) fd monitoring
24
aio-posix: simplify FDMonOps->update() prototype
25
aio-posix: add io_uring fd monitoring implementation
26
aio-posix: support userspace polling of fd monitoring
27
aio-posix: remove idle poll handlers to improve scalability
35
28
36
Nir Soffer (2):
29
MAINTAINERS | 2 +
37
block: Use QEMU_IS_ALIGNED
30
configure | 5 +
38
block: Remove unused masks
31
include/block/aio.h | 71 ++++++-
39
32
include/qemu/queue.h | 19 +-
40
Sergio Lopez (1):
33
util/Makefile.objs | 3 +
41
blockjob: update nodes head while removing all bdrv
34
util/aio-posix.c | 451 ++++++++++++++----------------------------
42
35
util/aio-posix.h | 81 ++++++++
43
Thomas Huth (2):
36
util/fdmon-epoll.c | 155 +++++++++++++++
44
tests/qemu-iotests/check: Replace "tests" with "iotests" in final
37
util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++
45
status text
38
util/fdmon-poll.c | 107 ++++++++++
46
tests/Makefile: Do not print the name of the check-block.sh shell
39
util/trace-events | 2 +
47
script
40
11 files changed, 915 insertions(+), 313 deletions(-)
48
41
create mode 100644 util/aio-posix.h
49
Vladimir Sementsov-Ogievskiy (1):
42
create mode 100644 util/fdmon-epoll.c
50
tests/qemu-iotests: Fix qemu-io related output in 026.out.nocache
43
create mode 100644 util/fdmon-io_uring.c
51
44
create mode 100644 util/fdmon-poll.c
52
tests/Makefile.include | 2 +-
53
block/qcow2.h | 8 +-
54
include/block/block.h | 2 -
55
block/bochs.c | 4 +-
56
block/cloop.c | 4 +-
57
block/curl.c | 133 ++++++++++-------------
58
block/dmg.c | 4 +-
59
block/io.c | 8 +-
60
block/qcow2-cluster.c | 40 +++----
61
block/qcow2-threads.c | 63 ++++++++---
62
block/qcow2.c | 9 +-
63
block/vvfat.c | 8 +-
64
blockjob.c | 17 ++-
65
migration/block.c | 2 +-
66
qemu-img.c | 2 +-
67
tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++---------------
68
tests/qemu-iotests/263 | 91 ++++++++++++++++
69
tests/qemu-iotests/263.out | 40 +++++++
70
tests/qemu-iotests/check | 8 +-
71
tests/qemu-iotests/group | 1 +
72
20 files changed, 380 insertions(+), 234 deletions(-)
73
create mode 100755 tests/qemu-iotests/263
74
create mode 100644 tests/qemu-iotests/263.out
75
45
76
--
46
--
77
2.21.0
47
2.24.1
78
48
79
diff view generated by jsdifflib
Deleted patch
1
From: Nir Soffer <nirsof@gmail.com>
2
1
3
Replace instances of:
4
5
(n & (BDRV_SECTOR_SIZE - 1)) == 0
6
7
And:
8
9
(n & ~BDRV_SECTOR_MASK) == 0
10
11
With:
12
13
QEMU_IS_ALIGNED(n, BDRV_SECTOR_SIZE)
14
15
Which reveals the intent of the code better, and makes it easier to
16
locate the code checking alignment.
17
18
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
19
Message-id: 20190827185913.27427-2-nsoffer@redhat.com
20
Reviewed-by: John Snow <jsnow@redhat.com>
21
Signed-off-by: Max Reitz <mreitz@redhat.com>
22
---
23
block/bochs.c | 4 ++--
24
block/cloop.c | 4 ++--
25
block/dmg.c | 4 ++--
26
block/io.c | 8 ++++----
27
block/qcow2-cluster.c | 4 ++--
28
block/qcow2.c | 4 ++--
29
block/vvfat.c | 8 ++++----
30
qemu-img.c | 2 +-
31
8 files changed, 19 insertions(+), 19 deletions(-)
32
33
diff --git a/block/bochs.c b/block/bochs.c
34
index XXXXXXX..XXXXXXX 100644
35
--- a/block/bochs.c
36
+++ b/block/bochs.c
37
@@ -XXX,XX +XXX,XX @@ bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
38
QEMUIOVector local_qiov;
39
int ret;
40
41
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
42
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
43
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
44
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
45
46
qemu_iovec_init(&local_qiov, qiov->niov);
47
qemu_co_mutex_lock(&s->lock);
48
diff --git a/block/cloop.c b/block/cloop.c
49
index XXXXXXX..XXXXXXX 100644
50
--- a/block/cloop.c
51
+++ b/block/cloop.c
52
@@ -XXX,XX +XXX,XX @@ cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
53
int nb_sectors = bytes >> BDRV_SECTOR_BITS;
54
int ret, i;
55
56
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
57
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
58
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
59
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
60
61
qemu_co_mutex_lock(&s->lock);
62
63
diff --git a/block/dmg.c b/block/dmg.c
64
index XXXXXXX..XXXXXXX 100644
65
--- a/block/dmg.c
66
+++ b/block/dmg.c
67
@@ -XXX,XX +XXX,XX @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
68
int nb_sectors = bytes >> BDRV_SECTOR_BITS;
69
int ret, i;
70
71
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
72
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
73
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
74
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
75
76
qemu_co_mutex_lock(&s->lock);
77
78
diff --git a/block/io.c b/block/io.c
79
index XXXXXXX..XXXXXXX 100644
80
--- a/block/io.c
81
+++ b/block/io.c
82
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
83
sector_num = offset >> BDRV_SECTOR_BITS;
84
nb_sectors = bytes >> BDRV_SECTOR_BITS;
85
86
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
87
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
88
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
89
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
90
assert(bytes <= BDRV_REQUEST_MAX_BYTES);
91
assert(drv->bdrv_co_readv);
92
93
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
94
sector_num = offset >> BDRV_SECTOR_BITS;
95
nb_sectors = bytes >> BDRV_SECTOR_BITS;
96
97
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
98
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
99
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
100
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
101
assert(bytes <= BDRV_REQUEST_MAX_BYTES);
102
103
assert(drv->bdrv_co_writev);
104
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
105
index XXXXXXX..XXXXXXX 100644
106
--- a/block/qcow2-cluster.c
107
+++ b/block/qcow2-cluster.c
108
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
109
{
110
if (bytes && bs->encrypted) {
111
BDRVQcow2State *s = bs->opaque;
112
- assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
113
- assert((bytes & ~BDRV_SECTOR_MASK) == 0);
114
+ assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
115
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
116
assert(s->crypto);
117
if (qcow2_co_encrypt(bs, cluster_offset,
118
src_cluster_offset + offset_in_cluster,
119
diff --git a/block/qcow2.c b/block/qcow2.c
120
index XXXXXXX..XXXXXXX 100644
121
--- a/block/qcow2.c
122
+++ b/block/qcow2.c
123
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
124
goto fail;
125
}
126
127
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
128
- assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
129
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
130
+ assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE));
131
if (qcow2_co_decrypt(bs, cluster_offset, offset,
132
cluster_data, cur_bytes) < 0) {
133
ret = -EIO;
134
diff --git a/block/vvfat.c b/block/vvfat.c
135
index XXXXXXX..XXXXXXX 100644
136
--- a/block/vvfat.c
137
+++ b/block/vvfat.c
138
@@ -XXX,XX +XXX,XX @@ vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
139
int nb_sectors = bytes >> BDRV_SECTOR_BITS;
140
void *buf;
141
142
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
143
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
144
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
145
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
146
147
buf = g_try_malloc(bytes);
148
if (bytes && buf == NULL) {
149
@@ -XXX,XX +XXX,XX @@ vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
150
int nb_sectors = bytes >> BDRV_SECTOR_BITS;
151
void *buf;
152
153
- assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
154
- assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
155
+ assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
156
+ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
157
158
buf = g_try_malloc(bytes);
159
if (bytes && buf == NULL) {
160
diff --git a/qemu-img.c b/qemu-img.c
161
index XXXXXXX..XXXXXXX 100644
162
--- a/qemu-img.c
163
+++ b/qemu-img.c
164
@@ -XXX,XX +XXX,XX @@ static int img_convert(int argc, char **argv)
165
int64_t sval;
166
167
sval = cvtnum(optarg);
168
- if (sval < 0 || sval & (BDRV_SECTOR_SIZE - 1) ||
169
+ if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) ||
170
sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) {
171
error_report("Invalid buffer size for sparse output specified. "
172
"Valid sizes are multiples of %llu up to %llu. Select "
173
--
174
2.21.0
175
176
diff view generated by jsdifflib
Deleted patch
1
From: Nir Soffer <nirsof@gmail.com>
2
1
3
Replace confusing usage:
4
5
~BDRV_SECTOR_MASK
6
7
With more clear:
8
9
(BDRV_SECTOR_SIZE - 1)
10
11
Remove BDRV_SECTOR_MASK and the unused BDRV_BLOCK_OFFSET_MASK which was
12
it's last user.
13
14
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
15
Message-id: 20190827185913.27427-3-nsoffer@redhat.com
16
Reviewed-by: Juan Quintela <quintela@redhat.com>
17
Reviewed-by: John Snow <jsnow@redhat.com>
18
Signed-off-by: Max Reitz <mreitz@redhat.com>
19
---
20
include/block/block.h | 2 --
21
migration/block.c | 2 +-
22
2 files changed, 1 insertion(+), 3 deletions(-)
23
24
diff --git a/include/block/block.h b/include/block/block.h
25
index XXXXXXX..XXXXXXX 100644
26
--- a/include/block/block.h
27
+++ b/include/block/block.h
28
@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
29
30
#define BDRV_SECTOR_BITS 9
31
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
32
-#define BDRV_SECTOR_MASK ~(BDRV_SECTOR_SIZE - 1)
33
34
#define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
35
INT_MAX >> BDRV_SECTOR_BITS)
36
@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
37
#define BDRV_BLOCK_ALLOCATED 0x10
38
#define BDRV_BLOCK_EOF 0x20
39
#define BDRV_BLOCK_RECURSE 0x40
40
-#define BDRV_BLOCK_OFFSET_MASK BDRV_SECTOR_MASK
41
42
typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
43
44
diff --git a/migration/block.c b/migration/block.c
45
index XXXXXXX..XXXXXXX 100644
46
--- a/migration/block.c
47
+++ b/migration/block.c
48
@@ -XXX,XX +XXX,XX @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
49
do {
50
addr = qemu_get_be64(f);
51
52
- flags = addr & ~BDRV_SECTOR_MASK;
53
+ flags = addr & (BDRV_SECTOR_SIZE - 1);
54
addr >>= BDRV_SECTOR_BITS;
55
56
if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
57
--
58
2.21.0
59
60
diff view generated by jsdifflib
Deleted patch
1
From: Thomas Huth <thuth@redhat.com>
2
1
3
When running "make check -j8" or something similar, the iotests are
4
running in parallel with the other tests. So when they are printing
5
out "Passed all xx tests" or a similar status message at the end,
6
it might not be quite clear that this message belongs to the iotests,
7
since the output might be mixed with the other tests. Thus change the
8
word "tests" here to "iotests" instead to avoid confusion.
9
10
Signed-off-by: Thomas Huth <thuth@redhat.com>
11
Message-id: 20190906113920.11271-1-thuth@redhat.com
12
Reviewed-by: John Snow <jsnow@redhat.com>
13
Signed-off-by: Max Reitz <mreitz@redhat.com>
14
---
15
tests/qemu-iotests/check | 8 ++++----
16
1 file changed, 4 insertions(+), 4 deletions(-)
17
18
diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
19
index XXXXXXX..XXXXXXX 100755
20
--- a/tests/qemu-iotests/check
21
+++ b/tests/qemu-iotests/check
22
@@ -XXX,XX +XXX,XX @@ END { if (NR > 0) {
23
if [ ! -z "$n_bad" -a $n_bad != 0 ]
24
then
25
echo "Failures:$bad"
26
- echo "Failed $n_bad of $try tests"
27
+ echo "Failed $n_bad of $try iotests"
28
echo "Failures:$bad" | fmt >>check.log
29
- echo "Failed $n_bad of $try tests" >>check.log
30
+ echo "Failed $n_bad of $try iotests" >>check.log
31
else
32
- echo "Passed all $try tests"
33
- echo "Passed all $try tests" >>check.log
34
+ echo "Passed all $try iotests"
35
+ echo "Passed all $try iotests" >>check.log
36
fi
37
needwrap=false
38
fi
39
--
40
2.21.0
41
42
diff view generated by jsdifflib
1
From: Maxim Levitsky <mlevitsk@redhat.com>
1
Do not leave stale linked list pointers around after removal. It's
2
safer to set them to NULL so that use-after-removal results in an
3
immediate segfault.
2
4
3
This fixes subtle corruption introduced by luks threaded encryption
5
The RCU queue removal macros are unchanged since nodes may still be
4
in commit 8ac0f15f335
6
traversed after removal.
5
7
6
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1745922
8
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com
11
Message-Id: <20200224103406.1894923-2-stefanha@redhat.com>
12
---
13
include/qemu/queue.h | 19 +++++++++++++++----
14
1 file changed, 15 insertions(+), 4 deletions(-)
7
15
8
The corruption happens when we do a write that
16
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
9
* writes to two or more unallocated clusters at once
17
index XXXXXXX..XXXXXXX 100644
10
* doesn't fully cover the first sector
18
--- a/include/qemu/queue.h
11
* doesn't fully cover the last sector
19
+++ b/include/qemu/queue.h
12
* uses luks encryption
20
@@ -XXX,XX +XXX,XX @@ struct { \
21
(elm)->field.le_next->field.le_prev = \
22
(elm)->field.le_prev; \
23
*(elm)->field.le_prev = (elm)->field.le_next; \
24
+ (elm)->field.le_next = NULL; \
25
+ (elm)->field.le_prev = NULL; \
26
} while (/*CONSTCOND*/0)
27
28
/*
29
@@ -XXX,XX +XXX,XX @@ struct { \
30
} while (/*CONSTCOND*/0)
31
32
#define QSLIST_REMOVE_HEAD(head, field) do { \
33
- (head)->slh_first = (head)->slh_first->field.sle_next; \
34
+ typeof((head)->slh_first) elm = (head)->slh_first; \
35
+ (head)->slh_first = elm->field.sle_next; \
36
+ elm->field.sle_next = NULL; \
37
} while (/*CONSTCOND*/0)
38
39
#define QSLIST_REMOVE_AFTER(slistelm, field) do { \
40
- (slistelm)->field.sle_next = \
41
- QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field); \
42
+ typeof(slistelm) next = (slistelm)->field.sle_next; \
43
+ (slistelm)->field.sle_next = next->field.sle_next; \
44
+ next->field.sle_next = NULL; \
45
} while (/*CONSTCOND*/0)
46
47
#define QSLIST_REMOVE(head, elm, type, field) do { \
48
@@ -XXX,XX +XXX,XX @@ struct { \
49
while (curelm->field.sle_next != (elm)) \
50
curelm = curelm->field.sle_next; \
51
curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \
52
+ (elm)->field.sle_next = NULL; \
53
} \
54
} while (/*CONSTCOND*/0)
55
56
@@ -XXX,XX +XXX,XX @@ struct { \
57
} while (/*CONSTCOND*/0)
58
59
#define QSIMPLEQ_REMOVE_HEAD(head, field) do { \
60
- if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
61
+ typeof((head)->sqh_first) elm = (head)->sqh_first; \
62
+ if (((head)->sqh_first = elm->field.sqe_next) == NULL) \
63
(head)->sqh_last = &(head)->sqh_first; \
64
+ elm->field.sqe_next = NULL; \
65
} while (/*CONSTCOND*/0)
66
67
#define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do { \
68
@@ -XXX,XX +XXX,XX @@ struct { \
69
if ((curelm->field.sqe_next = \
70
curelm->field.sqe_next->field.sqe_next) == NULL) \
71
(head)->sqh_last = &(curelm)->field.sqe_next; \
72
+ (elm)->field.sqe_next = NULL; \
73
} \
74
} while (/*CONSTCOND*/0)
75
76
@@ -XXX,XX +XXX,XX @@ union { \
77
(head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \
78
(elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \
79
(elm)->field.tqe_circ.tql_prev = NULL; \
80
+ (elm)->field.tqe_circ.tql_next = NULL; \
81
+ (elm)->field.tqe_next = NULL; \
82
} while (/*CONSTCOND*/0)
83
84
/* remove @left, @right and all elements in between from @head */
85
--
86
2.24.1
13
87
14
In this case, when allocating the new clusters we COW both areas
15
prior to the write and after the write, and we encrypt them.
16
17
The above mentioned commit accidentally made it so we encrypt the
18
second COW area using the physical cluster offset of the first area.
19
20
The problem is that offset_in_cluster in do_perform_cow_encrypt
21
can be larger that the cluster size, thus cluster_offset
22
will no longer point to the start of the cluster at which encrypted
23
area starts.
24
25
Next patch in this series will refactor the code to avoid all these
26
assumptions.
27
28
In the bugreport that was triggered by rebasing a luks image to new,
29
zero filled base, which lot of such writes, and causes some files
30
with zero areas to contain garbage there instead.
31
But as described above it can happen elsewhere as well
32
33
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
34
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
35
Message-id: 20190915203655.21638-2-mlevitsk@redhat.com
36
Reviewed-by: Max Reitz <mreitz@redhat.com>
37
Signed-off-by: Max Reitz <mreitz@redhat.com>
38
---
39
block/qcow2-cluster.c | 7 ++++---
40
1 file changed, 4 insertions(+), 3 deletions(-)
41
42
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
43
index XXXXXXX..XXXXXXX 100644
44
--- a/block/qcow2-cluster.c
45
+++ b/block/qcow2-cluster.c
46
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
47
assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
48
assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
49
assert(s->crypto);
50
- if (qcow2_co_encrypt(bs, cluster_offset,
51
- src_cluster_offset + offset_in_cluster,
52
- buffer, bytes) < 0) {
53
+ if (qcow2_co_encrypt(bs,
54
+ start_of_cluster(s, cluster_offset + offset_in_cluster),
55
+ src_cluster_offset + offset_in_cluster,
56
+ buffer, bytes) < 0) {
57
return false;
58
}
59
}
60
--
61
2.21.0
62
63
diff view generated by jsdifflib
1
From: Thomas Huth <thuth@redhat.com>
1
QLIST_SAFE_REMOVE() is confusing here because the node must be on the
2
list. We actually just wanted to clear the linked list pointers when
3
removing it from the list. QLIST_REMOVE() now does this, so switch to
4
it.
2
5
3
The check script is already printing out which iotest is currently
6
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
4
running, so printing out the name of the check-block.sh shell script
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
looks superfluous here.
8
Link: https://lore.kernel.org/r/20200224103406.1894923-3-stefanha@redhat.com
6
9
Message-Id: <20200224103406.1894923-3-stefanha@redhat.com>
7
Signed-off-by: Thomas Huth <thuth@redhat.com>
8
Message-id: 20190906113534.10907-1-thuth@redhat.com
9
Acked-by: John Snow <jsnow@redhat.com>
10
Reviewed-by: Max Reitz <mreitz@redhat.com>
11
Signed-off-by: Max Reitz <mreitz@redhat.com>
12
---
10
---
13
tests/Makefile.include | 2 +-
11
util/aio-posix.c | 2 +-
14
1 file changed, 1 insertion(+), 1 deletion(-)
12
1 file changed, 1 insertion(+), 1 deletion(-)
15
13
16
diff --git a/tests/Makefile.include b/tests/Makefile.include
14
diff --git a/util/aio-posix.c b/util/aio-posix.c
17
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
18
--- a/tests/Makefile.include
16
--- a/util/aio-posix.c
19
+++ b/tests/Makefile.include
17
+++ b/util/aio-posix.c
20
@@ -XXX,XX +XXX,XX @@ QEMU_IOTESTS_HELPERS-$(call land,$(CONFIG_SOFTMMU),$(CONFIG_LINUX)) = tests/qemu
18
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
21
check-tests/check-block.sh: tests/check-block.sh qemu-img$(EXESUF) \
19
AioHandler *node;
22
        qemu-io$(EXESUF) qemu-nbd$(EXESUF) $(QEMU_IOTESTS_HELPERS-y) \
20
23
        $(patsubst %,%/all,$(filter %-softmmu,$(TARGET_DIRS)))
21
while ((node = QLIST_FIRST(ready_list))) {
24
-    $<
22
- QLIST_SAFE_REMOVE(node, node_ready);
25
+    @$<
23
+ QLIST_REMOVE(node, node_ready);
26
24
progress = aio_dispatch_handler(ctx, node) || progress;
27
.PHONY: $(patsubst %, check-%, $(check-qapi-schema-y))
25
}
28
$(patsubst %, check-%, $(check-qapi-schema-y)): check-%.json: $(SRC_PATH)/%.json
26
29
--
27
--
30
2.21.0
28
2.24.1
31
29
32
diff view generated by jsdifflib
Deleted patch
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
2
1
3
qemu-io now prefixes its error and warnings with "qemu-io:".
4
36b9986b08787019e fixed a lot of iotests output but forget about
5
026.out.nocache. Fix it too.
6
7
Fixes: 99e98d7c9fc1a1639fad ("qemu-io: Use error_[gs]et_progname()")
8
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
9
Message-id: 20190816153015.447957-2-vsementsov@virtuozzo.com
10
Reviewed-by: John Snow <jsnow@redhat.com>
11
Signed-off-by: Max Reitz <mreitz@redhat.com>
12
---
13
tests/qemu-iotests/026.out.nocache | 168 ++++++++++++++---------------
14
1 file changed, 84 insertions(+), 84 deletions(-)
15
16
diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache
17
index XXXXXXX..XXXXXXX 100644
18
--- a/tests/qemu-iotests/026.out.nocache
19
+++ b/tests/qemu-iotests/026.out.nocache
20
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
21
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
22
23
Event: l1_update; errno: 5; imm: off; once: off; write
24
-Failed to flush the L2 table cache: Input/output error
25
-Failed to flush the refcount block cache: Input/output error
26
+qemu-io: Failed to flush the L2 table cache: Input/output error
27
+qemu-io: Failed to flush the refcount block cache: Input/output error
28
write failed: Input/output error
29
30
1 leaked clusters were found on the image.
31
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
32
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
33
34
Event: l1_update; errno: 5; imm: off; once: off; write -b
35
-Failed to flush the L2 table cache: Input/output error
36
-Failed to flush the refcount block cache: Input/output error
37
+qemu-io: Failed to flush the L2 table cache: Input/output error
38
+qemu-io: Failed to flush the refcount block cache: Input/output error
39
write failed: Input/output error
40
41
1 leaked clusters were found on the image.
42
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
43
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
44
45
Event: l1_update; errno: 28; imm: off; once: off; write
46
-Failed to flush the L2 table cache: No space left on device
47
-Failed to flush the refcount block cache: No space left on device
48
+qemu-io: Failed to flush the L2 table cache: No space left on device
49
+qemu-io: Failed to flush the refcount block cache: No space left on device
50
write failed: No space left on device
51
52
1 leaked clusters were found on the image.
53
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
54
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
55
56
Event: l1_update; errno: 28; imm: off; once: off; write -b
57
-Failed to flush the L2 table cache: No space left on device
58
-Failed to flush the refcount block cache: No space left on device
59
+qemu-io: Failed to flush the L2 table cache: No space left on device
60
+qemu-io: Failed to flush the refcount block cache: No space left on device
61
write failed: No space left on device
62
63
1 leaked clusters were found on the image.
64
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
65
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
66
67
Event: l2_update; errno: 5; imm: off; once: off; write
68
-Failed to flush the L2 table cache: Input/output error
69
-Failed to flush the refcount block cache: Input/output error
70
+qemu-io: Failed to flush the L2 table cache: Input/output error
71
+qemu-io: Failed to flush the refcount block cache: Input/output error
72
wrote 131072/131072 bytes at offset 0
73
128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
74
75
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
76
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
77
78
Event: l2_update; errno: 5; imm: off; once: off; write -b
79
-Failed to flush the L2 table cache: Input/output error
80
-Failed to flush the refcount block cache: Input/output error
81
+qemu-io: Failed to flush the L2 table cache: Input/output error
82
+qemu-io: Failed to flush the refcount block cache: Input/output error
83
wrote 131072/131072 bytes at offset 0
84
128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
85
86
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
87
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
88
89
Event: l2_update; errno: 28; imm: off; once: off; write
90
-Failed to flush the L2 table cache: No space left on device
91
-Failed to flush the refcount block cache: No space left on device
92
+qemu-io: Failed to flush the L2 table cache: No space left on device
93
+qemu-io: Failed to flush the refcount block cache: No space left on device
94
wrote 131072/131072 bytes at offset 0
95
128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
96
97
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
98
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
99
100
Event: l2_update; errno: 28; imm: off; once: off; write -b
101
-Failed to flush the L2 table cache: No space left on device
102
-Failed to flush the refcount block cache: No space left on device
103
+qemu-io: Failed to flush the L2 table cache: No space left on device
104
+qemu-io: Failed to flush the refcount block cache: No space left on device
105
wrote 131072/131072 bytes at offset 0
106
128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
107
108
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
109
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
110
111
Event: l2_alloc_write; errno: 5; imm: off; once: off; write
112
-Failed to flush the L2 table cache: Input/output error
113
-Failed to flush the refcount block cache: Input/output error
114
+qemu-io: Failed to flush the L2 table cache: Input/output error
115
+qemu-io: Failed to flush the refcount block cache: Input/output error
116
write failed: Input/output error
117
No errors were found on the image.
118
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
119
120
Event: l2_alloc_write; errno: 5; imm: off; once: off; write -b
121
-Failed to flush the L2 table cache: Input/output error
122
-Failed to flush the refcount block cache: Input/output error
123
+qemu-io: Failed to flush the L2 table cache: Input/output error
124
+qemu-io: Failed to flush the refcount block cache: Input/output error
125
write failed: Input/output error
126
127
1 leaked clusters were found on the image.
128
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
129
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
130
131
Event: l2_alloc_write; errno: 28; imm: off; once: off; write
132
-Failed to flush the L2 table cache: No space left on device
133
-Failed to flush the refcount block cache: No space left on device
134
+qemu-io: Failed to flush the L2 table cache: No space left on device
135
+qemu-io: Failed to flush the refcount block cache: No space left on device
136
write failed: No space left on device
137
No errors were found on the image.
138
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
139
140
Event: l2_alloc_write; errno: 28; imm: off; once: off; write -b
141
-Failed to flush the L2 table cache: No space left on device
142
-Failed to flush the refcount block cache: No space left on device
143
+qemu-io: Failed to flush the L2 table cache: No space left on device
144
+qemu-io: Failed to flush the refcount block cache: No space left on device
145
write failed: No space left on device
146
147
1 leaked clusters were found on the image.
148
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
149
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
150
151
Event: write_aio; errno: 5; imm: off; once: off; write
152
-Failed to flush the L2 table cache: Input/output error
153
-Failed to flush the refcount block cache: Input/output error
154
+qemu-io: Failed to flush the L2 table cache: Input/output error
155
+qemu-io: Failed to flush the refcount block cache: Input/output error
156
write failed: Input/output error
157
No errors were found on the image.
158
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
159
160
Event: write_aio; errno: 5; imm: off; once: off; write -b
161
-Failed to flush the L2 table cache: Input/output error
162
-Failed to flush the refcount block cache: Input/output error
163
+qemu-io: Failed to flush the L2 table cache: Input/output error
164
+qemu-io: Failed to flush the refcount block cache: Input/output error
165
write failed: Input/output error
166
No errors were found on the image.
167
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
168
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
169
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
170
171
Event: write_aio; errno: 28; imm: off; once: off; write
172
-Failed to flush the L2 table cache: No space left on device
173
-Failed to flush the refcount block cache: No space left on device
174
+qemu-io: Failed to flush the L2 table cache: No space left on device
175
+qemu-io: Failed to flush the refcount block cache: No space left on device
176
write failed: No space left on device
177
No errors were found on the image.
178
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
179
180
Event: write_aio; errno: 28; imm: off; once: off; write -b
181
-Failed to flush the L2 table cache: No space left on device
182
-Failed to flush the refcount block cache: No space left on device
183
+qemu-io: Failed to flush the L2 table cache: No space left on device
184
+qemu-io: Failed to flush the refcount block cache: No space left on device
185
write failed: No space left on device
186
No errors were found on the image.
187
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
188
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
189
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
190
191
Event: refblock_load; errno: 5; imm: off; once: off; write
192
-Failed to flush the L2 table cache: Input/output error
193
-Failed to flush the refcount block cache: Input/output error
194
+qemu-io: Failed to flush the L2 table cache: Input/output error
195
+qemu-io: Failed to flush the refcount block cache: Input/output error
196
write failed: Input/output error
197
No errors were found on the image.
198
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
199
200
Event: refblock_load; errno: 5; imm: off; once: off; write -b
201
-Failed to flush the L2 table cache: Input/output error
202
-Failed to flush the refcount block cache: Input/output error
203
+qemu-io: Failed to flush the L2 table cache: Input/output error
204
+qemu-io: Failed to flush the refcount block cache: Input/output error
205
write failed: Input/output error
206
No errors were found on the image.
207
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
208
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
209
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
210
211
Event: refblock_load; errno: 28; imm: off; once: off; write
212
-Failed to flush the L2 table cache: No space left on device
213
-Failed to flush the refcount block cache: No space left on device
214
+qemu-io: Failed to flush the L2 table cache: No space left on device
215
+qemu-io: Failed to flush the refcount block cache: No space left on device
216
write failed: No space left on device
217
No errors were found on the image.
218
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
219
220
Event: refblock_load; errno: 28; imm: off; once: off; write -b
221
-Failed to flush the L2 table cache: No space left on device
222
-Failed to flush the refcount block cache: No space left on device
223
+qemu-io: Failed to flush the L2 table cache: No space left on device
224
+qemu-io: Failed to flush the refcount block cache: No space left on device
225
write failed: No space left on device
226
No errors were found on the image.
227
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
228
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
229
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
230
231
Event: refblock_update_part; errno: 5; imm: off; once: off; write
232
-Failed to flush the L2 table cache: Input/output error
233
-Failed to flush the refcount block cache: Input/output error
234
+qemu-io: Failed to flush the L2 table cache: Input/output error
235
+qemu-io: Failed to flush the refcount block cache: Input/output error
236
write failed: Input/output error
237
No errors were found on the image.
238
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
239
240
Event: refblock_update_part; errno: 5; imm: off; once: off; write -b
241
-Failed to flush the L2 table cache: Input/output error
242
-Failed to flush the refcount block cache: Input/output error
243
+qemu-io: Failed to flush the L2 table cache: Input/output error
244
+qemu-io: Failed to flush the refcount block cache: Input/output error
245
write failed: Input/output error
246
No errors were found on the image.
247
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
248
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
249
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
250
251
Event: refblock_update_part; errno: 28; imm: off; once: off; write
252
-Failed to flush the L2 table cache: No space left on device
253
-Failed to flush the refcount block cache: No space left on device
254
+qemu-io: Failed to flush the L2 table cache: No space left on device
255
+qemu-io: Failed to flush the refcount block cache: No space left on device
256
write failed: No space left on device
257
No errors were found on the image.
258
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
259
260
Event: refblock_update_part; errno: 28; imm: off; once: off; write -b
261
-Failed to flush the L2 table cache: No space left on device
262
-Failed to flush the refcount block cache: No space left on device
263
+qemu-io: Failed to flush the L2 table cache: No space left on device
264
+qemu-io: Failed to flush the refcount block cache: No space left on device
265
write failed: No space left on device
266
No errors were found on the image.
267
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
268
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
269
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
270
271
Event: refblock_alloc; errno: 5; imm: off; once: off; write
272
-Failed to flush the L2 table cache: Input/output error
273
-Failed to flush the refcount block cache: Input/output error
274
+qemu-io: Failed to flush the L2 table cache: Input/output error
275
+qemu-io: Failed to flush the refcount block cache: Input/output error
276
write failed: Input/output error
277
No errors were found on the image.
278
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
279
280
Event: refblock_alloc; errno: 5; imm: off; once: off; write -b
281
-Failed to flush the L2 table cache: Input/output error
282
-Failed to flush the refcount block cache: Input/output error
283
+qemu-io: Failed to flush the L2 table cache: Input/output error
284
+qemu-io: Failed to flush the refcount block cache: Input/output error
285
write failed: Input/output error
286
No errors were found on the image.
287
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
288
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
289
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
290
291
Event: refblock_alloc; errno: 28; imm: off; once: off; write
292
-Failed to flush the L2 table cache: No space left on device
293
-Failed to flush the refcount block cache: No space left on device
294
+qemu-io: Failed to flush the L2 table cache: No space left on device
295
+qemu-io: Failed to flush the refcount block cache: No space left on device
296
write failed: No space left on device
297
No errors were found on the image.
298
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
299
300
Event: refblock_alloc; errno: 28; imm: off; once: off; write -b
301
-Failed to flush the L2 table cache: No space left on device
302
-Failed to flush the refcount block cache: No space left on device
303
+qemu-io: Failed to flush the L2 table cache: No space left on device
304
+qemu-io: Failed to flush the refcount block cache: No space left on device
305
write failed: No space left on device
306
No errors were found on the image.
307
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
308
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
309
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
310
311
Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write
312
-Failed to flush the L2 table cache: No space left on device
313
-Failed to flush the refcount block cache: No space left on device
314
+qemu-io: Failed to flush the L2 table cache: No space left on device
315
+qemu-io: Failed to flush the refcount block cache: No space left on device
316
write failed: No space left on device
317
318
55 leaked clusters were found on the image.
319
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
320
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
321
322
Event: refblock_alloc_hookup; errno: 28; imm: off; once: off; write -b
323
-Failed to flush the L2 table cache: No space left on device
324
-Failed to flush the refcount block cache: No space left on device
325
+qemu-io: Failed to flush the L2 table cache: No space left on device
326
+qemu-io: Failed to flush the refcount block cache: No space left on device
327
write failed: No space left on device
328
329
251 leaked clusters were found on the image.
330
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
331
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
332
333
Event: refblock_alloc_write; errno: 28; imm: off; once: off; write
334
-Failed to flush the L2 table cache: No space left on device
335
-Failed to flush the refcount block cache: No space left on device
336
+qemu-io: Failed to flush the L2 table cache: No space left on device
337
+qemu-io: Failed to flush the refcount block cache: No space left on device
338
write failed: No space left on device
339
No errors were found on the image.
340
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
341
342
Event: refblock_alloc_write; errno: 28; imm: off; once: off; write -b
343
-Failed to flush the L2 table cache: No space left on device
344
-Failed to flush the refcount block cache: No space left on device
345
+qemu-io: Failed to flush the L2 table cache: No space left on device
346
+qemu-io: Failed to flush the refcount block cache: No space left on device
347
write failed: No space left on device
348
No errors were found on the image.
349
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
350
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
351
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
352
353
Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write
354
-Failed to flush the L2 table cache: No space left on device
355
-Failed to flush the refcount block cache: No space left on device
356
+qemu-io: Failed to flush the L2 table cache: No space left on device
357
+qemu-io: Failed to flush the refcount block cache: No space left on device
358
write failed: No space left on device
359
360
10 leaked clusters were found on the image.
361
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
362
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
363
364
Event: refblock_alloc_write_blocks; errno: 28; imm: off; once: off; write -b
365
-Failed to flush the L2 table cache: No space left on device
366
-Failed to flush the refcount block cache: No space left on device
367
+qemu-io: Failed to flush the L2 table cache: No space left on device
368
+qemu-io: Failed to flush the refcount block cache: No space left on device
369
write failed: No space left on device
370
371
23 leaked clusters were found on the image.
372
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
373
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
374
375
Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write
376
-Failed to flush the L2 table cache: No space left on device
377
-Failed to flush the refcount block cache: No space left on device
378
+qemu-io: Failed to flush the L2 table cache: No space left on device
379
+qemu-io: Failed to flush the refcount block cache: No space left on device
380
write failed: No space left on device
381
382
10 leaked clusters were found on the image.
383
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
384
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
385
386
Event: refblock_alloc_write_table; errno: 28; imm: off; once: off; write -b
387
-Failed to flush the L2 table cache: No space left on device
388
-Failed to flush the refcount block cache: No space left on device
389
+qemu-io: Failed to flush the L2 table cache: No space left on device
390
+qemu-io: Failed to flush the refcount block cache: No space left on device
391
write failed: No space left on device
392
393
23 leaked clusters were found on the image.
394
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
395
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
396
397
Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write
398
-Failed to flush the L2 table cache: No space left on device
399
-Failed to flush the refcount block cache: No space left on device
400
+qemu-io: Failed to flush the L2 table cache: No space left on device
401
+qemu-io: Failed to flush the refcount block cache: No space left on device
402
write failed: No space left on device
403
404
10 leaked clusters were found on the image.
405
@@ -XXX,XX +XXX,XX @@ This means waste of disk space, but no harm to data.
406
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
407
408
Event: refblock_alloc_switch_table; errno: 28; imm: off; once: off; write -b
409
-Failed to flush the L2 table cache: No space left on device
410
-Failed to flush the refcount block cache: No space left on device
411
+qemu-io: Failed to flush the L2 table cache: No space left on device
412
+qemu-io: Failed to flush the refcount block cache: No space left on device
413
write failed: No space left on device
414
415
23 leaked clusters were found on the image.
416
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
417
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
418
419
Event: l1_grow_write_table; errno: 5; imm: off; once: off
420
-Failed to flush the L2 table cache: Input/output error
421
-Failed to flush the refcount block cache: Input/output error
422
+qemu-io: Failed to flush the L2 table cache: Input/output error
423
+qemu-io: Failed to flush the refcount block cache: Input/output error
424
write failed: Input/output error
425
No errors were found on the image.
426
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
427
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
428
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
429
430
Event: l1_grow_write_table; errno: 28; imm: off; once: off
431
-Failed to flush the L2 table cache: No space left on device
432
-Failed to flush the refcount block cache: No space left on device
433
+qemu-io: Failed to flush the L2 table cache: No space left on device
434
+qemu-io: Failed to flush the refcount block cache: No space left on device
435
write failed: No space left on device
436
No errors were found on the image.
437
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
438
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
439
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
440
441
Event: l1_grow_activate_table; errno: 5; imm: off; once: off
442
-Failed to flush the L2 table cache: Input/output error
443
-Failed to flush the refcount block cache: Input/output error
444
+qemu-io: Failed to flush the L2 table cache: Input/output error
445
+qemu-io: Failed to flush the refcount block cache: Input/output error
446
write failed: Input/output error
447
448
96 leaked clusters were found on the image.
449
@@ -XXX,XX +XXX,XX @@ No errors were found on the image.
450
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
451
452
Event: l1_grow_activate_table; errno: 28; imm: off; once: off
453
-Failed to flush the L2 table cache: No space left on device
454
-Failed to flush the refcount block cache: No space left on device
455
+qemu-io: Failed to flush the L2 table cache: No space left on device
456
+qemu-io: Failed to flush the refcount block cache: No space left on device
457
write failed: No space left on device
458
459
96 leaked clusters were found on the image.
460
--
461
2.21.0
462
463
diff view generated by jsdifflib
Deleted patch
1
A follow-up patch will make curl_multi_do() and curl_multi_read() take a
2
CURLSocket instead of the CURLState. They still need the latter,
3
though, so add a pointer to it to the former.
4
1
5
Cc: qemu-stable@nongnu.org
6
Signed-off-by: Max Reitz <mreitz@redhat.com>
7
Reviewed-by: John Snow <jsnow@redhat.com>
8
Message-id: 20190910124136.10565-2-mreitz@redhat.com
9
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
10
Signed-off-by: Max Reitz <mreitz@redhat.com>
11
---
12
block/curl.c | 3 +++
13
1 file changed, 3 insertions(+)
14
15
diff --git a/block/curl.c b/block/curl.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/block/curl.c
18
+++ b/block/curl.c
19
@@ -XXX,XX +XXX,XX @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
20
#define CURL_BLOCK_OPT_TIMEOUT_DEFAULT 5
21
22
struct BDRVCURLState;
23
+struct CURLState;
24
25
static bool libcurl_initialized;
26
27
@@ -XXX,XX +XXX,XX @@ typedef struct CURLAIOCB {
28
29
typedef struct CURLSocket {
30
int fd;
31
+ struct CURLState *state;
32
QLIST_ENTRY(CURLSocket) next;
33
} CURLSocket;
34
35
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
36
if (!socket) {
37
socket = g_new0(CURLSocket, 1);
38
socket->fd = fd;
39
+ socket->state = state;
40
QLIST_INSERT_HEAD(&state->sockets, socket, next);
41
}
42
socket = NULL;
43
--
44
2.21.0
45
46
diff view generated by jsdifflib
1
This does not really change anything, but it makes the code a bit easier
1
One iteration of polling is always performed even when polling is
2
to follow once we use @socket as the opaque pointer for
2
disabled. This is done because:
3
aio_set_fd_handler().
3
1. Userspace polling is cheaper than making a syscall. We might get
4
lucky.
5
2. We must poll once more after polling has stopped in case an event
6
occurred while stopping polling.
4
7
5
Cc: qemu-stable@nongnu.org
8
However, there are downsides:
6
Signed-off-by: Max Reitz <mreitz@redhat.com>
9
1. Polling becomes a bottleneck when the number of event sources is very
7
Message-id: 20190910124136.10565-3-mreitz@redhat.com
10
high. It's more efficient to monitor fds in that case.
8
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
11
2. A high-frequency polling event source can starve non-polling event
9
Reviewed-by: John Snow <jsnow@redhat.com>
12
sources because ppoll(2)/epoll(7) is never invoked.
10
Signed-off-by: Max Reitz <mreitz@redhat.com>
13
14
This patch removes the forced polling iteration so that poll_ns=0 really
15
means no polling.
16
17
IOPS increases from 10k to 60k when the guest has 100
18
virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
19
device because the large number of event sources being polled slows down
20
the event loop.
21
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
23
Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com
24
Message-Id: <20200305170806.1313245-2-stefanha@redhat.com>
11
---
25
---
12
block/curl.c | 10 +++++-----
26
util/aio-posix.c | 22 +++++++++++++++-------
13
1 file changed, 5 insertions(+), 5 deletions(-)
27
1 file changed, 15 insertions(+), 7 deletions(-)
14
28
15
diff --git a/block/curl.c b/block/curl.c
29
diff --git a/util/aio-posix.c b/util/aio-posix.c
16
index XXXXXXX..XXXXXXX 100644
30
index XXXXXXX..XXXXXXX 100644
17
--- a/block/curl.c
31
--- a/util/aio-posix.c
18
+++ b/block/curl.c
32
+++ b/util/aio-posix.c
19
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
33
@@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx,
20
34
(IOHandler *)io_poll_end);
21
QLIST_FOREACH(socket, &state->sockets, next) {
35
}
22
if (socket->fd == fd) {
36
23
- if (action == CURL_POLL_REMOVE) {
37
-static void poll_set_started(AioContext *ctx, bool started)
24
- QLIST_REMOVE(socket, next);
38
+static bool poll_set_started(AioContext *ctx, bool started)
25
- g_free(socket);
39
{
26
- }
40
AioHandler *node;
27
break;
41
+ bool progress = false;
42
43
if (started == ctx->poll_started) {
44
- return;
45
+ return false;
46
}
47
48
ctx->poll_started = started;
49
@@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started)
50
if (fn) {
51
fn(node->opaque);
52
}
53
+
54
+ /* Poll one last time in case ->io_poll_end() raced with the event */
55
+ if (!started) {
56
+ progress = node->io_poll(node->opaque) || progress;
57
+ }
58
}
59
qemu_lockcnt_dec(&ctx->list_lock);
60
+
61
+ return progress;
62
}
63
64
65
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
28
}
66
}
29
}
67
}
30
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
68
31
socket->state = state;
69
- poll_set_started(ctx, false);
32
QLIST_INSERT_HEAD(&state->sockets, socket, next);
70
+ if (poll_set_started(ctx, false)) {
33
}
71
+ *timeout = 0;
34
- socket = NULL;
72
+ return true;
35
36
trace_curl_sock_cb(action, (int)fd);
37
switch (action) {
38
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
39
break;
40
}
41
42
+ if (action == CURL_POLL_REMOVE) {
43
+ QLIST_REMOVE(socket, next);
44
+ g_free(socket);
45
+ }
73
+ }
46
+
74
47
return 0;
75
- /* Even if we don't run busy polling, try polling once in case it can make
76
- * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
77
- */
78
- return run_poll_handlers_once(ctx, timeout);
79
+ return false;
48
}
80
}
49
81
82
bool aio_poll(AioContext *ctx, bool blocking)
50
--
83
--
51
2.21.0
84
2.24.1
52
85
53
diff view generated by jsdifflib
Deleted patch
1
While it is more likely that transfers complete after some file
2
descriptor has data ready to read, we probably should not rely on it.
3
Better be safe than sorry and call curl_multi_check_completion() in
4
curl_multi_do(), too, just like it is done in curl_multi_read().
5
1
6
With this change, curl_multi_do() and curl_multi_read() are actually the
7
same, so drop curl_multi_read() and use curl_multi_do() as the sole FD
8
handler.
9
10
Signed-off-by: Max Reitz <mreitz@redhat.com>
11
Message-id: 20190910124136.10565-4-mreitz@redhat.com
12
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
13
Reviewed-by: John Snow <jsnow@redhat.com>
14
Signed-off-by: Max Reitz <mreitz@redhat.com>
15
---
16
block/curl.c | 14 ++------------
17
1 file changed, 2 insertions(+), 12 deletions(-)
18
19
diff --git a/block/curl.c b/block/curl.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/block/curl.c
22
+++ b/block/curl.c
23
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVCURLState {
24
25
static void curl_clean_state(CURLState *s);
26
static void curl_multi_do(void *arg);
27
-static void curl_multi_read(void *arg);
28
29
#ifdef NEED_CURL_TIMER_CALLBACK
30
/* Called from curl_multi_do_locked, with s->mutex held. */
31
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
32
switch (action) {
33
case CURL_POLL_IN:
34
aio_set_fd_handler(s->aio_context, fd, false,
35
- curl_multi_read, NULL, NULL, state);
36
+ curl_multi_do, NULL, NULL, state);
37
break;
38
case CURL_POLL_OUT:
39
aio_set_fd_handler(s->aio_context, fd, false,
40
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
41
break;
42
case CURL_POLL_INOUT:
43
aio_set_fd_handler(s->aio_context, fd, false,
44
- curl_multi_read, curl_multi_do, NULL, state);
45
+ curl_multi_do, curl_multi_do, NULL, state);
46
break;
47
case CURL_POLL_REMOVE:
48
aio_set_fd_handler(s->aio_context, fd, false,
49
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
50
{
51
CURLState *s = (CURLState *)arg;
52
53
- qemu_mutex_lock(&s->s->mutex);
54
- curl_multi_do_locked(s);
55
- qemu_mutex_unlock(&s->s->mutex);
56
-}
57
-
58
-static void curl_multi_read(void *arg)
59
-{
60
- CURLState *s = (CURLState *)arg;
61
-
62
qemu_mutex_lock(&s->s->mutex);
63
curl_multi_do_locked(s);
64
curl_multi_check_completion(s->s);
65
--
66
2.21.0
67
68
diff view generated by jsdifflib
1
If we had done that all along, debugging would have been much simpler.
1
Now that run_poll_handlers_once() is only called by run_poll_handlers()
2
(Also, I/O errors are better than hangs.)
2
we can improve the CPU time profile by moving the expensive
3
RCU_READ_LOCK() out of the polling loop.
3
4
4
Signed-off-by: Max Reitz <mreitz@redhat.com>
5
This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's
5
Message-id: 20190910124136.10565-8-mreitz@redhat.com
6
sampling profiler output.
6
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
7
7
Reviewed-by: John Snow <jsnow@redhat.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Max Reitz <mreitz@redhat.com>
9
Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com
10
Message-Id: <20200305170806.1313245-3-stefanha@redhat.com>
9
---
11
---
10
block/curl.c | 8 +++++++-
12
util/aio-posix.c | 20 ++++++++++----------
11
1 file changed, 7 insertions(+), 1 deletion(-)
13
1 file changed, 10 insertions(+), 10 deletions(-)
12
14
13
diff --git a/block/curl.c b/block/curl.c
15
diff --git a/util/aio-posix.c b/util/aio-posix.c
14
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
15
--- a/block/curl.c
17
--- a/util/aio-posix.c
16
+++ b/block/curl.c
18
+++ b/util/aio-posix.c
17
@@ -XXX,XX +XXX,XX @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb)
19
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
18
trace_curl_setup_preadv(acb->bytes, start, state->range);
20
bool progress = false;
19
curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range);
21
AioHandler *node;
20
22
21
- curl_multi_add_handle(s->multi, state->curl);
23
- /*
22
+ if (curl_multi_add_handle(s->multi, state->curl) != CURLM_OK) {
24
- * Optimization: ->io_poll() handlers often contain RCU read critical
23
+ state->acb[0] = NULL;
25
- * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
24
+ acb->ret = -EIO;
26
- * -> rcu_read_lock() -> ... sequences with expensive memory
27
- * synchronization primitives. Make the entire polling loop an RCU
28
- * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
29
- * are cheap.
30
- */
31
- RCU_READ_LOCK_GUARD();
32
-
33
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
34
if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
35
aio_node_check(ctx, node->is_external) &&
36
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
37
38
trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
39
40
+ /*
41
+ * Optimization: ->io_poll() handlers often contain RCU read critical
42
+ * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
43
+ * -> rcu_read_lock() -> ... sequences with expensive memory
44
+ * synchronization primitives. Make the entire polling loop an RCU
45
+ * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
46
+ * are cheap.
47
+ */
48
+ RCU_READ_LOCK_GUARD();
25
+
49
+
26
+ curl_clean_state(state);
50
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
27
+ goto out;
51
do {
28
+ }
52
progress = run_poll_handlers_once(ctx, timeout);
29
30
/* Tell curl it needs to kick things off */
31
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
32
--
53
--
33
2.21.0
54
2.24.1
34
55
35
diff view generated by jsdifflib
1
From: Maxim Levitsky <mlevitsk@redhat.com>
1
The ppoll(2) and epoll(7) file descriptor monitoring implementations are
2
mixed with the core util/aio-posix.c code. Before adding another
3
implementation for Linux io_uring, extract out the existing
4
ones so there is a clear interface and the core code is simpler.
2
5
3
* Change the qcow2_co_{encrypt|decrypt} to just receive full host and
6
The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps
4
guest offsets and use this function directly instead of calling
7
struct. See the patch for details.
5
do_perform_cow_encrypt (which is removed by that patch).
6
8
7
* Adjust qcow2_co_encdec to take full host and guest offsets as well.
9
Semantic changes:
10
1. ppoll(2) now reflects events from pollfds[] back into AioHandlers
11
while we're still on the clock for adaptive polling. This was
12
already happening for epoll(7), so if it's really an issue then we'll
13
need to fix both in the future.
14
2. epoll(7)'s fallback to ppoll(2) while external events are disabled
15
was broken when the number of fds exceeded the epoll(7) upgrade
16
threshold. I guess this code path simply wasn't tested and no one
17
noticed the bug. I didn't go out of my way to fix it but the correct
18
code is simpler than preserving the bug.
8
19
9
* Document the qcow2_co_{encrypt|decrypt} arguments
20
I also took some liberties in removing the unnecessary
10
to prevent the bug fixed in former commit from hopefully
21
AioContext->epoll_available (just check AioContext->epollfd != -1
11
happening again.
22
instead) and AioContext->epoll_enabled (it's implicit if our
23
AioContext->fdmon_ops callbacks are being invoked) fields.
12
24
13
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
25
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Message-id: 20190915203655.21638-3-mlevitsk@redhat.com
26
Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com
15
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
27
Message-Id: <20200305170806.1313245-4-stefanha@redhat.com>
16
[mreitz: Let perform_cow() return the error value returned by
17
qcow2_co_encrypt(), as proposed by Vladimir]
18
Signed-off-by: Max Reitz <mreitz@redhat.com>
19
---
28
---
20
block/qcow2.h | 8 +++---
29
MAINTAINERS | 2 +
21
block/qcow2-cluster.c | 41 +++++++++-------------------
30
include/block/aio.h | 36 +++++-
22
block/qcow2-threads.c | 63 +++++++++++++++++++++++++++++++++----------
31
util/Makefile.objs | 2 +
23
block/qcow2.c | 5 ++--
32
util/aio-posix.c | 286 ++------------------------------------------
24
4 files changed, 69 insertions(+), 48 deletions(-)
33
util/aio-posix.h | 61 ++++++++++
34
util/fdmon-epoll.c | 151 +++++++++++++++++++++++
35
util/fdmon-poll.c | 104 ++++++++++++++++
36
7 files changed, 366 insertions(+), 276 deletions(-)
37
create mode 100644 util/aio-posix.h
38
create mode 100644 util/fdmon-epoll.c
39
create mode 100644 util/fdmon-poll.c
25
40
26
diff --git a/block/qcow2.h b/block/qcow2.h
41
diff --git a/MAINTAINERS b/MAINTAINERS
27
index XXXXXXX..XXXXXXX 100644
42
index XXXXXXX..XXXXXXX 100644
28
--- a/block/qcow2.h
43
--- a/MAINTAINERS
29
+++ b/block/qcow2.h
44
+++ b/MAINTAINERS
30
@@ -XXX,XX +XXX,XX @@ ssize_t coroutine_fn
45
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
31
qcow2_co_decompress(BlockDriverState *bs, void *dest, size_t dest_size,
46
S: Supported
32
const void *src, size_t src_size);
47
F: util/async.c
33
int coroutine_fn
48
F: util/aio-*.c
34
-qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
49
+F: util/aio-*.h
35
- uint64_t offset, void *buf, size_t len);
50
+F: util/fdmon-*.c
36
+qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset,
51
F: block/io.c
37
+ uint64_t guest_offset, void *buf, size_t len);
52
F: migration/block*
38
int coroutine_fn
53
F: include/block/aio.h
39
-qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
54
diff --git a/include/block/aio.h b/include/block/aio.h
40
- uint64_t offset, void *buf, size_t len);
41
+qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset,
42
+ uint64_t guest_offset, void *buf, size_t len);
43
44
#endif
45
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
46
index XXXXXXX..XXXXXXX 100644
55
index XXXXXXX..XXXXXXX 100644
47
--- a/block/qcow2-cluster.c
56
--- a/include/block/aio.h
48
+++ b/block/qcow2-cluster.c
57
+++ b/include/block/aio.h
49
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
58
@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
50
return 0;
59
struct LinuxAioState;
60
struct LuringState;
61
62
+/* Callbacks for file descriptor monitoring implementations */
63
+typedef struct {
64
+ /*
65
+ * update:
66
+ * @ctx: the AioContext
67
+ * @node: the handler
68
+ * @is_new: is the file descriptor already being monitored?
69
+ *
70
+ * Add/remove/modify a monitored file descriptor. There are three cases:
71
+ * 1. node->pfd.events == 0 means remove the file descriptor.
72
+ * 2. !is_new means modify an already monitored file descriptor.
73
+ * 3. is_new means add a new file descriptor.
74
+ *
75
+ * Called with ctx->list_lock acquired.
76
+ */
77
+ void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
78
+
79
+ /*
80
+ * wait:
81
+ * @ctx: the AioContext
82
+ * @ready_list: list for handlers that become ready
83
+ * @timeout: maximum duration to wait, in nanoseconds
84
+ *
85
+ * Wait for file descriptors to become ready and place them on ready_list.
86
+ *
87
+ * Called with ctx->list_lock incremented but not locked.
88
+ *
89
+ * Returns: number of ready file descriptors.
90
+ */
91
+ int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
92
+} FDMonOps;
93
+
94
/*
95
* Each aio_bh_poll() call carves off a slice of the BH list, so that newly
96
* scheduled BHs are not processed until the next aio_bh_poll() call. All
97
@@ -XXX,XX +XXX,XX @@ struct AioContext {
98
99
/* epoll(7) state used when built with CONFIG_EPOLL */
100
int epollfd;
101
- bool epoll_enabled;
102
- bool epoll_available;
103
+
104
+ const FDMonOps *fdmon_ops;
105
};
106
107
/**
108
diff --git a/util/Makefile.objs b/util/Makefile.objs
109
index XXXXXXX..XXXXXXX 100644
110
--- a/util/Makefile.objs
111
+++ b/util/Makefile.objs
112
@@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
113
util-obj-y += main-loop.o
114
util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
115
util-obj-$(CONFIG_POSIX) += aio-posix.o
116
+util-obj-$(CONFIG_POSIX) += fdmon-poll.o
117
+util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
118
util-obj-$(CONFIG_POSIX) += compatfd.o
119
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
120
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
121
diff --git a/util/aio-posix.c b/util/aio-posix.c
122
index XXXXXXX..XXXXXXX 100644
123
--- a/util/aio-posix.c
124
+++ b/util/aio-posix.c
125
@@ -XXX,XX +XXX,XX @@
126
#include "qemu/sockets.h"
127
#include "qemu/cutils.h"
128
#include "trace.h"
129
-#ifdef CONFIG_EPOLL_CREATE1
130
-#include <sys/epoll.h>
131
-#endif
132
+#include "aio-posix.h"
133
134
-struct AioHandler
135
-{
136
- GPollFD pfd;
137
- IOHandler *io_read;
138
- IOHandler *io_write;
139
- AioPollFn *io_poll;
140
- IOHandler *io_poll_begin;
141
- IOHandler *io_poll_end;
142
- void *opaque;
143
- bool is_external;
144
- QLIST_ENTRY(AioHandler) node;
145
- QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
146
- QLIST_ENTRY(AioHandler) node_deleted;
147
-};
148
-
149
-/* Add a handler to a ready list */
150
-static void add_ready_handler(AioHandlerList *ready_list,
151
- AioHandler *node,
152
- int revents)
153
+void aio_add_ready_handler(AioHandlerList *ready_list,
154
+ AioHandler *node,
155
+ int revents)
156
{
157
QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
158
node->pfd.revents = revents;
159
QLIST_INSERT_HEAD(ready_list, node, node_ready);
51
}
160
}
52
161
53
-static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
162
-#ifdef CONFIG_EPOLL_CREATE1
54
- uint64_t src_cluster_offset,
163
-
55
- uint64_t cluster_offset,
164
-/* The fd number threshold to switch to epoll */
56
- unsigned offset_in_cluster,
165
-#define EPOLL_ENABLE_THRESHOLD 64
57
- uint8_t *buffer,
166
-
58
- unsigned bytes)
167
-static void aio_epoll_disable(AioContext *ctx)
59
-{
168
-{
60
- if (bytes && bs->encrypted) {
169
- ctx->epoll_enabled = false;
61
- BDRVQcow2State *s = bs->opaque;
170
- if (!ctx->epoll_available) {
62
- assert(QEMU_IS_ALIGNED(offset_in_cluster, BDRV_SECTOR_SIZE));
171
- return;
63
- assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
172
- }
64
- assert(s->crypto);
173
- ctx->epoll_available = false;
65
- if (qcow2_co_encrypt(bs,
174
- close(ctx->epollfd);
66
- start_of_cluster(s, cluster_offset + offset_in_cluster),
175
-}
67
- src_cluster_offset + offset_in_cluster,
176
-
68
- buffer, bytes) < 0) {
177
-static inline int epoll_events_from_pfd(int pfd_events)
178
-{
179
- return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
180
- (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
181
- (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
182
- (pfd_events & G_IO_ERR ? EPOLLERR : 0);
183
-}
184
-
185
-static bool aio_epoll_try_enable(AioContext *ctx)
186
-{
187
- AioHandler *node;
188
- struct epoll_event event;
189
-
190
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
191
- int r;
192
- if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
193
- continue;
194
- }
195
- event.events = epoll_events_from_pfd(node->pfd.events);
196
- event.data.ptr = node;
197
- r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
198
- if (r) {
69
- return false;
199
- return false;
70
- }
200
- }
71
- }
201
- }
202
- ctx->epoll_enabled = true;
72
- return true;
203
- return true;
73
-}
204
-}
74
-
205
-
75
static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
206
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
76
uint64_t cluster_offset,
207
-{
77
unsigned offset_in_cluster,
208
- struct epoll_event event;
78
@@ -XXX,XX +XXX,XX @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
209
- int r;
79
210
- int ctl;
80
/* Encrypt the data if necessary before writing it */
211
-
81
if (bs->encrypted) {
212
- if (!ctx->epoll_enabled) {
82
- if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
213
- return;
83
- start->offset, start_buffer,
214
- }
84
- start->nb_bytes) ||
215
- if (!node->pfd.events) {
85
- !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
216
- ctl = EPOLL_CTL_DEL;
86
- end->offset, end_buffer, end->nb_bytes)) {
217
- } else {
87
- ret = -EIO;
218
- event.data.ptr = node;
88
+ ret = qcow2_co_encrypt(bs,
219
- event.events = epoll_events_from_pfd(node->pfd.events);
89
+ m->alloc_offset + start->offset,
220
- ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
90
+ m->offset + start->offset,
221
- }
91
+ start_buffer, start->nb_bytes);
222
-
92
+ if (ret < 0) {
223
- r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
93
+ goto fail;
224
- if (r) {
94
+ }
225
- aio_epoll_disable(ctx);
95
+
226
- }
96
+ ret = qcow2_co_encrypt(bs,
227
-}
97
+ m->alloc_offset + end->offset,
228
-
98
+ m->offset + end->offset,
229
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
99
+ end_buffer, end->nb_bytes);
230
- int64_t timeout)
100
+ if (ret < 0) {
231
-{
101
goto fail;
232
- GPollFD pfd = {
233
- .fd = ctx->epollfd,
234
- .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
235
- };
236
- AioHandler *node;
237
- int i, ret = 0;
238
- struct epoll_event events[128];
239
-
240
- if (timeout > 0) {
241
- ret = qemu_poll_ns(&pfd, 1, timeout);
242
- if (ret > 0) {
243
- timeout = 0;
244
- }
245
- }
246
- if (timeout <= 0 || ret > 0) {
247
- ret = epoll_wait(ctx->epollfd, events,
248
- ARRAY_SIZE(events),
249
- timeout);
250
- if (ret <= 0) {
251
- goto out;
252
- }
253
- for (i = 0; i < ret; i++) {
254
- int ev = events[i].events;
255
- int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
256
- (ev & EPOLLOUT ? G_IO_OUT : 0) |
257
- (ev & EPOLLHUP ? G_IO_HUP : 0) |
258
- (ev & EPOLLERR ? G_IO_ERR : 0);
259
-
260
- node = events[i].data.ptr;
261
- add_ready_handler(ready_list, node, revents);
262
- }
263
- }
264
-out:
265
- return ret;
266
-}
267
-
268
-static bool aio_epoll_enabled(AioContext *ctx)
269
-{
270
- /* Fall back to ppoll when external clients are disabled. */
271
- return !aio_external_disabled(ctx) && ctx->epoll_enabled;
272
-}
273
-
274
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
275
- unsigned npfd, int64_t timeout)
276
-{
277
- if (!ctx->epoll_available) {
278
- return false;
279
- }
280
- if (aio_epoll_enabled(ctx)) {
281
- return true;
282
- }
283
- if (npfd >= EPOLL_ENABLE_THRESHOLD) {
284
- if (aio_epoll_try_enable(ctx)) {
285
- return true;
286
- } else {
287
- aio_epoll_disable(ctx);
288
- }
289
- }
290
- return false;
291
-}
292
-
293
-#else
294
-
295
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
296
-{
297
-}
298
-
299
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
300
- int64_t timeout)
301
-{
302
- assert(false);
303
-}
304
-
305
-static bool aio_epoll_enabled(AioContext *ctx)
306
-{
307
- return false;
308
-}
309
-
310
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
311
- unsigned npfd, int64_t timeout)
312
-{
313
- return false;
314
-}
315
-
316
-#endif
317
-
318
static AioHandler *find_aio_handler(AioContext *ctx, int fd)
319
{
320
AioHandler *node;
321
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
322
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
323
324
if (new_node) {
325
- aio_epoll_update(ctx, new_node, is_new);
326
+ ctx->fdmon_ops->update(ctx, new_node, is_new);
327
} else if (node) {
328
/* Unregister deleted fd_handler */
329
- aio_epoll_update(ctx, node, false);
330
+ ctx->fdmon_ops->update(ctx, node, false);
331
}
332
qemu_lockcnt_unlock(&ctx->list_lock);
333
aio_notify(ctx);
334
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
335
timerlistgroup_run_timers(&ctx->tlg);
336
}
337
338
-/* These thread-local variables are used only in a small part of aio_poll
339
- * around the call to the poll() system call. In particular they are not
340
- * used while aio_poll is performing callbacks, which makes it much easier
341
- * to think about reentrancy!
342
- *
343
- * Stack-allocated arrays would be perfect but they have size limitations;
344
- * heap allocation is expensive enough that we want to reuse arrays across
345
- * calls to aio_poll(). And because poll() has to be called without holding
346
- * any lock, the arrays cannot be stored in AioContext. Thread-local data
347
- * has none of the disadvantages of these three options.
348
- */
349
-static __thread GPollFD *pollfds;
350
-static __thread AioHandler **nodes;
351
-static __thread unsigned npfd, nalloc;
352
-static __thread Notifier pollfds_cleanup_notifier;
353
-
354
-static void pollfds_cleanup(Notifier *n, void *unused)
355
-{
356
- g_assert(npfd == 0);
357
- g_free(pollfds);
358
- g_free(nodes);
359
- nalloc = 0;
360
-}
361
-
362
-static void add_pollfd(AioHandler *node)
363
-{
364
- if (npfd == nalloc) {
365
- if (nalloc == 0) {
366
- pollfds_cleanup_notifier.notify = pollfds_cleanup;
367
- qemu_thread_atexit_add(&pollfds_cleanup_notifier);
368
- nalloc = 8;
369
- } else {
370
- g_assert(nalloc <= INT_MAX);
371
- nalloc *= 2;
372
- }
373
- pollfds = g_renew(GPollFD, pollfds, nalloc);
374
- nodes = g_renew(AioHandler *, nodes, nalloc);
375
- }
376
- nodes[npfd] = node;
377
- pollfds[npfd] = (GPollFD) {
378
- .fd = node->pfd.fd,
379
- .events = node->pfd.events,
380
- };
381
- npfd++;
382
-}
383
-
384
static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
385
{
386
bool progress = false;
387
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
388
bool aio_poll(AioContext *ctx, bool blocking)
389
{
390
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
391
- AioHandler *node;
392
- int i;
393
int ret = 0;
394
bool progress;
395
int64_t timeout;
396
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
397
* system call---a single round of run_poll_handlers_once suffices.
398
*/
399
if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
400
- assert(npfd == 0);
401
-
402
- /* fill pollfds */
403
-
404
- if (!aio_epoll_enabled(ctx)) {
405
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
406
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
407
- && aio_node_check(ctx, node->is_external)) {
408
- add_pollfd(node);
409
- }
410
- }
411
- }
412
-
413
- /* wait until next event */
414
- if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
415
- npfd = 0; /* pollfds[] is not being used */
416
- ret = aio_epoll(ctx, &ready_list, timeout);
417
- } else {
418
- ret = qemu_poll_ns(pollfds, npfd, timeout);
419
- }
420
+ ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
421
}
422
423
if (blocking) {
424
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
102
}
425
}
103
}
426
}
104
diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
427
105
index XXXXXXX..XXXXXXX 100644
428
- /* if we have any readable fds, dispatch event */
106
--- a/block/qcow2-threads.c
429
- if (ret > 0) {
107
+++ b/block/qcow2-threads.c
430
- for (i = 0; i < npfd; i++) {
108
@@ -XXX,XX +XXX,XX @@ static int qcow2_encdec_pool_func(void *opaque)
431
- int revents = pollfds[i].revents;
432
-
433
- if (revents) {
434
- add_ready_handler(&ready_list, nodes[i], revents);
435
- }
436
- }
437
- }
438
-
439
- npfd = 0;
440
-
441
progress |= aio_bh_poll(ctx);
442
443
if (ret > 0) {
444
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
445
446
void aio_context_setup(AioContext *ctx)
447
{
448
-#ifdef CONFIG_EPOLL_CREATE1
449
- assert(!ctx->epollfd);
450
- ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
451
- if (ctx->epollfd == -1) {
452
- fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
453
- ctx->epoll_available = false;
454
- } else {
455
- ctx->epoll_available = true;
456
- }
457
-#endif
458
+ ctx->fdmon_ops = &fdmon_poll_ops;
459
+ ctx->epollfd = -1;
460
+
461
+ fdmon_epoll_setup(ctx);
109
}
462
}
110
463
111
static int coroutine_fn
464
void aio_context_destroy(AioContext *ctx)
112
-qcow2_co_encdec(BlockDriverState *bs, uint64_t file_cluster_offset,
113
- uint64_t offset, void *buf, size_t len, Qcow2EncDecFunc func)
114
+qcow2_co_encdec(BlockDriverState *bs, uint64_t host_offset,
115
+ uint64_t guest_offset, void *buf, size_t len,
116
+ Qcow2EncDecFunc func)
117
{
465
{
118
BDRVQcow2State *s = bs->opaque;
466
-#ifdef CONFIG_EPOLL_CREATE1
119
Qcow2EncDecData arg = {
467
- aio_epoll_disable(ctx);
120
.block = s->crypto,
468
-#endif
121
- .offset = s->crypt_physical_offset ?
469
+ fdmon_epoll_disable(ctx);
122
- file_cluster_offset + offset_into_cluster(s, offset) :
123
- offset,
124
+ .offset = s->crypt_physical_offset ? host_offset : guest_offset,
125
.buf = buf,
126
.len = len,
127
.func = func,
128
};
129
130
- return qcow2_co_process(bs, qcow2_encdec_pool_func, &arg);
131
+ assert(QEMU_IS_ALIGNED(guest_offset, BDRV_SECTOR_SIZE));
132
+ assert(QEMU_IS_ALIGNED(host_offset, BDRV_SECTOR_SIZE));
133
+ assert(QEMU_IS_ALIGNED(len, BDRV_SECTOR_SIZE));
134
+ assert(s->crypto);
135
+
136
+ return len == 0 ? 0 : qcow2_co_process(bs, qcow2_encdec_pool_func, &arg);
137
}
470
}
138
471
472
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
473
diff --git a/util/aio-posix.h b/util/aio-posix.h
474
new file mode 100644
475
index XXXXXXX..XXXXXXX
476
--- /dev/null
477
+++ b/util/aio-posix.h
478
@@ -XXX,XX +XXX,XX @@
139
+/*
479
+/*
140
+ * qcow2_co_encrypt()
480
+ * AioContext POSIX event loop implementation internal APIs
141
+ *
481
+ *
142
+ * Encrypts one or more contiguous aligned sectors
482
+ * Copyright IBM, Corp. 2008
483
+ * Copyright Red Hat, Inc. 2020
143
+ *
484
+ *
144
+ * @host_offset - underlying storage offset of the first sector of the
485
+ * Authors:
145
+ * data to be encrypted
486
+ * Anthony Liguori <aliguori@us.ibm.com>
146
+ *
487
+ *
147
+ * @guest_offset - guest (virtual) offset of the first sector of the
488
+ * This work is licensed under the terms of the GNU GPL, version 2. See
148
+ * data to be encrypted
489
+ * the COPYING file in the top-level directory.
149
+ *
490
+ *
150
+ * @buf - buffer with the data to encrypt, that after encryption
491
+ * Contributions after 2012-01-13 are licensed under the terms of the
151
+ * will be written to the underlying storage device at
492
+ * GNU GPL, version 2 or (at your option) any later version.
152
+ * @host_offset
493
+ */
494
+
495
+#ifndef AIO_POSIX_H
496
+#define AIO_POSIX_H
497
+
498
+#include "block/aio.h"
499
+
500
+struct AioHandler {
501
+ GPollFD pfd;
502
+ IOHandler *io_read;
503
+ IOHandler *io_write;
504
+ AioPollFn *io_poll;
505
+ IOHandler *io_poll_begin;
506
+ IOHandler *io_poll_end;
507
+ void *opaque;
508
+ bool is_external;
509
+ QLIST_ENTRY(AioHandler) node;
510
+ QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
511
+ QLIST_ENTRY(AioHandler) node_deleted;
512
+};
513
+
514
+/* Add a handler to a ready list */
515
+void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
516
+ int revents);
517
+
518
+extern const FDMonOps fdmon_poll_ops;
519
+
520
+#ifdef CONFIG_EPOLL_CREATE1
521
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
522
+void fdmon_epoll_setup(AioContext *ctx);
523
+void fdmon_epoll_disable(AioContext *ctx);
524
+#else
525
+static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
526
+{
527
+ return false;
528
+}
529
+
530
+static inline void fdmon_epoll_setup(AioContext *ctx)
531
+{
532
+}
533
+
534
+static inline void fdmon_epoll_disable(AioContext *ctx)
535
+{
536
+}
537
+#endif /* !CONFIG_EPOLL_CREATE1 */
538
+
539
+#endif /* AIO_POSIX_H */
540
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
541
new file mode 100644
542
index XXXXXXX..XXXXXXX
543
--- /dev/null
544
+++ b/util/fdmon-epoll.c
545
@@ -XXX,XX +XXX,XX @@
546
+/* SPDX-License-Identifier: GPL-2.0-or-later */
547
+/*
548
+ * epoll(7) file descriptor monitoring
549
+ */
550
+
551
+#include "qemu/osdep.h"
552
+#include <sys/epoll.h>
553
+#include "qemu/rcu_queue.h"
554
+#include "aio-posix.h"
555
+
556
+/* The fd number threshold to switch to epoll */
557
+#define EPOLL_ENABLE_THRESHOLD 64
558
+
559
+void fdmon_epoll_disable(AioContext *ctx)
560
+{
561
+ if (ctx->epollfd >= 0) {
562
+ close(ctx->epollfd);
563
+ ctx->epollfd = -1;
564
+ }
565
+
566
+ /* Switch back */
567
+ ctx->fdmon_ops = &fdmon_poll_ops;
568
+}
569
+
570
+static inline int epoll_events_from_pfd(int pfd_events)
571
+{
572
+ return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
573
+ (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
574
+ (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
575
+ (pfd_events & G_IO_ERR ? EPOLLERR : 0);
576
+}
577
+
578
+static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
579
+{
580
+ struct epoll_event event;
581
+ int r;
582
+ int ctl;
583
+
584
+ if (!node->pfd.events) {
585
+ ctl = EPOLL_CTL_DEL;
586
+ } else {
587
+ event.data.ptr = node;
588
+ event.events = epoll_events_from_pfd(node->pfd.events);
589
+ ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
590
+ }
591
+
592
+ r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
593
+ if (r) {
594
+ fdmon_epoll_disable(ctx);
595
+ }
596
+}
597
+
598
+static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
599
+ int64_t timeout)
600
+{
601
+ GPollFD pfd = {
602
+ .fd = ctx->epollfd,
603
+ .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
604
+ };
605
+ AioHandler *node;
606
+ int i, ret = 0;
607
+ struct epoll_event events[128];
608
+
609
+ /* Fall back while external clients are disabled */
610
+ if (atomic_read(&ctx->external_disable_cnt)) {
611
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
612
+ }
613
+
614
+ if (timeout > 0) {
615
+ ret = qemu_poll_ns(&pfd, 1, timeout);
616
+ if (ret > 0) {
617
+ timeout = 0;
618
+ }
619
+ }
620
+ if (timeout <= 0 || ret > 0) {
621
+ ret = epoll_wait(ctx->epollfd, events,
622
+ ARRAY_SIZE(events),
623
+ timeout);
624
+ if (ret <= 0) {
625
+ goto out;
626
+ }
627
+ for (i = 0; i < ret; i++) {
628
+ int ev = events[i].events;
629
+ int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
630
+ (ev & EPOLLOUT ? G_IO_OUT : 0) |
631
+ (ev & EPOLLHUP ? G_IO_HUP : 0) |
632
+ (ev & EPOLLERR ? G_IO_ERR : 0);
633
+
634
+ node = events[i].data.ptr;
635
+ aio_add_ready_handler(ready_list, node, revents);
636
+ }
637
+ }
638
+out:
639
+ return ret;
640
+}
641
+
642
+static const FDMonOps fdmon_epoll_ops = {
643
+ .update = fdmon_epoll_update,
644
+ .wait = fdmon_epoll_wait,
645
+};
646
+
647
+static bool fdmon_epoll_try_enable(AioContext *ctx)
648
+{
649
+ AioHandler *node;
650
+ struct epoll_event event;
651
+
652
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
653
+ int r;
654
+ if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
655
+ continue;
656
+ }
657
+ event.events = epoll_events_from_pfd(node->pfd.events);
658
+ event.data.ptr = node;
659
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
660
+ if (r) {
661
+ return false;
662
+ }
663
+ }
664
+
665
+ ctx->fdmon_ops = &fdmon_epoll_ops;
666
+ return true;
667
+}
668
+
669
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
670
+{
671
+ if (ctx->epollfd < 0) {
672
+ return false;
673
+ }
674
+
675
+ /* Do not upgrade while external clients are disabled */
676
+ if (atomic_read(&ctx->external_disable_cnt)) {
677
+ return false;
678
+ }
679
+
680
+ if (npfd >= EPOLL_ENABLE_THRESHOLD) {
681
+ if (fdmon_epoll_try_enable(ctx)) {
682
+ return true;
683
+ } else {
684
+ fdmon_epoll_disable(ctx);
685
+ }
686
+ }
687
+ return false;
688
+}
689
+
690
+void fdmon_epoll_setup(AioContext *ctx)
691
+{
692
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
693
+ if (ctx->epollfd == -1) {
694
+ fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
695
+ }
696
+}
697
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
698
new file mode 100644
699
index XXXXXXX..XXXXXXX
700
--- /dev/null
701
+++ b/util/fdmon-poll.c
702
@@ -XXX,XX +XXX,XX @@
703
+/* SPDX-License-Identifier: GPL-2.0-or-later */
704
+/*
705
+ * poll(2) file descriptor monitoring
153
+ *
706
+ *
154
+ * @len - length of the buffer (must be a BDRV_SECTOR_SIZE multiple)
707
+ * Uses ppoll(2) when available, g_poll() otherwise.
708
+ */
709
+
710
+#include "qemu/osdep.h"
711
+#include "aio-posix.h"
712
+#include "qemu/rcu_queue.h"
713
+
714
+/*
715
+ * These thread-local variables are used only in fdmon_poll_wait() around the
716
+ * call to the poll() system call. In particular they are not used while
717
+ * aio_poll is performing callbacks, which makes it much easier to think about
718
+ * reentrancy!
155
+ *
719
+ *
156
+ * Depending on the encryption method, @host_offset and/or @guest_offset
720
+ * Stack-allocated arrays would be perfect but they have size limitations;
157
+ * may be used for generating the initialization vector for
721
+ * heap allocation is expensive enough that we want to reuse arrays across
158
+ * encryption.
722
+ * calls to aio_poll(). And because poll() has to be called without holding
159
+ *
723
+ * any lock, the arrays cannot be stored in AioContext. Thread-local data
160
+ * Note that while the whole range must be aligned on sectors, it
724
+ * has none of the disadvantages of these three options.
161
+ * does not have to be aligned on clusters and can also cross cluster
162
+ * boundaries
163
+ */
725
+ */
164
int coroutine_fn
726
+static __thread GPollFD *pollfds;
165
-qcow2_co_encrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
727
+static __thread AioHandler **nodes;
166
- uint64_t offset, void *buf, size_t len)
728
+static __thread unsigned npfd, nalloc;
167
+qcow2_co_encrypt(BlockDriverState *bs, uint64_t host_offset,
729
+static __thread Notifier pollfds_cleanup_notifier;
168
+ uint64_t guest_offset, void *buf, size_t len)
730
+
169
{
731
+static void pollfds_cleanup(Notifier *n, void *unused)
170
- return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len,
732
+{
171
- qcrypto_block_encrypt);
733
+ g_assert(npfd == 0);
172
+ return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len,
734
+ g_free(pollfds);
173
+ qcrypto_block_encrypt);
735
+ g_free(nodes);
174
}
736
+ nalloc = 0;
175
737
+}
176
+/*
738
+
177
+ * qcow2_co_decrypt()
739
+static void add_pollfd(AioHandler *node)
178
+ *
740
+{
179
+ * Decrypts one or more contiguous aligned sectors
741
+ if (npfd == nalloc) {
180
+ * Similar to qcow2_co_encrypt
742
+ if (nalloc == 0) {
181
+ */
743
+ pollfds_cleanup_notifier.notify = pollfds_cleanup;
182
int coroutine_fn
744
+ qemu_thread_atexit_add(&pollfds_cleanup_notifier);
183
-qcow2_co_decrypt(BlockDriverState *bs, uint64_t file_cluster_offset,
745
+ nalloc = 8;
184
- uint64_t offset, void *buf, size_t len)
746
+ } else {
185
+qcow2_co_decrypt(BlockDriverState *bs, uint64_t host_offset,
747
+ g_assert(nalloc <= INT_MAX);
186
+ uint64_t guest_offset, void *buf, size_t len)
748
+ nalloc *= 2;
187
{
749
+ }
188
- return qcow2_co_encdec(bs, file_cluster_offset, offset, buf, len,
750
+ pollfds = g_renew(GPollFD, pollfds, nalloc);
189
- qcrypto_block_decrypt);
751
+ nodes = g_renew(AioHandler *, nodes, nalloc);
190
+ return qcow2_co_encdec(bs, host_offset, guest_offset, buf, len,
752
+ }
191
+ qcrypto_block_decrypt);
753
+ nodes[npfd] = node;
192
}
754
+ pollfds[npfd] = (GPollFD) {
193
diff --git a/block/qcow2.c b/block/qcow2.c
755
+ .fd = node->pfd.fd,
194
index XXXXXXX..XXXXXXX 100644
756
+ .events = node->pfd.events,
195
--- a/block/qcow2.c
757
+ };
196
+++ b/block/qcow2.c
758
+ npfd++;
197
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs,
759
+}
198
760
+
199
assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
761
+static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
200
assert(QEMU_IS_ALIGNED(cur_bytes, BDRV_SECTOR_SIZE));
762
+ int64_t timeout)
201
- if (qcow2_co_decrypt(bs, cluster_offset, offset,
763
+{
202
+ if (qcow2_co_decrypt(bs, cluster_offset + offset_in_cluster,
764
+ AioHandler *node;
203
+ offset,
765
+ int ret;
204
cluster_data, cur_bytes) < 0) {
766
+
205
ret = -EIO;
767
+ assert(npfd == 0);
206
goto fail;
768
+
207
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int qcow2_co_pwritev_part(
769
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
208
qemu_iovec_to_buf(qiov, qiov_offset + bytes_done,
770
+ if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
209
cluster_data, cur_bytes);
771
+ && aio_node_check(ctx, node->is_external)) {
210
772
+ add_pollfd(node);
211
- if (qcow2_co_encrypt(bs, cluster_offset, offset,
773
+ }
212
+ if (qcow2_co_encrypt(bs, cluster_offset + offset_in_cluster, offset,
774
+ }
213
cluster_data, cur_bytes) < 0) {
775
+
214
ret = -EIO;
776
+ /* epoll(7) is faster above a certain number of fds */
215
goto out_unlocked;
777
+ if (fdmon_epoll_try_upgrade(ctx, npfd)) {
778
+ return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
779
+ }
780
+
781
+ ret = qemu_poll_ns(pollfds, npfd, timeout);
782
+ if (ret > 0) {
783
+ int i;
784
+
785
+ for (i = 0; i < npfd; i++) {
786
+ int revents = pollfds[i].revents;
787
+
788
+ if (revents) {
789
+ aio_add_ready_handler(ready_list, nodes[i], revents);
790
+ }
791
+ }
792
+ }
793
+
794
+ npfd = 0;
795
+ return ret;
796
+}
797
+
798
+static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
799
+{
800
+ /* Do nothing, AioHandler already contains the state we'll need */
801
+}
802
+
803
+const FDMonOps fdmon_poll_ops = {
804
+ .update = fdmon_poll_update,
805
+ .wait = fdmon_poll_wait,
806
+};
216
--
807
--
217
2.21.0
808
2.24.1
218
809
219
diff view generated by jsdifflib
1
Instead of reporting all sockets to cURL, only report the one that has
1
The AioHandler *node, bool is_new arguments are more complicated to
2
caused curl_multi_do_locked() to be called. This lets us get rid of the
2
think about than simply being given AioHandler *old_node, AioHandler
3
QLIST_FOREACH_SAFE() list, which was actually wrong: SAFE foreaches are
3
*new_node.
4
only safe when the current element is removed in each iteration. If it
5
possible for the list to be concurrently modified, we cannot guarantee
6
that only the current element will be removed. Therefore, we must not
7
use QLIST_FOREACH_SAFE() here.
8
4
9
Fixes: ff5ca1664af85b24a4180d595ea6873fd3deac57
5
Furthermore, the new Linux io_uring file descriptor monitoring mechanism
10
Cc: qemu-stable@nongnu.org
6
added by the new patch requires access to both the old and the new
11
Signed-off-by: Max Reitz <mreitz@redhat.com>
7
nodes. Make this change now in preparation.
12
Message-id: 20190910124136.10565-6-mreitz@redhat.com
8
13
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Reviewed-by: John Snow <jsnow@redhat.com>
10
Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com
15
Signed-off-by: Max Reitz <mreitz@redhat.com>
11
Message-Id: <20200305170806.1313245-5-stefanha@redhat.com>
16
---
12
---
17
block/curl.c | 17 ++++++-----------
13
include/block/aio.h | 13 ++++++-------
18
1 file changed, 6 insertions(+), 11 deletions(-)
14
util/aio-posix.c | 7 +------
15
util/fdmon-epoll.c | 21 ++++++++++++---------
16
util/fdmon-poll.c | 4 +++-
17
4 files changed, 22 insertions(+), 23 deletions(-)
19
18
20
diff --git a/block/curl.c b/block/curl.c
19
diff --git a/include/block/aio.h b/include/block/aio.h
21
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
22
--- a/block/curl.c
21
--- a/include/block/aio.h
23
+++ b/block/curl.c
22
+++ b/include/block/aio.h
24
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
23
@@ -XXX,XX +XXX,XX @@ typedef struct {
24
/*
25
* update:
26
* @ctx: the AioContext
27
- * @node: the handler
28
- * @is_new: is the file descriptor already being monitored?
29
+ * @old_node: the existing handler or NULL if this file descriptor is being
30
+ * monitored for the first time
31
+ * @new_node: the new handler or NULL if this file descriptor is being
32
+ * removed
33
*
34
- * Add/remove/modify a monitored file descriptor. There are three cases:
35
- * 1. node->pfd.events == 0 means remove the file descriptor.
36
- * 2. !is_new means modify an already monitored file descriptor.
37
- * 3. is_new means add a new file descriptor.
38
+ * Add/remove/modify a monitored file descriptor.
39
*
40
* Called with ctx->list_lock acquired.
41
*/
42
- void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
43
+ void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
44
45
/*
46
* wait:
47
diff --git a/util/aio-posix.c b/util/aio-posix.c
48
index XXXXXXX..XXXXXXX 100644
49
--- a/util/aio-posix.c
50
+++ b/util/aio-posix.c
51
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
52
atomic_set(&ctx->poll_disable_cnt,
53
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
54
55
- if (new_node) {
56
- ctx->fdmon_ops->update(ctx, new_node, is_new);
57
- } else if (node) {
58
- /* Unregister deleted fd_handler */
59
- ctx->fdmon_ops->update(ctx, node, false);
60
- }
61
+ ctx->fdmon_ops->update(ctx, node, new_node);
62
qemu_lockcnt_unlock(&ctx->list_lock);
63
aio_notify(ctx);
64
65
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
66
index XXXXXXX..XXXXXXX 100644
67
--- a/util/fdmon-epoll.c
68
+++ b/util/fdmon-epoll.c
69
@@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events)
70
(pfd_events & G_IO_ERR ? EPOLLERR : 0);
25
}
71
}
26
72
27
/* Called with s->mutex held. */
73
-static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
28
-static void curl_multi_do_locked(CURLSocket *ready_socket)
74
+static void fdmon_epoll_update(AioContext *ctx,
29
+static void curl_multi_do_locked(CURLSocket *socket)
75
+ AioHandler *old_node,
76
+ AioHandler *new_node)
30
{
77
{
31
- CURLSocket *socket, *next_socket;
78
- struct epoll_event event;
32
- CURLState *s = ready_socket->state;
79
+ struct epoll_event event = {
33
+ BDRVCURLState *s = socket->state->s;
80
+ .data.ptr = new_node,
34
int running;
81
+ .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
82
+ };
35
int r;
83
int r;
36
84
- int ctl;
37
- if (!s->s->multi) {
85
38
+ if (!s->multi) {
86
- if (!node->pfd.events) {
39
return;
87
- ctl = EPOLL_CTL_DEL;
88
+ if (!new_node) {
89
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
90
+ } else if (!old_node) {
91
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
92
} else {
93
- event.data.ptr = node;
94
- event.events = epoll_events_from_pfd(node->pfd.events);
95
- ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
96
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
40
}
97
}
41
98
42
- /* Need to use _SAFE because curl_multi_socket_action() may trigger
99
- r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
43
- * curl_sock_cb() which might modify this list */
100
if (r) {
44
- QLIST_FOREACH_SAFE(socket, &s->sockets, next, next_socket) {
101
fdmon_epoll_disable(ctx);
45
- do {
102
}
46
- r = curl_multi_socket_action(s->s->multi, socket->fd, 0, &running);
103
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
47
- } while (r == CURLM_CALL_MULTI_PERFORM);
104
index XXXXXXX..XXXXXXX 100644
48
- }
105
--- a/util/fdmon-poll.c
49
+ do {
106
+++ b/util/fdmon-poll.c
50
+ r = curl_multi_socket_action(s->multi, socket->fd, 0, &running);
107
@@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
51
+ } while (r == CURLM_CALL_MULTI_PERFORM);
108
return ret;
52
}
109
}
53
110
54
static void curl_multi_do(void *arg)
111
-static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
112
+static void fdmon_poll_update(AioContext *ctx,
113
+ AioHandler *old_node,
114
+ AioHandler *new_node)
115
{
116
/* Do nothing, AioHandler already contains the state we'll need */
117
}
55
--
118
--
56
2.21.0
119
2.24.1
57
120
58
diff view generated by jsdifflib
1
From: Maxim Levitsky <mlevitsk@redhat.com>
1
The recent Linux io_uring API has several advantages over ppoll(2) and
2
epoll(2). Details are given in the source code.
2
3
3
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
4
Add an io_uring implementation and make it the default on Linux.
4
Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
5
Performance is the same as with epoll(7) but later patches add
5
Message-id: 20190915203655.21638-4-mlevitsk@redhat.com
6
optimizations that take advantage of io_uring.
6
Reviewed-by: Max Reitz <mreitz@redhat.com>
7
7
Signed-off-by: Max Reitz <mreitz@redhat.com>
8
It is necessary to change how aio_set_fd_handler() deals with deleting
9
AioHandlers since removing monitored file descriptors is asynchronous in
10
io_uring. fdmon_io_uring_remove() marks the AioHandler deleted and
11
aio_set_fd_handler() will let it handle deletion in that case.
12
13
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com
15
Message-Id: <20200305170806.1313245-6-stefanha@redhat.com>
8
---
16
---
9
tests/qemu-iotests/263 | 91 ++++++++++++++++++++++++++++++++++++++
17
configure | 5 +
10
tests/qemu-iotests/263.out | 40 +++++++++++++++++
18
include/block/aio.h | 9 ++
11
tests/qemu-iotests/group | 1 +
19
util/Makefile.objs | 1 +
12
3 files changed, 132 insertions(+)
20
util/aio-posix.c | 20 ++-
13
create mode 100755 tests/qemu-iotests/263
21
util/aio-posix.h | 20 ++-
14
create mode 100644 tests/qemu-iotests/263.out
22
util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++
23
6 files changed, 376 insertions(+), 5 deletions(-)
24
create mode 100644 util/fdmon-io_uring.c
15
25
16
diff --git a/tests/qemu-iotests/263 b/tests/qemu-iotests/263
26
diff --git a/configure b/configure
17
new file mode 100755
27
index XXXXXXX..XXXXXXX 100755
18
index XXXXXXX..XXXXXXX
28
--- a/configure
19
--- /dev/null
29
+++ b/configure
20
+++ b/tests/qemu-iotests/263
30
@@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then
31
linux_io_uring_cflags=$($pkg_config --cflags liburing)
32
linux_io_uring_libs=$($pkg_config --libs liburing)
33
linux_io_uring=yes
34
+
35
+ # io_uring is used in libqemuutil.a where per-file -libs variables are not
36
+ # seen by programs linking the archive. It's not ideal, but just add the
37
+ # library dependency globally.
38
+ LIBS="$linux_io_uring_libs $LIBS"
39
else
40
if test "$linux_io_uring" = "yes" ; then
41
feature_not_found "linux io_uring" "Install liburing devel"
42
diff --git a/include/block/aio.h b/include/block/aio.h
43
index XXXXXXX..XXXXXXX 100644
44
--- a/include/block/aio.h
45
+++ b/include/block/aio.h
21
@@ -XXX,XX +XXX,XX @@
46
@@ -XXX,XX +XXX,XX @@
22
+#!/usr/bin/env bash
47
#ifndef QEMU_AIO_H
23
+#
48
#define QEMU_AIO_H
24
+# Test encrypted write that crosses cluster boundary of two unallocated clusters
49
25
+# Based on 188
50
+#ifdef CONFIG_LINUX_IO_URING
26
+#
51
+#include <liburing.h>
27
+# Copyright (C) 2019 Red Hat, Inc.
52
+#endif
28
+#
53
#include "qemu/queue.h"
29
+# This program is free software; you can redistribute it and/or modify
54
#include "qemu/event_notifier.h"
30
+# it under the terms of the GNU General Public License as published by
55
#include "qemu/thread.h"
31
+# the Free Software Foundation; either version 2 of the License, or
56
@@ -XXX,XX +XXX,XX @@ struct BHListSlice {
32
+# (at your option) any later version.
57
QSIMPLEQ_ENTRY(BHListSlice) next;
33
+#
58
};
34
+# This program is distributed in the hope that it will be useful,
59
35
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
60
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
36
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
61
+
37
+# GNU General Public License for more details.
62
struct AioContext {
38
+#
63
GSource source;
39
+# You should have received a copy of the GNU General Public License
64
40
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
65
@@ -XXX,XX +XXX,XX @@ struct AioContext {
41
+#
66
* locking.
42
+
67
*/
43
+# creator
68
struct LuringState *linux_io_uring;
44
+owner=mlevitsk@redhat.com
69
+
45
+
70
+ /* State for file descriptor monitoring using Linux io_uring */
46
+seq=`basename $0`
71
+ struct io_uring fdmon_io_uring;
47
+echo "QA output created by $seq"
72
+ AioHandlerSList submit_list;
48
+
73
#endif
49
+status=1    # failure is the default!
74
50
+
75
/* TimerLists for calling timers - one per clock type. Has its own
51
+_cleanup()
76
diff --git a/util/Makefile.objs b/util/Makefile.objs
52
+{
77
index XXXXXXX..XXXXXXX 100644
53
+    _cleanup_test_img
78
--- a/util/Makefile.objs
54
+}
79
+++ b/util/Makefile.objs
55
+trap "_cleanup; exit \$status" 0 1 2 3 15
80
@@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
56
+
81
util-obj-$(CONFIG_POSIX) += aio-posix.o
57
+# get standard environment, filters and checks
82
util-obj-$(CONFIG_POSIX) += fdmon-poll.o
58
+. ./common.rc
83
util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
59
+. ./common.filter
84
+util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o
60
+
85
util-obj-$(CONFIG_POSIX) += compatfd.o
61
+_supported_fmt qcow2
86
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
62
+_supported_proto generic
87
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
63
+_supported_os Linux
88
diff --git a/util/aio-posix.c b/util/aio-posix.c
64
+
89
index XXXXXXX..XXXXXXX 100644
65
+
90
--- a/util/aio-posix.c
66
+size=1M
91
+++ b/util/aio-posix.c
67
+
92
@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
68
+SECRET="secret,id=sec0,data=astrochicken"
93
g_source_remove_poll(&ctx->source, &node->pfd);
69
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
94
}
70
+
95
71
+
96
+ node->pfd.revents = 0;
72
+_run_test()
97
+
73
+{
98
+ /* If the fd monitor has already marked it deleted, leave it alone */
74
+    echo "== reading the whole image =="
99
+ if (QLIST_IS_INSERTED(node, node_deleted)) {
75
+    $QEMU_IO --object $SECRET -c "read -P 0 0 $size" --image-opts "$1" | _filter_qemu_io | _filter_testdir
100
+ return false;
76
+
101
+ }
77
+    echo
102
+
78
+    echo "== write two 512 byte sectors on a cluster boundary =="
103
/* If a read is in progress, just mark the node as deleted */
79
+    $QEMU_IO --object $SECRET -c "write -P 0xAA 0xFE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir
104
if (qemu_lockcnt_count(&ctx->list_lock)) {
80
+
105
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
81
+    echo
106
- node->pfd.revents = 0;
82
+    echo "== verify that the rest of the image is not changed =="
107
return false;
83
+    $QEMU_IO --object $SECRET -c "read -P 0x00 0x00000 0xFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir
108
}
84
+    $QEMU_IO --object $SECRET -c "read -P 0xAA 0x0FE00 0x400" --image-opts "$1" | _filter_qemu_io | _filter_testdir
109
/* Otherwise, delete it for real. We can't just mark it as
85
+    $QEMU_IO --object $SECRET -c "read -P 0x00 0x10200 0xEFE00" --image-opts "$1" | _filter_qemu_io | _filter_testdir
110
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
86
+
111
87
+}
112
QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
88
+
113
}
89
+
114
- if (node) {
90
+echo
115
- deleted = aio_remove_fd_handler(ctx, node);
91
+echo "testing LUKS qcow2 encryption"
116
- }
92
+echo
117
93
+
118
/* No need to order poll_disable_cnt writes against other updates;
94
+_make_test_img --object $SECRET -o "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,cluster_size=64K" $size
119
* the counter is only used to avoid wasting time and latency on
95
+_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
120
@@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx,
96
+_cleanup_test_img
121
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
97
+
122
98
+echo
123
ctx->fdmon_ops->update(ctx, node, new_node);
99
+echo "testing legacy AES qcow2 encryption"
124
+ if (node) {
100
+echo
125
+ deleted = aio_remove_fd_handler(ctx, node);
101
+
126
+ }
102
+
127
qemu_lockcnt_unlock(&ctx->list_lock);
103
+_make_test_img --object $SECRET -o "encrypt.format=aes,encrypt.key-secret=sec0,cluster_size=64K" $size
128
aio_notify(ctx);
104
+_run_test "driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
129
105
+_cleanup_test_img
130
@@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx)
106
+
131
ctx->fdmon_ops = &fdmon_poll_ops;
107
+
132
ctx->epollfd = -1;
108
+
133
109
+# success, all done
134
+ /* Use the fastest fd monitoring implementation if available */
110
+echo "*** done"
135
+ if (fdmon_io_uring_setup(ctx)) {
111
+rm -f $seq.full
136
+ return;
112
+status=0
137
+ }
113
diff --git a/tests/qemu-iotests/263.out b/tests/qemu-iotests/263.out
138
+
139
fdmon_epoll_setup(ctx);
140
}
141
142
void aio_context_destroy(AioContext *ctx)
143
{
144
+ fdmon_io_uring_destroy(ctx);
145
fdmon_epoll_disable(ctx);
146
}
147
148
diff --git a/util/aio-posix.h b/util/aio-posix.h
149
index XXXXXXX..XXXXXXX 100644
150
--- a/util/aio-posix.h
151
+++ b/util/aio-posix.h
152
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
153
IOHandler *io_poll_begin;
154
IOHandler *io_poll_end;
155
void *opaque;
156
- bool is_external;
157
QLIST_ENTRY(AioHandler) node;
158
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
159
QLIST_ENTRY(AioHandler) node_deleted;
160
+#ifdef CONFIG_LINUX_IO_URING
161
+ QSLIST_ENTRY(AioHandler) node_submitted;
162
+ unsigned flags; /* see fdmon-io_uring.c */
163
+#endif
164
+ bool is_external;
165
};
166
167
/* Add a handler to a ready list */
168
@@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx)
169
}
170
#endif /* !CONFIG_EPOLL_CREATE1 */
171
172
+#ifdef CONFIG_LINUX_IO_URING
173
+bool fdmon_io_uring_setup(AioContext *ctx);
174
+void fdmon_io_uring_destroy(AioContext *ctx);
175
+#else
176
+static inline bool fdmon_io_uring_setup(AioContext *ctx)
177
+{
178
+ return false;
179
+}
180
+
181
+static inline void fdmon_io_uring_destroy(AioContext *ctx)
182
+{
183
+}
184
+#endif /* !CONFIG_LINUX_IO_URING */
185
+
186
#endif /* AIO_POSIX_H */
187
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
114
new file mode 100644
188
new file mode 100644
115
index XXXXXXX..XXXXXXX
189
index XXXXXXX..XXXXXXX
116
--- /dev/null
190
--- /dev/null
117
+++ b/tests/qemu-iotests/263.out
191
+++ b/util/fdmon-io_uring.c
118
@@ -XXX,XX +XXX,XX @@
192
@@ -XXX,XX +XXX,XX @@
119
+QA output created by 263
193
+/* SPDX-License-Identifier: GPL-2.0-or-later */
120
+
194
+/*
121
+testing LUKS qcow2 encryption
195
+ * Linux io_uring file descriptor monitoring
122
+
196
+ *
123
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=luks encrypt.key-secret=sec0 encrypt.iter-time=10
197
+ * The Linux io_uring API supports file descriptor monitoring with a few
124
+== reading the whole image ==
198
+ * advantages over existing APIs like poll(2) and epoll(7):
125
+read 1048576/1048576 bytes at offset 0
199
+ *
126
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
200
+ * 1. Userspace polling of events is possible because the completion queue (cq
127
+
201
+ * ring) is shared between the kernel and userspace. This allows
128
+== write two 512 byte sectors on a cluster boundary ==
202
+ * applications that rely on userspace polling to also monitor file
129
+wrote 1024/1024 bytes at offset 65024
203
+ * descriptors in the same userspace polling loop.
130
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
204
+ *
131
+
205
+ * 2. Submission and completion is batched and done together in a single system
132
+== verify that the rest of the image is not changed ==
206
+ * call. This minimizes the number of system calls.
133
+read 65024/65024 bytes at offset 0
207
+ *
134
+63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
208
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
135
+read 1024/1024 bytes at offset 65024
209
+ * poll(2).
136
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
210
+ *
137
+read 982528/982528 bytes at offset 66048
211
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
138
+959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
212
+ * epoll(7).
139
+
213
+ *
140
+testing legacy AES qcow2 encryption
214
+ * This code only monitors file descriptors and does not do asynchronous disk
141
+
215
+ * I/O. Implementing disk I/O efficiently has other requirements and should
142
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 encrypt.format=aes encrypt.key-secret=sec0
216
+ * use a separate io_uring so it does not make sense to unify the code.
143
+== reading the whole image ==
217
+ *
144
+read 1048576/1048576 bytes at offset 0
218
+ * File descriptor monitoring is implemented using the following operations:
145
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
219
+ *
146
+
220
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
147
+== write two 512 byte sectors on a cluster boundary ==
221
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When
148
+wrote 1024/1024 bytes at offset 65024
222
+ * the poll mask changes for a file descriptor it is first removed and then
149
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
223
+ * re-added with the new poll mask, so this operation is also used as part
150
+
224
+ * of modifying an existing monitored file descriptor.
151
+== verify that the rest of the image is not changed ==
225
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
152
+read 65024/65024 bytes at offset 0
226
+ * for events. This operation self-cancels if another event completes
153
+63.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
227
+ * before the timeout.
154
+read 1024/1024 bytes at offset 65024
228
+ *
155
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
229
+ * io_uring calls the submission queue the "sq ring" and the completion queue
156
+read 982528/982528 bytes at offset 66048
230
+ * the "cq ring". Ring entries are called "sqe" and "cqe", respectively.
157
+959.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
231
+ *
158
+*** done
232
+ * The code is structured so that sq/cq rings are only modified within
159
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
233
+ * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on
160
index XXXXXXX..XXXXXXX 100644
234
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
161
--- a/tests/qemu-iotests/group
235
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
162
+++ b/tests/qemu-iotests/group
236
+ */
163
@@ -XXX,XX +XXX,XX @@
237
+
164
257 rw
238
+#include "qemu/osdep.h"
165
258 rw quick
239
+#include <poll.h>
166
262 rw quick migration
240
+#include "qemu/rcu_queue.h"
167
+263 rw quick
241
+#include "aio-posix.h"
168
265 rw auto quick
242
+
169
266 rw quick
243
+enum {
244
+ FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
245
+
246
+ /* AioHandler::flags */
247
+ FDMON_IO_URING_PENDING = (1 << 0),
248
+ FDMON_IO_URING_ADD = (1 << 1),
249
+ FDMON_IO_URING_REMOVE = (1 << 2),
250
+};
251
+
252
+static inline int poll_events_from_pfd(int pfd_events)
253
+{
254
+ return (pfd_events & G_IO_IN ? POLLIN : 0) |
255
+ (pfd_events & G_IO_OUT ? POLLOUT : 0) |
256
+ (pfd_events & G_IO_HUP ? POLLHUP : 0) |
257
+ (pfd_events & G_IO_ERR ? POLLERR : 0);
258
+}
259
+
260
+static inline int pfd_events_from_poll(int poll_events)
261
+{
262
+ return (poll_events & POLLIN ? G_IO_IN : 0) |
263
+ (poll_events & POLLOUT ? G_IO_OUT : 0) |
264
+ (poll_events & POLLHUP ? G_IO_HUP : 0) |
265
+ (poll_events & POLLERR ? G_IO_ERR : 0);
266
+}
267
+
268
+/*
269
+ * Returns an sqe for submitting a request. Only be called within
270
+ * fdmon_io_uring_wait().
271
+ */
272
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
273
+{
274
+ struct io_uring *ring = &ctx->fdmon_io_uring;
275
+ struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
276
+ int ret;
277
+
278
+ if (likely(sqe)) {
279
+ return sqe;
280
+ }
281
+
282
+ /* No free sqes left, submit pending sqes first */
283
+ ret = io_uring_submit(ring);
284
+ assert(ret > 1);
285
+ sqe = io_uring_get_sqe(ring);
286
+ assert(sqe);
287
+ return sqe;
288
+}
289
+
290
+/* Atomically enqueue an AioHandler for sq ring submission */
291
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
292
+{
293
+ unsigned old_flags;
294
+
295
+ old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
296
+ if (!(old_flags & FDMON_IO_URING_PENDING)) {
297
+ QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
298
+ }
299
+}
300
+
301
+/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */
302
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
303
+{
304
+ AioHandler *node = QSLIST_FIRST(head);
305
+
306
+ if (!node) {
307
+ return NULL;
308
+ }
309
+
310
+ /* Doesn't need to be atomic since fill_sq_ring() moves the list */
311
+ QSLIST_REMOVE_HEAD(head, node_submitted);
312
+
313
+ /*
314
+ * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two
315
+ * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
316
+ * telling process_cqe() to delete the AioHandler when its
317
+ * IORING_OP_POLL_ADD completes.
318
+ */
319
+ *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
320
+ FDMON_IO_URING_ADD));
321
+ return node;
322
+}
323
+
324
+static void fdmon_io_uring_update(AioContext *ctx,
325
+ AioHandler *old_node,
326
+ AioHandler *new_node)
327
+{
328
+ if (new_node) {
329
+ enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
330
+ }
331
+
332
+ if (old_node) {
333
+ /*
334
+ * Deletion is tricky because IORING_OP_POLL_ADD and
335
+ * IORING_OP_POLL_REMOVE are async. We need to wait for the original
336
+ * IORING_OP_POLL_ADD to complete before this handler can be freed
337
+ * safely.
338
+ *
339
+ * It's possible that the file descriptor becomes ready and the
340
+ * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
341
+ * submitted, too.
342
+ *
343
+ * Mark this handler deleted right now but don't place it on
344
+ * ctx->deleted_aio_handlers yet. Instead, manually fudge the list
345
+ * entry to make QLIST_IS_INSERTED() think this handler has been
346
+ * inserted and other code recognizes this AioHandler as deleted.
347
+ *
348
+ * Once the original IORING_OP_POLL_ADD completes we enqueue the
349
+ * handler on the real ctx->deleted_aio_handlers list to be freed.
350
+ */
351
+ assert(!QLIST_IS_INSERTED(old_node, node_deleted));
352
+ old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
353
+
354
+ enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
355
+ }
356
+}
357
+
358
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
359
+{
360
+ struct io_uring_sqe *sqe = get_sqe(ctx);
361
+ int events = poll_events_from_pfd(node->pfd.events);
362
+
363
+ io_uring_prep_poll_add(sqe, node->pfd.fd, events);
364
+ io_uring_sqe_set_data(sqe, node);
365
+}
366
+
367
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
368
+{
369
+ struct io_uring_sqe *sqe = get_sqe(ctx);
370
+
371
+ io_uring_prep_poll_remove(sqe, node);
372
+}
373
+
374
+/* Add a timeout that self-cancels when another cqe becomes ready */
375
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
376
+{
377
+ struct io_uring_sqe *sqe;
378
+ struct __kernel_timespec ts = {
379
+ .tv_sec = ns / NANOSECONDS_PER_SECOND,
380
+ .tv_nsec = ns % NANOSECONDS_PER_SECOND,
381
+ };
382
+
383
+ sqe = get_sqe(ctx);
384
+ io_uring_prep_timeout(sqe, &ts, 1, 0);
385
+}
386
+
387
+/* Add sqes from ctx->submit_list for submission */
388
+static void fill_sq_ring(AioContext *ctx)
389
+{
390
+ AioHandlerSList submit_list;
391
+ AioHandler *node;
392
+ unsigned flags;
393
+
394
+ QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
395
+
396
+ while ((node = dequeue(&submit_list, &flags))) {
397
+ /* Order matters, just in case both flags were set */
398
+ if (flags & FDMON_IO_URING_ADD) {
399
+ add_poll_add_sqe(ctx, node);
400
+ }
401
+ if (flags & FDMON_IO_URING_REMOVE) {
402
+ add_poll_remove_sqe(ctx, node);
403
+ }
404
+ }
405
+}
406
+
407
+/* Returns true if a handler became ready */
408
+static bool process_cqe(AioContext *ctx,
409
+ AioHandlerList *ready_list,
410
+ struct io_uring_cqe *cqe)
411
+{
412
+ AioHandler *node = io_uring_cqe_get_data(cqe);
413
+ unsigned flags;
414
+
415
+ /* poll_timeout and poll_remove have a zero user_data field */
416
+ if (!node) {
417
+ return false;
418
+ }
419
+
420
+ /*
421
+ * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
422
+ * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
423
+ * bit before IORING_OP_POLL_REMOVE is submitted.
424
+ */
425
+ flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
426
+ if (flags & FDMON_IO_URING_REMOVE) {
427
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
428
+ return false;
429
+ }
430
+
431
+ aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
432
+
433
+ /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
434
+ add_poll_add_sqe(ctx, node);
435
+ return true;
436
+}
437
+
438
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
439
+{
440
+ struct io_uring *ring = &ctx->fdmon_io_uring;
441
+ struct io_uring_cqe *cqe;
442
+ unsigned num_cqes = 0;
443
+ unsigned num_ready = 0;
444
+ unsigned head;
445
+
446
+ io_uring_for_each_cqe(ring, head, cqe) {
447
+ if (process_cqe(ctx, ready_list, cqe)) {
448
+ num_ready++;
449
+ }
450
+
451
+ num_cqes++;
452
+ }
453
+
454
+ io_uring_cq_advance(ring, num_cqes);
455
+ return num_ready;
456
+}
457
+
458
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
459
+ int64_t timeout)
460
+{
461
+ unsigned wait_nr = 1; /* block until at least one cqe is ready */
462
+ int ret;
463
+
464
+ /* Fall back while external clients are disabled */
465
+ if (atomic_read(&ctx->external_disable_cnt)) {
466
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
467
+ }
468
+
469
+ if (timeout == 0) {
470
+ wait_nr = 0; /* non-blocking */
471
+ } else if (timeout > 0) {
472
+ add_timeout_sqe(ctx, timeout);
473
+ }
474
+
475
+ fill_sq_ring(ctx);
476
+
477
+ ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
478
+ assert(ret >= 0);
479
+
480
+ return process_cq_ring(ctx, ready_list);
481
+}
482
+
483
+static const FDMonOps fdmon_io_uring_ops = {
484
+ .update = fdmon_io_uring_update,
485
+ .wait = fdmon_io_uring_wait,
486
+};
487
+
488
+bool fdmon_io_uring_setup(AioContext *ctx)
489
+{
490
+ int ret;
491
+
492
+ ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
493
+ if (ret != 0) {
494
+ return false;
495
+ }
496
+
497
+ QSLIST_INIT(&ctx->submit_list);
498
+ ctx->fdmon_ops = &fdmon_io_uring_ops;
499
+ return true;
500
+}
501
+
502
+void fdmon_io_uring_destroy(AioContext *ctx)
503
+{
504
+ if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
505
+ AioHandler *node;
506
+
507
+ io_uring_queue_exit(&ctx->fdmon_io_uring);
508
+
509
+ /* No need to submit these anymore, just free them. */
510
+ while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
511
+ QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
512
+ QLIST_REMOVE(node, node);
513
+ g_free(node);
514
+ }
515
+
516
+ ctx->fdmon_ops = &fdmon_poll_ops;
517
+ }
518
+}
170
--
519
--
171
2.21.0
520
2.24.1
172
521
173
diff view generated by jsdifflib
1
curl_multi_do_locked() currently marks all sockets as ready. That is
1
Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled
2
not only inefficient, but in fact unsafe (the loop is). A follow-up
2
from userspace. Previously userspace polling was only allowed when all
3
patch will change that, but to do so, curl_multi_do_locked() needs to
3
AioHandler's had an ->io_poll() callback. This prevented starvation of
4
know exactly which socket is ready; and that is accomplished by this
4
fds by userspace pollable handlers.
5
patch here.
6
5
7
Cc: qemu-stable@nongnu.org
6
Add the FDMonOps->need_wait() callback that enables userspace polling
8
Signed-off-by: Max Reitz <mreitz@redhat.com>
7
even when some AioHandlers lack ->io_poll().
9
Message-id: 20190910124136.10565-5-mreitz@redhat.com
8
10
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
9
For example, it's now possible to do userspace polling when a TCP/IP
11
Reviewed-by: John Snow <jsnow@redhat.com>
10
socket is monitored thanks to Linux io_uring.
12
Signed-off-by: Max Reitz <mreitz@redhat.com>
11
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com
14
Message-Id: <20200305170806.1313245-7-stefanha@redhat.com>
13
---
15
---
14
block/curl.c | 20 +++++++++++---------
16
include/block/aio.h | 19 +++++++++++++++++++
15
1 file changed, 11 insertions(+), 9 deletions(-)
17
util/aio-posix.c | 11 ++++++++---
18
util/fdmon-epoll.c | 1 +
19
util/fdmon-io_uring.c | 6 ++++++
20
util/fdmon-poll.c | 1 +
21
5 files changed, 35 insertions(+), 3 deletions(-)
16
22
17
diff --git a/block/curl.c b/block/curl.c
23
diff --git a/include/block/aio.h b/include/block/aio.h
18
index XXXXXXX..XXXXXXX 100644
24
index XXXXXXX..XXXXXXX 100644
19
--- a/block/curl.c
25
--- a/include/block/aio.h
20
+++ b/block/curl.c
26
+++ b/include/block/aio.h
21
@@ -XXX,XX +XXX,XX @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
27
@@ -XXX,XX +XXX,XX @@ struct ThreadPool;
22
switch (action) {
28
struct LinuxAioState;
23
case CURL_POLL_IN:
29
struct LuringState;
24
aio_set_fd_handler(s->aio_context, fd, false,
30
25
- curl_multi_do, NULL, NULL, state);
31
+/* Is polling disabled? */
26
+ curl_multi_do, NULL, NULL, socket);
32
+bool aio_poll_disabled(AioContext *ctx);
27
break;
33
+
28
case CURL_POLL_OUT:
34
/* Callbacks for file descriptor monitoring implementations */
29
aio_set_fd_handler(s->aio_context, fd, false,
35
typedef struct {
30
- NULL, curl_multi_do, NULL, state);
36
/*
31
+ NULL, curl_multi_do, NULL, socket);
37
@@ -XXX,XX +XXX,XX @@ typedef struct {
32
break;
38
* Returns: number of ready file descriptors.
33
case CURL_POLL_INOUT:
39
*/
34
aio_set_fd_handler(s->aio_context, fd, false,
40
int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
35
- curl_multi_do, curl_multi_do, NULL, state);
41
+
36
+ curl_multi_do, curl_multi_do, NULL, socket);
42
+ /*
37
break;
43
+ * need_wait:
38
case CURL_POLL_REMOVE:
44
+ * @ctx: the AioContext
39
aio_set_fd_handler(s->aio_context, fd, false,
45
+ *
40
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
46
+ * Tell aio_poll() when to stop userspace polling early because ->wait()
47
+ * has fds ready.
48
+ *
49
+ * File descriptor monitoring implementations that cannot poll fd readiness
50
+ * from userspace should use aio_poll_disabled() here. This ensures that
51
+ * file descriptors are not starved by handlers that frequently make
52
+ * progress via userspace polling.
53
+ *
54
+ * Returns: true if ->wait() should be called, false otherwise.
55
+ */
56
+ bool (*need_wait)(AioContext *ctx);
57
} FDMonOps;
58
59
/*
60
diff --git a/util/aio-posix.c b/util/aio-posix.c
61
index XXXXXXX..XXXXXXX 100644
62
--- a/util/aio-posix.c
63
+++ b/util/aio-posix.c
64
@@ -XXX,XX +XXX,XX @@
65
#include "trace.h"
66
#include "aio-posix.h"
67
68
+bool aio_poll_disabled(AioContext *ctx)
69
+{
70
+ return atomic_read(&ctx->poll_disable_cnt);
71
+}
72
+
73
void aio_add_ready_handler(AioHandlerList *ready_list,
74
AioHandler *node,
75
int revents)
76
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
77
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
78
max_ns = qemu_soonest_timeout(*timeout, max_ns);
79
assert(!(max_ns && progress));
80
- } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
81
+ } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
82
83
/* If time has passed with no successful polling, adjust *timeout to
84
* keep the same ending time.
85
@@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
86
{
87
int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
88
89
- if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
90
+ if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
91
poll_set_started(ctx, true);
92
93
if (run_poll_handlers(ctx, max_ns, timeout)) {
94
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
95
/* If polling is allowed, non-blocking aio_poll does not need the
96
* system call---a single round of run_poll_handlers_once suffices.
97
*/
98
- if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
99
+ if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
100
ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
101
}
102
103
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
104
index XXXXXXX..XXXXXXX 100644
105
--- a/util/fdmon-epoll.c
106
+++ b/util/fdmon-epoll.c
107
@@ -XXX,XX +XXX,XX @@ out:
108
static const FDMonOps fdmon_epoll_ops = {
109
.update = fdmon_epoll_update,
110
.wait = fdmon_epoll_wait,
111
+ .need_wait = aio_poll_disabled,
112
};
113
114
static bool fdmon_epoll_try_enable(AioContext *ctx)
115
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
116
index XXXXXXX..XXXXXXX 100644
117
--- a/util/fdmon-io_uring.c
118
+++ b/util/fdmon-io_uring.c
119
@@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
120
return process_cq_ring(ctx, ready_list);
41
}
121
}
42
122
43
/* Called with s->mutex held. */
123
+static bool fdmon_io_uring_need_wait(AioContext *ctx)
44
-static void curl_multi_do_locked(CURLState *s)
124
+{
45
+static void curl_multi_do_locked(CURLSocket *ready_socket)
125
+ return io_uring_cq_ready(&ctx->fdmon_io_uring);
46
{
126
+}
47
CURLSocket *socket, *next_socket;
127
+
48
+ CURLState *s = ready_socket->state;
128
static const FDMonOps fdmon_io_uring_ops = {
49
int running;
129
.update = fdmon_io_uring_update,
50
int r;
130
.wait = fdmon_io_uring_wait,
51
131
+ .need_wait = fdmon_io_uring_need_wait,
52
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do_locked(CURLState *s)
132
};
53
133
54
static void curl_multi_do(void *arg)
134
bool fdmon_io_uring_setup(AioContext *ctx)
55
{
135
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
56
- CURLState *s = (CURLState *)arg;
136
index XXXXXXX..XXXXXXX 100644
57
+ CURLSocket *socket = arg;
137
--- a/util/fdmon-poll.c
58
+ BDRVCURLState *s = socket->state->s;
138
+++ b/util/fdmon-poll.c
59
139
@@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx,
60
- qemu_mutex_lock(&s->s->mutex);
140
const FDMonOps fdmon_poll_ops = {
61
- curl_multi_do_locked(s);
141
.update = fdmon_poll_update,
62
- curl_multi_check_completion(s->s);
142
.wait = fdmon_poll_wait,
63
- qemu_mutex_unlock(&s->s->mutex);
143
+ .need_wait = aio_poll_disabled,
64
+ qemu_mutex_lock(&s->mutex);
144
};
65
+ curl_multi_do_locked(socket);
66
+ curl_multi_check_completion(s);
67
+ qemu_mutex_unlock(&s->mutex);
68
}
69
70
static void curl_multi_timeout_do(void *arg)
71
--
145
--
72
2.21.0
146
2.24.1
73
147
74
diff view generated by jsdifflib
Deleted patch
1
Background: As of cURL 7.59.0, it verifies that several functions are
2
not called from within a callback. Among these functions is
3
curl_multi_add_handle().
4
1
5
curl_read_cb() is a callback from cURL and not a coroutine. Waking up
6
acb->co will lead to entering it then and there, which means the current
7
request will settle and the caller (if it runs in the same coroutine)
8
may then issue the next request. In such a case, we will enter
9
curl_setup_preadv() effectively from within curl_read_cb().
10
11
Calling curl_multi_add_handle() will then fail and the new request will
12
not be processed.
13
14
Fix this by not letting curl_read_cb() wake up acb->co. Instead, leave
15
the whole business of settling the AIOCB objects to
16
curl_multi_check_completion() (which is called from our timer callback
17
and our FD handler, so not from any cURL callbacks).
18
19
Reported-by: Natalie Gavrielov <ngavrilo@redhat.com>
20
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1740193
21
Cc: qemu-stable@nongnu.org
22
Signed-off-by: Max Reitz <mreitz@redhat.com>
23
Message-id: 20190910124136.10565-7-mreitz@redhat.com
24
Reviewed-by: John Snow <jsnow@redhat.com>
25
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
26
Signed-off-by: Max Reitz <mreitz@redhat.com>
27
---
28
block/curl.c | 69 ++++++++++++++++++++++------------------------------
29
1 file changed, 29 insertions(+), 40 deletions(-)
30
31
diff --git a/block/curl.c b/block/curl.c
32
index XXXXXXX..XXXXXXX 100644
33
--- a/block/curl.c
34
+++ b/block/curl.c
35
@@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
36
{
37
CURLState *s = ((CURLState*)opaque);
38
size_t realsize = size * nmemb;
39
- int i;
40
41
trace_curl_read_cb(realsize);
42
43
@@ -XXX,XX +XXX,XX @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
44
memcpy(s->orig_buf + s->buf_off, ptr, realsize);
45
s->buf_off += realsize;
46
47
- for(i=0; i<CURL_NUM_ACB; i++) {
48
- CURLAIOCB *acb = s->acb[i];
49
-
50
- if (!acb)
51
- continue;
52
-
53
- if ((s->buf_off >= acb->end)) {
54
- size_t request_length = acb->bytes;
55
-
56
- qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start,
57
- acb->end - acb->start);
58
-
59
- if (acb->end - acb->start < request_length) {
60
- size_t offset = acb->end - acb->start;
61
- qemu_iovec_memset(acb->qiov, offset, 0,
62
- request_length - offset);
63
- }
64
-
65
- acb->ret = 0;
66
- s->acb[i] = NULL;
67
- qemu_mutex_unlock(&s->s->mutex);
68
- aio_co_wake(acb->co);
69
- qemu_mutex_lock(&s->s->mutex);
70
- }
71
- }
72
-
73
read_end:
74
/* curl will error out if we do not return this value */
75
return size * nmemb;
76
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
77
break;
78
79
if (msg->msg == CURLMSG_DONE) {
80
+ int i;
81
CURLState *state = NULL;
82
+ bool error = msg->data.result != CURLE_OK;
83
+
84
curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE,
85
(char **)&state);
86
87
- /* ACBs for successful messages get completed in curl_read_cb */
88
- if (msg->data.result != CURLE_OK) {
89
- int i;
90
+ if (error) {
91
static int errcount = 100;
92
93
/* Don't lose the original error message from curl, since
94
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
95
error_report("curl: further errors suppressed");
96
}
97
}
98
+ }
99
100
- for (i = 0; i < CURL_NUM_ACB; i++) {
101
- CURLAIOCB *acb = state->acb[i];
102
+ for (i = 0; i < CURL_NUM_ACB; i++) {
103
+ CURLAIOCB *acb = state->acb[i];
104
105
- if (acb == NULL) {
106
- continue;
107
- }
108
+ if (acb == NULL) {
109
+ continue;
110
+ }
111
+
112
+ if (!error) {
113
+ /* Assert that we have read all data */
114
+ assert(state->buf_off >= acb->end);
115
+
116
+ qemu_iovec_from_buf(acb->qiov, 0,
117
+ state->orig_buf + acb->start,
118
+ acb->end - acb->start);
119
120
- acb->ret = -EIO;
121
- state->acb[i] = NULL;
122
- qemu_mutex_unlock(&s->mutex);
123
- aio_co_wake(acb->co);
124
- qemu_mutex_lock(&s->mutex);
125
+ if (acb->end - acb->start < acb->bytes) {
126
+ size_t offset = acb->end - acb->start;
127
+ qemu_iovec_memset(acb->qiov, offset, 0,
128
+ acb->bytes - offset);
129
+ }
130
}
131
+
132
+ acb->ret = error ? -EIO : 0;
133
+ state->acb[i] = NULL;
134
+ qemu_mutex_unlock(&s->mutex);
135
+ aio_co_wake(acb->co);
136
+ qemu_mutex_lock(&s->mutex);
137
}
138
139
curl_clean_state(state);
140
--
141
2.21.0
142
143
diff view generated by jsdifflib
1
From: Sergio Lopez <slp@redhat.com>
1
When there are many poll handlers it's likely that some of them are idle
2
2
most of the time. Remove handlers that haven't had activity recently so
3
block_job_remove_all_bdrv() iterates through job->nodes, calling
3
that the polling loop scales better for guests with a large number of
4
bdrv_root_unref_child() for each entry. The call to the latter may
4
devices.
5
reach child_job_[can_]set_aio_ctx(), which will also attempt to
5
6
traverse job->nodes, potentially finding entries that where freed
6
This feature only takes effect for the Linux io_uring fd monitoring
7
on previous iterations.
7
implementation because it is capable of combining fd monitoring with
8
8
userspace polling. The other implementations can't do that and risk
9
To avoid this situation, update job->nodes head on each iteration to
9
starving fds in favor of poll handlers, so don't try this optimization
10
ensure that already freed entries are no longer linked to the list.
10
when they are in use.
11
11
12
RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1746631
12
IOPS improves from 10k to 105k when the guest has 100
13
Signed-off-by: Sergio Lopez <slp@redhat.com>
13
virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1
14
Cc: qemu-stable@nongnu.org
14
device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe.
15
Signed-off-by: Max Reitz <mreitz@redhat.com>
15
16
Message-id: 20190911100316.32282-1-mreitz@redhat.com
16
[Clarified aio_poll_handlers locking discipline explanation in comment
17
Reviewed-by: Sergio Lopez <slp@redhat.com>
17
after discussion with Paolo Bonzini <pbonzini@redhat.com>.
18
Signed-off-by: Max Reitz <mreitz@redhat.com>
18
--Stefan]
19
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com
22
Message-Id: <20200305170806.1313245-8-stefanha@redhat.com>
19
---
23
---
20
blockjob.c | 17 +++++++++++++----
24
include/block/aio.h | 8 ++++
21
1 file changed, 13 insertions(+), 4 deletions(-)
25
util/aio-posix.c | 93 +++++++++++++++++++++++++++++++++++++++++----
22
26
util/aio-posix.h | 2 +
23
diff --git a/blockjob.c b/blockjob.c
27
util/trace-events | 2 +
24
index XXXXXXX..XXXXXXX 100644
28
4 files changed, 98 insertions(+), 7 deletions(-)
25
--- a/blockjob.c
29
26
+++ b/blockjob.c
30
diff --git a/include/block/aio.h b/include/block/aio.h
27
@@ -XXX,XX +XXX,XX @@ static const BdrvChildRole child_job = {
31
index XXXXXXX..XXXXXXX 100644
28
32
--- a/include/block/aio.h
29
void block_job_remove_all_bdrv(BlockJob *job)
33
+++ b/include/block/aio.h
34
@@ -XXX,XX +XXX,XX @@ struct AioContext {
35
int64_t poll_grow; /* polling time growth factor */
36
int64_t poll_shrink; /* polling time shrink factor */
37
38
+ /*
39
+ * List of handlers participating in userspace polling. Protected by
40
+ * ctx->list_lock. Iterated and modified mostly by the event loop thread
41
+ * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
42
+ * only touches the list to delete nodes if ctx->list_lock's count is zero.
43
+ */
44
+ AioHandlerList poll_aio_handlers;
45
+
46
/* Are we in polling mode or monitoring file descriptors? */
47
bool poll_started;
48
49
diff --git a/util/aio-posix.c b/util/aio-posix.c
50
index XXXXXXX..XXXXXXX 100644
51
--- a/util/aio-posix.c
52
+++ b/util/aio-posix.c
53
@@ -XXX,XX +XXX,XX @@
54
#include "trace.h"
55
#include "aio-posix.h"
56
57
+/* Stop userspace polling on a handler if it isn't active for some time */
58
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
59
+
60
bool aio_poll_disabled(AioContext *ctx)
30
{
61
{
31
- GSList *l;
62
return atomic_read(&ctx->poll_disable_cnt);
32
- for (l = job->nodes; l; l = l->next) {
63
@@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
64
* deleted because deleted nodes are only cleaned up while
65
* no one is walking the handlers list.
66
*/
67
+ QLIST_SAFE_REMOVE(node, node_poll);
68
QLIST_REMOVE(node, node);
69
return true;
70
}
71
@@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started)
72
ctx->poll_started = started;
73
74
qemu_lockcnt_inc(&ctx->list_lock);
75
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
76
+ QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
77
IOHandler *fn;
78
79
if (QLIST_IS_INSERTED(node, node_deleted)) {
80
@@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx)
81
while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
82
QLIST_REMOVE(node, node);
83
QLIST_REMOVE(node, node_deleted);
84
+ QLIST_SAFE_REMOVE(node, node_poll);
85
g_free(node);
86
}
87
88
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
89
revents = node->pfd.revents & node->pfd.events;
90
node->pfd.revents = 0;
91
33
+ /*
92
+ /*
34
+ * bdrv_root_unref_child() may reach child_job_[can_]set_aio_ctx(),
93
+ * Start polling AioHandlers when they become ready because activity is
35
+ * which will also traverse job->nodes, so consume the list one by
94
+ * likely to continue. Note that starvation is theoretically possible when
36
+ * one to make sure that such a concurrent access does not attempt
95
+ * fdmon_supports_polling(), but only until the fd fires for the first
37
+ * to process an already freed BdrvChild.
96
+ * time.
38
+ */
97
+ */
39
+ while (job->nodes) {
98
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
40
+ GSList *l = job->nodes;
99
+ !QLIST_IS_INSERTED(node, node_poll) &&
41
BdrvChild *c = l->data;
100
+ node->io_poll) {
42
+
101
+ trace_poll_add(ctx, node, node->pfd.fd, revents);
43
+ job->nodes = l->next;
102
+ if (ctx->poll_started && node->io_poll_begin) {
44
+
103
+ node->io_poll_begin(node->opaque);
45
bdrv_op_unblock_all(c->bs, job->blocker);
104
+ }
46
bdrv_root_unref_child(c);
105
+ QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
47
+
106
+ }
48
+ g_slist_free_1(l);
107
+
49
}
108
if (!QLIST_IS_INSERTED(node, node_deleted) &&
50
- g_slist_free(job->nodes);
109
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
51
- job->nodes = NULL;
110
aio_node_check(ctx, node->is_external) &&
111
@@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx)
112
timerlistgroup_run_timers(&ctx->tlg);
52
}
113
}
53
114
54
bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
115
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
116
+static bool run_poll_handlers_once(AioContext *ctx,
117
+ int64_t now,
118
+ int64_t *timeout)
119
{
120
bool progress = false;
121
AioHandler *node;
122
+ AioHandler *tmp;
123
124
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
125
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
126
- aio_node_check(ctx, node->is_external) &&
127
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
128
+ if (aio_node_check(ctx, node->is_external) &&
129
node->io_poll(node->opaque)) {
130
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
131
+
132
/*
133
* Polling was successful, exit try_poll_mode immediately
134
* to adjust the next polling time.
135
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
136
return progress;
137
}
138
139
+static bool fdmon_supports_polling(AioContext *ctx)
140
+{
141
+ return ctx->fdmon_ops->need_wait != aio_poll_disabled;
142
+}
143
+
144
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
145
+{
146
+ AioHandler *node;
147
+ AioHandler *tmp;
148
+ bool progress = false;
149
+
150
+ /*
151
+ * File descriptor monitoring implementations without userspace polling
152
+ * support suffer from starvation when a subset of handlers is polled
153
+ * because fds will not be processed in a timely fashion. Don't remove
154
+ * idle poll handlers.
155
+ */
156
+ if (!fdmon_supports_polling(ctx)) {
157
+ return false;
158
+ }
159
+
160
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
161
+ if (node->poll_idle_timeout == 0LL) {
162
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
163
+ } else if (now >= node->poll_idle_timeout) {
164
+ trace_poll_remove(ctx, node, node->pfd.fd);
165
+ node->poll_idle_timeout = 0LL;
166
+ QLIST_SAFE_REMOVE(node, node_poll);
167
+ if (ctx->poll_started && node->io_poll_end) {
168
+ node->io_poll_end(node->opaque);
169
+
170
+ /*
171
+ * Final poll in case ->io_poll_end() races with an event.
172
+ * Nevermind about re-adding the handler in the rare case where
173
+ * this causes progress.
174
+ */
175
+ progress = node->io_poll(node->opaque) || progress;
176
+ }
177
+ }
178
+ }
179
+
180
+ return progress;
181
+}
182
+
183
/* run_poll_handlers:
184
* @ctx: the AioContext
185
* @max_ns: maximum time to poll for, in nanoseconds
186
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
187
188
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
189
do {
190
- progress = run_poll_handlers_once(ctx, timeout);
191
+ progress = run_poll_handlers_once(ctx, start_time, timeout);
192
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
193
max_ns = qemu_soonest_timeout(*timeout, max_ns);
194
assert(!(max_ns && progress));
195
} while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
196
197
+ if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
198
+ *timeout = 0;
199
+ progress = true;
200
+ }
201
+
202
/* If time has passed with no successful polling, adjust *timeout to
203
* keep the same ending time.
204
*/
205
@@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
206
*/
207
static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
208
{
209
- int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
210
+ int64_t max_ns;
211
+
212
+ if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
213
+ return false;
214
+ }
215
216
+ max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
217
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
218
poll_set_started(ctx, true);
219
220
diff --git a/util/aio-posix.h b/util/aio-posix.h
221
index XXXXXXX..XXXXXXX 100644
222
--- a/util/aio-posix.h
223
+++ b/util/aio-posix.h
224
@@ -XXX,XX +XXX,XX @@ struct AioHandler {
225
QLIST_ENTRY(AioHandler) node;
226
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
227
QLIST_ENTRY(AioHandler) node_deleted;
228
+ QLIST_ENTRY(AioHandler) node_poll;
229
#ifdef CONFIG_LINUX_IO_URING
230
QSLIST_ENTRY(AioHandler) node_submitted;
231
unsigned flags; /* see fdmon-io_uring.c */
232
#endif
233
+ int64_t poll_idle_timeout; /* when to stop userspace polling */
234
bool is_external;
235
};
236
237
diff --git a/util/trace-events b/util/trace-events
238
index XXXXXXX..XXXXXXX 100644
239
--- a/util/trace-events
240
+++ b/util/trace-events
241
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
242
run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
243
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
244
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
245
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
246
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
247
248
# async.c
249
aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
55
--
250
--
56
2.21.0
251
2.24.1
57
252
58
diff view generated by jsdifflib