1 | The following changes since commit 6d40ce00c1166c317e298ad82ecf10e650c4f87d: | 1 | The following changes since commit a2376507f615495b1d16685449ce0ea78c2caf9d: |
---|---|---|---|
2 | 2 | ||
3 | Update version for v6.0.0-rc1 release (2021-03-30 18:19:07 +0100) | 3 | Merge remote-tracking branch 'remotes/bonzini-gitlab/tags/for-upstream' into staging (2021-07-24 11:04:57 +0100) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to b6489ac06695e257ea0a9841364577e247fdee30: | 9 | for you to fetch changes up to 15a730e7a3aaac180df72cd5730e0617bcf44a5a: |
10 | 10 | ||
11 | test-coroutine: Add rwlock downgrade test (2021-03-31 10:44:21 +0100) | 11 | block/nvme: Fix VFIO_MAP_DMA failed: No space left on device (2021-07-26 09:38:12 +0100) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | A fix for VDI image files and more generally for CoRwlock. | 16 | Phil's block/nvme.c ENOSPC fix for newer Linux kernels that return this errno. |
17 | 17 | ||
18 | ---------------------------------------------------------------- | 18 | ---------------------------------------------------------------- |
19 | 19 | ||
20 | David Edmondson (4): | 20 | Philippe Mathieu-Daudé (1): |
21 | block/vdi: When writing new bmap entry fails, don't leak the buffer | 21 | block/nvme: Fix VFIO_MAP_DMA failed: No space left on device |
22 | block/vdi: Don't assume that blocks are larger than VdiHeader | ||
23 | coroutine-lock: Store the coroutine in the CoWaitRecord only once | ||
24 | test-coroutine: Add rwlock downgrade test | ||
25 | 22 | ||
26 | Paolo Bonzini (2): | 23 | block/nvme.c | 22 ++++++++++++++++++++++ |
27 | coroutine-lock: Reimplement CoRwlock to fix downgrade bug | 24 | 1 file changed, 22 insertions(+) |
28 | test-coroutine: Add rwlock upgrade test | ||
29 | |||
30 | include/qemu/coroutine.h | 17 ++-- | ||
31 | block/vdi.c | 11 ++- | ||
32 | tests/unit/test-coroutine.c | 161 +++++++++++++++++++++++++++++++++++ | ||
33 | util/qemu-coroutine-lock.c | 165 +++++++++++++++++++++++------------- | ||
34 | 4 files changed, 282 insertions(+), 72 deletions(-) | ||
35 | 25 | ||
36 | -- | 26 | -- |
37 | 2.30.2 | 27 | 2.31.1 |
38 | 28 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: David Edmondson <david.edmondson@oracle.com> | ||
2 | 1 | ||
3 | If a new bitmap entry is allocated, requiring the entire block to be | ||
4 | written, avoiding leaking the buffer allocated for the block should | ||
5 | the write fail. | ||
6 | |||
7 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
8 | Signed-off-by: David Edmondson <david.edmondson@oracle.com> | ||
9 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
10 | Acked-by: Max Reitz <mreitz@redhat.com> | ||
11 | Message-id: 20210325112941.365238-2-pbonzini@redhat.com | ||
12 | Message-Id: <20210309144015.557477-2-david.edmondson@oracle.com> | ||
13 | Acked-by: Max Reitz <mreitz@redhat.com> | ||
14 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
15 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
16 | --- | ||
17 | block/vdi.c | 1 + | ||
18 | 1 file changed, 1 insertion(+) | ||
19 | |||
20 | diff --git a/block/vdi.c b/block/vdi.c | ||
21 | index XXXXXXX..XXXXXXX 100644 | ||
22 | --- a/block/vdi.c | ||
23 | +++ b/block/vdi.c | ||
24 | @@ -XXX,XX +XXX,XX @@ nonallocating_write: | ||
25 | |||
26 | logout("finished data write\n"); | ||
27 | if (ret < 0) { | ||
28 | + g_free(block); | ||
29 | return ret; | ||
30 | } | ||
31 | |||
32 | -- | ||
33 | 2.30.2 | ||
34 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: David Edmondson <david.edmondson@oracle.com> | ||
2 | 1 | ||
3 | Given that the block size is read from the header of the VDI file, a | ||
4 | wide variety of sizes might be seen. Rather than re-using a block | ||
5 | sized memory region when writing the VDI header, allocate an | ||
6 | appropriately sized buffer. | ||
7 | |||
8 | Signed-off-by: David Edmondson <david.edmondson@oracle.com> | ||
9 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
10 | Acked-by: Max Reitz <mreitz@redhat.com> | ||
11 | Message-id: 20210325112941.365238-3-pbonzini@redhat.com | ||
12 | Message-Id: <20210309144015.557477-3-david.edmondson@oracle.com> | ||
13 | Acked-by: Max Reitz <mreitz@redhat.com> | ||
14 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
15 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
16 | --- | ||
17 | block/vdi.c | 10 ++++++---- | ||
18 | 1 file changed, 6 insertions(+), 4 deletions(-) | ||
19 | |||
20 | diff --git a/block/vdi.c b/block/vdi.c | ||
21 | index XXXXXXX..XXXXXXX 100644 | ||
22 | --- a/block/vdi.c | ||
23 | +++ b/block/vdi.c | ||
24 | @@ -XXX,XX +XXX,XX @@ nonallocating_write: | ||
25 | |||
26 | if (block) { | ||
27 | /* One or more new blocks were allocated. */ | ||
28 | - VdiHeader *header = (VdiHeader *) block; | ||
29 | + VdiHeader *header; | ||
30 | uint8_t *base; | ||
31 | uint64_t offset; | ||
32 | uint32_t n_sectors; | ||
33 | |||
34 | + g_free(block); | ||
35 | + header = g_malloc(sizeof(*header)); | ||
36 | + | ||
37 | logout("now writing modified header\n"); | ||
38 | assert(VDI_IS_ALLOCATED(bmap_first)); | ||
39 | *header = s->header; | ||
40 | vdi_header_to_le(header); | ||
41 | - ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader)); | ||
42 | - g_free(block); | ||
43 | - block = NULL; | ||
44 | + ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header)); | ||
45 | + g_free(header); | ||
46 | |||
47 | if (ret < 0) { | ||
48 | return ret; | ||
49 | -- | ||
50 | 2.30.2 | ||
51 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: David Edmondson <david.edmondson@oracle.com> | ||
2 | 1 | ||
3 | When taking the slow path for mutex acquisition, set the coroutine | ||
4 | value in the CoWaitRecord in push_waiter(), rather than both there and | ||
5 | in the caller. | ||
6 | |||
7 | Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> | ||
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
9 | Signed-off-by: David Edmondson <david.edmondson@oracle.com> | ||
10 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
11 | Message-id: 20210325112941.365238-4-pbonzini@redhat.com | ||
12 | Message-Id: <20210309144015.557477-4-david.edmondson@oracle.com> | ||
13 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | --- | ||
16 | util/qemu-coroutine-lock.c | 1 - | ||
17 | 1 file changed, 1 deletion(-) | ||
18 | |||
19 | diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/util/qemu-coroutine-lock.c | ||
22 | +++ b/util/qemu-coroutine-lock.c | ||
23 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx, | ||
24 | unsigned old_handoff; | ||
25 | |||
26 | trace_qemu_co_mutex_lock_entry(mutex, self); | ||
27 | - w.co = self; | ||
28 | push_waiter(mutex, &w); | ||
29 | |||
30 | /* This is the "Responsibility Hand-Off" protocol; a lock() picks from | ||
31 | -- | ||
32 | 2.30.2 | ||
33 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Paolo Bonzini <pbonzini@redhat.com> | ||
2 | 1 | ||
3 | An invariant of the current rwlock is that if multiple coroutines hold a | ||
4 | reader lock, all must be runnable. The unlock implementation relies on | ||
5 | this, choosing to wake a single coroutine when the final read lock | ||
6 | holder exits the critical section, assuming that it will wake a | ||
7 | coroutine attempting to acquire a write lock. | ||
8 | |||
9 | The downgrade implementation violates this assumption by creating a | ||
10 | read lock owning coroutine that is exclusively runnable - any other | ||
11 | coroutines that are waiting to acquire a read lock are *not* made | ||
12 | runnable when the write lock holder converts its ownership to read | ||
13 | only. | ||
14 | |||
15 | More in general, the old implementation had lots of other fairness bugs. | ||
16 | The root cause of the bugs was that CoQueue would wake up readers even | ||
17 | if there were pending writers, and would wake up writers even if there | ||
18 | were readers. In that case, the coroutine would go back to sleep *at | ||
19 | the end* of the CoQueue, losing its place at the head of the line. | ||
20 | |||
21 | To fix this, keep the queue of waiters explicitly in the CoRwlock | ||
22 | instead of using CoQueue, and store for each whether it is a | ||
23 | potential reader or a writer. This way, downgrade can look at the | ||
24 | first queued coroutines and wake it only if it is a reader, causing | ||
25 | all other readers in line to be released in turn. | ||
26 | |||
27 | Reported-by: David Edmondson <david.edmondson@oracle.com> | ||
28 | Reviewed-by: David Edmondson <david.edmondson@oracle.com> | ||
29 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
30 | Message-id: 20210325112941.365238-5-pbonzini@redhat.com | ||
31 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
32 | --- | ||
33 | include/qemu/coroutine.h | 17 ++-- | ||
34 | util/qemu-coroutine-lock.c | 164 +++++++++++++++++++++++-------------- | ||
35 | 2 files changed, 114 insertions(+), 67 deletions(-) | ||
36 | |||
37 | diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h | ||
38 | index XXXXXXX..XXXXXXX 100644 | ||
39 | --- a/include/qemu/coroutine.h | ||
40 | +++ b/include/qemu/coroutine.h | ||
41 | @@ -XXX,XX +XXX,XX @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock); | ||
42 | bool qemu_co_queue_empty(CoQueue *queue); | ||
43 | |||
44 | |||
45 | +typedef struct CoRwTicket CoRwTicket; | ||
46 | typedef struct CoRwlock { | ||
47 | - int pending_writer; | ||
48 | - int reader; | ||
49 | CoMutex mutex; | ||
50 | - CoQueue queue; | ||
51 | + | ||
52 | + /* Number of readers, or -1 if owned for writing. */ | ||
53 | + int owners; | ||
54 | + | ||
55 | + /* Waiting coroutines. */ | ||
56 | + QSIMPLEQ_HEAD(, CoRwTicket) tickets; | ||
57 | } CoRwlock; | ||
58 | |||
59 | /** | ||
60 | @@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock); | ||
61 | /** | ||
62 | * Write Locks the CoRwlock from a reader. This is a bit more efficient than | ||
63 | * @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock. | ||
64 | - * However, if the lock cannot be upgraded immediately, control is transferred | ||
65 | - * to the caller of the current coroutine. Also, @qemu_co_rwlock_upgrade | ||
66 | - * only overrides CoRwlock fairness if there are no concurrent readers, so | ||
67 | - * another writer might run while @qemu_co_rwlock_upgrade blocks. | ||
68 | + * Note that if the lock cannot be upgraded immediately, control is transferred | ||
69 | + * to the caller of the current coroutine; another writer might run while | ||
70 | + * @qemu_co_rwlock_upgrade blocks. | ||
71 | */ | ||
72 | void qemu_co_rwlock_upgrade(CoRwlock *lock); | ||
73 | |||
74 | diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c | ||
75 | index XXXXXXX..XXXXXXX 100644 | ||
76 | --- a/util/qemu-coroutine-lock.c | ||
77 | +++ b/util/qemu-coroutine-lock.c | ||
78 | @@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) | ||
79 | trace_qemu_co_mutex_unlock_return(mutex, self); | ||
80 | } | ||
81 | |||
82 | +struct CoRwTicket { | ||
83 | + bool read; | ||
84 | + Coroutine *co; | ||
85 | + QSIMPLEQ_ENTRY(CoRwTicket) next; | ||
86 | +}; | ||
87 | + | ||
88 | void qemu_co_rwlock_init(CoRwlock *lock) | ||
89 | { | ||
90 | - memset(lock, 0, sizeof(*lock)); | ||
91 | - qemu_co_queue_init(&lock->queue); | ||
92 | qemu_co_mutex_init(&lock->mutex); | ||
93 | + lock->owners = 0; | ||
94 | + QSIMPLEQ_INIT(&lock->tickets); | ||
95 | +} | ||
96 | + | ||
97 | +/* Releases the internal CoMutex. */ | ||
98 | +static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock) | ||
99 | +{ | ||
100 | + CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets); | ||
101 | + Coroutine *co = NULL; | ||
102 | + | ||
103 | + /* | ||
104 | + * Setting lock->owners here prevents rdlock and wrlock from | ||
105 | + * sneaking in between unlock and wake. | ||
106 | + */ | ||
107 | + | ||
108 | + if (tkt) { | ||
109 | + if (tkt->read) { | ||
110 | + if (lock->owners >= 0) { | ||
111 | + lock->owners++; | ||
112 | + co = tkt->co; | ||
113 | + } | ||
114 | + } else { | ||
115 | + if (lock->owners == 0) { | ||
116 | + lock->owners = -1; | ||
117 | + co = tkt->co; | ||
118 | + } | ||
119 | + } | ||
120 | + } | ||
121 | + | ||
122 | + if (co) { | ||
123 | + QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next); | ||
124 | + qemu_co_mutex_unlock(&lock->mutex); | ||
125 | + aio_co_wake(co); | ||
126 | + } else { | ||
127 | + qemu_co_mutex_unlock(&lock->mutex); | ||
128 | + } | ||
129 | } | ||
130 | |||
131 | void qemu_co_rwlock_rdlock(CoRwlock *lock) | ||
132 | @@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock) | ||
133 | |||
134 | qemu_co_mutex_lock(&lock->mutex); | ||
135 | /* For fairness, wait if a writer is in line. */ | ||
136 | - while (lock->pending_writer) { | ||
137 | - qemu_co_queue_wait(&lock->queue, &lock->mutex); | ||
138 | - } | ||
139 | - lock->reader++; | ||
140 | - qemu_co_mutex_unlock(&lock->mutex); | ||
141 | - | ||
142 | - /* The rest of the read-side critical section is run without the mutex. */ | ||
143 | - self->locks_held++; | ||
144 | -} | ||
145 | - | ||
146 | -void qemu_co_rwlock_unlock(CoRwlock *lock) | ||
147 | -{ | ||
148 | - Coroutine *self = qemu_coroutine_self(); | ||
149 | - | ||
150 | - assert(qemu_in_coroutine()); | ||
151 | - if (!lock->reader) { | ||
152 | - /* The critical section started in qemu_co_rwlock_wrlock. */ | ||
153 | - qemu_co_queue_restart_all(&lock->queue); | ||
154 | + if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) { | ||
155 | + lock->owners++; | ||
156 | + qemu_co_mutex_unlock(&lock->mutex); | ||
157 | } else { | ||
158 | - self->locks_held--; | ||
159 | + CoRwTicket my_ticket = { true, self }; | ||
160 | |||
161 | + QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next); | ||
162 | + qemu_co_mutex_unlock(&lock->mutex); | ||
163 | + qemu_coroutine_yield(); | ||
164 | + assert(lock->owners >= 1); | ||
165 | + | ||
166 | + /* Possibly wake another reader, which will wake the next in line. */ | ||
167 | qemu_co_mutex_lock(&lock->mutex); | ||
168 | - lock->reader--; | ||
169 | - assert(lock->reader >= 0); | ||
170 | - /* Wakeup only one waiting writer */ | ||
171 | - if (!lock->reader) { | ||
172 | - qemu_co_queue_next(&lock->queue); | ||
173 | - } | ||
174 | + qemu_co_rwlock_maybe_wake_one(lock); | ||
175 | } | ||
176 | - qemu_co_mutex_unlock(&lock->mutex); | ||
177 | + | ||
178 | + self->locks_held++; | ||
179 | +} | ||
180 | + | ||
181 | +void qemu_co_rwlock_unlock(CoRwlock *lock) | ||
182 | +{ | ||
183 | + Coroutine *self = qemu_coroutine_self(); | ||
184 | + | ||
185 | + assert(qemu_in_coroutine()); | ||
186 | + self->locks_held--; | ||
187 | + | ||
188 | + qemu_co_mutex_lock(&lock->mutex); | ||
189 | + if (lock->owners > 0) { | ||
190 | + lock->owners--; | ||
191 | + } else { | ||
192 | + assert(lock->owners == -1); | ||
193 | + lock->owners = 0; | ||
194 | + } | ||
195 | + | ||
196 | + qemu_co_rwlock_maybe_wake_one(lock); | ||
197 | } | ||
198 | |||
199 | void qemu_co_rwlock_downgrade(CoRwlock *lock) | ||
200 | { | ||
201 | - Coroutine *self = qemu_coroutine_self(); | ||
202 | + qemu_co_mutex_lock(&lock->mutex); | ||
203 | + assert(lock->owners == -1); | ||
204 | + lock->owners = 1; | ||
205 | |||
206 | - /* lock->mutex critical section started in qemu_co_rwlock_wrlock or | ||
207 | - * qemu_co_rwlock_upgrade. | ||
208 | - */ | ||
209 | - assert(lock->reader == 0); | ||
210 | - lock->reader++; | ||
211 | - qemu_co_mutex_unlock(&lock->mutex); | ||
212 | - | ||
213 | - /* The rest of the read-side critical section is run without the mutex. */ | ||
214 | - self->locks_held++; | ||
215 | + /* Possibly wake another reader, which will wake the next in line. */ | ||
216 | + qemu_co_rwlock_maybe_wake_one(lock); | ||
217 | } | ||
218 | |||
219 | void qemu_co_rwlock_wrlock(CoRwlock *lock) | ||
220 | { | ||
221 | + Coroutine *self = qemu_coroutine_self(); | ||
222 | + | ||
223 | qemu_co_mutex_lock(&lock->mutex); | ||
224 | - lock->pending_writer++; | ||
225 | - while (lock->reader) { | ||
226 | - qemu_co_queue_wait(&lock->queue, &lock->mutex); | ||
227 | + if (lock->owners == 0) { | ||
228 | + lock->owners = -1; | ||
229 | + qemu_co_mutex_unlock(&lock->mutex); | ||
230 | + } else { | ||
231 | + CoRwTicket my_ticket = { false, qemu_coroutine_self() }; | ||
232 | + | ||
233 | + QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next); | ||
234 | + qemu_co_mutex_unlock(&lock->mutex); | ||
235 | + qemu_coroutine_yield(); | ||
236 | + assert(lock->owners == -1); | ||
237 | } | ||
238 | - lock->pending_writer--; | ||
239 | |||
240 | - /* The rest of the write-side critical section is run with | ||
241 | - * the mutex taken, so that lock->reader remains zero. | ||
242 | - * There is no need to update self->locks_held. | ||
243 | - */ | ||
244 | + self->locks_held++; | ||
245 | } | ||
246 | |||
247 | void qemu_co_rwlock_upgrade(CoRwlock *lock) | ||
248 | { | ||
249 | - Coroutine *self = qemu_coroutine_self(); | ||
250 | - | ||
251 | qemu_co_mutex_lock(&lock->mutex); | ||
252 | - assert(lock->reader > 0); | ||
253 | - lock->reader--; | ||
254 | - lock->pending_writer++; | ||
255 | - while (lock->reader) { | ||
256 | - qemu_co_queue_wait(&lock->queue, &lock->mutex); | ||
257 | + assert(lock->owners > 0); | ||
258 | + /* For fairness, wait if a writer is in line. */ | ||
259 | + if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) { | ||
260 | + lock->owners = -1; | ||
261 | + qemu_co_mutex_unlock(&lock->mutex); | ||
262 | + } else { | ||
263 | + CoRwTicket my_ticket = { false, qemu_coroutine_self() }; | ||
264 | + | ||
265 | + lock->owners--; | ||
266 | + QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next); | ||
267 | + qemu_co_rwlock_maybe_wake_one(lock); | ||
268 | + qemu_coroutine_yield(); | ||
269 | + assert(lock->owners == -1); | ||
270 | } | ||
271 | - lock->pending_writer--; | ||
272 | - | ||
273 | - /* The rest of the write-side critical section is run with | ||
274 | - * the mutex taken, similar to qemu_co_rwlock_wrlock. Do | ||
275 | - * not account for the lock twice in self->locks_held. | ||
276 | - */ | ||
277 | - self->locks_held--; | ||
278 | } | ||
279 | -- | ||
280 | 2.30.2 | ||
281 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Paolo Bonzini <pbonzini@redhat.com> | ||
2 | 1 | ||
3 | Test that rwlock upgrade is fair, and that readers go back to sleep if | ||
4 | a writer is in line. | ||
5 | |||
6 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | ||
7 | Message-id: 20210325112941.365238-6-pbonzini@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | --- | ||
10 | tests/unit/test-coroutine.c | 62 +++++++++++++++++++++++++++++++++++++ | ||
11 | 1 file changed, 62 insertions(+) | ||
12 | |||
13 | diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c | ||
14 | index XXXXXXX..XXXXXXX 100644 | ||
15 | --- a/tests/unit/test-coroutine.c | ||
16 | +++ b/tests/unit/test-coroutine.c | ||
17 | @@ -XXX,XX +XXX,XX @@ static void test_co_mutex_lockable(void) | ||
18 | g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL); | ||
19 | } | ||
20 | |||
21 | +static CoRwlock rwlock; | ||
22 | + | ||
23 | +/* Test that readers are properly sent back to the queue when upgrading, | ||
24 | + * even if they are the sole readers. The test scenario is as follows: | ||
25 | + * | ||
26 | + * | ||
27 | + * | c1 | c2 | | ||
28 | + * |--------------+------------+ | ||
29 | + * | rdlock | | | ||
30 | + * | yield | | | ||
31 | + * | | wrlock | | ||
32 | + * | | <queued> | | ||
33 | + * | upgrade | | | ||
34 | + * | <queued> | <dequeued> | | ||
35 | + * | | unlock | | ||
36 | + * | <dequeued> | | | ||
37 | + * | unlock | | | ||
38 | + */ | ||
39 | + | ||
40 | +static void coroutine_fn rwlock_yield_upgrade(void *opaque) | ||
41 | +{ | ||
42 | + qemu_co_rwlock_rdlock(&rwlock); | ||
43 | + qemu_coroutine_yield(); | ||
44 | + | ||
45 | + qemu_co_rwlock_upgrade(&rwlock); | ||
46 | + qemu_co_rwlock_unlock(&rwlock); | ||
47 | + | ||
48 | + *(bool *)opaque = true; | ||
49 | +} | ||
50 | + | ||
51 | +static void coroutine_fn rwlock_wrlock_yield(void *opaque) | ||
52 | +{ | ||
53 | + qemu_co_rwlock_wrlock(&rwlock); | ||
54 | + qemu_coroutine_yield(); | ||
55 | + | ||
56 | + qemu_co_rwlock_unlock(&rwlock); | ||
57 | + *(bool *)opaque = true; | ||
58 | +} | ||
59 | + | ||
60 | +static void test_co_rwlock_upgrade(void) | ||
61 | +{ | ||
62 | + bool c1_done = false; | ||
63 | + bool c2_done = false; | ||
64 | + Coroutine *c1, *c2; | ||
65 | + | ||
66 | + qemu_co_rwlock_init(&rwlock); | ||
67 | + c1 = qemu_coroutine_create(rwlock_yield_upgrade, &c1_done); | ||
68 | + c2 = qemu_coroutine_create(rwlock_wrlock_yield, &c2_done); | ||
69 | + | ||
70 | + qemu_coroutine_enter(c1); | ||
71 | + qemu_coroutine_enter(c2); | ||
72 | + | ||
73 | + /* c1 now should go to sleep. */ | ||
74 | + qemu_coroutine_enter(c1); | ||
75 | + g_assert(!c1_done); | ||
76 | + | ||
77 | + qemu_coroutine_enter(c2); | ||
78 | + g_assert(c1_done); | ||
79 | + g_assert(c2_done); | ||
80 | +} | ||
81 | + | ||
82 | /* | ||
83 | * Check that creation, enter, and return work | ||
84 | */ | ||
85 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
86 | g_test_add_func("/basic/order", test_order); | ||
87 | g_test_add_func("/locking/co-mutex", test_co_mutex); | ||
88 | g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable); | ||
89 | + g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade); | ||
90 | if (g_test_perf()) { | ||
91 | g_test_add_func("/perf/lifecycle", perf_lifecycle); | ||
92 | g_test_add_func("/perf/nesting", perf_nesting); | ||
93 | -- | ||
94 | 2.30.2 | ||
95 | diff view generated by jsdifflib |
1 | From: David Edmondson <david.edmondson@oracle.com> | 1 | From: Philippe Mathieu-Daudé <philmd@redhat.com> |
---|---|---|---|
2 | 2 | ||
3 | Test that downgrading an rwlock does not result in a failure to | 3 | When the NVMe block driver was introduced (see commit bdd6a90a9e5, |
4 | schedule coroutines queued on the rwlock. | 4 | January 2018), Linux VFIO_IOMMU_MAP_DMA ioctl was only returning |
5 | -ENOMEM in case of error. The driver was correctly handling the | ||
6 | error path to recycle its volatile IOVA mappings. | ||
5 | 7 | ||
6 | The diagram associated with test_co_rwlock_downgrade() describes the | 8 | To fix CVE-2019-3882, Linux commit 492855939bdb ("vfio/type1: Limit |
7 | intended behaviour, but what was observed previously corresponds to: | 9 | DMA mappings per container", April 2019) added the -ENOSPC error to |
10 | signal the user exhausted the DMA mappings available for a container. | ||
8 | 11 | ||
9 | | c1 | c2 | c3 | c4 | | 12 | The block driver started to mis-behave: |
10 | |--------+------------+------------+----------| | ||
11 | | rdlock | | | | | ||
12 | | yield | | | | | ||
13 | | | wrlock | | | | ||
14 | | | <queued> | | | | ||
15 | | | | rdlock | | | ||
16 | | | | <queued> | | | ||
17 | | | | | wrlock | | ||
18 | | | | | <queued> | | ||
19 | | unlock | | | | | ||
20 | | yield | | | | | ||
21 | | | <dequeued> | | | | ||
22 | | | downgrade | | | | ||
23 | | | ... | | | | ||
24 | | | unlock | | | | ||
25 | | | | <dequeued> | | | ||
26 | | | | <queued> | | | ||
27 | 13 | ||
28 | This results in a failure... | 14 | qemu-system-x86_64: VFIO_MAP_DMA failed: No space left on device |
15 | (qemu) | ||
16 | (qemu) info status | ||
17 | VM status: paused (io-error) | ||
18 | (qemu) c | ||
19 | VFIO_MAP_DMA failed: No space left on device | ||
20 | (qemu) c | ||
21 | VFIO_MAP_DMA failed: No space left on device | ||
29 | 22 | ||
30 | ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done) | 23 | (The VM is not resumable from here, hence stuck.) |
31 | Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done) | ||
32 | 24 | ||
33 | ...as a result of the c3 coroutine failing to run to completion. | 25 | Fix by handling the new -ENOSPC error (when DMA mappings are |
26 | exhausted) without any distinction to the current -ENOMEM error, | ||
27 | so we don't change the behavior on old kernels where the CVE-2019-3882 | ||
28 | fix is not present. | ||
34 | 29 | ||
35 | Signed-off-by: David Edmondson <david.edmondson@oracle.com> | 30 | An easy way to reproduce this bug is to restrict the DMA mapping |
36 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 31 | limit (65535 by default) when loading the VFIO IOMMU module: |
37 | Message-id: 20210325112941.365238-7-pbonzini@redhat.com | 32 | |
38 | Message-Id: <20210309144015.557477-5-david.edmondson@oracle.com> | 33 | # modprobe vfio_iommu_type1 dma_entry_limit=666 |
39 | Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> | 34 | |
35 | Cc: qemu-stable@nongnu.org | ||
36 | Cc: Fam Zheng <fam@euphon.net> | ||
37 | Cc: Maxim Levitsky <mlevitsk@redhat.com> | ||
38 | Cc: Alex Williamson <alex.williamson@redhat.com> | ||
39 | Reported-by: Michal Prívozník <mprivozn@redhat.com> | ||
40 | Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
41 | Message-id: 20210723195843.1032825-1-philmd@redhat.com | ||
42 | Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver") | ||
43 | Buglink: https://bugs.launchpad.net/qemu/+bug/1863333 | ||
44 | Resolves: https://gitlab.com/qemu-project/qemu/-/issues/65 | ||
45 | Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
40 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 46 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
41 | --- | 47 | --- |
42 | tests/unit/test-coroutine.c | 99 +++++++++++++++++++++++++++++++++++++ | 48 | block/nvme.c | 22 ++++++++++++++++++++++ |
43 | 1 file changed, 99 insertions(+) | 49 | 1 file changed, 22 insertions(+) |
44 | 50 | ||
45 | diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c | 51 | diff --git a/block/nvme.c b/block/nvme.c |
46 | index XXXXXXX..XXXXXXX 100644 | 52 | index XXXXXXX..XXXXXXX 100644 |
47 | --- a/tests/unit/test-coroutine.c | 53 | --- a/block/nvme.c |
48 | +++ b/tests/unit/test-coroutine.c | 54 | +++ b/block/nvme.c |
49 | @@ -XXX,XX +XXX,XX @@ static void test_co_rwlock_upgrade(void) | 55 | @@ -XXX,XX +XXX,XX @@ try_map: |
50 | g_assert(c2_done); | 56 | r = qemu_vfio_dma_map(s->vfio, |
51 | } | 57 | qiov->iov[i].iov_base, |
52 | 58 | len, true, &iova); | |
53 | +static void coroutine_fn rwlock_rdlock_yield(void *opaque) | 59 | + if (r == -ENOSPC) { |
54 | +{ | 60 | + /* |
55 | + qemu_co_rwlock_rdlock(&rwlock); | 61 | + * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA |
56 | + qemu_coroutine_yield(); | 62 | + * ioctl returns -ENOSPC to signal the user exhausted the DMA |
57 | + | 63 | + * mappings available for a container since Linux kernel commit |
58 | + qemu_co_rwlock_unlock(&rwlock); | 64 | + * 492855939bdb ("vfio/type1: Limit DMA mappings per container", |
59 | + qemu_coroutine_yield(); | 65 | + * April 2019, see CVE-2019-3882). |
60 | + | 66 | + * |
61 | + *(bool *)opaque = true; | 67 | + * This block driver already handles this error path by checking |
62 | +} | 68 | + * for the -ENOMEM error, so we directly replace -ENOSPC by |
63 | + | 69 | + * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev |
64 | +static void coroutine_fn rwlock_wrlock_downgrade(void *opaque) | 70 | + * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and |
65 | +{ | 71 | + * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator |
66 | + qemu_co_rwlock_wrlock(&rwlock); | 72 | + * to add more storage to the blockdev. Not something we can do |
67 | + | 73 | + * easily with an IOMMU :) |
68 | + qemu_co_rwlock_downgrade(&rwlock); | 74 | + */ |
69 | + qemu_co_rwlock_unlock(&rwlock); | 75 | + r = -ENOMEM; |
70 | + *(bool *)opaque = true; | 76 | + } |
71 | +} | 77 | if (r == -ENOMEM && retry) { |
72 | + | 78 | + /* |
73 | +static void coroutine_fn rwlock_rdlock(void *opaque) | 79 | + * We exhausted the DMA mappings available for our container: |
74 | +{ | 80 | + * recycle the volatile IOVA mappings. |
75 | + qemu_co_rwlock_rdlock(&rwlock); | 81 | + */ |
76 | + | 82 | retry = false; |
77 | + qemu_co_rwlock_unlock(&rwlock); | 83 | trace_nvme_dma_flush_queue_wait(s); |
78 | + *(bool *)opaque = true; | 84 | if (s->dma_map_count) { |
79 | +} | ||
80 | + | ||
81 | +static void coroutine_fn rwlock_wrlock(void *opaque) | ||
82 | +{ | ||
83 | + qemu_co_rwlock_wrlock(&rwlock); | ||
84 | + | ||
85 | + qemu_co_rwlock_unlock(&rwlock); | ||
86 | + *(bool *)opaque = true; | ||
87 | +} | ||
88 | + | ||
89 | +/* | ||
90 | + * Check that downgrading a reader-writer lock does not cause a hang. | ||
91 | + * | ||
92 | + * Four coroutines are used to produce a situation where there are | ||
93 | + * both reader and writer hopefuls waiting to acquire an rwlock that | ||
94 | + * is held by a reader. | ||
95 | + * | ||
96 | + * The correct sequence of operations we aim to provoke can be | ||
97 | + * represented as: | ||
98 | + * | ||
99 | + * | c1 | c2 | c3 | c4 | | ||
100 | + * |--------+------------+------------+------------| | ||
101 | + * | rdlock | | | | | ||
102 | + * | yield | | | | | ||
103 | + * | | wrlock | | | | ||
104 | + * | | <queued> | | | | ||
105 | + * | | | rdlock | | | ||
106 | + * | | | <queued> | | | ||
107 | + * | | | | wrlock | | ||
108 | + * | | | | <queued> | | ||
109 | + * | unlock | | | | | ||
110 | + * | yield | | | | | ||
111 | + * | | <dequeued> | | | | ||
112 | + * | | downgrade | | | | ||
113 | + * | | | <dequeued> | | | ||
114 | + * | | | unlock | | | ||
115 | + * | | ... | | | | ||
116 | + * | | unlock | | | | ||
117 | + * | | | | <dequeued> | | ||
118 | + * | | | | unlock | | ||
119 | + */ | ||
120 | +static void test_co_rwlock_downgrade(void) | ||
121 | +{ | ||
122 | + bool c1_done = false; | ||
123 | + bool c2_done = false; | ||
124 | + bool c3_done = false; | ||
125 | + bool c4_done = false; | ||
126 | + Coroutine *c1, *c2, *c3, *c4; | ||
127 | + | ||
128 | + qemu_co_rwlock_init(&rwlock); | ||
129 | + | ||
130 | + c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done); | ||
131 | + c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done); | ||
132 | + c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done); | ||
133 | + c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done); | ||
134 | + | ||
135 | + qemu_coroutine_enter(c1); | ||
136 | + qemu_coroutine_enter(c2); | ||
137 | + qemu_coroutine_enter(c3); | ||
138 | + qemu_coroutine_enter(c4); | ||
139 | + | ||
140 | + qemu_coroutine_enter(c1); | ||
141 | + | ||
142 | + g_assert(c2_done); | ||
143 | + g_assert(c3_done); | ||
144 | + g_assert(c4_done); | ||
145 | + | ||
146 | + qemu_coroutine_enter(c1); | ||
147 | + | ||
148 | + g_assert(c1_done); | ||
149 | +} | ||
150 | + | ||
151 | /* | ||
152 | * Check that creation, enter, and return work | ||
153 | */ | ||
154 | @@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv) | ||
155 | g_test_add_func("/locking/co-mutex", test_co_mutex); | ||
156 | g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable); | ||
157 | g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade); | ||
158 | + g_test_add_func("/locking/co-rwlock/downgrade", test_co_rwlock_downgrade); | ||
159 | if (g_test_perf()) { | ||
160 | g_test_add_func("/perf/lifecycle", perf_lifecycle); | ||
161 | g_test_add_func("/perf/nesting", perf_nesting); | ||
162 | -- | 85 | -- |
163 | 2.30.2 | 86 | 2.31.1 |
164 | 87 | diff view generated by jsdifflib |