1
The following changes since commit 6d40ce00c1166c317e298ad82ecf10e650c4f87d:
1
The following changes since commit a2376507f615495b1d16685449ce0ea78c2caf9d:
2
2
3
Update version for v6.0.0-rc1 release (2021-03-30 18:19:07 +0100)
3
Merge remote-tracking branch 'remotes/bonzini-gitlab/tags/for-upstream' into staging (2021-07-24 11:04:57 +0100)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to b6489ac06695e257ea0a9841364577e247fdee30:
9
for you to fetch changes up to 15a730e7a3aaac180df72cd5730e0617bcf44a5a:
10
10
11
test-coroutine: Add rwlock downgrade test (2021-03-31 10:44:21 +0100)
11
block/nvme: Fix VFIO_MAP_DMA failed: No space left on device (2021-07-26 09:38:12 +0100)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
A fix for VDI image files and more generally for CoRwlock.
16
Phil's block/nvme.c ENOSPC fix for newer Linux kernels that return this errno.
17
17
18
----------------------------------------------------------------
18
----------------------------------------------------------------
19
19
20
David Edmondson (4):
20
Philippe Mathieu-Daudé (1):
21
block/vdi: When writing new bmap entry fails, don't leak the buffer
21
block/nvme: Fix VFIO_MAP_DMA failed: No space left on device
22
block/vdi: Don't assume that blocks are larger than VdiHeader
23
coroutine-lock: Store the coroutine in the CoWaitRecord only once
24
test-coroutine: Add rwlock downgrade test
25
22
26
Paolo Bonzini (2):
23
block/nvme.c | 22 ++++++++++++++++++++++
27
coroutine-lock: Reimplement CoRwlock to fix downgrade bug
24
1 file changed, 22 insertions(+)
28
test-coroutine: Add rwlock upgrade test
29
30
include/qemu/coroutine.h | 17 ++--
31
block/vdi.c | 11 ++-
32
tests/unit/test-coroutine.c | 161 +++++++++++++++++++++++++++++++++++
33
util/qemu-coroutine-lock.c | 165 +++++++++++++++++++++++-------------
34
4 files changed, 282 insertions(+), 72 deletions(-)
35
25
36
--
26
--
37
2.30.2
27
2.31.1
38
28
diff view generated by jsdifflib
Deleted patch
1
From: David Edmondson <david.edmondson@oracle.com>
2
1
3
If a new bitmap entry is allocated, requiring the entire block to be
4
written, avoiding leaking the buffer allocated for the block should
5
the write fail.
6
7
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
8
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Acked-by: Max Reitz <mreitz@redhat.com>
11
Message-id: 20210325112941.365238-2-pbonzini@redhat.com
12
Message-Id: <20210309144015.557477-2-david.edmondson@oracle.com>
13
Acked-by: Max Reitz <mreitz@redhat.com>
14
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
block/vdi.c | 1 +
18
1 file changed, 1 insertion(+)
19
20
diff --git a/block/vdi.c b/block/vdi.c
21
index XXXXXXX..XXXXXXX 100644
22
--- a/block/vdi.c
23
+++ b/block/vdi.c
24
@@ -XXX,XX +XXX,XX @@ nonallocating_write:
25
26
logout("finished data write\n");
27
if (ret < 0) {
28
+ g_free(block);
29
return ret;
30
}
31
32
--
33
2.30.2
34
diff view generated by jsdifflib
Deleted patch
1
From: David Edmondson <david.edmondson@oracle.com>
2
1
3
Given that the block size is read from the header of the VDI file, a
4
wide variety of sizes might be seen. Rather than re-using a block
5
sized memory region when writing the VDI header, allocate an
6
appropriately sized buffer.
7
8
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Acked-by: Max Reitz <mreitz@redhat.com>
11
Message-id: 20210325112941.365238-3-pbonzini@redhat.com
12
Message-Id: <20210309144015.557477-3-david.edmondson@oracle.com>
13
Acked-by: Max Reitz <mreitz@redhat.com>
14
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
block/vdi.c | 10 ++++++----
18
1 file changed, 6 insertions(+), 4 deletions(-)
19
20
diff --git a/block/vdi.c b/block/vdi.c
21
index XXXXXXX..XXXXXXX 100644
22
--- a/block/vdi.c
23
+++ b/block/vdi.c
24
@@ -XXX,XX +XXX,XX @@ nonallocating_write:
25
26
if (block) {
27
/* One or more new blocks were allocated. */
28
- VdiHeader *header = (VdiHeader *) block;
29
+ VdiHeader *header;
30
uint8_t *base;
31
uint64_t offset;
32
uint32_t n_sectors;
33
34
+ g_free(block);
35
+ header = g_malloc(sizeof(*header));
36
+
37
logout("now writing modified header\n");
38
assert(VDI_IS_ALLOCATED(bmap_first));
39
*header = s->header;
40
vdi_header_to_le(header);
41
- ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader));
42
- g_free(block);
43
- block = NULL;
44
+ ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
45
+ g_free(header);
46
47
if (ret < 0) {
48
return ret;
49
--
50
2.30.2
51
diff view generated by jsdifflib
Deleted patch
1
From: David Edmondson <david.edmondson@oracle.com>
2
1
3
When taking the slow path for mutex acquisition, set the coroutine
4
value in the CoWaitRecord in push_waiter(), rather than both there and
5
in the caller.
6
7
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
9
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
10
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Message-id: 20210325112941.365238-4-pbonzini@redhat.com
12
Message-Id: <20210309144015.557477-4-david.edmondson@oracle.com>
13
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
util/qemu-coroutine-lock.c | 1 -
17
1 file changed, 1 deletion(-)
18
19
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/util/qemu-coroutine-lock.c
22
+++ b/util/qemu-coroutine-lock.c
23
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
24
unsigned old_handoff;
25
26
trace_qemu_co_mutex_lock_entry(mutex, self);
27
- w.co = self;
28
push_waiter(mutex, &w);
29
30
/* This is the "Responsibility Hand-Off" protocol; a lock() picks from
31
--
32
2.30.2
33
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
An invariant of the current rwlock is that if multiple coroutines hold a
4
reader lock, all must be runnable. The unlock implementation relies on
5
this, choosing to wake a single coroutine when the final read lock
6
holder exits the critical section, assuming that it will wake a
7
coroutine attempting to acquire a write lock.
8
9
The downgrade implementation violates this assumption by creating a
10
read lock owning coroutine that is exclusively runnable - any other
11
coroutines that are waiting to acquire a read lock are *not* made
12
runnable when the write lock holder converts its ownership to read
13
only.
14
15
More in general, the old implementation had lots of other fairness bugs.
16
The root cause of the bugs was that CoQueue would wake up readers even
17
if there were pending writers, and would wake up writers even if there
18
were readers. In that case, the coroutine would go back to sleep *at
19
the end* of the CoQueue, losing its place at the head of the line.
20
21
To fix this, keep the queue of waiters explicitly in the CoRwlock
22
instead of using CoQueue, and store for each whether it is a
23
potential reader or a writer. This way, downgrade can look at the
24
first queued coroutines and wake it only if it is a reader, causing
25
all other readers in line to be released in turn.
26
27
Reported-by: David Edmondson <david.edmondson@oracle.com>
28
Reviewed-by: David Edmondson <david.edmondson@oracle.com>
29
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
30
Message-id: 20210325112941.365238-5-pbonzini@redhat.com
31
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
32
---
33
include/qemu/coroutine.h | 17 ++--
34
util/qemu-coroutine-lock.c | 164 +++++++++++++++++++++++--------------
35
2 files changed, 114 insertions(+), 67 deletions(-)
36
37
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
38
index XXXXXXX..XXXXXXX 100644
39
--- a/include/qemu/coroutine.h
40
+++ b/include/qemu/coroutine.h
41
@@ -XXX,XX +XXX,XX @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock);
42
bool qemu_co_queue_empty(CoQueue *queue);
43
44
45
+typedef struct CoRwTicket CoRwTicket;
46
typedef struct CoRwlock {
47
- int pending_writer;
48
- int reader;
49
CoMutex mutex;
50
- CoQueue queue;
51
+
52
+ /* Number of readers, or -1 if owned for writing. */
53
+ int owners;
54
+
55
+ /* Waiting coroutines. */
56
+ QSIMPLEQ_HEAD(, CoRwTicket) tickets;
57
} CoRwlock;
58
59
/**
60
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock);
61
/**
62
* Write Locks the CoRwlock from a reader. This is a bit more efficient than
63
* @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock.
64
- * However, if the lock cannot be upgraded immediately, control is transferred
65
- * to the caller of the current coroutine. Also, @qemu_co_rwlock_upgrade
66
- * only overrides CoRwlock fairness if there are no concurrent readers, so
67
- * another writer might run while @qemu_co_rwlock_upgrade blocks.
68
+ * Note that if the lock cannot be upgraded immediately, control is transferred
69
+ * to the caller of the current coroutine; another writer might run while
70
+ * @qemu_co_rwlock_upgrade blocks.
71
*/
72
void qemu_co_rwlock_upgrade(CoRwlock *lock);
73
74
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
75
index XXXXXXX..XXXXXXX 100644
76
--- a/util/qemu-coroutine-lock.c
77
+++ b/util/qemu-coroutine-lock.c
78
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
79
trace_qemu_co_mutex_unlock_return(mutex, self);
80
}
81
82
+struct CoRwTicket {
83
+ bool read;
84
+ Coroutine *co;
85
+ QSIMPLEQ_ENTRY(CoRwTicket) next;
86
+};
87
+
88
void qemu_co_rwlock_init(CoRwlock *lock)
89
{
90
- memset(lock, 0, sizeof(*lock));
91
- qemu_co_queue_init(&lock->queue);
92
qemu_co_mutex_init(&lock->mutex);
93
+ lock->owners = 0;
94
+ QSIMPLEQ_INIT(&lock->tickets);
95
+}
96
+
97
+/* Releases the internal CoMutex. */
98
+static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
99
+{
100
+ CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
101
+ Coroutine *co = NULL;
102
+
103
+ /*
104
+ * Setting lock->owners here prevents rdlock and wrlock from
105
+ * sneaking in between unlock and wake.
106
+ */
107
+
108
+ if (tkt) {
109
+ if (tkt->read) {
110
+ if (lock->owners >= 0) {
111
+ lock->owners++;
112
+ co = tkt->co;
113
+ }
114
+ } else {
115
+ if (lock->owners == 0) {
116
+ lock->owners = -1;
117
+ co = tkt->co;
118
+ }
119
+ }
120
+ }
121
+
122
+ if (co) {
123
+ QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
124
+ qemu_co_mutex_unlock(&lock->mutex);
125
+ aio_co_wake(co);
126
+ } else {
127
+ qemu_co_mutex_unlock(&lock->mutex);
128
+ }
129
}
130
131
void qemu_co_rwlock_rdlock(CoRwlock *lock)
132
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
133
134
qemu_co_mutex_lock(&lock->mutex);
135
/* For fairness, wait if a writer is in line. */
136
- while (lock->pending_writer) {
137
- qemu_co_queue_wait(&lock->queue, &lock->mutex);
138
- }
139
- lock->reader++;
140
- qemu_co_mutex_unlock(&lock->mutex);
141
-
142
- /* The rest of the read-side critical section is run without the mutex. */
143
- self->locks_held++;
144
-}
145
-
146
-void qemu_co_rwlock_unlock(CoRwlock *lock)
147
-{
148
- Coroutine *self = qemu_coroutine_self();
149
-
150
- assert(qemu_in_coroutine());
151
- if (!lock->reader) {
152
- /* The critical section started in qemu_co_rwlock_wrlock. */
153
- qemu_co_queue_restart_all(&lock->queue);
154
+ if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
155
+ lock->owners++;
156
+ qemu_co_mutex_unlock(&lock->mutex);
157
} else {
158
- self->locks_held--;
159
+ CoRwTicket my_ticket = { true, self };
160
161
+ QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
162
+ qemu_co_mutex_unlock(&lock->mutex);
163
+ qemu_coroutine_yield();
164
+ assert(lock->owners >= 1);
165
+
166
+ /* Possibly wake another reader, which will wake the next in line. */
167
qemu_co_mutex_lock(&lock->mutex);
168
- lock->reader--;
169
- assert(lock->reader >= 0);
170
- /* Wakeup only one waiting writer */
171
- if (!lock->reader) {
172
- qemu_co_queue_next(&lock->queue);
173
- }
174
+ qemu_co_rwlock_maybe_wake_one(lock);
175
}
176
- qemu_co_mutex_unlock(&lock->mutex);
177
+
178
+ self->locks_held++;
179
+}
180
+
181
+void qemu_co_rwlock_unlock(CoRwlock *lock)
182
+{
183
+ Coroutine *self = qemu_coroutine_self();
184
+
185
+ assert(qemu_in_coroutine());
186
+ self->locks_held--;
187
+
188
+ qemu_co_mutex_lock(&lock->mutex);
189
+ if (lock->owners > 0) {
190
+ lock->owners--;
191
+ } else {
192
+ assert(lock->owners == -1);
193
+ lock->owners = 0;
194
+ }
195
+
196
+ qemu_co_rwlock_maybe_wake_one(lock);
197
}
198
199
void qemu_co_rwlock_downgrade(CoRwlock *lock)
200
{
201
- Coroutine *self = qemu_coroutine_self();
202
+ qemu_co_mutex_lock(&lock->mutex);
203
+ assert(lock->owners == -1);
204
+ lock->owners = 1;
205
206
- /* lock->mutex critical section started in qemu_co_rwlock_wrlock or
207
- * qemu_co_rwlock_upgrade.
208
- */
209
- assert(lock->reader == 0);
210
- lock->reader++;
211
- qemu_co_mutex_unlock(&lock->mutex);
212
-
213
- /* The rest of the read-side critical section is run without the mutex. */
214
- self->locks_held++;
215
+ /* Possibly wake another reader, which will wake the next in line. */
216
+ qemu_co_rwlock_maybe_wake_one(lock);
217
}
218
219
void qemu_co_rwlock_wrlock(CoRwlock *lock)
220
{
221
+ Coroutine *self = qemu_coroutine_self();
222
+
223
qemu_co_mutex_lock(&lock->mutex);
224
- lock->pending_writer++;
225
- while (lock->reader) {
226
- qemu_co_queue_wait(&lock->queue, &lock->mutex);
227
+ if (lock->owners == 0) {
228
+ lock->owners = -1;
229
+ qemu_co_mutex_unlock(&lock->mutex);
230
+ } else {
231
+ CoRwTicket my_ticket = { false, qemu_coroutine_self() };
232
+
233
+ QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
234
+ qemu_co_mutex_unlock(&lock->mutex);
235
+ qemu_coroutine_yield();
236
+ assert(lock->owners == -1);
237
}
238
- lock->pending_writer--;
239
240
- /* The rest of the write-side critical section is run with
241
- * the mutex taken, so that lock->reader remains zero.
242
- * There is no need to update self->locks_held.
243
- */
244
+ self->locks_held++;
245
}
246
247
void qemu_co_rwlock_upgrade(CoRwlock *lock)
248
{
249
- Coroutine *self = qemu_coroutine_self();
250
-
251
qemu_co_mutex_lock(&lock->mutex);
252
- assert(lock->reader > 0);
253
- lock->reader--;
254
- lock->pending_writer++;
255
- while (lock->reader) {
256
- qemu_co_queue_wait(&lock->queue, &lock->mutex);
257
+ assert(lock->owners > 0);
258
+ /* For fairness, wait if a writer is in line. */
259
+ if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
260
+ lock->owners = -1;
261
+ qemu_co_mutex_unlock(&lock->mutex);
262
+ } else {
263
+ CoRwTicket my_ticket = { false, qemu_coroutine_self() };
264
+
265
+ lock->owners--;
266
+ QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
267
+ qemu_co_rwlock_maybe_wake_one(lock);
268
+ qemu_coroutine_yield();
269
+ assert(lock->owners == -1);
270
}
271
- lock->pending_writer--;
272
-
273
- /* The rest of the write-side critical section is run with
274
- * the mutex taken, similar to qemu_co_rwlock_wrlock. Do
275
- * not account for the lock twice in self->locks_held.
276
- */
277
- self->locks_held--;
278
}
279
--
280
2.30.2
281
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
Test that rwlock upgrade is fair, and that readers go back to sleep if
4
a writer is in line.
5
6
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
7
Message-id: 20210325112941.365238-6-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
tests/unit/test-coroutine.c | 62 +++++++++++++++++++++++++++++++++++++
11
1 file changed, 62 insertions(+)
12
13
diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
14
index XXXXXXX..XXXXXXX 100644
15
--- a/tests/unit/test-coroutine.c
16
+++ b/tests/unit/test-coroutine.c
17
@@ -XXX,XX +XXX,XX @@ static void test_co_mutex_lockable(void)
18
g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL);
19
}
20
21
+static CoRwlock rwlock;
22
+
23
+/* Test that readers are properly sent back to the queue when upgrading,
24
+ * even if they are the sole readers. The test scenario is as follows:
25
+ *
26
+ *
27
+ * | c1 | c2 |
28
+ * |--------------+------------+
29
+ * | rdlock | |
30
+ * | yield | |
31
+ * | | wrlock |
32
+ * | | <queued> |
33
+ * | upgrade | |
34
+ * | <queued> | <dequeued> |
35
+ * | | unlock |
36
+ * | <dequeued> | |
37
+ * | unlock | |
38
+ */
39
+
40
+static void coroutine_fn rwlock_yield_upgrade(void *opaque)
41
+{
42
+ qemu_co_rwlock_rdlock(&rwlock);
43
+ qemu_coroutine_yield();
44
+
45
+ qemu_co_rwlock_upgrade(&rwlock);
46
+ qemu_co_rwlock_unlock(&rwlock);
47
+
48
+ *(bool *)opaque = true;
49
+}
50
+
51
+static void coroutine_fn rwlock_wrlock_yield(void *opaque)
52
+{
53
+ qemu_co_rwlock_wrlock(&rwlock);
54
+ qemu_coroutine_yield();
55
+
56
+ qemu_co_rwlock_unlock(&rwlock);
57
+ *(bool *)opaque = true;
58
+}
59
+
60
+static void test_co_rwlock_upgrade(void)
61
+{
62
+ bool c1_done = false;
63
+ bool c2_done = false;
64
+ Coroutine *c1, *c2;
65
+
66
+ qemu_co_rwlock_init(&rwlock);
67
+ c1 = qemu_coroutine_create(rwlock_yield_upgrade, &c1_done);
68
+ c2 = qemu_coroutine_create(rwlock_wrlock_yield, &c2_done);
69
+
70
+ qemu_coroutine_enter(c1);
71
+ qemu_coroutine_enter(c2);
72
+
73
+ /* c1 now should go to sleep. */
74
+ qemu_coroutine_enter(c1);
75
+ g_assert(!c1_done);
76
+
77
+ qemu_coroutine_enter(c2);
78
+ g_assert(c1_done);
79
+ g_assert(c2_done);
80
+}
81
+
82
/*
83
* Check that creation, enter, and return work
84
*/
85
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
86
g_test_add_func("/basic/order", test_order);
87
g_test_add_func("/locking/co-mutex", test_co_mutex);
88
g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
89
+ g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
90
if (g_test_perf()) {
91
g_test_add_func("/perf/lifecycle", perf_lifecycle);
92
g_test_add_func("/perf/nesting", perf_nesting);
93
--
94
2.30.2
95
diff view generated by jsdifflib
1
From: David Edmondson <david.edmondson@oracle.com>
1
From: Philippe Mathieu-Daudé <philmd@redhat.com>
2
2
3
Test that downgrading an rwlock does not result in a failure to
3
When the NVMe block driver was introduced (see commit bdd6a90a9e5,
4
schedule coroutines queued on the rwlock.
4
January 2018), Linux VFIO_IOMMU_MAP_DMA ioctl was only returning
5
-ENOMEM in case of error. The driver was correctly handling the
6
error path to recycle its volatile IOVA mappings.
5
7
6
The diagram associated with test_co_rwlock_downgrade() describes the
8
To fix CVE-2019-3882, Linux commit 492855939bdb ("vfio/type1: Limit
7
intended behaviour, but what was observed previously corresponds to:
9
DMA mappings per container", April 2019) added the -ENOSPC error to
10
signal the user exhausted the DMA mappings available for a container.
8
11
9
| c1 | c2 | c3 | c4 |
12
The block driver started to mis-behave:
10
|--------+------------+------------+----------|
11
| rdlock | | | |
12
| yield | | | |
13
| | wrlock | | |
14
| | <queued> | | |
15
| | | rdlock | |
16
| | | <queued> | |
17
| | | | wrlock |
18
| | | | <queued> |
19
| unlock | | | |
20
| yield | | | |
21
| | <dequeued> | | |
22
| | downgrade | | |
23
| | ... | | |
24
| | unlock | | |
25
| | | <dequeued> | |
26
| | | <queued> | |
27
13
28
This results in a failure...
14
qemu-system-x86_64: VFIO_MAP_DMA failed: No space left on device
15
(qemu)
16
(qemu) info status
17
VM status: paused (io-error)
18
(qemu) c
19
VFIO_MAP_DMA failed: No space left on device
20
(qemu) c
21
VFIO_MAP_DMA failed: No space left on device
29
22
30
ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
23
(The VM is not resumable from here, hence stuck.)
31
Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: (c3_done)
32
24
33
...as a result of the c3 coroutine failing to run to completion.
25
Fix by handling the new -ENOSPC error (when DMA mappings are
26
exhausted) without any distinction to the current -ENOMEM error,
27
so we don't change the behavior on old kernels where the CVE-2019-3882
28
fix is not present.
34
29
35
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
30
An easy way to reproduce this bug is to restrict the DMA mapping
36
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
31
limit (65535 by default) when loading the VFIO IOMMU module:
37
Message-id: 20210325112941.365238-7-pbonzini@redhat.com
32
38
Message-Id: <20210309144015.557477-5-david.edmondson@oracle.com>
33
# modprobe vfio_iommu_type1 dma_entry_limit=666
39
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
34
35
Cc: qemu-stable@nongnu.org
36
Cc: Fam Zheng <fam@euphon.net>
37
Cc: Maxim Levitsky <mlevitsk@redhat.com>
38
Cc: Alex Williamson <alex.williamson@redhat.com>
39
Reported-by: Michal Prívozník <mprivozn@redhat.com>
40
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
41
Message-id: 20210723195843.1032825-1-philmd@redhat.com
42
Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
43
Buglink: https://bugs.launchpad.net/qemu/+bug/1863333
44
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/65
45
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
40
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
46
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
41
---
47
---
42
tests/unit/test-coroutine.c | 99 +++++++++++++++++++++++++++++++++++++
48
block/nvme.c | 22 ++++++++++++++++++++++
43
1 file changed, 99 insertions(+)
49
1 file changed, 22 insertions(+)
44
50
45
diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
51
diff --git a/block/nvme.c b/block/nvme.c
46
index XXXXXXX..XXXXXXX 100644
52
index XXXXXXX..XXXXXXX 100644
47
--- a/tests/unit/test-coroutine.c
53
--- a/block/nvme.c
48
+++ b/tests/unit/test-coroutine.c
54
+++ b/block/nvme.c
49
@@ -XXX,XX +XXX,XX @@ static void test_co_rwlock_upgrade(void)
55
@@ -XXX,XX +XXX,XX @@ try_map:
50
g_assert(c2_done);
56
r = qemu_vfio_dma_map(s->vfio,
51
}
57
qiov->iov[i].iov_base,
52
58
len, true, &iova);
53
+static void coroutine_fn rwlock_rdlock_yield(void *opaque)
59
+ if (r == -ENOSPC) {
54
+{
60
+ /*
55
+ qemu_co_rwlock_rdlock(&rwlock);
61
+ * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA
56
+ qemu_coroutine_yield();
62
+ * ioctl returns -ENOSPC to signal the user exhausted the DMA
57
+
63
+ * mappings available for a container since Linux kernel commit
58
+ qemu_co_rwlock_unlock(&rwlock);
64
+ * 492855939bdb ("vfio/type1: Limit DMA mappings per container",
59
+ qemu_coroutine_yield();
65
+ * April 2019, see CVE-2019-3882).
60
+
66
+ *
61
+ *(bool *)opaque = true;
67
+ * This block driver already handles this error path by checking
62
+}
68
+ * for the -ENOMEM error, so we directly replace -ENOSPC by
63
+
69
+ * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev
64
+static void coroutine_fn rwlock_wrlock_downgrade(void *opaque)
70
+ * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and
65
+{
71
+ * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator
66
+ qemu_co_rwlock_wrlock(&rwlock);
72
+ * to add more storage to the blockdev. Not something we can do
67
+
73
+ * easily with an IOMMU :)
68
+ qemu_co_rwlock_downgrade(&rwlock);
74
+ */
69
+ qemu_co_rwlock_unlock(&rwlock);
75
+ r = -ENOMEM;
70
+ *(bool *)opaque = true;
76
+ }
71
+}
77
if (r == -ENOMEM && retry) {
72
+
78
+ /*
73
+static void coroutine_fn rwlock_rdlock(void *opaque)
79
+ * We exhausted the DMA mappings available for our container:
74
+{
80
+ * recycle the volatile IOVA mappings.
75
+ qemu_co_rwlock_rdlock(&rwlock);
81
+ */
76
+
82
retry = false;
77
+ qemu_co_rwlock_unlock(&rwlock);
83
trace_nvme_dma_flush_queue_wait(s);
78
+ *(bool *)opaque = true;
84
if (s->dma_map_count) {
79
+}
80
+
81
+static void coroutine_fn rwlock_wrlock(void *opaque)
82
+{
83
+ qemu_co_rwlock_wrlock(&rwlock);
84
+
85
+ qemu_co_rwlock_unlock(&rwlock);
86
+ *(bool *)opaque = true;
87
+}
88
+
89
+/*
90
+ * Check that downgrading a reader-writer lock does not cause a hang.
91
+ *
92
+ * Four coroutines are used to produce a situation where there are
93
+ * both reader and writer hopefuls waiting to acquire an rwlock that
94
+ * is held by a reader.
95
+ *
96
+ * The correct sequence of operations we aim to provoke can be
97
+ * represented as:
98
+ *
99
+ * | c1 | c2 | c3 | c4 |
100
+ * |--------+------------+------------+------------|
101
+ * | rdlock | | | |
102
+ * | yield | | | |
103
+ * | | wrlock | | |
104
+ * | | <queued> | | |
105
+ * | | | rdlock | |
106
+ * | | | <queued> | |
107
+ * | | | | wrlock |
108
+ * | | | | <queued> |
109
+ * | unlock | | | |
110
+ * | yield | | | |
111
+ * | | <dequeued> | | |
112
+ * | | downgrade | | |
113
+ * | | | <dequeued> | |
114
+ * | | | unlock | |
115
+ * | | ... | | |
116
+ * | | unlock | | |
117
+ * | | | | <dequeued> |
118
+ * | | | | unlock |
119
+ */
120
+static void test_co_rwlock_downgrade(void)
121
+{
122
+ bool c1_done = false;
123
+ bool c2_done = false;
124
+ bool c3_done = false;
125
+ bool c4_done = false;
126
+ Coroutine *c1, *c2, *c3, *c4;
127
+
128
+ qemu_co_rwlock_init(&rwlock);
129
+
130
+ c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done);
131
+ c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done);
132
+ c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done);
133
+ c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done);
134
+
135
+ qemu_coroutine_enter(c1);
136
+ qemu_coroutine_enter(c2);
137
+ qemu_coroutine_enter(c3);
138
+ qemu_coroutine_enter(c4);
139
+
140
+ qemu_coroutine_enter(c1);
141
+
142
+ g_assert(c2_done);
143
+ g_assert(c3_done);
144
+ g_assert(c4_done);
145
+
146
+ qemu_coroutine_enter(c1);
147
+
148
+ g_assert(c1_done);
149
+}
150
+
151
/*
152
* Check that creation, enter, and return work
153
*/
154
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
155
g_test_add_func("/locking/co-mutex", test_co_mutex);
156
g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable);
157
g_test_add_func("/locking/co-rwlock/upgrade", test_co_rwlock_upgrade);
158
+ g_test_add_func("/locking/co-rwlock/downgrade", test_co_rwlock_downgrade);
159
if (g_test_perf()) {
160
g_test_add_func("/perf/lifecycle", perf_lifecycle);
161
g_test_add_func("/perf/nesting", perf_nesting);
162
--
85
--
163
2.30.2
86
2.31.1
164
87
diff view generated by jsdifflib