1
The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae:
1
The following changes since commit 813bac3d8d70d85cb7835f7945eb9eed84c2d8d0:
2
2
3
Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100)
3
Merge tag '2023q3-bsd-user-pull-request' of https://gitlab.com/bsdimp/qemu into staging (2023-08-29 08:58:00 -0400)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://github.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a:
9
for you to fetch changes up to 87ec6f55af38e29be5b2b65a8acf84da73e06d06:
10
10
11
block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100)
11
aio-posix: zero out io_uring sqe user_data (2023-08-30 07:39:59 -0400)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
v3:
17
- Drop UFS emulation due to CI failures
18
- Add "aio-posix: zero out io_uring sqe user_data"
19
16
----------------------------------------------------------------
20
----------------------------------------------------------------
17
21
18
Daniele Buono (4):
22
Andrey Drobyshev (3):
19
coroutine: support SafeStack in ucontext backend
23
block: add subcluster_size field to BlockDriverInfo
20
coroutine: add check for SafeStack in sigaltstack
24
block/io: align requests to subcluster_size
21
configure: add flags to support SafeStack
25
tests/qemu-iotests/197: add testcase for CoR with subclusters
22
check-block: enable iotests with SafeStack
23
26
24
Stefan Hajnoczi (8):
27
Fabiano Rosas (1):
25
minikconf: explicitly set encoding to UTF-8
28
block-migration: Ensure we don't crash during migration cleanup
26
block/nvme: poll queues without q->lock
27
block/nvme: drop tautologous assertion
28
block/nvme: don't access CQE after moving cq.head
29
block/nvme: switch to a NVMeRequest freelist
30
block/nvme: clarify that free_req_queue is protected by q->lock
31
block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair
32
block/nvme: support nested aio_poll()
33
29
34
configure | 73 ++++++++++++
30
Stefan Hajnoczi (1):
35
include/qemu/coroutine_int.h | 5 +
31
aio-posix: zero out io_uring sqe user_data
36
block/nvme.c | 220 +++++++++++++++++++++++++----------
32
37
util/coroutine-sigaltstack.c | 4 +
33
include/block/block-common.h | 5 ++++
38
util/coroutine-ucontext.c | 28 +++++
34
include/block/block-io.h | 8 +++---
39
block/trace-events | 2 +-
35
block.c | 7 +++++
40
scripts/minikconf.py | 6 +-
36
block/io.c | 50 ++++++++++++++++++------------------
41
tests/check-block.sh | 12 +-
37
block/mirror.c | 8 +++---
42
8 files changed, 284 insertions(+), 66 deletions(-)
38
block/qcow2.c | 1 +
39
migration/block.c | 11 ++++++--
40
util/fdmon-io_uring.c | 2 ++
41
tests/qemu-iotests/197 | 29 +++++++++++++++++++++
42
tests/qemu-iotests/197.out | 24 +++++++++++++++++
43
10 files changed, 110 insertions(+), 35 deletions(-)
43
44
44
--
45
--
45
2.26.2
46
2.41.0
46
diff view generated by jsdifflib
Deleted patch
1
QEMU currently only has ASCII Kconfig files but Linux actually uses
2
UTF-8. Explicitly specify the encoding and that we're doing text file
3
I/O.
4
1
5
It's unclear whether or not QEMU will ever need Unicode in its Kconfig
6
files. If we start using the help text then it will become an issue
7
sooner or later. Make this change now for consistency with Linux
8
Kconfig.
9
10
Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
13
Message-id: 20200521153616.307100-1-stefanha@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
scripts/minikconf.py | 6 +++---
17
1 file changed, 3 insertions(+), 3 deletions(-)
18
19
diff --git a/scripts/minikconf.py b/scripts/minikconf.py
20
index XXXXXXX..XXXXXXX 100755
21
--- a/scripts/minikconf.py
22
+++ b/scripts/minikconf.py
23
@@ -XXX,XX +XXX,XX @@ class KconfigParser:
24
if incl_abs_fname in self.data.previously_included:
25
return
26
try:
27
- fp = open(incl_abs_fname, 'r')
28
+ fp = open(incl_abs_fname, 'rt', encoding='utf-8')
29
except IOError as e:
30
raise KconfigParserError(self,
31
'%s: %s' % (e.strerror, include))
32
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
33
parser.do_assignment(name, value == 'y')
34
external_vars.add(name[7:])
35
else:
36
- fp = open(arg, 'r')
37
+ fp = open(arg, 'rt', encoding='utf-8')
38
parser.parse_file(fp)
39
fp.close()
40
41
@@ -XXX,XX +XXX,XX @@ if __name__ == '__main__':
42
if key not in external_vars and config[key]:
43
print ('CONFIG_%s=y' % key)
44
45
- deps = open(argv[2], 'w')
46
+ deps = open(argv[2], 'wt', encoding='utf-8')
47
for fname in data.previously_included:
48
print ('%s: %s' % (argv[1], fname), file=deps)
49
deps.close()
50
--
51
2.26.2
52
diff view generated by jsdifflib
1
Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce
1
From: Fabiano Rosas <farosas@suse.de>
2
the number of function arguments by keeping the BDRVNVMeState pointer in
3
NVMeQueuePair. This will come in handly when a BH is introduced in a
4
later patch and only one argument can be passed to it.
5
2
6
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
3
We can fail the blk_insert_bs() at init_blk_migration(), leaving the
7
Reviewed-by: Sergio Lopez <slp@redhat.com>
4
BlkMigDevState without a dirty_bitmap and BlockDriverState. Account
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
5
for the possibly missing elements when doing cleanup.
9
Message-id: 20200617132201.1832152-7-stefanha@redhat.com
6
7
Fix the following crashes:
8
9
Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
10
0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359
11
359 BlockDriverState *bs = bitmap->bs;
12
#0 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359
13
#1 0x0000555555bba331 in unset_dirty_tracking () at ../migration/block.c:371
14
#2 0x0000555555bbad98 in block_migration_cleanup_bmds () at ../migration/block.c:681
15
16
Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
17
0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073
18
7073 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
19
#0 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073
20
#1 0x0000555555e9734a in bdrv_op_unblock_all (bs=0x0, reason=0x0) at ../block.c:7095
21
#2 0x0000555555bbae13 in block_migration_cleanup_bmds () at ../migration/block.c:690
22
23
Signed-off-by: Fabiano Rosas <farosas@suse.de>
24
Message-id: 20230731203338.27581-1-farosas@suse.de
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
25
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
26
---
12
block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------
27
migration/block.c | 11 +++++++++--
13
1 file changed, 38 insertions(+), 32 deletions(-)
28
1 file changed, 9 insertions(+), 2 deletions(-)
14
29
15
diff --git a/block/nvme.c b/block/nvme.c
30
diff --git a/migration/block.c b/migration/block.c
16
index XXXXXXX..XXXXXXX 100644
31
index XXXXXXX..XXXXXXX 100644
17
--- a/block/nvme.c
32
--- a/migration/block.c
18
+++ b/block/nvme.c
33
+++ b/migration/block.c
19
@@ -XXX,XX +XXX,XX @@
34
@@ -XXX,XX +XXX,XX @@ static void unset_dirty_tracking(void)
20
*/
35
BlkMigDevState *bmds;
21
#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
36
22
37
QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
23
+typedef struct BDRVNVMeState BDRVNVMeState;
38
- bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
24
+
39
+ if (bmds->dirty_bitmap) {
25
typedef struct {
40
+ bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
26
int32_t head, tail;
41
+ }
27
uint8_t *queue;
28
@@ -XXX,XX +XXX,XX @@ typedef struct {
29
typedef struct {
30
QemuMutex lock;
31
32
+ /* Read from I/O code path, initialized under BQL */
33
+ BDRVNVMeState *s;
34
+ int index;
35
+
36
/* Fields protected by BQL */
37
- int index;
38
uint8_t *prp_list_pages;
39
40
/* Fields protected by @lock */
41
@@ -XXX,XX +XXX,XX @@ typedef volatile struct {
42
43
QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
44
45
-typedef struct {
46
+struct BDRVNVMeState {
47
AioContext *aio_context;
48
QEMUVFIOState *vfio;
49
NVMeRegs *regs;
50
@@ -XXX,XX +XXX,XX @@ typedef struct {
51
52
/* PCI address (required for nvme_refresh_filename()) */
53
char *device;
54
-} BDRVNVMeState;
55
+};
56
57
#define NVME_BLOCK_OPT_DEVICE "device"
58
#define NVME_BLOCK_OPT_NAMESPACE "namespace"
59
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
60
}
42
}
61
}
43
}
62
44
63
-static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
45
@@ -XXX,XX +XXX,XX @@ static int64_t get_remaining_dirty(void)
64
+static void nvme_free_queue_pair(NVMeQueuePair *q)
46
static void block_migration_cleanup_bmds(void)
65
{
47
{
66
qemu_vfree(q->prp_list_pages);
48
BlkMigDevState *bmds;
67
qemu_vfree(q->sq.queue);
49
+ BlockDriverState *bs;
68
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
50
AioContext *ctx;
69
uint64_t prp_list_iova;
51
70
52
unset_dirty_tracking();
71
qemu_mutex_init(&q->lock);
53
72
+ q->s = s;
54
while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
73
q->index = idx;
55
QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
74
qemu_co_queue_init(&q->free_req_queue);
56
- bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
75
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
76
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
77
78
return q;
79
fail:
80
- nvme_free_queue_pair(bs, q);
81
+ nvme_free_queue_pair(q);
82
return NULL;
83
}
84
85
/* With q->lock */
86
-static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
87
+static void nvme_kick(NVMeQueuePair *q)
88
{
89
+ BDRVNVMeState *s = q->s;
90
+
57
+
91
if (s->plugged || !q->need_kick) {
58
+ bs = blk_bs(bmds->blk);
92
return;
59
+ if (bs) {
93
}
60
+ bdrv_op_unblock_all(bs, bmds->blocker);
94
@@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
61
+ }
95
}
62
error_free(bmds->blocker);
96
63
97
/* With q->lock */
64
/* Save ctx, because bmds->blk can disappear during blk_unref. */
98
-static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
99
+static void nvme_wake_free_req_locked(NVMeQueuePair *q)
100
{
101
if (!qemu_co_queue_empty(&q->free_req_queue)) {
102
- replay_bh_schedule_oneshot_event(s->aio_context,
103
+ replay_bh_schedule_oneshot_event(q->s->aio_context,
104
nvme_free_req_queue_cb, q);
105
}
106
}
107
108
/* Insert a request in the freelist and wake waiters */
109
-static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
110
- NVMeRequest *req)
111
+static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
112
{
113
qemu_mutex_lock(&q->lock);
114
nvme_put_free_req_locked(q, req);
115
- nvme_wake_free_req_locked(s, q);
116
+ nvme_wake_free_req_locked(q);
117
qemu_mutex_unlock(&q->lock);
118
}
119
120
@@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c)
121
}
122
123
/* With q->lock */
124
-static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
125
+static bool nvme_process_completion(NVMeQueuePair *q)
126
{
127
+ BDRVNVMeState *s = q->s;
128
bool progress = false;
129
NVMeRequest *preq;
130
NVMeRequest req;
131
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
132
/* Notify the device so it can post more completions. */
133
smp_mb_release();
134
*q->cq.doorbell = cpu_to_le32(q->cq.head);
135
- nvme_wake_free_req_locked(s, q);
136
+ nvme_wake_free_req_locked(q);
137
}
138
q->busy = false;
139
return progress;
140
@@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd)
141
}
142
}
143
144
-static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
145
- NVMeRequest *req,
146
+static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
147
NvmeCmd *cmd, BlockCompletionFunc cb,
148
void *opaque)
149
{
150
@@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
151
req->opaque = opaque;
152
cmd->cid = cpu_to_le32(req->cid);
153
154
- trace_nvme_submit_command(s, q->index, req->cid);
155
+ trace_nvme_submit_command(q->s, q->index, req->cid);
156
nvme_trace_command(cmd);
157
qemu_mutex_lock(&q->lock);
158
memcpy((uint8_t *)q->sq.queue +
159
q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
160
q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
161
q->need_kick++;
162
- nvme_kick(s, q);
163
- nvme_process_completion(s, q);
164
+ nvme_kick(q);
165
+ nvme_process_completion(q);
166
qemu_mutex_unlock(&q->lock);
167
}
168
169
@@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
170
NvmeCmd *cmd)
171
{
172
NVMeRequest *req;
173
- BDRVNVMeState *s = bs->opaque;
174
int ret = -EINPROGRESS;
175
req = nvme_get_free_req(q);
176
if (!req) {
177
return -EBUSY;
178
}
179
- nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
180
+ nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
181
182
BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
183
return ret;
184
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
185
}
186
187
qemu_mutex_lock(&q->lock);
188
- while (nvme_process_completion(s, q)) {
189
+ while (nvme_process_completion(q)) {
190
/* Keep polling */
191
progress = true;
192
}
193
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
194
};
195
if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
196
error_setg(errp, "Failed to create io queue [%d]", n);
197
- nvme_free_queue_pair(bs, q);
198
+ nvme_free_queue_pair(q);
199
return false;
200
}
201
cmd = (NvmeCmd) {
202
@@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
203
};
204
if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
205
error_setg(errp, "Failed to create io queue [%d]", n);
206
- nvme_free_queue_pair(bs, q);
207
+ nvme_free_queue_pair(q);
208
return false;
209
}
210
s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
211
@@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs)
212
BDRVNVMeState *s = bs->opaque;
213
214
for (i = 0; i < s->nr_queues; ++i) {
215
- nvme_free_queue_pair(bs, s->queues[i]);
216
+ nvme_free_queue_pair(s->queues[i]);
217
}
218
g_free(s->queues);
219
aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
220
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
221
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
222
qemu_co_mutex_unlock(&s->dma_map_lock);
223
if (r) {
224
- nvme_put_free_req_and_wake(s, ioq, req);
225
+ nvme_put_free_req_and_wake(ioq, req);
226
return r;
227
}
228
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
229
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
230
231
data.co = qemu_coroutine_self();
232
while (data.ret == -EINPROGRESS) {
233
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
234
assert(s->nr_queues > 1);
235
req = nvme_get_free_req(ioq);
236
assert(req);
237
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
238
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
239
240
data.co = qemu_coroutine_self();
241
if (data.ret == -EINPROGRESS) {
242
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
243
req = nvme_get_free_req(ioq);
244
assert(req);
245
246
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
247
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
248
249
data.co = qemu_coroutine_self();
250
while (data.ret == -EINPROGRESS) {
251
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
252
qemu_co_mutex_unlock(&s->dma_map_lock);
253
254
if (ret) {
255
- nvme_put_free_req_and_wake(s, ioq, req);
256
+ nvme_put_free_req_and_wake(ioq, req);
257
goto out;
258
}
259
260
trace_nvme_dsm(s, offset, bytes);
261
262
- nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
263
+ nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
264
265
data.co = qemu_coroutine_self();
266
while (data.ret == -EINPROGRESS) {
267
@@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs)
268
for (i = 1; i < s->nr_queues; i++) {
269
NVMeQueuePair *q = s->queues[i];
270
qemu_mutex_lock(&q->lock);
271
- nvme_kick(s, q);
272
- nvme_process_completion(s, q);
273
+ nvme_kick(q);
274
+ nvme_process_completion(q);
275
qemu_mutex_unlock(&q->lock);
276
}
277
}
278
--
65
--
279
2.26.2
66
2.41.0
280
diff view generated by jsdifflib
1
QEMU block drivers are supposed to support aio_poll() from I/O
1
From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
2
completion callback functions. This means completion processing must be
3
re-entrant.
4
2
5
The standard approach is to schedule a BH during completion processing
3
This is going to be used in the subsequent commit as requests alignment
6
and cancel it at the end of processing. If aio_poll() is invoked by a
4
(in particular, during copy-on-read). This value only makes sense for
7
callback function then the BH will run. The BH continues the suspended
5
the formats which support subclusters (currently QCOW2 only). If this
8
completion processing.
6
field isn't set by driver's own bdrv_get_info() implementation, we
7
simply set it equal to the cluster size thus treating each cluster as
8
having a single subcluster.
9
9
10
All of this means that request A's cb() can synchronously wait for
10
Reviewed-by: Eric Blake <eblake@redhat.com>
11
request B to complete. Previously the nvme block driver would hang
11
Reviewed-by: Denis V. Lunev <den@openvz.org>
12
because it didn't process completions from nested aio_poll().
12
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
13
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Message-ID: <20230711172553.234055-2-andrey.drobyshev@virtuozzo.com>
16
---
17
include/block/block-common.h | 5 +++++
18
block.c | 7 +++++++
19
block/qcow2.c | 1 +
20
3 files changed, 13 insertions(+)
13
21
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
22
diff --git a/include/block/block-common.h b/include/block/block-common.h
15
Reviewed-by: Sergio Lopez <slp@redhat.com>
16
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
19
block/nvme.c | 67 ++++++++++++++++++++++++++++++++++++++++------
20
block/trace-events | 2 +-
21
2 files changed, 60 insertions(+), 9 deletions(-)
22
23
diff --git a/block/nvme.c b/block/nvme.c
24
index XXXXXXX..XXXXXXX 100644
23
index XXXXXXX..XXXXXXX 100644
25
--- a/block/nvme.c
24
--- a/include/block/block-common.h
26
+++ b/block/nvme.c
25
+++ b/include/block/block-common.h
27
@@ -XXX,XX +XXX,XX @@ typedef struct {
26
@@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneWps {
28
int cq_phase;
27
typedef struct BlockDriverInfo {
29
int free_req_head;
28
/* in bytes, 0 if irrelevant */
30
NVMeRequest reqs[NVME_NUM_REQS];
29
int cluster_size;
31
- bool busy;
30
+ /*
32
int need_kick;
31
+ * A fraction of cluster_size, if supported (currently QCOW2 only); if
33
int inflight;
32
+ * disabled or unsupported, set equal to cluster_size.
34
+
33
+ */
35
+ /* Thread-safe, no lock necessary */
34
+ int subcluster_size;
36
+ QEMUBH *completion_bh;
35
/* offset at which the VM state can be saved (0 if not possible) */
37
} NVMeQueuePair;
36
int64_t vm_state_offset;
38
37
bool is_dirty;
39
/* Memory mapped registers */
38
diff --git a/block.c b/block.c
40
@@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState {
39
index XXXXXXX..XXXXXXX 100644
41
#define NVME_BLOCK_OPT_DEVICE "device"
40
--- a/block.c
42
#define NVME_BLOCK_OPT_NAMESPACE "namespace"
41
+++ b/block.c
43
42
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
44
+static void nvme_process_completion_bh(void *opaque);
43
}
45
+
44
memset(bdi, 0, sizeof(*bdi));
46
static QemuOptsList runtime_opts = {
45
ret = drv->bdrv_co_get_info(bs, bdi);
47
.name = "nvme",
46
+ if (bdi->subcluster_size == 0) {
48
.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
47
+ /*
49
@@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
48
+ * If the driver left this unset, subclusters are not supported.
50
49
+ * Then it is safe to treat each cluster as having only one subcluster.
51
static void nvme_free_queue_pair(NVMeQueuePair *q)
50
+ */
51
+ bdi->subcluster_size = bdi->cluster_size;
52
+ }
53
if (ret < 0) {
54
return ret;
55
}
56
diff --git a/block/qcow2.c b/block/qcow2.c
57
index XXXXXXX..XXXXXXX 100644
58
--- a/block/qcow2.c
59
+++ b/block/qcow2.c
60
@@ -XXX,XX +XXX,XX @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
52
{
61
{
53
+ if (q->completion_bh) {
62
BDRVQcow2State *s = bs->opaque;
54
+ qemu_bh_delete(q->completion_bh);
63
bdi->cluster_size = s->cluster_size;
55
+ }
64
+ bdi->subcluster_size = s->subcluster_size;
56
qemu_vfree(q->prp_list_pages);
65
bdi->vm_state_offset = qcow2_vm_state_offset(s);
57
qemu_vfree(q->sq.queue);
66
bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY;
58
qemu_vfree(q->cq.queue);
67
return 0;
59
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
60
q->index = idx;
61
qemu_co_queue_init(&q->free_req_queue);
62
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
63
+ q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
64
+ nvme_process_completion_bh, q);
65
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
66
s->page_size * NVME_NUM_REQS,
67
false, &prp_list_iova);
68
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
69
NvmeCqe *c;
70
71
trace_nvme_process_completion(s, q->index, q->inflight);
72
- if (q->busy || s->plugged) {
73
- trace_nvme_process_completion_queue_busy(s, q->index);
74
+ if (s->plugged) {
75
+ trace_nvme_process_completion_queue_plugged(s, q->index);
76
return false;
77
}
78
- q->busy = true;
79
+
80
+ /*
81
+ * Support re-entrancy when a request cb() function invokes aio_poll().
82
+ * Pending completions must be visible to aio_poll() so that a cb()
83
+ * function can wait for the completion of another request.
84
+ *
85
+ * The aio_poll() loop will execute our BH and we'll resume completion
86
+ * processing there.
87
+ */
88
+ qemu_bh_schedule(q->completion_bh);
89
+
90
assert(q->inflight >= 0);
91
while (q->inflight) {
92
int ret;
93
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
94
assert(req.cb);
95
nvme_put_free_req_locked(q, preq);
96
preq->cb = preq->opaque = NULL;
97
- qemu_mutex_unlock(&q->lock);
98
- req.cb(req.opaque, ret);
99
- qemu_mutex_lock(&q->lock);
100
q->inflight--;
101
+ qemu_mutex_unlock(&q->lock);
102
+ req.cb(req.opaque, ret);
103
+ qemu_mutex_lock(&q->lock);
104
progress = true;
105
}
106
if (progress) {
107
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q)
108
*q->cq.doorbell = cpu_to_le32(q->cq.head);
109
nvme_wake_free_req_locked(q);
110
}
111
- q->busy = false;
112
+
113
+ qemu_bh_cancel(q->completion_bh);
114
+
115
return progress;
116
}
117
118
+static void nvme_process_completion_bh(void *opaque)
119
+{
120
+ NVMeQueuePair *q = opaque;
121
+
122
+ /*
123
+ * We're being invoked because a nvme_process_completion() cb() function
124
+ * called aio_poll(). The callback may be waiting for further completions
125
+ * so notify the device that it has space to fill in more completions now.
126
+ */
127
+ smp_mb_release();
128
+ *q->cq.doorbell = cpu_to_le32(q->cq.head);
129
+ nvme_wake_free_req_locked(q);
130
+
131
+ nvme_process_completion(q);
132
+}
133
+
134
static void nvme_trace_command(const NvmeCmd *cmd)
135
{
136
int i;
137
@@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs)
138
{
139
BDRVNVMeState *s = bs->opaque;
140
141
+ for (int i = 0; i < s->nr_queues; i++) {
142
+ NVMeQueuePair *q = s->queues[i];
143
+
144
+ qemu_bh_delete(q->completion_bh);
145
+ q->completion_bh = NULL;
146
+ }
147
+
148
aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
149
false, NULL, NULL);
150
}
151
@@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs,
152
s->aio_context = new_context;
153
aio_set_event_notifier(new_context, &s->irq_notifier,
154
false, nvme_handle_event, nvme_poll_cb);
155
+
156
+ for (int i = 0; i < s->nr_queues; i++) {
157
+ NVMeQueuePair *q = s->queues[i];
158
+
159
+ q->completion_bh =
160
+ aio_bh_new(new_context, nvme_process_completion_bh, q);
161
+ }
162
}
163
164
static void nvme_aio_plug(BlockDriverState *bs)
165
diff --git a/block/trace-events b/block/trace-events
166
index XXXXXXX..XXXXXXX 100644
167
--- a/block/trace-events
168
+++ b/block/trace-events
169
@@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d"
170
nvme_dma_flush_queue_wait(void *s) "s %p"
171
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
172
nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
173
-nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
174
+nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
175
nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
176
nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
177
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
178
--
68
--
179
2.26.2
69
2.41.0
180
diff view generated by jsdifflib
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
1
From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
2
2
3
LLVM's SafeStack instrumentation does not yet support programs that make
3
When target image is using subclusters, and we align the request during
4
use of the APIs in ucontext.h
4
copy-on-read, it makes sense to align to subcluster_size rather than
5
With the current implementation of coroutine-ucontext, the resulting
5
cluster_size. Otherwise we end up with unnecessary allocations.
6
binary is incorrect, with different coroutines sharing the same unsafe
6
7
stack and producing undefined behavior at runtime.
7
This commit renames bdrv_round_to_clusters() to bdrv_round_to_subclusters()
8
This fix allocates an additional unsafe stack area for each coroutine,
8
and utilizes subcluster_size field of BlockDriverInfo to make necessary
9
and sets the new unsafe stack pointer before calling swapcontext() in
9
alignments. It affects copy-on-read as well as mirror job (which is
10
qemu_coroutine_new.
10
using bdrv_round_to_clusters()).
11
This is the only place where the pointer needs to be manually updated,
11
12
since sigsetjmp/siglongjmp are already instrumented by LLVM to properly
12
This change also fixes the following bug with failing assert (covered by
13
support SafeStack.
13
the test in the subsequent commit):
14
The additional stack is then freed in qemu_coroutine_delete.
14
15
15
qemu-img create -f qcow2 base.qcow2 64K
16
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
16
qemu-img create -f qcow2 -o extended_l2=on,backing_file=base.qcow2,backing_fmt=qcow2 img.qcow2 64K
17
Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com
17
qemu-io -c "write -P 0xaa 0 2K" img.qcow2
18
qemu-io -C -c "read -P 0x00 2K 62K" img.qcow2
19
20
qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.
21
22
Reviewed-by: Eric Blake <eblake@redhat.com>
23
Reviewed-by: Denis V. Lunev <den@openvz.org>
24
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
25
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
26
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
27
Message-ID: <20230711172553.234055-3-andrey.drobyshev@virtuozzo.com>
19
---
28
---
20
include/qemu/coroutine_int.h | 5 +++++
29
include/block/block-io.h | 8 +++----
21
util/coroutine-ucontext.c | 28 ++++++++++++++++++++++++++++
30
block/io.c | 50 ++++++++++++++++++++--------------------
22
2 files changed, 33 insertions(+)
31
block/mirror.c | 8 +++----
23
32
3 files changed, 33 insertions(+), 33 deletions(-)
24
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
33
34
diff --git a/include/block/block-io.h b/include/block/block-io.h
25
index XXXXXXX..XXXXXXX 100644
35
index XXXXXXX..XXXXXXX 100644
26
--- a/include/qemu/coroutine_int.h
36
--- a/include/block/block-io.h
27
+++ b/include/qemu/coroutine_int.h
37
+++ b/include/block/block-io.h
28
@@ -XXX,XX +XXX,XX @@
38
@@ -XXX,XX +XXX,XX @@ bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
29
#include "qemu/queue.h"
39
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
30
#include "qemu/coroutine.h"
40
Error **errp);
31
41
BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
32
+#ifdef CONFIG_SAFESTACK
42
-void bdrv_round_to_clusters(BlockDriverState *bs,
33
+/* Pointer to the unsafe stack, defined by the compiler */
43
- int64_t offset, int64_t bytes,
34
+extern __thread void *__safestack_unsafe_stack_ptr;
44
- int64_t *cluster_offset,
35
+#endif
45
- int64_t *cluster_bytes);
36
+
46
+void bdrv_round_to_subclusters(BlockDriverState *bs,
37
#define COROUTINE_STACK_SIZE (1 << 20)
47
+ int64_t offset, int64_t bytes,
38
48
+ int64_t *cluster_offset,
39
typedef enum {
49
+ int64_t *cluster_bytes);
40
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
50
51
void bdrv_get_backing_filename(BlockDriverState *bs,
52
char *filename, int filename_size);
53
diff --git a/block/io.c b/block/io.c
41
index XXXXXXX..XXXXXXX 100644
54
index XXXXXXX..XXXXXXX 100644
42
--- a/util/coroutine-ucontext.c
55
--- a/block/io.c
43
+++ b/util/coroutine-ucontext.c
56
+++ b/block/io.c
44
@@ -XXX,XX +XXX,XX @@ typedef struct {
57
@@ -XXX,XX +XXX,XX @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
45
Coroutine base;
58
}
46
void *stack;
59
47
size_t stack_size;
60
/**
48
+#ifdef CONFIG_SAFESTACK
61
- * Round a region to cluster boundaries
49
+ /* Need an unsafe stack for each coroutine */
62
+ * Round a region to subcluster (if supported) or cluster boundaries
50
+ void *unsafe_stack;
63
*/
51
+ size_t unsafe_stack_size;
64
void coroutine_fn GRAPH_RDLOCK
52
+#endif
65
-bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
53
sigjmp_buf env;
66
- int64_t *cluster_offset, int64_t *cluster_bytes)
54
67
+bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
55
void *tsan_co_fiber;
68
+ int64_t *align_offset, int64_t *align_bytes)
56
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
69
{
57
co = g_malloc0(sizeof(*co));
70
BlockDriverInfo bdi;
58
co->stack_size = COROUTINE_STACK_SIZE;
71
IO_CODE();
59
co->stack = qemu_alloc_stack(&co->stack_size);
72
- if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
60
+#ifdef CONFIG_SAFESTACK
73
- *cluster_offset = offset;
61
+ co->unsafe_stack_size = COROUTINE_STACK_SIZE;
74
- *cluster_bytes = bytes;
62
+ co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
75
+ if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) {
63
+#endif
76
+ *align_offset = offset;
64
co->base.entry_arg = &old_env; /* stash away our jmp_buf */
77
+ *align_bytes = bytes;
65
78
} else {
66
uc.uc_link = &old_uc;
79
- int64_t c = bdi.cluster_size;
67
@@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void)
80
- *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
68
COROUTINE_YIELD,
81
- *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
69
&fake_stack_save,
82
+ int64_t c = bdi.subcluster_size;
70
co->stack, co->stack_size, co->tsan_co_fiber);
83
+ *align_offset = QEMU_ALIGN_DOWN(offset, c);
71
+
84
+ *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c);
72
+#ifdef CONFIG_SAFESTACK
73
+ /*
74
+ * Before we swap the context, set the new unsafe stack
75
+ * The unsafe stack grows just like the normal stack, so start from
76
+ * the last usable location of the memory area.
77
+ * NOTE: we don't have to re-set the usp afterwards because we are
78
+ * coming back to this context through a siglongjmp.
79
+ * The compiler already wrapped the corresponding sigsetjmp call with
80
+ * code that saves the usp on the (safe) stack before the call, and
81
+ * restores it right after (which is where we return with siglongjmp).
82
+ */
83
+ void *usp = co->unsafe_stack + co->unsafe_stack_size;
84
+ __safestack_unsafe_stack_ptr = usp;
85
+#endif
86
+
87
swapcontext(&old_uc, &uc);
88
}
85
}
89
90
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_)
91
#endif
92
93
qemu_free_stack(co->stack, co->stack_size);
94
+#ifdef CONFIG_SAFESTACK
95
+ qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
96
+#endif
97
g_free(co);
98
}
86
}
99
87
88
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
89
void *bounce_buffer = NULL;
90
91
BlockDriver *drv = bs->drv;
92
- int64_t cluster_offset;
93
- int64_t cluster_bytes;
94
+ int64_t align_offset;
95
+ int64_t align_bytes;
96
int64_t skip_bytes;
97
int ret;
98
int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
99
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
100
* BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
101
* is one reason we loop rather than doing it all at once.
102
*/
103
- bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
104
- skip_bytes = offset - cluster_offset;
105
+ bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes);
106
+ skip_bytes = offset - align_offset;
107
108
trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
109
- cluster_offset, cluster_bytes);
110
+ align_offset, align_bytes);
111
112
- while (cluster_bytes) {
113
+ while (align_bytes) {
114
int64_t pnum;
115
116
if (skip_write) {
117
ret = 1; /* "already allocated", so nothing will be copied */
118
- pnum = MIN(cluster_bytes, max_transfer);
119
+ pnum = MIN(align_bytes, max_transfer);
120
} else {
121
- ret = bdrv_is_allocated(bs, cluster_offset,
122
- MIN(cluster_bytes, max_transfer), &pnum);
123
+ ret = bdrv_is_allocated(bs, align_offset,
124
+ MIN(align_bytes, max_transfer), &pnum);
125
if (ret < 0) {
126
/*
127
* Safe to treat errors in querying allocation as if
128
* unallocated; we'll probably fail again soon on the
129
* read, but at least that will set a decent errno.
130
*/
131
- pnum = MIN(cluster_bytes, max_transfer);
132
+ pnum = MIN(align_bytes, max_transfer);
133
}
134
135
/* Stop at EOF if the image ends in the middle of the cluster */
136
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
137
/* Must copy-on-read; use the bounce buffer */
138
pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
139
if (!bounce_buffer) {
140
- int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
141
+ int64_t max_we_need = MAX(pnum, align_bytes - pnum);
142
int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
143
int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
144
145
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
146
}
147
qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
148
149
- ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
150
+ ret = bdrv_driver_preadv(bs, align_offset, pnum,
151
&local_qiov, 0, 0);
152
if (ret < 0) {
153
goto err;
154
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
155
/* FIXME: Should we (perhaps conditionally) be setting
156
* BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
157
* that still correctly reads as zero? */
158
- ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
159
+ ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum,
160
BDRV_REQ_WRITE_UNCHANGED);
161
} else {
162
/* This does not change the data on the disk, it is not
163
* necessary to flush even in cache=writethrough mode.
164
*/
165
- ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
166
+ ret = bdrv_driver_pwritev(bs, align_offset, pnum,
167
&local_qiov, 0,
168
BDRV_REQ_WRITE_UNCHANGED);
169
}
170
@@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes,
171
}
172
}
173
174
- cluster_offset += pnum;
175
- cluster_bytes -= pnum;
176
+ align_offset += pnum;
177
+ align_bytes -= pnum;
178
progress += pnum - skip_bytes;
179
skip_bytes = 0;
180
}
181
diff --git a/block/mirror.c b/block/mirror.c
182
index XXXXXXX..XXXXXXX 100644
183
--- a/block/mirror.c
184
+++ b/block/mirror.c
185
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
186
need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity,
187
s->cow_bitmap);
188
if (need_cow) {
189
- bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes,
190
- &align_offset, &align_bytes);
191
+ bdrv_round_to_subclusters(blk_bs(s->target), *offset, *bytes,
192
+ &align_offset, &align_bytes);
193
}
194
195
if (align_bytes > max_bytes) {
196
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
197
int64_t target_offset;
198
int64_t target_bytes;
199
WITH_GRAPH_RDLOCK_GUARD() {
200
- bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
201
- &target_offset, &target_bytes);
202
+ bdrv_round_to_subclusters(blk_bs(s->target), offset, io_bytes,
203
+ &target_offset, &target_bytes);
204
}
205
if (target_offset == offset &&
206
target_bytes == io_bytes) {
100
--
207
--
101
2.26.2
208
2.41.0
102
diff view generated by jsdifflib
Deleted patch
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
2
1
3
Current implementation of LLVM's SafeStack is not compatible with
4
code that uses an alternate stack created with sigaltstack().
5
Since coroutine-sigaltstack relies on sigaltstack(), it is not
6
compatible with SafeStack. The resulting binary is incorrect, with
7
different coroutines sharing the same unsafe stack and producing
8
undefined behavior at runtime.
9
10
In the future LLVM may provide a SafeStack implementation compatible with
11
sigaltstack(). In the meantime, if SafeStack is desired, the coroutine
12
implementation from coroutine-ucontext should be used.
13
As a safety check, add a control in coroutine-sigaltstack to throw a
14
preprocessor #error if SafeStack is enabled and we are trying to
15
use coroutine-sigaltstack to implement coroutines.
16
17
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
18
Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com
19
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
---
21
util/coroutine-sigaltstack.c | 4 ++++
22
1 file changed, 4 insertions(+)
23
24
diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
25
index XXXXXXX..XXXXXXX 100644
26
--- a/util/coroutine-sigaltstack.c
27
+++ b/util/coroutine-sigaltstack.c
28
@@ -XXX,XX +XXX,XX @@
29
#include "qemu-common.h"
30
#include "qemu/coroutine_int.h"
31
32
+#ifdef CONFIG_SAFESTACK
33
+#error "SafeStack is not compatible with code run in alternate signal stacks"
34
+#endif
35
+
36
typedef struct {
37
Coroutine base;
38
void *stack;
39
--
40
2.26.2
41
diff view generated by jsdifflib
Deleted patch
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
2
1
3
This patch adds a flag to enable/disable the SafeStack instrumentation
4
provided by LLVM.
5
6
On enable, make sure that the compiler supports the flags, and that we
7
are using the proper coroutine implementation (coroutine-ucontext).
8
On disable, explicitly disable the option if it was enabled by default.
9
10
While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS,
11
we are not checking for the O.S. since this is already done by LLVM.
12
13
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
14
Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
18
1 file changed, 73 insertions(+)
19
20
diff --git a/configure b/configure
21
index XXXXXXX..XXXXXXX 100755
22
--- a/configure
23
+++ b/configure
24
@@ -XXX,XX +XXX,XX @@ audio_win_int=""
25
libs_qga=""
26
debug_info="yes"
27
stack_protector=""
28
+safe_stack=""
29
use_containers="yes"
30
gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb")
31
32
@@ -XXX,XX +XXX,XX @@ for opt do
33
;;
34
--disable-stack-protector) stack_protector="no"
35
;;
36
+ --enable-safe-stack) safe_stack="yes"
37
+ ;;
38
+ --disable-safe-stack) safe_stack="no"
39
+ ;;
40
--disable-curses) curses="no"
41
;;
42
--enable-curses) curses="yes"
43
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
44
debug-tcg TCG debugging (default is disabled)
45
debug-info debugging information
46
sparse sparse checker
47
+ safe-stack SafeStack Stack Smash Protection. Depends on
48
+ clang/llvm >= 3.7 and requires coroutine backend ucontext.
49
50
gnutls GNUTLS cryptography support
51
nettle nettle cryptography support
52
@@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then
53
fi
54
fi
55
56
+##################################################
57
+# SafeStack
58
+
59
+
60
+if test "$safe_stack" = "yes"; then
61
+cat > $TMPC << EOF
62
+int main(int argc, char *argv[])
63
+{
64
+#if ! __has_feature(safe_stack)
65
+#error SafeStack Disabled
66
+#endif
67
+ return 0;
68
+}
69
+EOF
70
+ flag="-fsanitize=safe-stack"
71
+ # Check that safe-stack is supported and enabled.
72
+ if compile_prog "-Werror $flag" "$flag"; then
73
+ # Flag needed both at compilation and at linking
74
+ QEMU_CFLAGS="$QEMU_CFLAGS $flag"
75
+ QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
76
+ else
77
+ error_exit "SafeStack not supported by your compiler"
78
+ fi
79
+ if test "$coroutine" != "ucontext"; then
80
+ error_exit "SafeStack is only supported by the coroutine backend ucontext"
81
+ fi
82
+else
83
+cat > $TMPC << EOF
84
+int main(int argc, char *argv[])
85
+{
86
+#if defined(__has_feature)
87
+#if __has_feature(safe_stack)
88
+#error SafeStack Enabled
89
+#endif
90
+#endif
91
+ return 0;
92
+}
93
+EOF
94
+if test "$safe_stack" = "no"; then
95
+ # Make sure that safe-stack is disabled
96
+ if ! compile_prog "-Werror" ""; then
97
+ # SafeStack was already enabled, try to explicitly remove the feature
98
+ flag="-fno-sanitize=safe-stack"
99
+ if ! compile_prog "-Werror $flag" "$flag"; then
100
+ error_exit "Configure cannot disable SafeStack"
101
+ fi
102
+ QEMU_CFLAGS="$QEMU_CFLAGS $flag"
103
+ QEMU_LDFLAGS="$QEMU_LDFLAGS $flag"
104
+ fi
105
+else # "$safe_stack" = ""
106
+ # Set safe_stack to yes or no based on pre-existing flags
107
+ if compile_prog "-Werror" ""; then
108
+ safe_stack="no"
109
+ else
110
+ safe_stack="yes"
111
+ if test "$coroutine" != "ucontext"; then
112
+ error_exit "SafeStack is only supported by the coroutine backend ucontext"
113
+ fi
114
+ fi
115
+fi
116
+fi
117
118
##########################################
119
# check if we have open_by_handle_at
120
@@ -XXX,XX +XXX,XX @@ echo "sparse enabled $sparse"
121
echo "strip binaries $strip_opt"
122
echo "profiler $profiler"
123
echo "static build $static"
124
+echo "safe stack $safe_stack"
125
if test "$darwin" = "yes" ; then
126
echo "Cocoa support $cocoa"
127
fi
128
@@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then
129
echo "export CCACHE_CPP2=y" >> $config_host_mak
130
fi
131
132
+if test "$safe_stack" = "yes"; then
133
+ echo "CONFIG_SAFESTACK=y" >> $config_host_mak
134
+fi
135
+
136
# If we're using a separate build tree, set it up now.
137
# DIRS are directories which we simply mkdir in the build tree;
138
# LINKS are things to symlink back into the source tree
139
--
140
2.26.2
141
diff view generated by jsdifflib
Deleted patch
1
From: Daniele Buono <dbuono@linux.vnet.ibm.com>
2
1
3
SafeStack is a stack protection technique implemented in llvm. It is
4
enabled with a -fsanitize flag.
5
iotests are currently disabled when any -fsanitize option is used,
6
because such options tend to produce additional warnings and false
7
positives.
8
9
While common -fsanitize options are used to verify the code and not
10
added in production, SafeStack's main use is in production environments
11
to protect against stack smashing.
12
13
Since SafeStack does not print any warning or false positive, enable
14
iotests when SafeStack is the only -fsanitize option used.
15
This is likely going to be a production binary and we want to make sure
16
it works correctly.
17
18
Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com>
19
Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
---
22
tests/check-block.sh | 12 +++++++++++-
23
1 file changed, 11 insertions(+), 1 deletion(-)
24
25
diff --git a/tests/check-block.sh b/tests/check-block.sh
26
index XXXXXXX..XXXXXXX 100755
27
--- a/tests/check-block.sh
28
+++ b/tests/check-block.sh
29
@@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then
30
exit 0
31
fi
32
33
-if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then
34
+# Disable tests with any sanitizer except for SafeStack
35
+CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null )
36
+SANITIZE_FLAGS=""
37
+#Remove all occurrencies of -fsanitize=safe-stack
38
+for i in ${CFLAGS}; do
39
+ if [ "${i}" != "-fsanitize=safe-stack" ]; then
40
+ SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}"
41
+ fi
42
+done
43
+if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then
44
+ # Have a sanitize flag that is not allowed, stop
45
echo "Sanitizers are enabled ==> Not running the qemu-iotests."
46
exit 0
47
fi
48
--
49
2.26.2
50
diff view generated by jsdifflib
Deleted patch
1
A lot of CPU time is spent simply locking/unlocking q->lock during
2
polling. Check for completion outside the lock to make q->lock disappear
3
from the profile.
4
1
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Sergio Lopez <slp@redhat.com>
7
Message-id: 20200617132201.1832152-2-stefanha@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
block/nvme.c | 12 ++++++++++++
11
1 file changed, 12 insertions(+)
12
13
diff --git a/block/nvme.c b/block/nvme.c
14
index XXXXXXX..XXXXXXX 100644
15
--- a/block/nvme.c
16
+++ b/block/nvme.c
17
@@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s)
18
19
for (i = 0; i < s->nr_queues; i++) {
20
NVMeQueuePair *q = s->queues[i];
21
+ const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
22
+ NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
23
+
24
+ /*
25
+ * Do an early check for completions. q->lock isn't needed because
26
+ * nvme_process_completion() only runs in the event loop thread and
27
+ * cannot race with itself.
28
+ */
29
+ if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
30
+ continue;
31
+ }
32
+
33
qemu_mutex_lock(&q->lock);
34
while (nvme_process_completion(s, q)) {
35
/* Keep polling */
36
--
37
2.26.2
38
diff view generated by jsdifflib
Deleted patch
1
nvme_process_completion() explicitly checks cid so the assertion that
2
follows is always true:
3
1
4
if (cid == 0 || cid > NVME_QUEUE_SIZE) {
5
...
6
continue;
7
}
8
assert(cid <= NVME_QUEUE_SIZE);
9
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Reviewed-by: Sergio Lopez <slp@redhat.com>
12
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
13
Message-id: 20200617132201.1832152-3-stefanha@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
block/nvme.c | 1 -
17
1 file changed, 1 deletion(-)
18
19
diff --git a/block/nvme.c b/block/nvme.c
20
index XXXXXXX..XXXXXXX 100644
21
--- a/block/nvme.c
22
+++ b/block/nvme.c
23
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
24
cid);
25
continue;
26
}
27
- assert(cid <= NVME_QUEUE_SIZE);
28
trace_nvme_complete_command(s, q->index, cid);
29
preq = &q->reqs[cid - 1];
30
req = *preq;
31
--
32
2.26.2
33
diff view generated by jsdifflib
Deleted patch
1
Do not access a CQE after incrementing q->cq.head and releasing q->lock.
2
It is unlikely that this causes problems in practice but it's a latent
3
bug.
4
1
5
The reason why it should be safe at the moment is that completion
6
processing is not re-entrant and the CQ doorbell isn't written until the
7
end of nvme_process_completion().
8
9
Make this change now because QEMU expects completion processing to be
10
re-entrant and later patches will do that.
11
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Sergio Lopez <slp@redhat.com>
14
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
15
Message-id: 20200617132201.1832152-4-stefanha@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
18
block/nvme.c | 5 ++++-
19
1 file changed, 4 insertions(+), 1 deletion(-)
20
21
diff --git a/block/nvme.c b/block/nvme.c
22
index XXXXXXX..XXXXXXX 100644
23
--- a/block/nvme.c
24
+++ b/block/nvme.c
25
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
26
q->busy = true;
27
assert(q->inflight >= 0);
28
while (q->inflight) {
29
+ int ret;
30
int16_t cid;
31
+
32
c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
33
if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
34
break;
35
}
36
+ ret = nvme_translate_error(c);
37
q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
38
if (!q->cq.head) {
39
q->cq_phase = !q->cq_phase;
40
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
41
preq->busy = false;
42
preq->cb = preq->opaque = NULL;
43
qemu_mutex_unlock(&q->lock);
44
- req.cb(req.opaque, nvme_translate_error(c));
45
+ req.cb(req.opaque, ret);
46
qemu_mutex_lock(&q->lock);
47
q->inflight--;
48
progress = true;
49
--
50
2.26.2
51
diff view generated by jsdifflib
1
Existing users access free_req_queue under q->lock. Document this.
1
From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
2
2
3
Add testcase which checks that allocations during copy-on-read are
4
performed on the subcluster basis when subclusters are enabled in target
5
image.
6
7
This testcase also triggers the following assert with previous commit
8
not being applied, so we check that as well:
9
10
qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed.
11
12
Reviewed-by: Eric Blake <eblake@redhat.com>
13
Reviewed-by: Denis V. Lunev <den@openvz.org>
14
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
15
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
3
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Reviewed-by: Sergio Lopez <slp@redhat.com>
17
Message-ID: <20230711172553.234055-4-andrey.drobyshev@virtuozzo.com>
5
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
6
Message-id: 20200617132201.1832152-6-stefanha@redhat.com
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
---
18
---
9
block/nvme.c | 2 +-
19
tests/qemu-iotests/197 | 29 +++++++++++++++++++++++++++++
10
1 file changed, 1 insertion(+), 1 deletion(-)
20
tests/qemu-iotests/197.out | 24 ++++++++++++++++++++++++
21
2 files changed, 53 insertions(+)
11
22
12
diff --git a/block/nvme.c b/block/nvme.c
23
diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197
24
index XXXXXXX..XXXXXXX 100755
25
--- a/tests/qemu-iotests/197
26
+++ b/tests/qemu-iotests/197
27
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -f qcow2 -C -c 'read 0 1024' "$TEST_WRAP" | _filter_qemu_io
28
$QEMU_IO -f qcow2 -c map "$TEST_WRAP"
29
_check_test_img
30
31
+echo
32
+echo '=== Copy-on-read with subclusters ==='
33
+echo
34
+
35
+# Create base and top images 64K (1 cluster) each. Make subclusters enabled
36
+# for the top image
37
+_make_test_img 64K
38
+IMGPROTO=file IMGFMT=qcow2 TEST_IMG_FILE="$TEST_WRAP" \
39
+ _make_test_img --no-opts -o extended_l2=true -F "$IMGFMT" -b "$TEST_IMG" \
40
+ 64K | _filter_img_create
41
+
42
+$QEMU_IO -c "write -P 0xaa 0 64k" "$TEST_IMG" | _filter_qemu_io
43
+
44
+# Allocate individual subclusters in the top image, and not the whole cluster
45
+$QEMU_IO -c "write -P 0xbb 28K 2K" -c "write -P 0xcc 34K 2K" "$TEST_WRAP" \
46
+ | _filter_qemu_io
47
+
48
+# Only 2 subclusters should be allocated in the top image at this point
49
+$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map
50
+
51
+# Actual copy-on-read operation
52
+$QEMU_IO -C -c "read -P 0xaa 30K 4K" "$TEST_WRAP" | _filter_qemu_io
53
+
54
+# And here we should have 4 subclusters allocated right in the middle of the
55
+# top image. Make sure the whole cluster remains unallocated
56
+$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map
57
+
58
+_check_test_img
59
+
60
# success, all done
61
echo '*** done'
62
status=0
63
diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out
13
index XXXXXXX..XXXXXXX 100644
64
index XXXXXXX..XXXXXXX 100644
14
--- a/block/nvme.c
65
--- a/tests/qemu-iotests/197.out
15
+++ b/block/nvme.c
66
+++ b/tests/qemu-iotests/197.out
16
@@ -XXX,XX +XXX,XX @@ typedef struct {
67
@@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 0
17
} NVMeRequest;
68
1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
18
69
1 KiB (0x400) bytes allocated at offset 0 bytes (0x0)
19
typedef struct {
70
No errors were found on the image.
20
- CoQueue free_req_queue;
71
+
21
QemuMutex lock;
72
+=== Copy-on-read with subclusters ===
22
73
+
23
/* Fields protected by BQL */
74
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536
24
@@ -XXX,XX +XXX,XX @@ typedef struct {
75
+Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
25
uint8_t *prp_list_pages;
76
+wrote 65536/65536 bytes at offset 0
26
77
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
27
/* Fields protected by @lock */
78
+wrote 2048/2048 bytes at offset 28672
28
+ CoQueue free_req_queue;
79
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
29
NVMeQueue sq, cq;
80
+wrote 2048/2048 bytes at offset 34816
30
int cq_phase;
81
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
31
int free_req_head;
82
+Offset Length File
83
+0 0x7000 TEST_DIR/t.IMGFMT
84
+0x7000 0x800 TEST_DIR/t.wrap.IMGFMT
85
+0x7800 0x1000 TEST_DIR/t.IMGFMT
86
+0x8800 0x800 TEST_DIR/t.wrap.IMGFMT
87
+0x9000 0x7000 TEST_DIR/t.IMGFMT
88
+read 4096/4096 bytes at offset 30720
89
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
90
+Offset Length File
91
+0 0x7000 TEST_DIR/t.IMGFMT
92
+0x7000 0x2000 TEST_DIR/t.wrap.IMGFMT
93
+0x9000 0x7000 TEST_DIR/t.IMGFMT
94
+No errors were found on the image.
95
*** done
32
--
96
--
33
2.26.2
97
2.41.0
34
diff view generated by jsdifflib
1
There are three issues with the current NVMeRequest->busy field:
1
liburing does not clear sqe->user_data. We must do it ourselves to avoid
2
1. The busy field is accidentally accessed outside q->lock when request
2
undefined behavior in process_cqe() when user_data is used.
3
submission fails.
4
2. Waiters on free_req_queue are not woken when a request is returned
5
early due to submission failure.
6
2. Finding a free request involves scanning all requests. This makes
7
request submission O(n^2).
8
3
9
Switch to an O(1) freelist that is always accessed under the lock.
4
Note that fdmon-io_uring is currently disabled, so this is a latent bug
10
5
that does not affect users. Let's merge this fix now to make it easier
11
Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
6
to enable fdmon-io_uring in the future (and I'm working on that).
12
NVME_NUM_REQS, the number of usable requests. This makes the code
13
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
14
that one slot is reserved.
15
7
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Reviewed-by: Sergio Lopez <slp@redhat.com>
9
Message-ID: <20230426212639.82310-1-stefanha@redhat.com>
18
Message-id: 20200617132201.1832152-5-stefanha@redhat.com
19
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
---
10
---
21
block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------
11
util/fdmon-io_uring.c | 2 ++
22
1 file changed, 54 insertions(+), 27 deletions(-)
12
1 file changed, 2 insertions(+)
23
13
24
diff --git a/block/nvme.c b/block/nvme.c
14
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
25
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
26
--- a/block/nvme.c
16
--- a/util/fdmon-io_uring.c
27
+++ b/block/nvme.c
17
+++ b/util/fdmon-io_uring.c
28
@@ -XXX,XX +XXX,XX @@
18
@@ -XXX,XX +XXX,XX @@ static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
29
#define NVME_QUEUE_SIZE 128
19
#else
30
#define NVME_BAR_SIZE 8192
20
io_uring_prep_poll_remove(sqe, node);
31
21
#endif
32
+/*
22
+ io_uring_sqe_set_data(sqe, NULL);
33
+ * We have to leave one slot empty as that is the full queue case where
34
+ * head == tail + 1.
35
+ */
36
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
37
+
38
typedef struct {
39
int32_t head, tail;
40
uint8_t *queue;
41
@@ -XXX,XX +XXX,XX @@ typedef struct {
42
int cid;
43
void *prp_list_page;
44
uint64_t prp_list_iova;
45
- bool busy;
46
+ int free_req_next; /* q->reqs[] index of next free req */
47
} NVMeRequest;
48
49
typedef struct {
50
@@ -XXX,XX +XXX,XX @@ typedef struct {
51
/* Fields protected by @lock */
52
NVMeQueue sq, cq;
53
int cq_phase;
54
- NVMeRequest reqs[NVME_QUEUE_SIZE];
55
+ int free_req_head;
56
+ NVMeRequest reqs[NVME_NUM_REQS];
57
bool busy;
58
int need_kick;
59
int inflight;
60
@@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
61
qemu_mutex_init(&q->lock);
62
q->index = idx;
63
qemu_co_queue_init(&q->free_req_queue);
64
- q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
65
+ q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
66
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
67
- s->page_size * NVME_QUEUE_SIZE,
68
+ s->page_size * NVME_NUM_REQS,
69
false, &prp_list_iova);
70
if (r) {
71
goto fail;
72
}
73
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
74
+ q->free_req_head = -1;
75
+ for (i = 0; i < NVME_NUM_REQS; i++) {
76
NVMeRequest *req = &q->reqs[i];
77
req->cid = i + 1;
78
+ req->free_req_next = q->free_req_head;
79
+ q->free_req_head = i;
80
req->prp_list_page = q->prp_list_pages + i * s->page_size;
81
req->prp_list_iova = prp_list_iova + i * s->page_size;
82
}
83
+
84
nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
85
if (local_err) {
86
error_propagate(errp, local_err);
87
@@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
88
*/
89
static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
90
{
91
- int i;
92
- NVMeRequest *req = NULL;
93
+ NVMeRequest *req;
94
95
qemu_mutex_lock(&q->lock);
96
- while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
97
- /* We have to leave one slot empty as that is the full queue case (head
98
- * == tail + 1). */
99
+
100
+ while (q->free_req_head == -1) {
101
if (qemu_in_coroutine()) {
102
trace_nvme_free_req_queue_wait(q);
103
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
104
@@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
105
return NULL;
106
}
107
}
108
- for (i = 0; i < NVME_QUEUE_SIZE; i++) {
109
- if (!q->reqs[i].busy) {
110
- q->reqs[i].busy = true;
111
- req = &q->reqs[i];
112
- break;
113
- }
114
- }
115
- /* We have checked inflight and need_kick while holding q->lock, so one
116
- * free req must be available. */
117
- assert(req);
118
+
119
+ req = &q->reqs[q->free_req_head];
120
+ q->free_req_head = req->free_req_next;
121
+ req->free_req_next = -1;
122
+
123
qemu_mutex_unlock(&q->lock);
124
return req;
125
}
23
}
126
24
127
+/* With q->lock */
25
/* Add a timeout that self-cancels when another cqe becomes ready */
128
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
26
@@ -XXX,XX +XXX,XX @@ static void add_timeout_sqe(AioContext *ctx, int64_t ns)
129
+{
27
130
+ req->free_req_next = q->free_req_head;
28
sqe = get_sqe(ctx);
131
+ q->free_req_head = req - q->reqs;
29
io_uring_prep_timeout(sqe, &ts, 1, 0);
132
+}
30
+ io_uring_sqe_set_data(sqe, NULL);
133
+
31
}
134
+/* With q->lock */
32
135
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
33
/* Add sqes from ctx->submit_list for submission */
136
+{
137
+ if (!qemu_co_queue_empty(&q->free_req_queue)) {
138
+ replay_bh_schedule_oneshot_event(s->aio_context,
139
+ nvme_free_req_queue_cb, q);
140
+ }
141
+}
142
+
143
+/* Insert a request in the freelist and wake waiters */
144
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q,
145
+ NVMeRequest *req)
146
+{
147
+ qemu_mutex_lock(&q->lock);
148
+ nvme_put_free_req_locked(q, req);
149
+ nvme_wake_free_req_locked(s, q);
150
+ qemu_mutex_unlock(&q->lock);
151
+}
152
+
153
static inline int nvme_translate_error(const NvmeCqe *c)
154
{
155
uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
156
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
157
req = *preq;
158
assert(req.cid == cid);
159
assert(req.cb);
160
- preq->busy = false;
161
+ nvme_put_free_req_locked(q, preq);
162
preq->cb = preq->opaque = NULL;
163
qemu_mutex_unlock(&q->lock);
164
req.cb(req.opaque, ret);
165
@@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
166
/* Notify the device so it can post more completions. */
167
smp_mb_release();
168
*q->cq.doorbell = cpu_to_le32(q->cq.head);
169
- if (!qemu_co_queue_empty(&q->free_req_queue)) {
170
- replay_bh_schedule_oneshot_event(s->aio_context,
171
- nvme_free_req_queue_cb, q);
172
- }
173
+ nvme_wake_free_req_locked(s, q);
174
}
175
q->busy = false;
176
return progress;
177
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
178
r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
179
qemu_co_mutex_unlock(&s->dma_map_lock);
180
if (r) {
181
- req->busy = false;
182
+ nvme_put_free_req_and_wake(s, ioq, req);
183
return r;
184
}
185
nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
186
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
187
qemu_co_mutex_unlock(&s->dma_map_lock);
188
189
if (ret) {
190
- req->busy = false;
191
+ nvme_put_free_req_and_wake(s, ioq, req);
192
goto out;
193
}
194
195
--
34
--
196
2.26.2
35
2.41.0
197
diff view generated by jsdifflib