1 | The following changes since commit 171199f56f5f9bdf1e5d670d09ef1351d8f01bae: | 1 | The following changes since commit 813bac3d8d70d85cb7835f7945eb9eed84c2d8d0: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/alistair/tags/pull-riscv-to-apply-20200619-3' into staging (2020-06-22 14:45:25 +0100) | 3 | Merge tag '2023q3-bsd-user-pull-request' of https://gitlab.com/bsdimp/qemu into staging (2023-08-29 08:58:00 -0400) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | https://github.com/stefanha/qemu.git tags/block-pull-request | 7 | https://gitlab.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 7838c67f22a81fcf669785cd6c0876438422071a: | 9 | for you to fetch changes up to 87ec6f55af38e29be5b2b65a8acf84da73e06d06: |
10 | 10 | ||
11 | block/nvme: support nested aio_poll() (2020-06-23 15:46:08 +0100) | 11 | aio-posix: zero out io_uring sqe user_data (2023-08-30 07:39:59 -0400) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | v3: | ||
17 | - Drop UFS emulation due to CI failures | ||
18 | - Add "aio-posix: zero out io_uring sqe user_data" | ||
19 | |||
16 | ---------------------------------------------------------------- | 20 | ---------------------------------------------------------------- |
17 | 21 | ||
18 | Daniele Buono (4): | 22 | Andrey Drobyshev (3): |
19 | coroutine: support SafeStack in ucontext backend | 23 | block: add subcluster_size field to BlockDriverInfo |
20 | coroutine: add check for SafeStack in sigaltstack | 24 | block/io: align requests to subcluster_size |
21 | configure: add flags to support SafeStack | 25 | tests/qemu-iotests/197: add testcase for CoR with subclusters |
22 | check-block: enable iotests with SafeStack | ||
23 | 26 | ||
24 | Stefan Hajnoczi (8): | 27 | Fabiano Rosas (1): |
25 | minikconf: explicitly set encoding to UTF-8 | 28 | block-migration: Ensure we don't crash during migration cleanup |
26 | block/nvme: poll queues without q->lock | ||
27 | block/nvme: drop tautologous assertion | ||
28 | block/nvme: don't access CQE after moving cq.head | ||
29 | block/nvme: switch to a NVMeRequest freelist | ||
30 | block/nvme: clarify that free_req_queue is protected by q->lock | ||
31 | block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair | ||
32 | block/nvme: support nested aio_poll() | ||
33 | 29 | ||
34 | configure | 73 ++++++++++++ | 30 | Stefan Hajnoczi (1): |
35 | include/qemu/coroutine_int.h | 5 + | 31 | aio-posix: zero out io_uring sqe user_data |
36 | block/nvme.c | 220 +++++++++++++++++++++++++---------- | 32 | |
37 | util/coroutine-sigaltstack.c | 4 + | 33 | include/block/block-common.h | 5 ++++ |
38 | util/coroutine-ucontext.c | 28 +++++ | 34 | include/block/block-io.h | 8 +++--- |
39 | block/trace-events | 2 +- | 35 | block.c | 7 +++++ |
40 | scripts/minikconf.py | 6 +- | 36 | block/io.c | 50 ++++++++++++++++++------------------ |
41 | tests/check-block.sh | 12 +- | 37 | block/mirror.c | 8 +++--- |
42 | 8 files changed, 284 insertions(+), 66 deletions(-) | 38 | block/qcow2.c | 1 + |
39 | migration/block.c | 11 ++++++-- | ||
40 | util/fdmon-io_uring.c | 2 ++ | ||
41 | tests/qemu-iotests/197 | 29 +++++++++++++++++++++ | ||
42 | tests/qemu-iotests/197.out | 24 +++++++++++++++++ | ||
43 | 10 files changed, 110 insertions(+), 35 deletions(-) | ||
43 | 44 | ||
44 | -- | 45 | -- |
45 | 2.26.2 | 46 | 2.41.0 |
46 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | QEMU currently only has ASCII Kconfig files but Linux actually uses | ||
2 | UTF-8. Explicitly specify the encoding and that we're doing text file | ||
3 | I/O. | ||
4 | 1 | ||
5 | It's unclear whether or not QEMU will ever need Unicode in its Kconfig | ||
6 | files. If we start using the help text then it will become an issue | ||
7 | sooner or later. Make this change now for consistency with Linux | ||
8 | Kconfig. | ||
9 | |||
10 | Reported-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
13 | Message-id: 20200521153616.307100-1-stefanha@redhat.com | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | --- | ||
16 | scripts/minikconf.py | 6 +++--- | ||
17 | 1 file changed, 3 insertions(+), 3 deletions(-) | ||
18 | |||
19 | diff --git a/scripts/minikconf.py b/scripts/minikconf.py | ||
20 | index XXXXXXX..XXXXXXX 100755 | ||
21 | --- a/scripts/minikconf.py | ||
22 | +++ b/scripts/minikconf.py | ||
23 | @@ -XXX,XX +XXX,XX @@ class KconfigParser: | ||
24 | if incl_abs_fname in self.data.previously_included: | ||
25 | return | ||
26 | try: | ||
27 | - fp = open(incl_abs_fname, 'r') | ||
28 | + fp = open(incl_abs_fname, 'rt', encoding='utf-8') | ||
29 | except IOError as e: | ||
30 | raise KconfigParserError(self, | ||
31 | '%s: %s' % (e.strerror, include)) | ||
32 | @@ -XXX,XX +XXX,XX @@ if __name__ == '__main__': | ||
33 | parser.do_assignment(name, value == 'y') | ||
34 | external_vars.add(name[7:]) | ||
35 | else: | ||
36 | - fp = open(arg, 'r') | ||
37 | + fp = open(arg, 'rt', encoding='utf-8') | ||
38 | parser.parse_file(fp) | ||
39 | fp.close() | ||
40 | |||
41 | @@ -XXX,XX +XXX,XX @@ if __name__ == '__main__': | ||
42 | if key not in external_vars and config[key]: | ||
43 | print ('CONFIG_%s=y' % key) | ||
44 | |||
45 | - deps = open(argv[2], 'w') | ||
46 | + deps = open(argv[2], 'wt', encoding='utf-8') | ||
47 | for fname in data.previously_included: | ||
48 | print ('%s: %s' % (argv[1], fname), file=deps) | ||
49 | deps.close() | ||
50 | -- | ||
51 | 2.26.2 | ||
52 | diff view generated by jsdifflib |
1 | Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce | 1 | From: Fabiano Rosas <farosas@suse.de> |
---|---|---|---|
2 | the number of function arguments by keeping the BDRVNVMeState pointer in | ||
3 | NVMeQueuePair. This will come in handly when a BH is introduced in a | ||
4 | later patch and only one argument can be passed to it. | ||
5 | 2 | ||
6 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 3 | We can fail the blk_insert_bs() at init_blk_migration(), leaving the |
7 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 4 | BlkMigDevState without a dirty_bitmap and BlockDriverState. Account |
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | 5 | for the possibly missing elements when doing cleanup. |
9 | Message-id: 20200617132201.1832152-7-stefanha@redhat.com | 6 | |
7 | Fix the following crashes: | ||
8 | |||
9 | Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. | ||
10 | 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359 | ||
11 | 359 BlockDriverState *bs = bitmap->bs; | ||
12 | #0 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359 | ||
13 | #1 0x0000555555bba331 in unset_dirty_tracking () at ../migration/block.c:371 | ||
14 | #2 0x0000555555bbad98 in block_migration_cleanup_bmds () at ../migration/block.c:681 | ||
15 | |||
16 | Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. | ||
17 | 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073 | ||
18 | 7073 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) { | ||
19 | #0 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073 | ||
20 | #1 0x0000555555e9734a in bdrv_op_unblock_all (bs=0x0, reason=0x0) at ../block.c:7095 | ||
21 | #2 0x0000555555bbae13 in block_migration_cleanup_bmds () at ../migration/block.c:690 | ||
22 | |||
23 | Signed-off-by: Fabiano Rosas <farosas@suse.de> | ||
24 | Message-id: 20230731203338.27581-1-farosas@suse.de | ||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
11 | --- | 26 | --- |
12 | block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------ | 27 | migration/block.c | 11 +++++++++-- |
13 | 1 file changed, 38 insertions(+), 32 deletions(-) | 28 | 1 file changed, 9 insertions(+), 2 deletions(-) |
14 | 29 | ||
15 | diff --git a/block/nvme.c b/block/nvme.c | 30 | diff --git a/migration/block.c b/migration/block.c |
16 | index XXXXXXX..XXXXXXX 100644 | 31 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/nvme.c | 32 | --- a/migration/block.c |
18 | +++ b/block/nvme.c | 33 | +++ b/migration/block.c |
19 | @@ -XXX,XX +XXX,XX @@ | 34 | @@ -XXX,XX +XXX,XX @@ static void unset_dirty_tracking(void) |
20 | */ | 35 | BlkMigDevState *bmds; |
21 | #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) | 36 | |
22 | 37 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { | |
23 | +typedef struct BDRVNVMeState BDRVNVMeState; | 38 | - bdrv_release_dirty_bitmap(bmds->dirty_bitmap); |
24 | + | 39 | + if (bmds->dirty_bitmap) { |
25 | typedef struct { | 40 | + bdrv_release_dirty_bitmap(bmds->dirty_bitmap); |
26 | int32_t head, tail; | 41 | + } |
27 | uint8_t *queue; | ||
28 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
29 | typedef struct { | ||
30 | QemuMutex lock; | ||
31 | |||
32 | + /* Read from I/O code path, initialized under BQL */ | ||
33 | + BDRVNVMeState *s; | ||
34 | + int index; | ||
35 | + | ||
36 | /* Fields protected by BQL */ | ||
37 | - int index; | ||
38 | uint8_t *prp_list_pages; | ||
39 | |||
40 | /* Fields protected by @lock */ | ||
41 | @@ -XXX,XX +XXX,XX @@ typedef volatile struct { | ||
42 | |||
43 | QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); | ||
44 | |||
45 | -typedef struct { | ||
46 | +struct BDRVNVMeState { | ||
47 | AioContext *aio_context; | ||
48 | QEMUVFIOState *vfio; | ||
49 | NVMeRegs *regs; | ||
50 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
51 | |||
52 | /* PCI address (required for nvme_refresh_filename()) */ | ||
53 | char *device; | ||
54 | -} BDRVNVMeState; | ||
55 | +}; | ||
56 | |||
57 | #define NVME_BLOCK_OPT_DEVICE "device" | ||
58 | #define NVME_BLOCK_OPT_NAMESPACE "namespace" | ||
59 | @@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, | ||
60 | } | 42 | } |
61 | } | 43 | } |
62 | 44 | ||
63 | -static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q) | 45 | @@ -XXX,XX +XXX,XX @@ static int64_t get_remaining_dirty(void) |
64 | +static void nvme_free_queue_pair(NVMeQueuePair *q) | 46 | static void block_migration_cleanup_bmds(void) |
65 | { | 47 | { |
66 | qemu_vfree(q->prp_list_pages); | 48 | BlkMigDevState *bmds; |
67 | qemu_vfree(q->sq.queue); | 49 | + BlockDriverState *bs; |
68 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | 50 | AioContext *ctx; |
69 | uint64_t prp_list_iova; | 51 | |
70 | 52 | unset_dirty_tracking(); | |
71 | qemu_mutex_init(&q->lock); | 53 | |
72 | + q->s = s; | 54 | while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { |
73 | q->index = idx; | 55 | QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); |
74 | qemu_co_queue_init(&q->free_req_queue); | 56 | - bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); |
75 | q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
76 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | ||
77 | |||
78 | return q; | ||
79 | fail: | ||
80 | - nvme_free_queue_pair(bs, q); | ||
81 | + nvme_free_queue_pair(q); | ||
82 | return NULL; | ||
83 | } | ||
84 | |||
85 | /* With q->lock */ | ||
86 | -static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) | ||
87 | +static void nvme_kick(NVMeQueuePair *q) | ||
88 | { | ||
89 | + BDRVNVMeState *s = q->s; | ||
90 | + | 57 | + |
91 | if (s->plugged || !q->need_kick) { | 58 | + bs = blk_bs(bmds->blk); |
92 | return; | 59 | + if (bs) { |
93 | } | 60 | + bdrv_op_unblock_all(bs, bmds->blocker); |
94 | @@ -XXX,XX +XXX,XX @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) | 61 | + } |
95 | } | 62 | error_free(bmds->blocker); |
96 | 63 | ||
97 | /* With q->lock */ | 64 | /* Save ctx, because bmds->blk can disappear during blk_unref. */ |
98 | -static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) | ||
99 | +static void nvme_wake_free_req_locked(NVMeQueuePair *q) | ||
100 | { | ||
101 | if (!qemu_co_queue_empty(&q->free_req_queue)) { | ||
102 | - replay_bh_schedule_oneshot_event(s->aio_context, | ||
103 | + replay_bh_schedule_oneshot_event(q->s->aio_context, | ||
104 | nvme_free_req_queue_cb, q); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /* Insert a request in the freelist and wake waiters */ | ||
109 | -static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, | ||
110 | - NVMeRequest *req) | ||
111 | +static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) | ||
112 | { | ||
113 | qemu_mutex_lock(&q->lock); | ||
114 | nvme_put_free_req_locked(q, req); | ||
115 | - nvme_wake_free_req_locked(s, q); | ||
116 | + nvme_wake_free_req_locked(q); | ||
117 | qemu_mutex_unlock(&q->lock); | ||
118 | } | ||
119 | |||
120 | @@ -XXX,XX +XXX,XX @@ static inline int nvme_translate_error(const NvmeCqe *c) | ||
121 | } | ||
122 | |||
123 | /* With q->lock */ | ||
124 | -static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
125 | +static bool nvme_process_completion(NVMeQueuePair *q) | ||
126 | { | ||
127 | + BDRVNVMeState *s = q->s; | ||
128 | bool progress = false; | ||
129 | NVMeRequest *preq; | ||
130 | NVMeRequest req; | ||
131 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
132 | /* Notify the device so it can post more completions. */ | ||
133 | smp_mb_release(); | ||
134 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
135 | - nvme_wake_free_req_locked(s, q); | ||
136 | + nvme_wake_free_req_locked(q); | ||
137 | } | ||
138 | q->busy = false; | ||
139 | return progress; | ||
140 | @@ -XXX,XX +XXX,XX @@ static void nvme_trace_command(const NvmeCmd *cmd) | ||
141 | } | ||
142 | } | ||
143 | |||
144 | -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, | ||
145 | - NVMeRequest *req, | ||
146 | +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, | ||
147 | NvmeCmd *cmd, BlockCompletionFunc cb, | ||
148 | void *opaque) | ||
149 | { | ||
150 | @@ -XXX,XX +XXX,XX @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, | ||
151 | req->opaque = opaque; | ||
152 | cmd->cid = cpu_to_le32(req->cid); | ||
153 | |||
154 | - trace_nvme_submit_command(s, q->index, req->cid); | ||
155 | + trace_nvme_submit_command(q->s, q->index, req->cid); | ||
156 | nvme_trace_command(cmd); | ||
157 | qemu_mutex_lock(&q->lock); | ||
158 | memcpy((uint8_t *)q->sq.queue + | ||
159 | q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); | ||
160 | q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; | ||
161 | q->need_kick++; | ||
162 | - nvme_kick(s, q); | ||
163 | - nvme_process_completion(s, q); | ||
164 | + nvme_kick(q); | ||
165 | + nvme_process_completion(q); | ||
166 | qemu_mutex_unlock(&q->lock); | ||
167 | } | ||
168 | |||
169 | @@ -XXX,XX +XXX,XX @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, | ||
170 | NvmeCmd *cmd) | ||
171 | { | ||
172 | NVMeRequest *req; | ||
173 | - BDRVNVMeState *s = bs->opaque; | ||
174 | int ret = -EINPROGRESS; | ||
175 | req = nvme_get_free_req(q); | ||
176 | if (!req) { | ||
177 | return -EBUSY; | ||
178 | } | ||
179 | - nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret); | ||
180 | + nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); | ||
181 | |||
182 | BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); | ||
183 | return ret; | ||
184 | @@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s) | ||
185 | } | ||
186 | |||
187 | qemu_mutex_lock(&q->lock); | ||
188 | - while (nvme_process_completion(s, q)) { | ||
189 | + while (nvme_process_completion(q)) { | ||
190 | /* Keep polling */ | ||
191 | progress = true; | ||
192 | } | ||
193 | @@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) | ||
194 | }; | ||
195 | if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { | ||
196 | error_setg(errp, "Failed to create io queue [%d]", n); | ||
197 | - nvme_free_queue_pair(bs, q); | ||
198 | + nvme_free_queue_pair(q); | ||
199 | return false; | ||
200 | } | ||
201 | cmd = (NvmeCmd) { | ||
202 | @@ -XXX,XX +XXX,XX @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) | ||
203 | }; | ||
204 | if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { | ||
205 | error_setg(errp, "Failed to create io queue [%d]", n); | ||
206 | - nvme_free_queue_pair(bs, q); | ||
207 | + nvme_free_queue_pair(q); | ||
208 | return false; | ||
209 | } | ||
210 | s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); | ||
211 | @@ -XXX,XX +XXX,XX @@ static void nvme_close(BlockDriverState *bs) | ||
212 | BDRVNVMeState *s = bs->opaque; | ||
213 | |||
214 | for (i = 0; i < s->nr_queues; ++i) { | ||
215 | - nvme_free_queue_pair(bs, s->queues[i]); | ||
216 | + nvme_free_queue_pair(s->queues[i]); | ||
217 | } | ||
218 | g_free(s->queues); | ||
219 | aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, | ||
220 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, | ||
221 | r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); | ||
222 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
223 | if (r) { | ||
224 | - nvme_put_free_req_and_wake(s, ioq, req); | ||
225 | + nvme_put_free_req_and_wake(ioq, req); | ||
226 | return r; | ||
227 | } | ||
228 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
229 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
230 | |||
231 | data.co = qemu_coroutine_self(); | ||
232 | while (data.ret == -EINPROGRESS) { | ||
233 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) | ||
234 | assert(s->nr_queues > 1); | ||
235 | req = nvme_get_free_req(ioq); | ||
236 | assert(req); | ||
237 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
238 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
239 | |||
240 | data.co = qemu_coroutine_self(); | ||
241 | if (data.ret == -EINPROGRESS) { | ||
242 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, | ||
243 | req = nvme_get_free_req(ioq); | ||
244 | assert(req); | ||
245 | |||
246 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
247 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
248 | |||
249 | data.co = qemu_coroutine_self(); | ||
250 | while (data.ret == -EINPROGRESS) { | ||
251 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, | ||
252 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
253 | |||
254 | if (ret) { | ||
255 | - nvme_put_free_req_and_wake(s, ioq, req); | ||
256 | + nvme_put_free_req_and_wake(ioq, req); | ||
257 | goto out; | ||
258 | } | ||
259 | |||
260 | trace_nvme_dsm(s, offset, bytes); | ||
261 | |||
262 | - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
263 | + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); | ||
264 | |||
265 | data.co = qemu_coroutine_self(); | ||
266 | while (data.ret == -EINPROGRESS) { | ||
267 | @@ -XXX,XX +XXX,XX @@ static void nvme_aio_unplug(BlockDriverState *bs) | ||
268 | for (i = 1; i < s->nr_queues; i++) { | ||
269 | NVMeQueuePair *q = s->queues[i]; | ||
270 | qemu_mutex_lock(&q->lock); | ||
271 | - nvme_kick(s, q); | ||
272 | - nvme_process_completion(s, q); | ||
273 | + nvme_kick(q); | ||
274 | + nvme_process_completion(q); | ||
275 | qemu_mutex_unlock(&q->lock); | ||
276 | } | ||
277 | } | ||
278 | -- | 65 | -- |
279 | 2.26.2 | 66 | 2.41.0 |
280 | diff view generated by jsdifflib |
1 | QEMU block drivers are supposed to support aio_poll() from I/O | 1 | From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
---|---|---|---|
2 | completion callback functions. This means completion processing must be | ||
3 | re-entrant. | ||
4 | 2 | ||
5 | The standard approach is to schedule a BH during completion processing | 3 | This is going to be used in the subsequent commit as requests alignment |
6 | and cancel it at the end of processing. If aio_poll() is invoked by a | 4 | (in particular, during copy-on-read). This value only makes sense for |
7 | callback function then the BH will run. The BH continues the suspended | 5 | the formats which support subclusters (currently QCOW2 only). If this |
8 | completion processing. | 6 | field isn't set by driver's own bdrv_get_info() implementation, we |
7 | simply set it equal to the cluster size thus treating each cluster as | ||
8 | having a single subcluster. | ||
9 | 9 | ||
10 | All of this means that request A's cb() can synchronously wait for | 10 | Reviewed-by: Eric Blake <eblake@redhat.com> |
11 | request B to complete. Previously the nvme block driver would hang | 11 | Reviewed-by: Denis V. Lunev <den@openvz.org> |
12 | because it didn't process completions from nested aio_poll(). | 12 | Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
13 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | Message-ID: <20230711172553.234055-2-andrey.drobyshev@virtuozzo.com> | ||
16 | --- | ||
17 | include/block/block-common.h | 5 +++++ | ||
18 | block.c | 7 +++++++ | ||
19 | block/qcow2.c | 1 + | ||
20 | 3 files changed, 13 insertions(+) | ||
13 | 21 | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 22 | diff --git a/include/block/block-common.h b/include/block/block-common.h |
15 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
16 | Message-id: 20200617132201.1832152-8-stefanha@redhat.com | ||
17 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
18 | --- | ||
19 | block/nvme.c | 67 ++++++++++++++++++++++++++++++++++++++++------ | ||
20 | block/trace-events | 2 +- | ||
21 | 2 files changed, 60 insertions(+), 9 deletions(-) | ||
22 | |||
23 | diff --git a/block/nvme.c b/block/nvme.c | ||
24 | index XXXXXXX..XXXXXXX 100644 | 23 | index XXXXXXX..XXXXXXX 100644 |
25 | --- a/block/nvme.c | 24 | --- a/include/block/block-common.h |
26 | +++ b/block/nvme.c | 25 | +++ b/include/block/block-common.h |
27 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 26 | @@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneWps { |
28 | int cq_phase; | 27 | typedef struct BlockDriverInfo { |
29 | int free_req_head; | 28 | /* in bytes, 0 if irrelevant */ |
30 | NVMeRequest reqs[NVME_NUM_REQS]; | 29 | int cluster_size; |
31 | - bool busy; | 30 | + /* |
32 | int need_kick; | 31 | + * A fraction of cluster_size, if supported (currently QCOW2 only); if |
33 | int inflight; | 32 | + * disabled or unsupported, set equal to cluster_size. |
34 | + | 33 | + */ |
35 | + /* Thread-safe, no lock necessary */ | 34 | + int subcluster_size; |
36 | + QEMUBH *completion_bh; | 35 | /* offset at which the VM state can be saved (0 if not possible) */ |
37 | } NVMeQueuePair; | 36 | int64_t vm_state_offset; |
38 | 37 | bool is_dirty; | |
39 | /* Memory mapped registers */ | 38 | diff --git a/block.c b/block.c |
40 | @@ -XXX,XX +XXX,XX @@ struct BDRVNVMeState { | 39 | index XXXXXXX..XXXXXXX 100644 |
41 | #define NVME_BLOCK_OPT_DEVICE "device" | 40 | --- a/block.c |
42 | #define NVME_BLOCK_OPT_NAMESPACE "namespace" | 41 | +++ b/block.c |
43 | 42 | @@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) | |
44 | +static void nvme_process_completion_bh(void *opaque); | 43 | } |
45 | + | 44 | memset(bdi, 0, sizeof(*bdi)); |
46 | static QemuOptsList runtime_opts = { | 45 | ret = drv->bdrv_co_get_info(bs, bdi); |
47 | .name = "nvme", | 46 | + if (bdi->subcluster_size == 0) { |
48 | .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), | 47 | + /* |
49 | @@ -XXX,XX +XXX,XX @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, | 48 | + * If the driver left this unset, subclusters are not supported. |
50 | 49 | + * Then it is safe to treat each cluster as having only one subcluster. | |
51 | static void nvme_free_queue_pair(NVMeQueuePair *q) | 50 | + */ |
51 | + bdi->subcluster_size = bdi->cluster_size; | ||
52 | + } | ||
53 | if (ret < 0) { | ||
54 | return ret; | ||
55 | } | ||
56 | diff --git a/block/qcow2.c b/block/qcow2.c | ||
57 | index XXXXXXX..XXXXXXX 100644 | ||
58 | --- a/block/qcow2.c | ||
59 | +++ b/block/qcow2.c | ||
60 | @@ -XXX,XX +XXX,XX @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) | ||
52 | { | 61 | { |
53 | + if (q->completion_bh) { | 62 | BDRVQcow2State *s = bs->opaque; |
54 | + qemu_bh_delete(q->completion_bh); | 63 | bdi->cluster_size = s->cluster_size; |
55 | + } | 64 | + bdi->subcluster_size = s->subcluster_size; |
56 | qemu_vfree(q->prp_list_pages); | 65 | bdi->vm_state_offset = qcow2_vm_state_offset(s); |
57 | qemu_vfree(q->sq.queue); | 66 | bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY; |
58 | qemu_vfree(q->cq.queue); | 67 | return 0; |
59 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | ||
60 | q->index = idx; | ||
61 | qemu_co_queue_init(&q->free_req_queue); | ||
62 | q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
63 | + q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs), | ||
64 | + nvme_process_completion_bh, q); | ||
65 | r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, | ||
66 | s->page_size * NVME_NUM_REQS, | ||
67 | false, &prp_list_iova); | ||
68 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
69 | NvmeCqe *c; | ||
70 | |||
71 | trace_nvme_process_completion(s, q->index, q->inflight); | ||
72 | - if (q->busy || s->plugged) { | ||
73 | - trace_nvme_process_completion_queue_busy(s, q->index); | ||
74 | + if (s->plugged) { | ||
75 | + trace_nvme_process_completion_queue_plugged(s, q->index); | ||
76 | return false; | ||
77 | } | ||
78 | - q->busy = true; | ||
79 | + | ||
80 | + /* | ||
81 | + * Support re-entrancy when a request cb() function invokes aio_poll(). | ||
82 | + * Pending completions must be visible to aio_poll() so that a cb() | ||
83 | + * function can wait for the completion of another request. | ||
84 | + * | ||
85 | + * The aio_poll() loop will execute our BH and we'll resume completion | ||
86 | + * processing there. | ||
87 | + */ | ||
88 | + qemu_bh_schedule(q->completion_bh); | ||
89 | + | ||
90 | assert(q->inflight >= 0); | ||
91 | while (q->inflight) { | ||
92 | int ret; | ||
93 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
94 | assert(req.cb); | ||
95 | nvme_put_free_req_locked(q, preq); | ||
96 | preq->cb = preq->opaque = NULL; | ||
97 | - qemu_mutex_unlock(&q->lock); | ||
98 | - req.cb(req.opaque, ret); | ||
99 | - qemu_mutex_lock(&q->lock); | ||
100 | q->inflight--; | ||
101 | + qemu_mutex_unlock(&q->lock); | ||
102 | + req.cb(req.opaque, ret); | ||
103 | + qemu_mutex_lock(&q->lock); | ||
104 | progress = true; | ||
105 | } | ||
106 | if (progress) { | ||
107 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(NVMeQueuePair *q) | ||
108 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
109 | nvme_wake_free_req_locked(q); | ||
110 | } | ||
111 | - q->busy = false; | ||
112 | + | ||
113 | + qemu_bh_cancel(q->completion_bh); | ||
114 | + | ||
115 | return progress; | ||
116 | } | ||
117 | |||
118 | +static void nvme_process_completion_bh(void *opaque) | ||
119 | +{ | ||
120 | + NVMeQueuePair *q = opaque; | ||
121 | + | ||
122 | + /* | ||
123 | + * We're being invoked because a nvme_process_completion() cb() function | ||
124 | + * called aio_poll(). The callback may be waiting for further completions | ||
125 | + * so notify the device that it has space to fill in more completions now. | ||
126 | + */ | ||
127 | + smp_mb_release(); | ||
128 | + *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
129 | + nvme_wake_free_req_locked(q); | ||
130 | + | ||
131 | + nvme_process_completion(q); | ||
132 | +} | ||
133 | + | ||
134 | static void nvme_trace_command(const NvmeCmd *cmd) | ||
135 | { | ||
136 | int i; | ||
137 | @@ -XXX,XX +XXX,XX @@ static void nvme_detach_aio_context(BlockDriverState *bs) | ||
138 | { | ||
139 | BDRVNVMeState *s = bs->opaque; | ||
140 | |||
141 | + for (int i = 0; i < s->nr_queues; i++) { | ||
142 | + NVMeQueuePair *q = s->queues[i]; | ||
143 | + | ||
144 | + qemu_bh_delete(q->completion_bh); | ||
145 | + q->completion_bh = NULL; | ||
146 | + } | ||
147 | + | ||
148 | aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, | ||
149 | false, NULL, NULL); | ||
150 | } | ||
151 | @@ -XXX,XX +XXX,XX @@ static void nvme_attach_aio_context(BlockDriverState *bs, | ||
152 | s->aio_context = new_context; | ||
153 | aio_set_event_notifier(new_context, &s->irq_notifier, | ||
154 | false, nvme_handle_event, nvme_poll_cb); | ||
155 | + | ||
156 | + for (int i = 0; i < s->nr_queues; i++) { | ||
157 | + NVMeQueuePair *q = s->queues[i]; | ||
158 | + | ||
159 | + q->completion_bh = | ||
160 | + aio_bh_new(new_context, nvme_process_completion_bh, q); | ||
161 | + } | ||
162 | } | ||
163 | |||
164 | static void nvme_aio_plug(BlockDriverState *bs) | ||
165 | diff --git a/block/trace-events b/block/trace-events | ||
166 | index XXXXXXX..XXXXXXX 100644 | ||
167 | --- a/block/trace-events | ||
168 | +++ b/block/trace-events | ||
169 | @@ -XXX,XX +XXX,XX @@ nvme_kick(void *s, int queue) "s %p queue %d" | ||
170 | nvme_dma_flush_queue_wait(void *s) "s %p" | ||
171 | nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" | ||
172 | nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" | ||
173 | -nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" | ||
174 | +nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d" | ||
175 | nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" | ||
176 | nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" | ||
177 | nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" | ||
178 | -- | 68 | -- |
179 | 2.26.2 | 69 | 2.41.0 |
180 | diff view generated by jsdifflib |
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | 1 | From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
---|---|---|---|
2 | 2 | ||
3 | LLVM's SafeStack instrumentation does not yet support programs that make | 3 | When target image is using subclusters, and we align the request during |
4 | use of the APIs in ucontext.h | 4 | copy-on-read, it makes sense to align to subcluster_size rather than |
5 | With the current implementation of coroutine-ucontext, the resulting | 5 | cluster_size. Otherwise we end up with unnecessary allocations. |
6 | binary is incorrect, with different coroutines sharing the same unsafe | 6 | |
7 | stack and producing undefined behavior at runtime. | 7 | This commit renames bdrv_round_to_clusters() to bdrv_round_to_subclusters() |
8 | This fix allocates an additional unsafe stack area for each coroutine, | 8 | and utilizes subcluster_size field of BlockDriverInfo to make necessary |
9 | and sets the new unsafe stack pointer before calling swapcontext() in | 9 | alignments. It affects copy-on-read as well as mirror job (which is |
10 | qemu_coroutine_new. | 10 | using bdrv_round_to_clusters()). |
11 | This is the only place where the pointer needs to be manually updated, | 11 | |
12 | since sigsetjmp/siglongjmp are already instrumented by LLVM to properly | 12 | This change also fixes the following bug with failing assert (covered by |
13 | support SafeStack. | 13 | the test in the subsequent commit): |
14 | The additional stack is then freed in qemu_coroutine_delete. | 14 | |
15 | 15 | qemu-img create -f qcow2 base.qcow2 64K | |
16 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | 16 | qemu-img create -f qcow2 -o extended_l2=on,backing_file=base.qcow2,backing_fmt=qcow2 img.qcow2 64K |
17 | Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com | 17 | qemu-io -c "write -P 0xaa 0 2K" img.qcow2 |
18 | qemu-io -C -c "read -P 0x00 2K 62K" img.qcow2 | ||
19 | |||
20 | qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. | ||
21 | |||
22 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
23 | Reviewed-by: Denis V. Lunev <den@openvz.org> | ||
24 | Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> | ||
25 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> | ||
18 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 26 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
27 | Message-ID: <20230711172553.234055-3-andrey.drobyshev@virtuozzo.com> | ||
19 | --- | 28 | --- |
20 | include/qemu/coroutine_int.h | 5 +++++ | 29 | include/block/block-io.h | 8 +++---- |
21 | util/coroutine-ucontext.c | 28 ++++++++++++++++++++++++++++ | 30 | block/io.c | 50 ++++++++++++++++++++-------------------- |
22 | 2 files changed, 33 insertions(+) | 31 | block/mirror.c | 8 +++---- |
23 | 32 | 3 files changed, 33 insertions(+), 33 deletions(-) | |
24 | diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h | 33 | |
34 | diff --git a/include/block/block-io.h b/include/block/block-io.h | ||
25 | index XXXXXXX..XXXXXXX 100644 | 35 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/include/qemu/coroutine_int.h | 36 | --- a/include/block/block-io.h |
27 | +++ b/include/qemu/coroutine_int.h | 37 | +++ b/include/block/block-io.h |
28 | @@ -XXX,XX +XXX,XX @@ | 38 | @@ -XXX,XX +XXX,XX @@ bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); |
29 | #include "qemu/queue.h" | 39 | ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs, |
30 | #include "qemu/coroutine.h" | 40 | Error **errp); |
31 | 41 | BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs); | |
32 | +#ifdef CONFIG_SAFESTACK | 42 | -void bdrv_round_to_clusters(BlockDriverState *bs, |
33 | +/* Pointer to the unsafe stack, defined by the compiler */ | 43 | - int64_t offset, int64_t bytes, |
34 | +extern __thread void *__safestack_unsafe_stack_ptr; | 44 | - int64_t *cluster_offset, |
35 | +#endif | 45 | - int64_t *cluster_bytes); |
36 | + | 46 | +void bdrv_round_to_subclusters(BlockDriverState *bs, |
37 | #define COROUTINE_STACK_SIZE (1 << 20) | 47 | + int64_t offset, int64_t bytes, |
38 | 48 | + int64_t *cluster_offset, | |
39 | typedef enum { | 49 | + int64_t *cluster_bytes); |
40 | diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c | 50 | |
51 | void bdrv_get_backing_filename(BlockDriverState *bs, | ||
52 | char *filename, int filename_size); | ||
53 | diff --git a/block/io.c b/block/io.c | ||
41 | index XXXXXXX..XXXXXXX 100644 | 54 | index XXXXXXX..XXXXXXX 100644 |
42 | --- a/util/coroutine-ucontext.c | 55 | --- a/block/io.c |
43 | +++ b/util/coroutine-ucontext.c | 56 | +++ b/block/io.c |
44 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 57 | @@ -XXX,XX +XXX,XX @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) |
45 | Coroutine base; | 58 | } |
46 | void *stack; | 59 | |
47 | size_t stack_size; | 60 | /** |
48 | +#ifdef CONFIG_SAFESTACK | 61 | - * Round a region to cluster boundaries |
49 | + /* Need an unsafe stack for each coroutine */ | 62 | + * Round a region to subcluster (if supported) or cluster boundaries |
50 | + void *unsafe_stack; | 63 | */ |
51 | + size_t unsafe_stack_size; | 64 | void coroutine_fn GRAPH_RDLOCK |
52 | +#endif | 65 | -bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes, |
53 | sigjmp_buf env; | 66 | - int64_t *cluster_offset, int64_t *cluster_bytes) |
54 | 67 | +bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes, | |
55 | void *tsan_co_fiber; | 68 | + int64_t *align_offset, int64_t *align_bytes) |
56 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void) | 69 | { |
57 | co = g_malloc0(sizeof(*co)); | 70 | BlockDriverInfo bdi; |
58 | co->stack_size = COROUTINE_STACK_SIZE; | 71 | IO_CODE(); |
59 | co->stack = qemu_alloc_stack(&co->stack_size); | 72 | - if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { |
60 | +#ifdef CONFIG_SAFESTACK | 73 | - *cluster_offset = offset; |
61 | + co->unsafe_stack_size = COROUTINE_STACK_SIZE; | 74 | - *cluster_bytes = bytes; |
62 | + co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size); | 75 | + if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) { |
63 | +#endif | 76 | + *align_offset = offset; |
64 | co->base.entry_arg = &old_env; /* stash away our jmp_buf */ | 77 | + *align_bytes = bytes; |
65 | 78 | } else { | |
66 | uc.uc_link = &old_uc; | 79 | - int64_t c = bdi.cluster_size; |
67 | @@ -XXX,XX +XXX,XX @@ Coroutine *qemu_coroutine_new(void) | 80 | - *cluster_offset = QEMU_ALIGN_DOWN(offset, c); |
68 | COROUTINE_YIELD, | 81 | - *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); |
69 | &fake_stack_save, | 82 | + int64_t c = bdi.subcluster_size; |
70 | co->stack, co->stack_size, co->tsan_co_fiber); | 83 | + *align_offset = QEMU_ALIGN_DOWN(offset, c); |
71 | + | 84 | + *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c); |
72 | +#ifdef CONFIG_SAFESTACK | ||
73 | + /* | ||
74 | + * Before we swap the context, set the new unsafe stack | ||
75 | + * The unsafe stack grows just like the normal stack, so start from | ||
76 | + * the last usable location of the memory area. | ||
77 | + * NOTE: we don't have to re-set the usp afterwards because we are | ||
78 | + * coming back to this context through a siglongjmp. | ||
79 | + * The compiler already wrapped the corresponding sigsetjmp call with | ||
80 | + * code that saves the usp on the (safe) stack before the call, and | ||
81 | + * restores it right after (which is where we return with siglongjmp). | ||
82 | + */ | ||
83 | + void *usp = co->unsafe_stack + co->unsafe_stack_size; | ||
84 | + __safestack_unsafe_stack_ptr = usp; | ||
85 | +#endif | ||
86 | + | ||
87 | swapcontext(&old_uc, &uc); | ||
88 | } | 85 | } |
89 | |||
90 | @@ -XXX,XX +XXX,XX @@ void qemu_coroutine_delete(Coroutine *co_) | ||
91 | #endif | ||
92 | |||
93 | qemu_free_stack(co->stack, co->stack_size); | ||
94 | +#ifdef CONFIG_SAFESTACK | ||
95 | + qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size); | ||
96 | +#endif | ||
97 | g_free(co); | ||
98 | } | 86 | } |
99 | 87 | ||
88 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
89 | void *bounce_buffer = NULL; | ||
90 | |||
91 | BlockDriver *drv = bs->drv; | ||
92 | - int64_t cluster_offset; | ||
93 | - int64_t cluster_bytes; | ||
94 | + int64_t align_offset; | ||
95 | + int64_t align_bytes; | ||
96 | int64_t skip_bytes; | ||
97 | int ret; | ||
98 | int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, | ||
99 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
100 | * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which | ||
101 | * is one reason we loop rather than doing it all at once. | ||
102 | */ | ||
103 | - bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); | ||
104 | - skip_bytes = offset - cluster_offset; | ||
105 | + bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes); | ||
106 | + skip_bytes = offset - align_offset; | ||
107 | |||
108 | trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, | ||
109 | - cluster_offset, cluster_bytes); | ||
110 | + align_offset, align_bytes); | ||
111 | |||
112 | - while (cluster_bytes) { | ||
113 | + while (align_bytes) { | ||
114 | int64_t pnum; | ||
115 | |||
116 | if (skip_write) { | ||
117 | ret = 1; /* "already allocated", so nothing will be copied */ | ||
118 | - pnum = MIN(cluster_bytes, max_transfer); | ||
119 | + pnum = MIN(align_bytes, max_transfer); | ||
120 | } else { | ||
121 | - ret = bdrv_is_allocated(bs, cluster_offset, | ||
122 | - MIN(cluster_bytes, max_transfer), &pnum); | ||
123 | + ret = bdrv_is_allocated(bs, align_offset, | ||
124 | + MIN(align_bytes, max_transfer), &pnum); | ||
125 | if (ret < 0) { | ||
126 | /* | ||
127 | * Safe to treat errors in querying allocation as if | ||
128 | * unallocated; we'll probably fail again soon on the | ||
129 | * read, but at least that will set a decent errno. | ||
130 | */ | ||
131 | - pnum = MIN(cluster_bytes, max_transfer); | ||
132 | + pnum = MIN(align_bytes, max_transfer); | ||
133 | } | ||
134 | |||
135 | /* Stop at EOF if the image ends in the middle of the cluster */ | ||
136 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
137 | /* Must copy-on-read; use the bounce buffer */ | ||
138 | pnum = MIN(pnum, MAX_BOUNCE_BUFFER); | ||
139 | if (!bounce_buffer) { | ||
140 | - int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); | ||
141 | + int64_t max_we_need = MAX(pnum, align_bytes - pnum); | ||
142 | int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); | ||
143 | int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); | ||
144 | |||
145 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
146 | } | ||
147 | qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); | ||
148 | |||
149 | - ret = bdrv_driver_preadv(bs, cluster_offset, pnum, | ||
150 | + ret = bdrv_driver_preadv(bs, align_offset, pnum, | ||
151 | &local_qiov, 0, 0); | ||
152 | if (ret < 0) { | ||
153 | goto err; | ||
154 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
155 | /* FIXME: Should we (perhaps conditionally) be setting | ||
156 | * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy | ||
157 | * that still correctly reads as zero? */ | ||
158 | - ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, | ||
159 | + ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum, | ||
160 | BDRV_REQ_WRITE_UNCHANGED); | ||
161 | } else { | ||
162 | /* This does not change the data on the disk, it is not | ||
163 | * necessary to flush even in cache=writethrough mode. | ||
164 | */ | ||
165 | - ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, | ||
166 | + ret = bdrv_driver_pwritev(bs, align_offset, pnum, | ||
167 | &local_qiov, 0, | ||
168 | BDRV_REQ_WRITE_UNCHANGED); | ||
169 | } | ||
170 | @@ -XXX,XX +XXX,XX @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, | ||
171 | } | ||
172 | } | ||
173 | |||
174 | - cluster_offset += pnum; | ||
175 | - cluster_bytes -= pnum; | ||
176 | + align_offset += pnum; | ||
177 | + align_bytes -= pnum; | ||
178 | progress += pnum - skip_bytes; | ||
179 | skip_bytes = 0; | ||
180 | } | ||
181 | diff --git a/block/mirror.c b/block/mirror.c | ||
182 | index XXXXXXX..XXXXXXX 100644 | ||
183 | --- a/block/mirror.c | ||
184 | +++ b/block/mirror.c | ||
185 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset, | ||
186 | need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity, | ||
187 | s->cow_bitmap); | ||
188 | if (need_cow) { | ||
189 | - bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes, | ||
190 | - &align_offset, &align_bytes); | ||
191 | + bdrv_round_to_subclusters(blk_bs(s->target), *offset, *bytes, | ||
192 | + &align_offset, &align_bytes); | ||
193 | } | ||
194 | |||
195 | if (align_bytes > max_bytes) { | ||
196 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) | ||
197 | int64_t target_offset; | ||
198 | int64_t target_bytes; | ||
199 | WITH_GRAPH_RDLOCK_GUARD() { | ||
200 | - bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes, | ||
201 | - &target_offset, &target_bytes); | ||
202 | + bdrv_round_to_subclusters(blk_bs(s->target), offset, io_bytes, | ||
203 | + &target_offset, &target_bytes); | ||
204 | } | ||
205 | if (target_offset == offset && | ||
206 | target_bytes == io_bytes) { | ||
100 | -- | 207 | -- |
101 | 2.26.2 | 208 | 2.41.0 |
102 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | Current implementation of LLVM's SafeStack is not compatible with | ||
4 | code that uses an alternate stack created with sigaltstack(). | ||
5 | Since coroutine-sigaltstack relies on sigaltstack(), it is not | ||
6 | compatible with SafeStack. The resulting binary is incorrect, with | ||
7 | different coroutines sharing the same unsafe stack and producing | ||
8 | undefined behavior at runtime. | ||
9 | |||
10 | In the future LLVM may provide a SafeStack implementation compatible with | ||
11 | sigaltstack(). In the meantime, if SafeStack is desired, the coroutine | ||
12 | implementation from coroutine-ucontext should be used. | ||
13 | As a safety check, add a control in coroutine-sigaltstack to throw a | ||
14 | preprocessor #error if SafeStack is enabled and we are trying to | ||
15 | use coroutine-sigaltstack to implement coroutines. | ||
16 | |||
17 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
18 | Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com | ||
19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
20 | --- | ||
21 | util/coroutine-sigaltstack.c | 4 ++++ | ||
22 | 1 file changed, 4 insertions(+) | ||
23 | |||
24 | diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c | ||
25 | index XXXXXXX..XXXXXXX 100644 | ||
26 | --- a/util/coroutine-sigaltstack.c | ||
27 | +++ b/util/coroutine-sigaltstack.c | ||
28 | @@ -XXX,XX +XXX,XX @@ | ||
29 | #include "qemu-common.h" | ||
30 | #include "qemu/coroutine_int.h" | ||
31 | |||
32 | +#ifdef CONFIG_SAFESTACK | ||
33 | +#error "SafeStack is not compatible with code run in alternate signal stacks" | ||
34 | +#endif | ||
35 | + | ||
36 | typedef struct { | ||
37 | Coroutine base; | ||
38 | void *stack; | ||
39 | -- | ||
40 | 2.26.2 | ||
41 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | This patch adds a flag to enable/disable the SafeStack instrumentation | ||
4 | provided by LLVM. | ||
5 | |||
6 | On enable, make sure that the compiler supports the flags, and that we | ||
7 | are using the proper coroutine implementation (coroutine-ucontext). | ||
8 | On disable, explicitly disable the option if it was enabled by default. | ||
9 | |||
10 | While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS, | ||
11 | we are not checking for the O.S. since this is already done by LLVM. | ||
12 | |||
13 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
14 | Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com | ||
15 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
16 | --- | ||
17 | configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ | ||
18 | 1 file changed, 73 insertions(+) | ||
19 | |||
20 | diff --git a/configure b/configure | ||
21 | index XXXXXXX..XXXXXXX 100755 | ||
22 | --- a/configure | ||
23 | +++ b/configure | ||
24 | @@ -XXX,XX +XXX,XX @@ audio_win_int="" | ||
25 | libs_qga="" | ||
26 | debug_info="yes" | ||
27 | stack_protector="" | ||
28 | +safe_stack="" | ||
29 | use_containers="yes" | ||
30 | gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb") | ||
31 | |||
32 | @@ -XXX,XX +XXX,XX @@ for opt do | ||
33 | ;; | ||
34 | --disable-stack-protector) stack_protector="no" | ||
35 | ;; | ||
36 | + --enable-safe-stack) safe_stack="yes" | ||
37 | + ;; | ||
38 | + --disable-safe-stack) safe_stack="no" | ||
39 | + ;; | ||
40 | --disable-curses) curses="no" | ||
41 | ;; | ||
42 | --enable-curses) curses="yes" | ||
43 | @@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available: | ||
44 | debug-tcg TCG debugging (default is disabled) | ||
45 | debug-info debugging information | ||
46 | sparse sparse checker | ||
47 | + safe-stack SafeStack Stack Smash Protection. Depends on | ||
48 | + clang/llvm >= 3.7 and requires coroutine backend ucontext. | ||
49 | |||
50 | gnutls GNUTLS cryptography support | ||
51 | nettle nettle cryptography support | ||
52 | @@ -XXX,XX +XXX,XX @@ if test "$debug_stack_usage" = "yes"; then | ||
53 | fi | ||
54 | fi | ||
55 | |||
56 | +################################################## | ||
57 | +# SafeStack | ||
58 | + | ||
59 | + | ||
60 | +if test "$safe_stack" = "yes"; then | ||
61 | +cat > $TMPC << EOF | ||
62 | +int main(int argc, char *argv[]) | ||
63 | +{ | ||
64 | +#if ! __has_feature(safe_stack) | ||
65 | +#error SafeStack Disabled | ||
66 | +#endif | ||
67 | + return 0; | ||
68 | +} | ||
69 | +EOF | ||
70 | + flag="-fsanitize=safe-stack" | ||
71 | + # Check that safe-stack is supported and enabled. | ||
72 | + if compile_prog "-Werror $flag" "$flag"; then | ||
73 | + # Flag needed both at compilation and at linking | ||
74 | + QEMU_CFLAGS="$QEMU_CFLAGS $flag" | ||
75 | + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" | ||
76 | + else | ||
77 | + error_exit "SafeStack not supported by your compiler" | ||
78 | + fi | ||
79 | + if test "$coroutine" != "ucontext"; then | ||
80 | + error_exit "SafeStack is only supported by the coroutine backend ucontext" | ||
81 | + fi | ||
82 | +else | ||
83 | +cat > $TMPC << EOF | ||
84 | +int main(int argc, char *argv[]) | ||
85 | +{ | ||
86 | +#if defined(__has_feature) | ||
87 | +#if __has_feature(safe_stack) | ||
88 | +#error SafeStack Enabled | ||
89 | +#endif | ||
90 | +#endif | ||
91 | + return 0; | ||
92 | +} | ||
93 | +EOF | ||
94 | +if test "$safe_stack" = "no"; then | ||
95 | + # Make sure that safe-stack is disabled | ||
96 | + if ! compile_prog "-Werror" ""; then | ||
97 | + # SafeStack was already enabled, try to explicitly remove the feature | ||
98 | + flag="-fno-sanitize=safe-stack" | ||
99 | + if ! compile_prog "-Werror $flag" "$flag"; then | ||
100 | + error_exit "Configure cannot disable SafeStack" | ||
101 | + fi | ||
102 | + QEMU_CFLAGS="$QEMU_CFLAGS $flag" | ||
103 | + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" | ||
104 | + fi | ||
105 | +else # "$safe_stack" = "" | ||
106 | + # Set safe_stack to yes or no based on pre-existing flags | ||
107 | + if compile_prog "-Werror" ""; then | ||
108 | + safe_stack="no" | ||
109 | + else | ||
110 | + safe_stack="yes" | ||
111 | + if test "$coroutine" != "ucontext"; then | ||
112 | + error_exit "SafeStack is only supported by the coroutine backend ucontext" | ||
113 | + fi | ||
114 | + fi | ||
115 | +fi | ||
116 | +fi | ||
117 | |||
118 | ########################################## | ||
119 | # check if we have open_by_handle_at | ||
120 | @@ -XXX,XX +XXX,XX @@ echo "sparse enabled $sparse" | ||
121 | echo "strip binaries $strip_opt" | ||
122 | echo "profiler $profiler" | ||
123 | echo "static build $static" | ||
124 | +echo "safe stack $safe_stack" | ||
125 | if test "$darwin" = "yes" ; then | ||
126 | echo "Cocoa support $cocoa" | ||
127 | fi | ||
128 | @@ -XXX,XX +XXX,XX @@ if test "$ccache_cpp2" = "yes"; then | ||
129 | echo "export CCACHE_CPP2=y" >> $config_host_mak | ||
130 | fi | ||
131 | |||
132 | +if test "$safe_stack" = "yes"; then | ||
133 | + echo "CONFIG_SAFESTACK=y" >> $config_host_mak | ||
134 | +fi | ||
135 | + | ||
136 | # If we're using a separate build tree, set it up now. | ||
137 | # DIRS are directories which we simply mkdir in the build tree; | ||
138 | # LINKS are things to symlink back into the source tree | ||
139 | -- | ||
140 | 2.26.2 | ||
141 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
2 | 1 | ||
3 | SafeStack is a stack protection technique implemented in llvm. It is | ||
4 | enabled with a -fsanitize flag. | ||
5 | iotests are currently disabled when any -fsanitize option is used, | ||
6 | because such options tend to produce additional warnings and false | ||
7 | positives. | ||
8 | |||
9 | While common -fsanitize options are used to verify the code and not | ||
10 | added in production, SafeStack's main use is in production environments | ||
11 | to protect against stack smashing. | ||
12 | |||
13 | Since SafeStack does not print any warning or false positive, enable | ||
14 | iotests when SafeStack is the only -fsanitize option used. | ||
15 | This is likely going to be a production binary and we want to make sure | ||
16 | it works correctly. | ||
17 | |||
18 | Signed-off-by: Daniele Buono <dbuono@linux.vnet.ibm.com> | ||
19 | Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com | ||
20 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
21 | --- | ||
22 | tests/check-block.sh | 12 +++++++++++- | ||
23 | 1 file changed, 11 insertions(+), 1 deletion(-) | ||
24 | |||
25 | diff --git a/tests/check-block.sh b/tests/check-block.sh | ||
26 | index XXXXXXX..XXXXXXX 100755 | ||
27 | --- a/tests/check-block.sh | ||
28 | +++ b/tests/check-block.sh | ||
29 | @@ -XXX,XX +XXX,XX @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then | ||
30 | exit 0 | ||
31 | fi | ||
32 | |||
33 | -if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then | ||
34 | +# Disable tests with any sanitizer except for SafeStack | ||
35 | +CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ) | ||
36 | +SANITIZE_FLAGS="" | ||
37 | +#Remove all occurrencies of -fsanitize=safe-stack | ||
38 | +for i in ${CFLAGS}; do | ||
39 | + if [ "${i}" != "-fsanitize=safe-stack" ]; then | ||
40 | + SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}" | ||
41 | + fi | ||
42 | +done | ||
43 | +if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then | ||
44 | + # Have a sanitize flag that is not allowed, stop | ||
45 | echo "Sanitizers are enabled ==> Not running the qemu-iotests." | ||
46 | exit 0 | ||
47 | fi | ||
48 | -- | ||
49 | 2.26.2 | ||
50 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | A lot of CPU time is spent simply locking/unlocking q->lock during | ||
2 | polling. Check for completion outside the lock to make q->lock disappear | ||
3 | from the profile. | ||
4 | 1 | ||
5 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
6 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
7 | Message-id: 20200617132201.1832152-2-stefanha@redhat.com | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | --- | ||
10 | block/nvme.c | 12 ++++++++++++ | ||
11 | 1 file changed, 12 insertions(+) | ||
12 | |||
13 | diff --git a/block/nvme.c b/block/nvme.c | ||
14 | index XXXXXXX..XXXXXXX 100644 | ||
15 | --- a/block/nvme.c | ||
16 | +++ b/block/nvme.c | ||
17 | @@ -XXX,XX +XXX,XX @@ static bool nvme_poll_queues(BDRVNVMeState *s) | ||
18 | |||
19 | for (i = 0; i < s->nr_queues; i++) { | ||
20 | NVMeQueuePair *q = s->queues[i]; | ||
21 | + const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; | ||
22 | + NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; | ||
23 | + | ||
24 | + /* | ||
25 | + * Do an early check for completions. q->lock isn't needed because | ||
26 | + * nvme_process_completion() only runs in the event loop thread and | ||
27 | + * cannot race with itself. | ||
28 | + */ | ||
29 | + if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { | ||
30 | + continue; | ||
31 | + } | ||
32 | + | ||
33 | qemu_mutex_lock(&q->lock); | ||
34 | while (nvme_process_completion(s, q)) { | ||
35 | /* Keep polling */ | ||
36 | -- | ||
37 | 2.26.2 | ||
38 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | nvme_process_completion() explicitly checks cid so the assertion that | ||
2 | follows is always true: | ||
3 | 1 | ||
4 | if (cid == 0 || cid > NVME_QUEUE_SIZE) { | ||
5 | ... | ||
6 | continue; | ||
7 | } | ||
8 | assert(cid <= NVME_QUEUE_SIZE); | ||
9 | |||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
11 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
12 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
13 | Message-id: 20200617132201.1832152-3-stefanha@redhat.com | ||
14 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
15 | --- | ||
16 | block/nvme.c | 1 - | ||
17 | 1 file changed, 1 deletion(-) | ||
18 | |||
19 | diff --git a/block/nvme.c b/block/nvme.c | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/block/nvme.c | ||
22 | +++ b/block/nvme.c | ||
23 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
24 | cid); | ||
25 | continue; | ||
26 | } | ||
27 | - assert(cid <= NVME_QUEUE_SIZE); | ||
28 | trace_nvme_complete_command(s, q->index, cid); | ||
29 | preq = &q->reqs[cid - 1]; | ||
30 | req = *preq; | ||
31 | -- | ||
32 | 2.26.2 | ||
33 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Do not access a CQE after incrementing q->cq.head and releasing q->lock. | ||
2 | It is unlikely that this causes problems in practice but it's a latent | ||
3 | bug. | ||
4 | 1 | ||
5 | The reason why it should be safe at the moment is that completion | ||
6 | processing is not re-entrant and the CQ doorbell isn't written until the | ||
7 | end of nvme_process_completion(). | ||
8 | |||
9 | Make this change now because QEMU expects completion processing to be | ||
10 | re-entrant and later patches will do that. | ||
11 | |||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Reviewed-by: Sergio Lopez <slp@redhat.com> | ||
14 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
15 | Message-id: 20200617132201.1832152-4-stefanha@redhat.com | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
17 | --- | ||
18 | block/nvme.c | 5 ++++- | ||
19 | 1 file changed, 4 insertions(+), 1 deletion(-) | ||
20 | |||
21 | diff --git a/block/nvme.c b/block/nvme.c | ||
22 | index XXXXXXX..XXXXXXX 100644 | ||
23 | --- a/block/nvme.c | ||
24 | +++ b/block/nvme.c | ||
25 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
26 | q->busy = true; | ||
27 | assert(q->inflight >= 0); | ||
28 | while (q->inflight) { | ||
29 | + int ret; | ||
30 | int16_t cid; | ||
31 | + | ||
32 | c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; | ||
33 | if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { | ||
34 | break; | ||
35 | } | ||
36 | + ret = nvme_translate_error(c); | ||
37 | q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; | ||
38 | if (!q->cq.head) { | ||
39 | q->cq_phase = !q->cq_phase; | ||
40 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
41 | preq->busy = false; | ||
42 | preq->cb = preq->opaque = NULL; | ||
43 | qemu_mutex_unlock(&q->lock); | ||
44 | - req.cb(req.opaque, nvme_translate_error(c)); | ||
45 | + req.cb(req.opaque, ret); | ||
46 | qemu_mutex_lock(&q->lock); | ||
47 | q->inflight--; | ||
48 | progress = true; | ||
49 | -- | ||
50 | 2.26.2 | ||
51 | diff view generated by jsdifflib |
1 | Existing users access free_req_queue under q->lock. Document this. | 1 | From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> |
---|---|---|---|
2 | 2 | ||
3 | Add testcase which checks that allocations during copy-on-read are | ||
4 | performed on the subcluster basis when subclusters are enabled in target | ||
5 | image. | ||
6 | |||
7 | This testcase also triggers the following assert with previous commit | ||
8 | not being applied, so we check that as well: | ||
9 | |||
10 | qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. | ||
11 | |||
12 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
13 | Reviewed-by: Denis V. Lunev <den@openvz.org> | ||
14 | Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com> | ||
15 | Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru> | ||
3 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
4 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 17 | Message-ID: <20230711172553.234055-4-andrey.drobyshev@virtuozzo.com> |
5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> | ||
6 | Message-id: 20200617132201.1832152-6-stefanha@redhat.com | ||
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
8 | --- | 18 | --- |
9 | block/nvme.c | 2 +- | 19 | tests/qemu-iotests/197 | 29 +++++++++++++++++++++++++++++ |
10 | 1 file changed, 1 insertion(+), 1 deletion(-) | 20 | tests/qemu-iotests/197.out | 24 ++++++++++++++++++++++++ |
21 | 2 files changed, 53 insertions(+) | ||
11 | 22 | ||
12 | diff --git a/block/nvme.c b/block/nvme.c | 23 | diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 |
24 | index XXXXXXX..XXXXXXX 100755 | ||
25 | --- a/tests/qemu-iotests/197 | ||
26 | +++ b/tests/qemu-iotests/197 | ||
27 | @@ -XXX,XX +XXX,XX @@ $QEMU_IO -f qcow2 -C -c 'read 0 1024' "$TEST_WRAP" | _filter_qemu_io | ||
28 | $QEMU_IO -f qcow2 -c map "$TEST_WRAP" | ||
29 | _check_test_img | ||
30 | |||
31 | +echo | ||
32 | +echo '=== Copy-on-read with subclusters ===' | ||
33 | +echo | ||
34 | + | ||
35 | +# Create base and top images 64K (1 cluster) each. Make subclusters enabled | ||
36 | +# for the top image | ||
37 | +_make_test_img 64K | ||
38 | +IMGPROTO=file IMGFMT=qcow2 TEST_IMG_FILE="$TEST_WRAP" \ | ||
39 | + _make_test_img --no-opts -o extended_l2=true -F "$IMGFMT" -b "$TEST_IMG" \ | ||
40 | + 64K | _filter_img_create | ||
41 | + | ||
42 | +$QEMU_IO -c "write -P 0xaa 0 64k" "$TEST_IMG" | _filter_qemu_io | ||
43 | + | ||
44 | +# Allocate individual subclusters in the top image, and not the whole cluster | ||
45 | +$QEMU_IO -c "write -P 0xbb 28K 2K" -c "write -P 0xcc 34K 2K" "$TEST_WRAP" \ | ||
46 | + | _filter_qemu_io | ||
47 | + | ||
48 | +# Only 2 subclusters should be allocated in the top image at this point | ||
49 | +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map | ||
50 | + | ||
51 | +# Actual copy-on-read operation | ||
52 | +$QEMU_IO -C -c "read -P 0xaa 30K 4K" "$TEST_WRAP" | _filter_qemu_io | ||
53 | + | ||
54 | +# And here we should have 4 subclusters allocated right in the middle of the | ||
55 | +# top image. Make sure the whole cluster remains unallocated | ||
56 | +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map | ||
57 | + | ||
58 | +_check_test_img | ||
59 | + | ||
60 | # success, all done | ||
61 | echo '*** done' | ||
62 | status=0 | ||
63 | diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out | ||
13 | index XXXXXXX..XXXXXXX 100644 | 64 | index XXXXXXX..XXXXXXX 100644 |
14 | --- a/block/nvme.c | 65 | --- a/tests/qemu-iotests/197.out |
15 | +++ b/block/nvme.c | 66 | +++ b/tests/qemu-iotests/197.out |
16 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 67 | @@ -XXX,XX +XXX,XX @@ read 1024/1024 bytes at offset 0 |
17 | } NVMeRequest; | 68 | 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) |
18 | 69 | 1 KiB (0x400) bytes allocated at offset 0 bytes (0x0) | |
19 | typedef struct { | 70 | No errors were found on the image. |
20 | - CoQueue free_req_queue; | 71 | + |
21 | QemuMutex lock; | 72 | +=== Copy-on-read with subclusters === |
22 | 73 | + | |
23 | /* Fields protected by BQL */ | 74 | +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 |
24 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 75 | +Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT |
25 | uint8_t *prp_list_pages; | 76 | +wrote 65536/65536 bytes at offset 0 |
26 | 77 | +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | |
27 | /* Fields protected by @lock */ | 78 | +wrote 2048/2048 bytes at offset 28672 |
28 | + CoQueue free_req_queue; | 79 | +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) |
29 | NVMeQueue sq, cq; | 80 | +wrote 2048/2048 bytes at offset 34816 |
30 | int cq_phase; | 81 | +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) |
31 | int free_req_head; | 82 | +Offset Length File |
83 | +0 0x7000 TEST_DIR/t.IMGFMT | ||
84 | +0x7000 0x800 TEST_DIR/t.wrap.IMGFMT | ||
85 | +0x7800 0x1000 TEST_DIR/t.IMGFMT | ||
86 | +0x8800 0x800 TEST_DIR/t.wrap.IMGFMT | ||
87 | +0x9000 0x7000 TEST_DIR/t.IMGFMT | ||
88 | +read 4096/4096 bytes at offset 30720 | ||
89 | +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) | ||
90 | +Offset Length File | ||
91 | +0 0x7000 TEST_DIR/t.IMGFMT | ||
92 | +0x7000 0x2000 TEST_DIR/t.wrap.IMGFMT | ||
93 | +0x9000 0x7000 TEST_DIR/t.IMGFMT | ||
94 | +No errors were found on the image. | ||
95 | *** done | ||
32 | -- | 96 | -- |
33 | 2.26.2 | 97 | 2.41.0 |
34 | diff view generated by jsdifflib |
1 | There are three issues with the current NVMeRequest->busy field: | 1 | liburing does not clear sqe->user_data. We must do it ourselves to avoid |
---|---|---|---|
2 | 1. The busy field is accidentally accessed outside q->lock when request | 2 | undefined behavior in process_cqe() when user_data is used. |
3 | submission fails. | ||
4 | 2. Waiters on free_req_queue are not woken when a request is returned | ||
5 | early due to submission failure. | ||
6 | 2. Finding a free request involves scanning all requests. This makes | ||
7 | request submission O(n^2). | ||
8 | 3 | ||
9 | Switch to an O(1) freelist that is always accessed under the lock. | 4 | Note that fdmon-io_uring is currently disabled, so this is a latent bug |
10 | 5 | that does not affect users. Let's merge this fix now to make it easier | |
11 | Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and | 6 | to enable fdmon-io_uring in the future (and I'm working on that). |
12 | NVME_NUM_REQS, the number of usable requests. This makes the code | ||
13 | simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind | ||
14 | that one slot is reserved. | ||
15 | 7 | ||
16 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
17 | Reviewed-by: Sergio Lopez <slp@redhat.com> | 9 | Message-ID: <20230426212639.82310-1-stefanha@redhat.com> |
18 | Message-id: 20200617132201.1832152-5-stefanha@redhat.com | ||
19 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
20 | --- | 10 | --- |
21 | block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------ | 11 | util/fdmon-io_uring.c | 2 ++ |
22 | 1 file changed, 54 insertions(+), 27 deletions(-) | 12 | 1 file changed, 2 insertions(+) |
23 | 13 | ||
24 | diff --git a/block/nvme.c b/block/nvme.c | 14 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c |
25 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
26 | --- a/block/nvme.c | 16 | --- a/util/fdmon-io_uring.c |
27 | +++ b/block/nvme.c | 17 | +++ b/util/fdmon-io_uring.c |
28 | @@ -XXX,XX +XXX,XX @@ | 18 | @@ -XXX,XX +XXX,XX @@ static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) |
29 | #define NVME_QUEUE_SIZE 128 | 19 | #else |
30 | #define NVME_BAR_SIZE 8192 | 20 | io_uring_prep_poll_remove(sqe, node); |
31 | 21 | #endif | |
32 | +/* | 22 | + io_uring_sqe_set_data(sqe, NULL); |
33 | + * We have to leave one slot empty as that is the full queue case where | ||
34 | + * head == tail + 1. | ||
35 | + */ | ||
36 | +#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) | ||
37 | + | ||
38 | typedef struct { | ||
39 | int32_t head, tail; | ||
40 | uint8_t *queue; | ||
41 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
42 | int cid; | ||
43 | void *prp_list_page; | ||
44 | uint64_t prp_list_iova; | ||
45 | - bool busy; | ||
46 | + int free_req_next; /* q->reqs[] index of next free req */ | ||
47 | } NVMeRequest; | ||
48 | |||
49 | typedef struct { | ||
50 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
51 | /* Fields protected by @lock */ | ||
52 | NVMeQueue sq, cq; | ||
53 | int cq_phase; | ||
54 | - NVMeRequest reqs[NVME_QUEUE_SIZE]; | ||
55 | + int free_req_head; | ||
56 | + NVMeRequest reqs[NVME_NUM_REQS]; | ||
57 | bool busy; | ||
58 | int need_kick; | ||
59 | int inflight; | ||
60 | @@ -XXX,XX +XXX,XX @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, | ||
61 | qemu_mutex_init(&q->lock); | ||
62 | q->index = idx; | ||
63 | qemu_co_queue_init(&q->free_req_queue); | ||
64 | - q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE); | ||
65 | + q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); | ||
66 | r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, | ||
67 | - s->page_size * NVME_QUEUE_SIZE, | ||
68 | + s->page_size * NVME_NUM_REQS, | ||
69 | false, &prp_list_iova); | ||
70 | if (r) { | ||
71 | goto fail; | ||
72 | } | ||
73 | - for (i = 0; i < NVME_QUEUE_SIZE; i++) { | ||
74 | + q->free_req_head = -1; | ||
75 | + for (i = 0; i < NVME_NUM_REQS; i++) { | ||
76 | NVMeRequest *req = &q->reqs[i]; | ||
77 | req->cid = i + 1; | ||
78 | + req->free_req_next = q->free_req_head; | ||
79 | + q->free_req_head = i; | ||
80 | req->prp_list_page = q->prp_list_pages + i * s->page_size; | ||
81 | req->prp_list_iova = prp_list_iova + i * s->page_size; | ||
82 | } | ||
83 | + | ||
84 | nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); | ||
85 | if (local_err) { | ||
86 | error_propagate(errp, local_err); | ||
87 | @@ -XXX,XX +XXX,XX @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) | ||
88 | */ | ||
89 | static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) | ||
90 | { | ||
91 | - int i; | ||
92 | - NVMeRequest *req = NULL; | ||
93 | + NVMeRequest *req; | ||
94 | |||
95 | qemu_mutex_lock(&q->lock); | ||
96 | - while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) { | ||
97 | - /* We have to leave one slot empty as that is the full queue case (head | ||
98 | - * == tail + 1). */ | ||
99 | + | ||
100 | + while (q->free_req_head == -1) { | ||
101 | if (qemu_in_coroutine()) { | ||
102 | trace_nvme_free_req_queue_wait(q); | ||
103 | qemu_co_queue_wait(&q->free_req_queue, &q->lock); | ||
104 | @@ -XXX,XX +XXX,XX @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) | ||
105 | return NULL; | ||
106 | } | ||
107 | } | ||
108 | - for (i = 0; i < NVME_QUEUE_SIZE; i++) { | ||
109 | - if (!q->reqs[i].busy) { | ||
110 | - q->reqs[i].busy = true; | ||
111 | - req = &q->reqs[i]; | ||
112 | - break; | ||
113 | - } | ||
114 | - } | ||
115 | - /* We have checked inflight and need_kick while holding q->lock, so one | ||
116 | - * free req must be available. */ | ||
117 | - assert(req); | ||
118 | + | ||
119 | + req = &q->reqs[q->free_req_head]; | ||
120 | + q->free_req_head = req->free_req_next; | ||
121 | + req->free_req_next = -1; | ||
122 | + | ||
123 | qemu_mutex_unlock(&q->lock); | ||
124 | return req; | ||
125 | } | 23 | } |
126 | 24 | ||
127 | +/* With q->lock */ | 25 | /* Add a timeout that self-cancels when another cqe becomes ready */ |
128 | +static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) | 26 | @@ -XXX,XX +XXX,XX @@ static void add_timeout_sqe(AioContext *ctx, int64_t ns) |
129 | +{ | 27 | |
130 | + req->free_req_next = q->free_req_head; | 28 | sqe = get_sqe(ctx); |
131 | + q->free_req_head = req - q->reqs; | 29 | io_uring_prep_timeout(sqe, &ts, 1, 0); |
132 | +} | 30 | + io_uring_sqe_set_data(sqe, NULL); |
133 | + | 31 | } |
134 | +/* With q->lock */ | 32 | |
135 | +static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) | 33 | /* Add sqes from ctx->submit_list for submission */ |
136 | +{ | ||
137 | + if (!qemu_co_queue_empty(&q->free_req_queue)) { | ||
138 | + replay_bh_schedule_oneshot_event(s->aio_context, | ||
139 | + nvme_free_req_queue_cb, q); | ||
140 | + } | ||
141 | +} | ||
142 | + | ||
143 | +/* Insert a request in the freelist and wake waiters */ | ||
144 | +static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, | ||
145 | + NVMeRequest *req) | ||
146 | +{ | ||
147 | + qemu_mutex_lock(&q->lock); | ||
148 | + nvme_put_free_req_locked(q, req); | ||
149 | + nvme_wake_free_req_locked(s, q); | ||
150 | + qemu_mutex_unlock(&q->lock); | ||
151 | +} | ||
152 | + | ||
153 | static inline int nvme_translate_error(const NvmeCqe *c) | ||
154 | { | ||
155 | uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; | ||
156 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
157 | req = *preq; | ||
158 | assert(req.cid == cid); | ||
159 | assert(req.cb); | ||
160 | - preq->busy = false; | ||
161 | + nvme_put_free_req_locked(q, preq); | ||
162 | preq->cb = preq->opaque = NULL; | ||
163 | qemu_mutex_unlock(&q->lock); | ||
164 | req.cb(req.opaque, ret); | ||
165 | @@ -XXX,XX +XXX,XX @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) | ||
166 | /* Notify the device so it can post more completions. */ | ||
167 | smp_mb_release(); | ||
168 | *q->cq.doorbell = cpu_to_le32(q->cq.head); | ||
169 | - if (!qemu_co_queue_empty(&q->free_req_queue)) { | ||
170 | - replay_bh_schedule_oneshot_event(s->aio_context, | ||
171 | - nvme_free_req_queue_cb, q); | ||
172 | - } | ||
173 | + nvme_wake_free_req_locked(s, q); | ||
174 | } | ||
175 | q->busy = false; | ||
176 | return progress; | ||
177 | @@ -XXX,XX +XXX,XX @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, | ||
178 | r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); | ||
179 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
180 | if (r) { | ||
181 | - req->busy = false; | ||
182 | + nvme_put_free_req_and_wake(s, ioq, req); | ||
183 | return r; | ||
184 | } | ||
185 | nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); | ||
186 | @@ -XXX,XX +XXX,XX @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, | ||
187 | qemu_co_mutex_unlock(&s->dma_map_lock); | ||
188 | |||
189 | if (ret) { | ||
190 | - req->busy = false; | ||
191 | + nvme_put_free_req_and_wake(s, ioq, req); | ||
192 | goto out; | ||
193 | } | ||
194 | |||
195 | -- | 34 | -- |
196 | 2.26.2 | 35 | 2.41.0 |
197 | diff view generated by jsdifflib |