1
The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:
1
The following changes since commit 8844bb8d896595ee1d25d21c770e6e6f29803097:
2
2
3
Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)
3
Merge tag 'or1k-pull-request-20230513' of https://github.com/stffrdhrn/qemu into staging (2023-05-13 11:23:14 +0100)
4
4
5
are available in the git repository at:
5
are available in the Git repository at:
6
6
7
git://github.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:
9
for you to fetch changes up to 01562fee5f3ad4506d57dbcf4b1903b565eceec7:
10
10
11
coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)
11
docs/zoned-storage:add zoned emulation use case (2023-05-15 08:19:04 -0400)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
This pull request contain's Sam Li's zoned storage support in the QEMU block
17
layer and virtio-blk emulation.
18
16
v2:
19
v2:
17
* Rebased to resolve scsi conflicts
20
- Sam fixed the CI failures. CI passes for me now. [Richard]
18
21
19
----------------------------------------------------------------
22
----------------------------------------------------------------
20
23
21
Paolo Bonzini (24):
24
Sam Li (16):
22
block: move AioContext, QEMUTimer, main-loop to libqemuutil
25
block/block-common: add zoned device structs
23
aio: introduce aio_co_schedule and aio_co_wake
26
block/file-posix: introduce helper functions for sysfs attributes
24
block-backend: allow blk_prw from coroutine context
27
block/block-backend: add block layer APIs resembling Linux
25
test-thread-pool: use generic AioContext infrastructure
28
ZonedBlockDevice ioctls
26
io: add methods to set I/O handlers on AioContext
29
block/raw-format: add zone operations to pass through requests
27
io: make qio_channel_yield aware of AioContexts
30
block: add zoned BlockDriver check to block layer
28
nbd: convert to use qio_channel_yield
31
iotests: test new zone operations
29
coroutine-lock: reschedule coroutine on the AioContext it was running
32
block: add some trace events for new block layer APIs
30
on
33
docs/zoned-storage: add zoned device documentation
31
blkdebug: reschedule coroutine on the AioContext it is running on
34
file-posix: add tracking of the zone write pointers
32
qed: introduce qed_aio_start_io and qed_aio_next_io_cb
35
block: introduce zone append write for zoned devices
33
aio: push aio_context_acquire/release down to dispatching
36
qemu-iotests: test zone append operation
34
block: explicitly acquire aiocontext in timers that need it
37
block: add some trace events for zone append
35
block: explicitly acquire aiocontext in callbacks that need it
38
virtio-blk: add zoned storage emulation for zoned devices
36
block: explicitly acquire aiocontext in bottom halves that need it
39
block: add accounting for zone append operation
37
block: explicitly acquire aiocontext in aio callbacks that need it
40
virtio-blk: add some trace events for zoned emulation
38
aio-posix: partially inline aio_dispatch into aio_poll
41
docs/zoned-storage:add zoned emulation use case
39
async: remove unnecessary inc/dec pairs
40
block: document fields protected by AioContext lock
41
coroutine-lock: make CoMutex thread-safe
42
coroutine-lock: add limited spinning to CoMutex
43
test-aio-multithread: add performance comparison with thread-based
44
mutexes
45
coroutine-lock: place CoMutex before CoQueue in header
46
coroutine-lock: add mutex argument to CoQueue APIs
47
coroutine-lock: make CoRwlock thread-safe and fair
48
42
49
Makefile.objs | 4 -
43
docs/devel/index-api.rst | 1 +
50
stubs/Makefile.objs | 1 +
44
docs/devel/zoned-storage.rst | 62 +++
51
tests/Makefile.include | 19 +-
45
qapi/block-core.json | 68 ++-
52
util/Makefile.objs | 6 +-
46
qapi/block.json | 4 +
53
block/nbd-client.h | 2 +-
47
meson.build | 5 +
54
block/qed.h | 3 +
48
include/block/accounting.h | 1 +
55
include/block/aio.h | 38 ++-
49
include/block/block-common.h | 57 ++
56
include/block/block_int.h | 64 +++--
50
include/block/block-io.h | 13 +
57
include/io/channel.h | 72 +++++-
51
include/block/block_int-common.h | 37 ++
58
include/qemu/coroutine.h | 84 ++++---
52
include/block/raw-aio.h | 8 +-
59
include/qemu/coroutine_int.h | 11 +-
53
include/sysemu/block-backend-io.h | 27 +
60
include/sysemu/block-backend.h | 14 +-
54
block.c | 19 +
61
tests/iothread.h | 25 ++
55
block/block-backend.c | 198 +++++++
62
block/backup.c | 2 +-
56
block/file-posix.c | 692 +++++++++++++++++++++++--
63
block/blkdebug.c | 9 +-
57
block/io.c | 68 +++
64
block/blkreplay.c | 2 +-
58
block/io_uring.c | 4 +
65
block/block-backend.c | 13 +-
59
block/linux-aio.c | 3 +
66
block/curl.c | 44 +++-
60
block/qapi-sysemu.c | 11 +
67
block/gluster.c | 9 +-
61
block/qapi.c | 18 +
68
block/io.c | 42 +---
62
block/raw-format.c | 26 +
69
block/iscsi.c | 15 +-
63
hw/block/virtio-blk-common.c | 2 +
70
block/linux-aio.c | 10 +-
64
hw/block/virtio-blk.c | 405 +++++++++++++++
71
block/mirror.c | 12 +-
65
hw/virtio/virtio-qmp.c | 2 +
72
block/nbd-client.c | 119 +++++----
66
qemu-io-cmds.c | 224 ++++++++
73
block/nfs.c | 9 +-
67
block/trace-events | 4 +
74
block/qcow2-cluster.c | 4 +-
68
docs/system/qemu-block-drivers.rst.inc | 6 +
75
block/qed-cluster.c | 2 +
69
hw/block/trace-events | 7 +
76
block/qed-table.c | 12 +-
70
tests/qemu-iotests/227.out | 18 +
77
block/qed.c | 58 +++--
71
tests/qemu-iotests/tests/zoned | 105 ++++
78
block/sheepdog.c | 31 +--
72
tests/qemu-iotests/tests/zoned.out | 69 +++
79
block/ssh.c | 29 +--
73
30 files changed, 2106 insertions(+), 58 deletions(-)
80
block/throttle-groups.c | 4 +-
74
create mode 100644 docs/devel/zoned-storage.rst
81
block/win32-aio.c | 9 +-
75
create mode 100755 tests/qemu-iotests/tests/zoned
82
dma-helpers.c | 2 +
76
create mode 100644 tests/qemu-iotests/tests/zoned.out
83
hw/9pfs/9p.c | 2 +-
84
hw/block/virtio-blk.c | 19 +-
85
hw/scsi/scsi-bus.c | 2 +
86
hw/scsi/scsi-disk.c | 15 ++
87
hw/scsi/scsi-generic.c | 20 +-
88
hw/scsi/virtio-scsi.c | 7 +
89
io/channel-command.c | 13 +
90
io/channel-file.c | 11 +
91
io/channel-socket.c | 16 +-
92
io/channel-tls.c | 12 +
93
io/channel-watch.c | 6 +
94
io/channel.c | 97 ++++++--
95
nbd/client.c | 2 +-
96
nbd/common.c | 9 +-
97
nbd/server.c | 94 +++-----
98
stubs/linux-aio.c | 32 +++
99
stubs/set-fd-handler.c | 11 -
100
tests/iothread.c | 91 +++++++
101
tests/test-aio-multithread.c | 463 ++++++++++++++++++++++++++++++++++++
102
tests/test-thread-pool.c | 12 +-
103
aio-posix.c => util/aio-posix.c | 62 ++---
104
aio-win32.c => util/aio-win32.c | 30 +--
105
util/aiocb.c | 55 +++++
106
async.c => util/async.c | 84 ++++++-
107
iohandler.c => util/iohandler.c | 0
108
main-loop.c => util/main-loop.c | 0
109
util/qemu-coroutine-lock.c | 254 ++++++++++++++++++--
110
util/qemu-coroutine-sleep.c | 2 +-
111
util/qemu-coroutine.c | 8 +
112
qemu-timer.c => util/qemu-timer.c | 0
113
thread-pool.c => util/thread-pool.c | 8 +-
114
trace-events | 11 -
115
util/trace-events | 17 +-
116
67 files changed, 1712 insertions(+), 533 deletions(-)
117
create mode 100644 tests/iothread.h
118
create mode 100644 stubs/linux-aio.c
119
create mode 100644 tests/iothread.c
120
create mode 100644 tests/test-aio-multithread.c
121
rename aio-posix.c => util/aio-posix.c (94%)
122
rename aio-win32.c => util/aio-win32.c (95%)
123
create mode 100644 util/aiocb.c
124
rename async.c => util/async.c (82%)
125
rename iohandler.c => util/iohandler.c (100%)
126
rename main-loop.c => util/main-loop.c (100%)
127
rename qemu-timer.c => util/qemu-timer.c (100%)
128
rename thread-pool.c => util/thread-pool.c (97%)
129
77
130
--
78
--
131
2.9.3
79
2.40.1
132
133
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
All that CoQueue needs in order to become thread-safe is help
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
from an external mutex. Add this to the API.
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
5
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
6
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
6
Reviewed-by: Hannes Reinecke <hare@suse.de>
7
Reviewed-by: Fam Zheng <famz@redhat.com>
7
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
8
Message-id: 20170213181244.16297-6-pbonzini@redhat.com
8
Acked-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: 20230508045533.175575-2-faithilikerun@gmail.com
11
Message-id: 20230324090605.28361-2-faithilikerun@gmail.com
12
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
13
<philmd@linaro.org>.
14
--Stefan]
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
16
---
11
include/qemu/coroutine.h | 8 +++++---
17
include/block/block-common.h | 43 ++++++++++++++++++++++++++++++++++++
12
block/backup.c | 2 +-
18
1 file changed, 43 insertions(+)
13
block/io.c | 4 ++--
14
block/nbd-client.c | 2 +-
15
block/qcow2-cluster.c | 4 +---
16
block/sheepdog.c | 2 +-
17
block/throttle-groups.c | 2 +-
18
hw/9pfs/9p.c | 2 +-
19
util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
20
9 files changed, 34 insertions(+), 16 deletions(-)
21
19
22
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
20
diff --git a/include/block/block-common.h b/include/block/block-common.h
23
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
24
--- a/include/qemu/coroutine.h
22
--- a/include/block/block-common.h
25
+++ b/include/qemu/coroutine.h
23
+++ b/include/block/block-common.h
26
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
24
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDriver BlockDriver;
27
25
typedef struct BdrvChild BdrvChild;
28
/**
26
typedef struct BdrvChildClass BdrvChildClass;
29
* CoQueues are a mechanism to queue coroutines in order to continue executing
27
30
- * them later.
28
+typedef enum BlockZoneOp {
31
+ * them later. They are similar to condition variables, but they need help
29
+ BLK_ZO_OPEN,
32
+ * from an external mutex in order to maintain thread-safety.
30
+ BLK_ZO_CLOSE,
33
*/
31
+ BLK_ZO_FINISH,
34
typedef struct CoQueue {
32
+ BLK_ZO_RESET,
35
QSIMPLEQ_HEAD(, Coroutine) entries;
33
+} BlockZoneOp;
36
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
37
38
/**
39
* Adds the current coroutine to the CoQueue and transfers control to the
40
- * caller of the coroutine.
41
+ * caller of the coroutine. The mutex is unlocked during the wait and
42
+ * locked again afterwards.
43
*/
44
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
45
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
46
47
/**
48
* Restarts the next coroutine in the CoQueue and removes it from the queue.
49
diff --git a/block/backup.c b/block/backup.c
50
index XXXXXXX..XXXXXXX 100644
51
--- a/block/backup.c
52
+++ b/block/backup.c
53
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
54
retry = false;
55
QLIST_FOREACH(req, &job->inflight_reqs, list) {
56
if (end > req->start && start < req->end) {
57
- qemu_co_queue_wait(&req->wait_queue);
58
+ qemu_co_queue_wait(&req->wait_queue, NULL);
59
retry = true;
60
break;
61
}
62
diff --git a/block/io.c b/block/io.c
63
index XXXXXXX..XXXXXXX 100644
64
--- a/block/io.c
65
+++ b/block/io.c
66
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
67
* (instead of producing a deadlock in the former case). */
68
if (!req->waiting_for) {
69
self->waiting_for = req;
70
- qemu_co_queue_wait(&req->wait_queue);
71
+ qemu_co_queue_wait(&req->wait_queue, NULL);
72
self->waiting_for = NULL;
73
retry = true;
74
waited = true;
75
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
76
77
/* Wait until any previous flushes are completed */
78
while (bs->active_flush_req) {
79
- qemu_co_queue_wait(&bs->flush_queue);
80
+ qemu_co_queue_wait(&bs->flush_queue, NULL);
81
}
82
83
bs->active_flush_req = true;
84
diff --git a/block/nbd-client.c b/block/nbd-client.c
85
index XXXXXXX..XXXXXXX 100644
86
--- a/block/nbd-client.c
87
+++ b/block/nbd-client.c
88
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
89
/* Poor man semaphore. The free_sema is locked when no other request
90
* can be accepted, and unlocked after receiving one reply. */
91
if (s->in_flight == MAX_NBD_REQUESTS) {
92
- qemu_co_queue_wait(&s->free_sema);
93
+ qemu_co_queue_wait(&s->free_sema, NULL);
94
assert(s->in_flight < MAX_NBD_REQUESTS);
95
}
96
s->in_flight++;
97
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
98
index XXXXXXX..XXXXXXX 100644
99
--- a/block/qcow2-cluster.c
100
+++ b/block/qcow2-cluster.c
101
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
102
if (bytes == 0) {
103
/* Wait for the dependency to complete. We need to recheck
104
* the free/allocated clusters when we continue. */
105
- qemu_co_mutex_unlock(&s->lock);
106
- qemu_co_queue_wait(&old_alloc->dependent_requests);
107
- qemu_co_mutex_lock(&s->lock);
108
+ qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
109
return -EAGAIN;
110
}
111
}
112
diff --git a/block/sheepdog.c b/block/sheepdog.c
113
index XXXXXXX..XXXXXXX 100644
114
--- a/block/sheepdog.c
115
+++ b/block/sheepdog.c
116
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
117
retry:
118
QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
119
if (AIOCBOverlapping(acb, cb)) {
120
- qemu_co_queue_wait(&s->overlapping_queue);
121
+ qemu_co_queue_wait(&s->overlapping_queue, NULL);
122
goto retry;
123
}
124
}
125
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
126
index XXXXXXX..XXXXXXX 100644
127
--- a/block/throttle-groups.c
128
+++ b/block/throttle-groups.c
129
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
130
if (must_wait || blkp->pending_reqs[is_write]) {
131
blkp->pending_reqs[is_write]++;
132
qemu_mutex_unlock(&tg->lock);
133
- qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
134
+ qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
135
qemu_mutex_lock(&tg->lock);
136
blkp->pending_reqs[is_write]--;
137
}
138
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
139
index XXXXXXX..XXXXXXX 100644
140
--- a/hw/9pfs/9p.c
141
+++ b/hw/9pfs/9p.c
142
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
143
/*
144
* Wait for pdu to complete.
145
*/
146
- qemu_co_queue_wait(&cancel_pdu->complete);
147
+ qemu_co_queue_wait(&cancel_pdu->complete, NULL);
148
cancel_pdu->cancelled = 0;
149
pdu_free(cancel_pdu);
150
}
151
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
152
index XXXXXXX..XXXXXXX 100644
153
--- a/util/qemu-coroutine-lock.c
154
+++ b/util/qemu-coroutine-lock.c
155
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
156
QSIMPLEQ_INIT(&queue->entries);
157
}
158
159
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
160
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
161
{
162
Coroutine *self = qemu_coroutine_self();
163
QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
164
+
34
+
165
+ if (mutex) {
35
+typedef enum BlockZoneModel {
166
+ qemu_co_mutex_unlock(mutex);
36
+ BLK_Z_NONE = 0x0, /* Regular block device */
167
+ }
37
+ BLK_Z_HM = 0x1, /* Host-managed zoned block device */
38
+ BLK_Z_HA = 0x2, /* Host-aware zoned block device */
39
+} BlockZoneModel;
168
+
40
+
169
+ /* There is no race condition here. Other threads will call
41
+typedef enum BlockZoneState {
170
+ * aio_co_schedule on our AioContext, which can reenter this
42
+ BLK_ZS_NOT_WP = 0x0,
171
+ * coroutine but only after this yield and after the main loop
43
+ BLK_ZS_EMPTY = 0x1,
172
+ * has gone through the next iteration.
44
+ BLK_ZS_IOPEN = 0x2,
173
+ */
45
+ BLK_ZS_EOPEN = 0x3,
174
qemu_coroutine_yield();
46
+ BLK_ZS_CLOSED = 0x4,
175
assert(qemu_in_coroutine());
47
+ BLK_ZS_RDONLY = 0xD,
48
+ BLK_ZS_FULL = 0xE,
49
+ BLK_ZS_OFFLINE = 0xF,
50
+} BlockZoneState;
176
+
51
+
177
+ /* TODO: OSv implements wait morphing here, where the wakeup
52
+typedef enum BlockZoneType {
178
+ * primitive automatically places the woken coroutine on the
53
+ BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
179
+ * mutex's queue. This avoids the thundering herd effect.
54
+ BLK_ZT_SWR = 0x2, /* Sequential writes required */
180
+ */
55
+ BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
181
+ if (mutex) {
56
+} BlockZoneType;
182
+ qemu_co_mutex_lock(mutex);
57
+
183
+ }
58
+/*
184
}
59
+ * Zone descriptor data structure.
185
60
+ * Provides information on a zone with all position and size values in bytes.
186
/**
61
+ */
187
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
62
+typedef struct BlockZoneDescriptor {
188
Coroutine *self = qemu_coroutine_self();
63
+ uint64_t start;
189
64
+ uint64_t length;
190
while (lock->writer) {
65
+ uint64_t cap;
191
- qemu_co_queue_wait(&lock->queue);
66
+ uint64_t wp;
192
+ qemu_co_queue_wait(&lock->queue, NULL);
67
+ BlockZoneType type;
193
}
68
+ BlockZoneState state;
194
lock->reader++;
69
+} BlockZoneDescriptor;
195
self->locks_held++;
70
+
196
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
71
typedef struct BlockDriverInfo {
197
Coroutine *self = qemu_coroutine_self();
72
/* in bytes, 0 if irrelevant */
198
73
int cluster_size;
199
while (lock->writer || lock->reader) {
200
- qemu_co_queue_wait(&lock->queue);
201
+ qemu_co_queue_wait(&lock->queue, NULL);
202
}
203
lock->writer = true;
204
self->locks_held++;
205
--
74
--
206
2.9.3
75
2.40.1
207
76
208
77
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
This adds a CoMutex around the existing CoQueue. Because the write-side
3
Use get_sysfs_str_val() to get the string value of device
4
can just take CoMutex, the old "writer" field is not necessary anymore.
4
zoned model. Then get_sysfs_zoned_model() can convert it to
5
Instead of removing it altogether, count the number of pending writers
5
BlockZoneModel type of QEMU.
6
during a read-side critical section and forbid further readers from
6
7
entering.
7
Use get_sysfs_long_val() to get the long value of zoned device
8
8
information.
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
10
Reviewed-by: Fam Zheng <famz@redhat.com>
10
Signed-off-by: Sam Li <faithilikerun@gmail.com>
11
Message-id: 20170213181244.16297-7-pbonzini@redhat.com
11
Reviewed-by: Hannes Reinecke <hare@suse.de>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
14
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
15
Acked-by: Kevin Wolf <kwolf@redhat.com>
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Message-id: 20230508045533.175575-3-faithilikerun@gmail.com
18
Message-id: 20230324090605.28361-3-faithilikerun@gmail.com
19
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
20
<philmd@linaro.org>.
21
--Stefan]
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
23
---
14
include/qemu/coroutine.h | 3 ++-
24
include/block/block_int-common.h | 3 +
15
util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
25
block/file-posix.c | 135 ++++++++++++++++++++++---------
16
2 files changed, 26 insertions(+), 12 deletions(-)
26
2 files changed, 100 insertions(+), 38 deletions(-)
17
27
18
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
28
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
19
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
20
--- a/include/qemu/coroutine.h
30
--- a/include/block/block_int-common.h
21
+++ b/include/qemu/coroutine.h
31
+++ b/include/block/block_int-common.h
22
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
32
@@ -XXX,XX +XXX,XX @@ typedef struct BlockLimits {
23
33
* an explicit monitor command to load the disk inside the guest).
24
34
*/
25
typedef struct CoRwlock {
35
bool has_variable_length;
26
- bool writer;
36
+
27
+ int pending_writer;
37
+ /* device zone model */
28
int reader;
38
+ BlockZoneModel zoned;
29
+ CoMutex mutex;
39
} BlockLimits;
30
CoQueue queue;
40
31
} CoRwlock;
41
typedef struct BdrvOpBlocker BdrvOpBlocker;
32
42
diff --git a/block/file-posix.c b/block/file-posix.c
33
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
34
index XXXXXXX..XXXXXXX 100644
43
index XXXXXXX..XXXXXXX 100644
35
--- a/util/qemu-coroutine-lock.c
44
--- a/block/file-posix.c
36
+++ b/util/qemu-coroutine-lock.c
45
+++ b/block/file-posix.c
37
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
46
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
47
#endif
48
}
49
50
-static int hdev_get_max_segments(int fd, struct stat *st)
51
+/*
52
+ * Get a sysfs attribute value as character string.
53
+ */
54
+#ifdef CONFIG_LINUX
55
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
56
+ char **val) {
57
+ g_autofree char *sysfspath = NULL;
58
+ int ret;
59
+ size_t len;
60
+
61
+ if (!S_ISBLK(st->st_mode)) {
62
+ return -ENOTSUP;
63
+ }
64
+
65
+ sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
66
+ major(st->st_rdev), minor(st->st_rdev),
67
+ attribute);
68
+ ret = g_file_get_contents(sysfspath, val, &len, NULL);
69
+ if (ret == -1) {
70
+ return -ENOENT;
71
+ }
72
+
73
+ /* The file is ended with '\n' */
74
+ char *p;
75
+ p = *val;
76
+ if (*(p + len - 1) == '\n') {
77
+ *(p + len - 1) = '\0';
78
+ }
79
+ return ret;
80
+}
81
+#endif
82
+
83
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
38
{
84
{
39
memset(lock, 0, sizeof(*lock));
85
+ g_autofree char *val = NULL;
40
qemu_co_queue_init(&lock->queue);
86
+ int ret;
41
+ qemu_co_mutex_init(&lock->mutex);
87
+
88
+ ret = get_sysfs_str_val(st, "zoned", &val);
89
+ if (ret < 0) {
90
+ return ret;
91
+ }
92
+
93
+ if (strcmp(val, "host-managed") == 0) {
94
+ *zoned = BLK_Z_HM;
95
+ } else if (strcmp(val, "host-aware") == 0) {
96
+ *zoned = BLK_Z_HA;
97
+ } else if (strcmp(val, "none") == 0) {
98
+ *zoned = BLK_Z_NONE;
99
+ } else {
100
+ return -ENOTSUP;
101
+ }
102
+ return 0;
103
+}
104
+
105
+/*
106
+ * Get a sysfs attribute value as a long integer.
107
+ */
108
#ifdef CONFIG_LINUX
109
- char buf[32];
110
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
111
+{
112
+ g_autofree char *str = NULL;
113
const char *end;
114
- char *sysfspath = NULL;
115
+ long val;
116
+ int ret;
117
+
118
+ ret = get_sysfs_str_val(st, attribute, &str);
119
+ if (ret < 0) {
120
+ return ret;
121
+ }
122
+
123
+ /* The file is ended with '\n', pass 'end' to accept that. */
124
+ ret = qemu_strtol(str, &end, 10, &val);
125
+ if (ret == 0 && end && *end == '\0') {
126
+ ret = val;
127
+ }
128
+ return ret;
129
+}
130
+#endif
131
+
132
+static int hdev_get_max_segments(int fd, struct stat *st)
133
+{
134
+#ifdef CONFIG_LINUX
135
int ret;
136
- int sysfd = -1;
137
- long max_segments;
138
139
if (S_ISCHR(st->st_mode)) {
140
if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
141
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_segments(int fd, struct stat *st)
142
}
143
return -ENOTSUP;
144
}
145
-
146
- if (!S_ISBLK(st->st_mode)) {
147
- return -ENOTSUP;
148
- }
149
-
150
- sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
151
- major(st->st_rdev), minor(st->st_rdev));
152
- sysfd = open(sysfspath, O_RDONLY);
153
- if (sysfd == -1) {
154
- ret = -errno;
155
- goto out;
156
- }
157
- ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
158
- if (ret < 0) {
159
- ret = -errno;
160
- goto out;
161
- } else if (ret == 0) {
162
- ret = -EIO;
163
- goto out;
164
- }
165
- buf[ret] = 0;
166
- /* The file is ended with '\n', pass 'end' to accept that. */
167
- ret = qemu_strtol(buf, &end, 10, &max_segments);
168
- if (ret == 0 && end && *end == '\n') {
169
- ret = max_segments;
170
- }
171
-
172
-out:
173
- if (sysfd != -1) {
174
- close(sysfd);
175
- }
176
- g_free(sysfspath);
177
- return ret;
178
+ return get_sysfs_long_val(st, "max_segments");
179
#else
180
return -ENOTSUP;
181
#endif
42
}
182
}
43
183
44
void qemu_co_rwlock_rdlock(CoRwlock *lock)
184
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
185
+ Error **errp)
186
+{
187
+ BlockZoneModel zoned;
188
+ int ret;
189
+
190
+ bs->bl.zoned = BLK_Z_NONE;
191
+
192
+ ret = get_sysfs_zoned_model(st, &zoned);
193
+ if (ret < 0 || zoned == BLK_Z_NONE) {
194
+ return;
195
+ }
196
+ bs->bl.zoned = zoned;
197
+}
198
+
199
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
45
{
200
{
46
Coroutine *self = qemu_coroutine_self();
201
BDRVRawState *s = bs->opaque;
47
202
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
48
- while (lock->writer) {
203
bs->bl.max_hw_iov = ret;
49
- qemu_co_queue_wait(&lock->queue, NULL);
50
+ qemu_co_mutex_lock(&lock->mutex);
51
+ /* For fairness, wait if a writer is in line. */
52
+ while (lock->pending_writer) {
53
+ qemu_co_queue_wait(&lock->queue, &lock->mutex);
54
}
55
lock->reader++;
56
+ qemu_co_mutex_unlock(&lock->mutex);
57
+
58
+ /* The rest of the read-side critical section is run without the mutex. */
59
self->locks_held++;
60
}
61
62
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
63
Coroutine *self = qemu_coroutine_self();
64
65
assert(qemu_in_coroutine());
66
- if (lock->writer) {
67
- lock->writer = false;
68
+ if (!lock->reader) {
69
+ /* The critical section started in qemu_co_rwlock_wrlock. */
70
qemu_co_queue_restart_all(&lock->queue);
71
} else {
72
+ self->locks_held--;
73
+
74
+ qemu_co_mutex_lock(&lock->mutex);
75
lock->reader--;
76
assert(lock->reader >= 0);
77
/* Wakeup only one waiting writer */
78
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
79
qemu_co_queue_next(&lock->queue);
80
}
204
}
81
}
205
}
82
- self->locks_held--;
206
+
83
+ qemu_co_mutex_unlock(&lock->mutex);
207
+ raw_refresh_zoned_limits(bs, &st, errp);
84
}
208
}
85
209
86
void qemu_co_rwlock_wrlock(CoRwlock *lock)
210
static int check_for_dasd(int fd)
87
{
88
- Coroutine *self = qemu_coroutine_self();
89
-
90
- while (lock->writer || lock->reader) {
91
- qemu_co_queue_wait(&lock->queue, NULL);
92
+ qemu_co_mutex_lock(&lock->mutex);
93
+ lock->pending_writer++;
94
+ while (lock->reader) {
95
+ qemu_co_queue_wait(&lock->queue, &lock->mutex);
96
}
97
- lock->writer = true;
98
- self->locks_held++;
99
+ lock->pending_writer--;
100
+
101
+ /* The rest of the write-side critical section is run with
102
+ * the mutex taken, so that lock->reader remains zero.
103
+ * There is no need to update self->locks_held.
104
+ */
105
}
106
--
211
--
107
2.9.3
212
2.40.1
108
213
109
214
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
In the client, read the reply headers from a coroutine, switching the
3
Add zoned device option to host_device BlockDriver. It will be presented only
4
read side between the "read header" coroutine and the I/O coroutine that
4
for zoned host block devices. By adding zone management operations to the
5
reads the body of the reply.
5
host_block_device BlockDriver, users can use the new block layer APIs
6
including Report Zone and four zone management operations
7
(open, close, finish, reset, reset_all).
6
8
7
In the server, if the server can read more requests it will create a new
9
Qemu-io uses the new APIs to perform zoned storage commands of the device:
8
"read request" coroutine as soon as a request has been read. Otherwise,
10
zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
9
the new coroutine is created in nbd_request_put.
11
zone_finish(zf).
10
12
13
For example, to test zone_report, use following command:
14
$ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0
15
-c "zrp offset nr_zones"
16
17
Signed-off-by: Sam Li <faithilikerun@gmail.com>
18
Reviewed-by: Hannes Reinecke <hare@suse.de>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
19
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
20
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
13
Reviewed-by: Fam Zheng <famz@redhat.com>
21
Acked-by: Kevin Wolf <kwolf@redhat.com>
14
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Message-id: 20170213135235.12274-8-pbonzini@redhat.com
23
Message-id: 20230508045533.175575-4-faithilikerun@gmail.com
24
Message-id: 20230324090605.28361-4-faithilikerun@gmail.com
25
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
26
<philmd@linaro.org> and remove spurious ret = -errno in
27
raw_co_zone_mgmt().
28
--Stefan]
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
29
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
30
---
18
block/nbd-client.h | 2 +-
31
meson.build | 5 +
19
block/nbd-client.c | 117 ++++++++++++++++++++++++-----------------------------
32
include/block/block-io.h | 9 +
20
nbd/client.c | 2 +-
33
include/block/block_int-common.h | 21 ++
21
nbd/common.c | 9 +----
34
include/block/raw-aio.h | 6 +-
22
nbd/server.c | 94 +++++++++++++-----------------------------
35
include/sysemu/block-backend-io.h | 18 ++
23
5 files changed, 83 insertions(+), 141 deletions(-)
36
block/block-backend.c | 137 +++++++++++++
37
block/file-posix.c | 313 +++++++++++++++++++++++++++++-
38
block/io.c | 41 ++++
39
qemu-io-cmds.c | 149 ++++++++++++++
40
9 files changed, 696 insertions(+), 3 deletions(-)
24
41
25
diff --git a/block/nbd-client.h b/block/nbd-client.h
42
diff --git a/meson.build b/meson.build
26
index XXXXXXX..XXXXXXX 100644
43
index XXXXXXX..XXXXXXX 100644
27
--- a/block/nbd-client.h
44
--- a/meson.build
28
+++ b/block/nbd-client.h
45
+++ b/meson.build
29
@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
46
@@ -XXX,XX +XXX,XX @@ if rdma.found()
30
47
endif
31
CoMutex send_mutex;
48
32
CoQueue free_sema;
49
# has_header_symbol
33
- Coroutine *send_coroutine;
50
+config_host_data.set('CONFIG_BLKZONED',
34
+ Coroutine *read_reply_co;
51
+ cc.has_header_symbol('linux/blkzoned.h', 'BLKOPENZONE'))
35
int in_flight;
52
config_host_data.set('CONFIG_EPOLL_CREATE1',
36
53
cc.has_header_symbol('sys/epoll.h', 'epoll_create1'))
37
Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
54
config_host_data.set('CONFIG_FALLOCATE_PUNCH_HOLE',
38
diff --git a/block/nbd-client.c b/block/nbd-client.c
55
@@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
56
config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM',
57
cc.has_member('struct stat', 'st_atim',
58
prefix: '#include <sys/stat.h>'))
59
+config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY',
60
+ cc.has_member('struct blk_zone', 'capacity',
61
+ prefix: '#include <linux/blkzoned.h>'))
62
63
# has_type
64
config_host_data.set('CONFIG_IOVEC',
65
diff --git a/include/block/block-io.h b/include/block/block-io.h
39
index XXXXXXX..XXXXXXX 100644
66
index XXXXXXX..XXXXXXX 100644
40
--- a/block/nbd-client.c
67
--- a/include/block/block-io.h
41
+++ b/block/nbd-client.c
68
+++ b/include/block/block-io.h
69
@@ -XXX,XX +XXX,XX @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
70
int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
71
int64_t bytes);
72
73
+/* Report zone information of zone block device. */
74
+int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs,
75
+ int64_t offset,
76
+ unsigned int *nr_zones,
77
+ BlockZoneDescriptor *zones);
78
+int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
79
+ BlockZoneOp op,
80
+ int64_t offset, int64_t len);
81
+
82
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
83
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
84
int64_t bytes, int64_t *pnum, int64_t *map,
85
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
86
index XXXXXXX..XXXXXXX 100644
87
--- a/include/block/block_int-common.h
88
+++ b/include/block/block_int-common.h
89
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
90
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_load_vmstate)(
91
BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
92
93
+ int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
94
+ int64_t offset, unsigned int *nr_zones,
95
+ BlockZoneDescriptor *zones);
96
+ int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
97
+ int64_t offset, int64_t len);
98
+
99
/* removable device specific */
100
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
101
BlockDriverState *bs);
102
@@ -XXX,XX +XXX,XX @@ typedef struct BlockLimits {
103
104
/* device zone model */
105
BlockZoneModel zoned;
106
+
107
+ /* zone size expressed in bytes */
108
+ uint32_t zone_size;
109
+
110
+ /* total number of zones */
111
+ uint32_t nr_zones;
112
+
113
+ /* maximum sectors of a zone append write operation */
114
+ uint32_t max_append_sectors;
115
+
116
+ /* maximum number of open zones */
117
+ uint32_t max_open_zones;
118
+
119
+ /* maximum number of active zones */
120
+ uint32_t max_active_zones;
121
} BlockLimits;
122
123
typedef struct BdrvOpBlocker BdrvOpBlocker;
124
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
125
index XXXXXXX..XXXXXXX 100644
126
--- a/include/block/raw-aio.h
127
+++ b/include/block/raw-aio.h
42
@@ -XXX,XX +XXX,XX @@
128
@@ -XXX,XX +XXX,XX @@
43
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
129
#define QEMU_AIO_WRITE_ZEROES 0x0020
44
#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs))
130
#define QEMU_AIO_COPY_RANGE 0x0040
45
131
#define QEMU_AIO_TRUNCATE 0x0080
46
-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
132
+#define QEMU_AIO_ZONE_REPORT 0x0100
47
+static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
133
+#define QEMU_AIO_ZONE_MGMT 0x0200
134
#define QEMU_AIO_TYPE_MASK \
135
(QEMU_AIO_READ | \
136
QEMU_AIO_WRITE | \
137
@@ -XXX,XX +XXX,XX @@
138
QEMU_AIO_DISCARD | \
139
QEMU_AIO_WRITE_ZEROES | \
140
QEMU_AIO_COPY_RANGE | \
141
- QEMU_AIO_TRUNCATE)
142
+ QEMU_AIO_TRUNCATE | \
143
+ QEMU_AIO_ZONE_REPORT | \
144
+ QEMU_AIO_ZONE_MGMT)
145
146
/* AIO flags */
147
#define QEMU_AIO_MISALIGNED 0x1000
148
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
149
index XXXXXXX..XXXXXXX 100644
150
--- a/include/sysemu/block-backend-io.h
151
+++ b/include/sysemu/block-backend-io.h
152
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
153
BlockCompletionFunc *cb, void *opaque);
154
BlockAIOCB *blk_aio_flush(BlockBackend *blk,
155
BlockCompletionFunc *cb, void *opaque);
156
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
157
+ unsigned int *nr_zones,
158
+ BlockZoneDescriptor *zones,
159
+ BlockCompletionFunc *cb, void *opaque);
160
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
161
+ int64_t offset, int64_t len,
162
+ BlockCompletionFunc *cb, void *opaque);
163
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
164
BlockCompletionFunc *cb, void *opaque);
165
void blk_aio_cancel_async(BlockAIOCB *acb);
166
@@ -XXX,XX +XXX,XX @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
167
int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
168
int64_t bytes, BdrvRequestFlags flags);
169
170
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
171
+ unsigned int *nr_zones,
172
+ BlockZoneDescriptor *zones);
173
+int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset,
174
+ unsigned int *nr_zones,
175
+ BlockZoneDescriptor *zones);
176
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
177
+ int64_t offset, int64_t len);
178
+int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
179
+ int64_t offset, int64_t len);
180
+
181
int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
182
int64_t bytes);
183
int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
184
diff --git a/block/block-backend.c b/block/block-backend.c
185
index XXXXXXX..XXXXXXX 100644
186
--- a/block/block-backend.c
187
+++ b/block/block-backend.c
188
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
189
return ret;
190
}
191
192
+static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
193
+{
194
+ BlkAioEmAIOCB *acb = opaque;
195
+ BlkRwCo *rwco = &acb->rwco;
196
+
197
+ rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
198
+ (unsigned int*)(uintptr_t)acb->bytes,
199
+ rwco->iobuf);
200
+ blk_aio_complete(acb);
201
+}
202
+
203
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
204
+ unsigned int *nr_zones,
205
+ BlockZoneDescriptor *zones,
206
+ BlockCompletionFunc *cb, void *opaque)
207
+{
208
+ BlkAioEmAIOCB *acb;
209
+ Coroutine *co;
210
+ IO_CODE();
211
+
212
+ blk_inc_in_flight(blk);
213
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
214
+ acb->rwco = (BlkRwCo) {
215
+ .blk = blk,
216
+ .offset = offset,
217
+ .iobuf = zones,
218
+ .ret = NOT_DONE,
219
+ };
220
+ acb->bytes = (int64_t)(uintptr_t)nr_zones,
221
+ acb->has_returned = false;
222
+
223
+ co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
224
+ aio_co_enter(blk_get_aio_context(blk), co);
225
+
226
+ acb->has_returned = true;
227
+ if (acb->rwco.ret != NOT_DONE) {
228
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
229
+ blk_aio_complete_bh, acb);
230
+ }
231
+
232
+ return &acb->common;
233
+}
234
+
235
+static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
236
+{
237
+ BlkAioEmAIOCB *acb = opaque;
238
+ BlkRwCo *rwco = &acb->rwco;
239
+
240
+ rwco->ret = blk_co_zone_mgmt(rwco->blk,
241
+ (BlockZoneOp)(uintptr_t)rwco->iobuf,
242
+ rwco->offset, acb->bytes);
243
+ blk_aio_complete(acb);
244
+}
245
+
246
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
247
+ int64_t offset, int64_t len,
248
+ BlockCompletionFunc *cb, void *opaque) {
249
+ BlkAioEmAIOCB *acb;
250
+ Coroutine *co;
251
+ IO_CODE();
252
+
253
+ blk_inc_in_flight(blk);
254
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
255
+ acb->rwco = (BlkRwCo) {
256
+ .blk = blk,
257
+ .offset = offset,
258
+ .iobuf = (void *)(uintptr_t)op,
259
+ .ret = NOT_DONE,
260
+ };
261
+ acb->bytes = len;
262
+ acb->has_returned = false;
263
+
264
+ co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
265
+ aio_co_enter(blk_get_aio_context(blk), co);
266
+
267
+ acb->has_returned = true;
268
+ if (acb->rwco.ret != NOT_DONE) {
269
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
270
+ blk_aio_complete_bh, acb);
271
+ }
272
+
273
+ return &acb->common;
274
+}
275
+
276
+/*
277
+ * Send a zone_report command.
278
+ * offset is a byte offset from the start of the device. No alignment
279
+ * required for offset.
280
+ * nr_zones represents IN maximum and OUT actual.
281
+ */
282
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
283
+ unsigned int *nr_zones,
284
+ BlockZoneDescriptor *zones)
285
+{
286
+ int ret;
287
+ IO_CODE();
288
+
289
+ blk_inc_in_flight(blk); /* increase before waiting */
290
+ blk_wait_while_drained(blk);
291
+ GRAPH_RDLOCK_GUARD();
292
+ if (!blk_is_available(blk)) {
293
+ blk_dec_in_flight(blk);
294
+ return -ENOMEDIUM;
295
+ }
296
+ ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
297
+ blk_dec_in_flight(blk);
298
+ return ret;
299
+}
300
+
301
+/*
302
+ * Send a zone_management command.
303
+ * op is the zone operation;
304
+ * offset is the byte offset from the start of the zoned device;
305
+ * len is the maximum number of bytes the command should operate on. It
306
+ * should be aligned with the device zone size.
307
+ */
308
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
309
+ int64_t offset, int64_t len)
310
+{
311
+ int ret;
312
+ IO_CODE();
313
+
314
+ blk_inc_in_flight(blk);
315
+ blk_wait_while_drained(blk);
316
+ GRAPH_RDLOCK_GUARD();
317
+
318
+ ret = blk_check_byte_request(blk, offset, len);
319
+ if (ret < 0) {
320
+ blk_dec_in_flight(blk);
321
+ return ret;
322
+ }
323
+
324
+ ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
325
+ blk_dec_in_flight(blk);
326
+ return ret;
327
+}
328
+
329
void blk_drain(BlockBackend *blk)
48
{
330
{
49
+ NBDClientSession *s = nbd_get_client_session(bs);
331
BlockDriverState *bs = blk_bs(blk);
50
int i;
332
diff --git a/block/file-posix.c b/block/file-posix.c
51
333
index XXXXXXX..XXXXXXX 100644
52
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
334
--- a/block/file-posix.c
53
@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
335
+++ b/block/file-posix.c
54
qemu_coroutine_enter(s->recv_coroutine[i]);
336
@@ -XXX,XX +XXX,XX @@
55
}
337
#include <sys/param.h>
338
#include <sys/syscall.h>
339
#include <sys/vfs.h>
340
+#if defined(CONFIG_BLKZONED)
341
+#include <linux/blkzoned.h>
342
+#endif
343
#include <linux/cdrom.h>
344
#include <linux/fd.h>
345
#include <linux/fs.h>
346
@@ -XXX,XX +XXX,XX @@ typedef struct RawPosixAIOData {
347
PreallocMode prealloc;
348
Error **errp;
349
} truncate;
350
+ struct {
351
+ unsigned int *nr_zones;
352
+ BlockZoneDescriptor *zones;
353
+ } zone_report;
354
+ struct {
355
+ unsigned long op;
356
+ } zone_mgmt;
357
};
358
} RawPosixAIOData;
359
360
@@ -XXX,XX +XXX,XX @@ static int get_sysfs_str_val(struct stat *st, const char *attribute,
361
}
362
#endif
363
364
+#if defined(CONFIG_BLKZONED)
365
static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
366
{
367
g_autofree char *val = NULL;
368
@@ -XXX,XX +XXX,XX @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
56
}
369
}
57
+ BDRV_POLL_WHILE(bs, s->read_reply_co);
370
return 0;
58
}
371
}
59
372
+#endif /* defined(CONFIG_BLKZONED) */
60
static void nbd_teardown_connection(BlockDriverState *bs)
373
61
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
374
/*
62
qio_channel_shutdown(client->ioc,
375
* Get a sysfs attribute value as a long integer.
63
QIO_CHANNEL_SHUTDOWN_BOTH,
376
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_segments(int fd, struct stat *st)
64
NULL);
377
#endif
65
- nbd_recv_coroutines_enter_all(client);
66
+ nbd_recv_coroutines_enter_all(bs);
67
68
nbd_client_detach_aio_context(bs);
69
object_unref(OBJECT(client->sioc));
70
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
71
client->ioc = NULL;
72
}
378
}
73
379
74
-static void nbd_reply_ready(void *opaque)
380
+#if defined(CONFIG_BLKZONED)
75
+static coroutine_fn void nbd_read_reply_entry(void *opaque)
381
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
382
Error **errp)
76
{
383
{
77
- BlockDriverState *bs = opaque;
384
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
78
- NBDClientSession *s = nbd_get_client_session(bs);
385
return;
79
+ NBDClientSession *s = opaque;
386
}
80
uint64_t i;
387
bs->bl.zoned = zoned;
388
+
389
+ ret = get_sysfs_long_val(st, "max_open_zones");
390
+ if (ret >= 0) {
391
+ bs->bl.max_open_zones = ret;
392
+ }
393
+
394
+ ret = get_sysfs_long_val(st, "max_active_zones");
395
+ if (ret >= 0) {
396
+ bs->bl.max_active_zones = ret;
397
+ }
398
+
399
+ /*
400
+ * The zoned device must at least have zone size and nr_zones fields.
401
+ */
402
+ ret = get_sysfs_long_val(st, "chunk_sectors");
403
+ if (ret < 0) {
404
+ error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
405
+ "sysfs attribute");
406
+ return;
407
+ } else if (!ret) {
408
+ error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
409
+ return;
410
+ }
411
+ bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
412
+
413
+ ret = get_sysfs_long_val(st, "nr_zones");
414
+ if (ret < 0) {
415
+ error_setg_errno(errp, -ret, "Unable to read nr_zones "
416
+ "sysfs attribute");
417
+ return;
418
+ } else if (!ret) {
419
+ error_setg(errp, "Read 0 from nr_zones sysfs attribute");
420
+ return;
421
+ }
422
+ bs->bl.nr_zones = ret;
423
+
424
+ ret = get_sysfs_long_val(st, "zone_append_max_bytes");
425
+ if (ret > 0) {
426
+ bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
427
+ }
428
}
429
+#else /* !defined(CONFIG_BLKZONED) */
430
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
431
+ Error **errp)
432
+{
433
+ bs->bl.zoned = BLK_Z_NONE;
434
+}
435
+#endif /* !defined(CONFIG_BLKZONED) */
436
437
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
438
{
439
@@ -XXX,XX +XXX,XX @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
440
BDRVRawState *s = bs->opaque;
81
int ret;
441
int ret;
82
442
83
- if (!s->ioc) { /* Already closed */
443
- /* If DASD, get blocksizes */
84
- return;
444
+ /* If DASD or zoned devices, get blocksizes */
85
- }
445
if (check_for_dasd(s->fd) < 0) {
86
-
446
- return -ENOTSUP;
87
- if (s->reply.handle == 0) {
447
+ /* zoned devices are not DASD */
88
- /* No reply already in flight. Fetch a header. It is possible
448
+ if (bs->bl.zoned == BLK_Z_NONE) {
89
- * that another thread has done the same thing in parallel, so
449
+ return -ENOTSUP;
90
- * the socket is not readable anymore.
450
+ }
91
- */
451
}
92
+ for (;;) {
452
ret = probe_logical_blocksize(s->fd, &bsz->log);
93
+ assert(s->reply.handle == 0);
453
if (ret < 0) {
94
ret = nbd_receive_reply(s->ioc, &s->reply);
454
@@ -XXX,XX +XXX,XX @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
95
- if (ret == -EAGAIN) {
455
}
96
- return;
456
#endif
97
- }
457
98
if (ret < 0) {
458
+/*
99
- s->reply.handle = 0;
459
+ * parse_zone - Fill a zone descriptor
100
- goto fail;
460
+ */
101
+ break;
461
+#if defined(CONFIG_BLKZONED)
102
}
462
+static inline int parse_zone(struct BlockZoneDescriptor *zone,
103
- }
463
+ const struct blk_zone *blkz) {
104
464
+ zone->start = blkz->start << BDRV_SECTOR_BITS;
105
- /* There's no need for a mutex on the receive side, because the
465
+ zone->length = blkz->len << BDRV_SECTOR_BITS;
106
- * handler acts as a synchronization point and ensures that only
466
+ zone->wp = blkz->wp << BDRV_SECTOR_BITS;
107
- * one coroutine is called until the reply finishes. */
467
+
108
- i = HANDLE_TO_INDEX(s, s->reply.handle);
468
+#ifdef HAVE_BLK_ZONE_REP_CAPACITY
109
- if (i >= MAX_NBD_REQUESTS) {
469
+ zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
110
- goto fail;
470
+#else
111
- }
471
+ zone->cap = blkz->len << BDRV_SECTOR_BITS;
112
+ /* There's no need for a mutex on the receive side, because the
472
+#endif
113
+ * handler acts as a synchronization point and ensures that only
473
+
114
+ * one coroutine is called until the reply finishes.
474
+ switch (blkz->type) {
115
+ */
475
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
116
+ i = HANDLE_TO_INDEX(s, s->reply.handle);
476
+ zone->type = BLK_ZT_SWR;
117
+ if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
477
+ break;
478
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
479
+ zone->type = BLK_ZT_SWP;
480
+ break;
481
+ case BLK_ZONE_TYPE_CONVENTIONAL:
482
+ zone->type = BLK_ZT_CONV;
483
+ break;
484
+ default:
485
+ error_report("Unsupported zone type: 0x%x", blkz->type);
486
+ return -ENOTSUP;
487
+ }
488
+
489
+ switch (blkz->cond) {
490
+ case BLK_ZONE_COND_NOT_WP:
491
+ zone->state = BLK_ZS_NOT_WP;
492
+ break;
493
+ case BLK_ZONE_COND_EMPTY:
494
+ zone->state = BLK_ZS_EMPTY;
495
+ break;
496
+ case BLK_ZONE_COND_IMP_OPEN:
497
+ zone->state = BLK_ZS_IOPEN;
498
+ break;
499
+ case BLK_ZONE_COND_EXP_OPEN:
500
+ zone->state = BLK_ZS_EOPEN;
501
+ break;
502
+ case BLK_ZONE_COND_CLOSED:
503
+ zone->state = BLK_ZS_CLOSED;
504
+ break;
505
+ case BLK_ZONE_COND_READONLY:
506
+ zone->state = BLK_ZS_RDONLY;
507
+ break;
508
+ case BLK_ZONE_COND_FULL:
509
+ zone->state = BLK_ZS_FULL;
510
+ break;
511
+ case BLK_ZONE_COND_OFFLINE:
512
+ zone->state = BLK_ZS_OFFLINE;
513
+ break;
514
+ default:
515
+ error_report("Unsupported zone state: 0x%x", blkz->cond);
516
+ return -ENOTSUP;
517
+ }
518
+ return 0;
519
+}
520
+#endif
521
+
522
+#if defined(CONFIG_BLKZONED)
523
+static int handle_aiocb_zone_report(void *opaque)
524
+{
525
+ RawPosixAIOData *aiocb = opaque;
526
+ int fd = aiocb->aio_fildes;
527
+ unsigned int *nr_zones = aiocb->zone_report.nr_zones;
528
+ BlockZoneDescriptor *zones = aiocb->zone_report.zones;
529
+ /* zoned block devices use 512-byte sectors */
530
+ uint64_t sector = aiocb->aio_offset / 512;
531
+
532
+ struct blk_zone *blkz;
533
+ size_t rep_size;
534
+ unsigned int nrz;
535
+ int ret;
536
+ unsigned int n = 0, i = 0;
537
+
538
+ nrz = *nr_zones;
539
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
540
+ g_autofree struct blk_zone_report *rep = NULL;
541
+ rep = g_malloc(rep_size);
542
+
543
+ blkz = (struct blk_zone *)(rep + 1);
544
+ while (n < nrz) {
545
+ memset(rep, 0, rep_size);
546
+ rep->sector = sector;
547
+ rep->nr_zones = nrz - n;
548
+
549
+ do {
550
+ ret = ioctl(fd, BLKREPORTZONE, rep);
551
+ } while (ret != 0 && errno == EINTR);
552
+ if (ret != 0) {
553
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
554
+ fd, sector, errno);
555
+ return -errno;
556
+ }
557
+
558
+ if (!rep->nr_zones) {
118
+ break;
559
+ break;
119
+ }
560
+ }
120
561
+
121
- if (s->recv_coroutine[i]) {
562
+ for (i = 0; i < rep->nr_zones; i++, n++) {
122
- qemu_coroutine_enter(s->recv_coroutine[i]);
563
+ ret = parse_zone(&zones[n], &blkz[i]);
123
- return;
564
+ if (ret != 0) {
124
+ /* We're woken up by the recv_coroutine itself. Note that there
565
+ return ret;
125
+ * is no race between yielding and reentering read_reply_co. This
566
+ }
126
+ * is because:
567
+
127
+ *
568
+ /* The next report should start after the last zone reported */
128
+ * - if recv_coroutine[i] runs on the same AioContext, it is only
569
+ sector = blkz[i].start + blkz[i].len;
129
+ * entered after we yield
570
+ }
130
+ *
571
+ }
131
+ * - if recv_coroutine[i] runs on a different AioContext, reentering
572
+
132
+ * read_reply_co happens through a bottom half, which can only
573
+ *nr_zones = n;
133
+ * run after we yield.
574
+ return 0;
134
+ */
575
+}
135
+ aio_co_wake(s->recv_coroutine[i]);
576
+#endif
136
+ qemu_coroutine_yield();
577
+
137
}
578
+#if defined(CONFIG_BLKZONED)
138
-
579
+static int handle_aiocb_zone_mgmt(void *opaque)
139
-fail:
580
+{
140
- nbd_teardown_connection(bs);
581
+ RawPosixAIOData *aiocb = opaque;
141
-}
582
+ int fd = aiocb->aio_fildes;
142
-
583
+ uint64_t sector = aiocb->aio_offset / 512;
143
-static void nbd_restart_write(void *opaque)
584
+ int64_t nr_sectors = aiocb->aio_nbytes / 512;
144
-{
585
+ struct blk_zone_range range;
145
- BlockDriverState *bs = opaque;
586
+ int ret;
146
-
587
+
147
- qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
588
+ /* Execute the operation */
148
+ s->read_reply_co = NULL;
589
+ range.sector = sector;
149
}
590
+ range.nr_sectors = nr_sectors;
150
591
+ do {
151
static int nbd_co_send_request(BlockDriverState *bs,
592
+ ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
152
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
593
+ } while (ret != 0 && errno == EINTR);
153
QEMUIOVector *qiov)
594
+
595
+ return ret;
596
+}
597
+#endif
598
+
599
static int handle_aiocb_copy_range(void *opaque)
154
{
600
{
155
NBDClientSession *s = nbd_get_client_session(bs);
601
RawPosixAIOData *aiocb = opaque;
156
- AioContext *aio_context;
602
@@ -XXX,XX +XXX,XX @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
157
int rc, ret, i;
158
159
qemu_co_mutex_lock(&s->send_mutex);
160
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
161
return -EPIPE;
162
}
163
164
- s->send_coroutine = qemu_coroutine_self();
165
- aio_context = bdrv_get_aio_context(bs);
166
-
167
- aio_set_fd_handler(aio_context, s->sioc->fd, false,
168
- nbd_reply_ready, nbd_restart_write, NULL, bs);
169
if (qiov) {
170
qio_channel_set_cork(s->ioc, true);
171
rc = nbd_send_request(s->ioc, request);
172
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
173
} else {
174
rc = nbd_send_request(s->ioc, request);
175
}
176
- aio_set_fd_handler(aio_context, s->sioc->fd, false,
177
- nbd_reply_ready, NULL, NULL, bs);
178
- s->send_coroutine = NULL;
179
qemu_co_mutex_unlock(&s->send_mutex);
180
return rc;
181
}
182
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
183
{
184
int ret;
185
186
- /* Wait until we're woken up by the read handler. TODO: perhaps
187
- * peek at the next reply and avoid yielding if it's ours? */
188
+ /* Wait until we're woken up by nbd_read_reply_entry. */
189
qemu_coroutine_yield();
190
*reply = s->reply;
191
if (reply->handle != request->handle ||
192
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
193
/* s->recv_coroutine[i] is set as soon as we get the send_lock. */
194
}
195
196
-static void nbd_coroutine_end(NBDClientSession *s,
197
+static void nbd_coroutine_end(BlockDriverState *bs,
198
NBDRequest *request)
199
{
200
+ NBDClientSession *s = nbd_get_client_session(bs);
201
int i = HANDLE_TO_INDEX(s, request->handle);
202
+
203
s->recv_coroutine[i] = NULL;
204
- if (s->in_flight-- == MAX_NBD_REQUESTS) {
205
- qemu_co_queue_next(&s->free_sema);
206
+ s->in_flight--;
207
+ qemu_co_queue_next(&s->free_sema);
208
+
209
+ /* Kick the read_reply_co to get the next reply. */
210
+ if (s->read_reply_co) {
211
+ aio_co_wake(s->read_reply_co);
212
}
603
}
213
}
604
}
214
605
215
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
606
+/*
216
} else {
607
+ * zone report - Get a zone block device's information in the form
217
nbd_co_receive_reply(client, &request, &reply, qiov);
608
+ * of an array of zone descriptors.
218
}
609
+ * zones is an array of zone descriptors to hold zone information on reply;
219
- nbd_coroutine_end(client, &request);
610
+ * offset can be any byte within the entire size of the device;
220
+ nbd_coroutine_end(bs, &request);
611
+ * nr_zones is the maxium number of sectors the command should operate on.
221
return -reply.error;
612
+ */
613
+#if defined(CONFIG_BLKZONED)
614
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
615
+ unsigned int *nr_zones,
616
+ BlockZoneDescriptor *zones) {
617
+ BDRVRawState *s = bs->opaque;
618
+ RawPosixAIOData acb = (RawPosixAIOData) {
619
+ .bs = bs,
620
+ .aio_fildes = s->fd,
621
+ .aio_type = QEMU_AIO_ZONE_REPORT,
622
+ .aio_offset = offset,
623
+ .zone_report = {
624
+ .nr_zones = nr_zones,
625
+ .zones = zones,
626
+ },
627
+ };
628
+
629
+ return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
630
+}
631
+#endif
632
+
633
+/*
634
+ * zone management operations - Execute an operation on a zone
635
+ */
636
+#if defined(CONFIG_BLKZONED)
637
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
638
+ int64_t offset, int64_t len) {
639
+ BDRVRawState *s = bs->opaque;
640
+ RawPosixAIOData acb;
641
+ int64_t zone_size, zone_size_mask;
642
+ const char *op_name;
643
+ unsigned long zo;
644
+ int ret;
645
+ int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
646
+
647
+ zone_size = bs->bl.zone_size;
648
+ zone_size_mask = zone_size - 1;
649
+ if (offset & zone_size_mask) {
650
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
651
+ "%" PRId64 "", offset / 512, zone_size / 512);
652
+ return -EINVAL;
653
+ }
654
+
655
+ if (((offset + len) < capacity && len & zone_size_mask) ||
656
+ offset + len > capacity) {
657
+ error_report("number of sectors %" PRId64 " is not aligned to zone size"
658
+ " %" PRId64 "", len / 512, zone_size / 512);
659
+ return -EINVAL;
660
+ }
661
+
662
+ switch (op) {
663
+ case BLK_ZO_OPEN:
664
+ op_name = "BLKOPENZONE";
665
+ zo = BLKOPENZONE;
666
+ break;
667
+ case BLK_ZO_CLOSE:
668
+ op_name = "BLKCLOSEZONE";
669
+ zo = BLKCLOSEZONE;
670
+ break;
671
+ case BLK_ZO_FINISH:
672
+ op_name = "BLKFINISHZONE";
673
+ zo = BLKFINISHZONE;
674
+ break;
675
+ case BLK_ZO_RESET:
676
+ op_name = "BLKRESETZONE";
677
+ zo = BLKRESETZONE;
678
+ break;
679
+ default:
680
+ error_report("Unsupported zone op: 0x%x", op);
681
+ return -ENOTSUP;
682
+ }
683
+
684
+ acb = (RawPosixAIOData) {
685
+ .bs = bs,
686
+ .aio_fildes = s->fd,
687
+ .aio_type = QEMU_AIO_ZONE_MGMT,
688
+ .aio_offset = offset,
689
+ .aio_nbytes = len,
690
+ .zone_mgmt = {
691
+ .op = zo,
692
+ },
693
+ };
694
+
695
+ ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
696
+ if (ret != 0) {
697
+ error_report("ioctl %s failed %d", op_name, ret);
698
+ }
699
+
700
+ return ret;
701
+}
702
+#endif
703
+
704
static coroutine_fn int
705
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
706
bool blkdev)
707
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
708
#ifdef __linux__
709
.bdrv_co_ioctl = hdev_co_ioctl,
710
#endif
711
+
712
+ /* zoned device */
713
+#if defined(CONFIG_BLKZONED)
714
+ /* zone management operations */
715
+ .bdrv_co_zone_report = raw_co_zone_report,
716
+ .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
717
+#endif
718
};
719
720
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
721
diff --git a/block/io.c b/block/io.c
722
index XXXXXXX..XXXXXXX 100644
723
--- a/block/io.c
724
+++ b/block/io.c
725
@@ -XXX,XX +XXX,XX @@ out:
726
return co.ret;
222
}
727
}
223
728
224
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
729
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
225
} else {
730
+ unsigned int *nr_zones,
226
nbd_co_receive_reply(client, &request, &reply, NULL);
731
+ BlockZoneDescriptor *zones)
227
}
732
+{
228
- nbd_coroutine_end(client, &request);
733
+ BlockDriver *drv = bs->drv;
229
+ nbd_coroutine_end(bs, &request);
734
+ CoroutineIOCompletion co = {
230
return -reply.error;
735
+ .coroutine = qemu_coroutine_self(),
231
}
736
+ };
232
737
+ IO_CODE();
233
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
738
+
234
} else {
739
+ bdrv_inc_in_flight(bs);
235
nbd_co_receive_reply(client, &request, &reply, NULL);
740
+ if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
236
}
741
+ co.ret = -ENOTSUP;
237
- nbd_coroutine_end(client, &request);
742
+ goto out;
238
+ nbd_coroutine_end(bs, &request);
743
+ }
239
return -reply.error;
744
+ co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
240
}
745
+out:
241
746
+ bdrv_dec_in_flight(bs);
242
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
747
+ return co.ret;
243
} else {
748
+}
244
nbd_co_receive_reply(client, &request, &reply, NULL);
749
+
245
}
750
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
246
- nbd_coroutine_end(client, &request);
751
+ int64_t offset, int64_t len)
247
+ nbd_coroutine_end(bs, &request);
752
+{
248
return -reply.error;
753
+ BlockDriver *drv = bs->drv;
249
}
754
+ CoroutineIOCompletion co = {
250
755
+ .coroutine = qemu_coroutine_self(),
251
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
756
+ };
252
} else {
757
+ IO_CODE();
253
nbd_co_receive_reply(client, &request, &reply, NULL);
758
+
254
}
759
+ bdrv_inc_in_flight(bs);
255
- nbd_coroutine_end(client, &request);
760
+ if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
256
+ nbd_coroutine_end(bs, &request);
761
+ co.ret = -ENOTSUP;
257
return -reply.error;
762
+ goto out;
258
763
+ }
259
}
764
+ co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
260
765
+out:
261
void nbd_client_detach_aio_context(BlockDriverState *bs)
766
+ bdrv_dec_in_flight(bs);
767
+ return co.ret;
768
+}
769
+
770
void *qemu_blockalign(BlockDriverState *bs, size_t size)
262
{
771
{
263
- aio_set_fd_handler(bdrv_get_aio_context(bs),
772
IO_CODE();
264
- nbd_get_client_session(bs)->sioc->fd,
773
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
265
- false, NULL, NULL, NULL, NULL);
266
+ NBDClientSession *client = nbd_get_client_session(bs);
267
+ qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
268
}
269
270
void nbd_client_attach_aio_context(BlockDriverState *bs,
271
AioContext *new_context)
272
{
273
- aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
274
- false, nbd_reply_ready, NULL, NULL, bs);
275
+ NBDClientSession *client = nbd_get_client_session(bs);
276
+ qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
277
+ aio_co_schedule(new_context, client->read_reply_co);
278
}
279
280
void nbd_client_close(BlockDriverState *bs)
281
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
282
/* Now that we're connected, set the socket to be non-blocking and
283
* kick the reply mechanism. */
284
qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
285
-
286
+ client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
287
nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
288
289
logout("Established connection with NBD server\n");
290
diff --git a/nbd/client.c b/nbd/client.c
291
index XXXXXXX..XXXXXXX 100644
774
index XXXXXXX..XXXXXXX 100644
292
--- a/nbd/client.c
775
--- a/qemu-io-cmds.c
293
+++ b/nbd/client.c
776
+++ b/qemu-io-cmds.c
294
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
777
@@ -XXX,XX +XXX,XX @@ static const cmdinfo_t flush_cmd = {
295
ssize_t ret;
778
.oneline = "flush all in-core file state to disk",
296
779
};
297
ret = read_sync(ioc, buf, sizeof(buf));
780
298
- if (ret < 0) {
781
+static inline int64_t tosector(int64_t bytes)
299
+ if (ret <= 0) {
782
+{
300
return ret;
783
+ return bytes >> BDRV_SECTOR_BITS;
301
}
784
+}
302
785
+
303
diff --git a/nbd/common.c b/nbd/common.c
786
+static int zone_report_f(BlockBackend *blk, int argc, char **argv)
304
index XXXXXXX..XXXXXXX 100644
787
+{
305
--- a/nbd/common.c
788
+ int ret;
306
+++ b/nbd/common.c
789
+ int64_t offset;
307
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
790
+ unsigned int nr_zones;
308
}
791
+
309
if (len == QIO_CHANNEL_ERR_BLOCK) {
792
+ ++optind;
310
if (qemu_in_coroutine()) {
793
+ offset = cvtnum(argv[optind]);
311
- /* XXX figure out if we can create a variant on
794
+ ++optind;
312
- * qio_channel_yield() that works with AIO contexts
795
+ nr_zones = cvtnum(argv[optind]);
313
- * and consider using that in this branch */
796
+
314
- qemu_coroutine_yield();
797
+ g_autofree BlockZoneDescriptor *zones = NULL;
315
- } else if (done) {
798
+ zones = g_new(BlockZoneDescriptor, nr_zones);
316
- /* XXX this is needed by nbd_reply_ready. */
799
+ ret = blk_zone_report(blk, offset, &nr_zones, zones);
317
- qio_channel_wait(ioc,
800
+ if (ret < 0) {
318
- do_read ? G_IO_IN : G_IO_OUT);
801
+ printf("zone report failed: %s\n", strerror(-ret));
319
+ qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
802
+ } else {
320
} else {
803
+ for (int i = 0; i < nr_zones; ++i) {
321
return -EAGAIN;
804
+ printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
322
}
805
+ "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
323
diff --git a/nbd/server.c b/nbd/server.c
806
+ "zcond:%u, [type: %u]\n",
324
index XXXXXXX..XXXXXXX 100644
807
+ tosector(zones[i].start), tosector(zones[i].length),
325
--- a/nbd/server.c
808
+ tosector(zones[i].cap), tosector(zones[i].wp),
326
+++ b/nbd/server.c
809
+ zones[i].state, zones[i].type);
327
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
328
CoMutex send_lock;
329
Coroutine *send_coroutine;
330
331
- bool can_read;
332
-
333
QTAILQ_ENTRY(NBDClient) next;
334
int nb_requests;
335
bool closing;
336
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
337
338
/* That's all folks */
339
340
-static void nbd_set_handlers(NBDClient *client);
341
-static void nbd_unset_handlers(NBDClient *client);
342
-static void nbd_update_can_read(NBDClient *client);
343
+static void nbd_client_receive_next_request(NBDClient *client);
344
345
static gboolean nbd_negotiate_continue(QIOChannel *ioc,
346
GIOCondition condition,
347
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
348
*/
349
assert(client->closing);
350
351
- nbd_unset_handlers(client);
352
+ qio_channel_detach_aio_context(client->ioc);
353
object_unref(OBJECT(client->sioc));
354
object_unref(OBJECT(client->ioc));
355
if (client->tlscreds) {
356
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
357
358
assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
359
client->nb_requests++;
360
- nbd_update_can_read(client);
361
362
req = g_new0(NBDRequestData, 1);
363
nbd_client_get(client);
364
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
365
g_free(req);
366
367
client->nb_requests--;
368
- nbd_update_can_read(client);
369
+ nbd_client_receive_next_request(client);
370
+
371
nbd_client_put(client);
372
}
373
374
@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
375
exp->ctx = ctx;
376
377
QTAILQ_FOREACH(client, &exp->clients, next) {
378
- nbd_set_handlers(client);
379
+ qio_channel_attach_aio_context(client->ioc, ctx);
380
+ if (client->recv_coroutine) {
381
+ aio_co_schedule(ctx, client->recv_coroutine);
382
+ }
810
+ }
383
+ if (client->send_coroutine) {
811
+ }
384
+ aio_co_schedule(ctx, client->send_coroutine);
812
+ return ret;
385
+ }
813
+}
386
}
814
+
387
}
815
+static const cmdinfo_t zone_report_cmd = {
388
816
+ .name = "zone_report",
389
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
817
+ .altname = "zrp",
390
TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
818
+ .cfunc = zone_report_f,
391
819
+ .argmin = 2,
392
QTAILQ_FOREACH(client, &exp->clients, next) {
820
+ .argmax = 2,
393
- nbd_unset_handlers(client);
821
+ .args = "offset number",
394
+ qio_channel_detach_aio_context(client->ioc);
822
+ .oneline = "report zone information",
395
}
823
+};
396
824
+
397
exp->ctx = NULL;
825
+static int zone_open_f(BlockBackend *blk, int argc, char **argv)
398
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
826
+{
399
g_assert(qemu_in_coroutine());
827
+ int ret;
400
qemu_co_mutex_lock(&client->send_lock);
828
+ int64_t offset, len;
401
client->send_coroutine = qemu_coroutine_self();
829
+ ++optind;
402
- nbd_set_handlers(client);
830
+ offset = cvtnum(argv[optind]);
403
831
+ ++optind;
404
if (!len) {
832
+ len = cvtnum(argv[optind]);
405
rc = nbd_send_reply(client->ioc, reply);
833
+ ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
406
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
834
+ if (ret < 0) {
407
}
835
+ printf("zone open failed: %s\n", strerror(-ret));
408
836
+ }
409
client->send_coroutine = NULL;
837
+ return ret;
410
- nbd_set_handlers(client);
838
+}
411
qemu_co_mutex_unlock(&client->send_lock);
839
+
412
return rc;
840
+static const cmdinfo_t zone_open_cmd = {
413
}
841
+ .name = "zone_open",
414
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
842
+ .altname = "zo",
415
ssize_t rc;
843
+ .cfunc = zone_open_f,
416
844
+ .argmin = 2,
417
g_assert(qemu_in_coroutine());
845
+ .argmax = 2,
418
- client->recv_coroutine = qemu_coroutine_self();
846
+ .args = "offset len",
419
- nbd_update_can_read(client);
847
+ .oneline = "explicit open a range of zones in zone block device",
420
-
848
+};
421
+ assert(client->recv_coroutine == qemu_coroutine_self());
849
+
422
rc = nbd_receive_request(client->ioc, request);
850
+static int zone_close_f(BlockBackend *blk, int argc, char **argv)
423
if (rc < 0) {
851
+{
424
if (rc != -EAGAIN) {
852
+ int ret;
425
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
853
+ int64_t offset, len;
426
854
+ ++optind;
427
out:
855
+ offset = cvtnum(argv[optind]);
428
client->recv_coroutine = NULL;
856
+ ++optind;
429
- nbd_update_can_read(client);
857
+ len = cvtnum(argv[optind]);
430
+ nbd_client_receive_next_request(client);
858
+ ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
431
859
+ if (ret < 0) {
432
return rc;
860
+ printf("zone close failed: %s\n", strerror(-ret));
433
}
861
+ }
434
862
+ return ret;
435
-static void nbd_trip(void *opaque)
863
+}
436
+/* Owns a reference to the NBDClient passed as opaque. */
864
+
437
+static coroutine_fn void nbd_trip(void *opaque)
865
+static const cmdinfo_t zone_close_cmd = {
438
{
866
+ .name = "zone_close",
439
NBDClient *client = opaque;
867
+ .altname = "zc",
440
NBDExport *exp = client->exp;
868
+ .cfunc = zone_close_f,
441
NBDRequestData *req;
869
+ .argmin = 2,
442
- NBDRequest request;
870
+ .argmax = 2,
443
+ NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
871
+ .args = "offset len",
444
NBDReply reply;
872
+ .oneline = "close a range of zones in zone block device",
445
ssize_t ret;
873
+};
446
int flags;
874
+
447
875
+static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
448
TRACE("Reading request.");
876
+{
449
if (client->closing) {
877
+ int ret;
450
+ nbd_client_put(client);
878
+ int64_t offset, len;
451
return;
879
+ ++optind;
452
}
880
+ offset = cvtnum(argv[optind]);
453
881
+ ++optind;
454
@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
882
+ len = cvtnum(argv[optind]);
455
883
+ ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
456
done:
884
+ if (ret < 0) {
457
nbd_request_put(req);
885
+ printf("zone finish failed: %s\n", strerror(-ret));
458
+ nbd_client_put(client);
886
+ }
459
return;
887
+ return ret;
460
888
+}
461
out:
889
+
462
nbd_request_put(req);
890
+static const cmdinfo_t zone_finish_cmd = {
463
client_close(client);
891
+ .name = "zone_finish",
464
+ nbd_client_put(client);
892
+ .altname = "zf",
465
}
893
+ .cfunc = zone_finish_f,
466
894
+ .argmin = 2,
467
-static void nbd_read(void *opaque)
895
+ .argmax = 2,
468
+static void nbd_client_receive_next_request(NBDClient *client)
896
+ .args = "offset len",
469
{
897
+ .oneline = "finish a range of zones in zone block device",
470
- NBDClient *client = opaque;
898
+};
471
-
899
+
472
- if (client->recv_coroutine) {
900
+static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
473
- qemu_coroutine_enter(client->recv_coroutine);
901
+{
474
- } else {
902
+ int ret;
475
- qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
903
+ int64_t offset, len;
476
- }
904
+ ++optind;
477
-}
905
+ offset = cvtnum(argv[optind]);
478
-
906
+ ++optind;
479
-static void nbd_restart_write(void *opaque)
907
+ len = cvtnum(argv[optind]);
480
-{
908
+ ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
481
- NBDClient *client = opaque;
909
+ if (ret < 0) {
482
-
910
+ printf("zone reset failed: %s\n", strerror(-ret));
483
- qemu_coroutine_enter(client->send_coroutine);
911
+ }
484
-}
912
+ return ret;
485
-
913
+}
486
-static void nbd_set_handlers(NBDClient *client)
914
+
487
-{
915
+static const cmdinfo_t zone_reset_cmd = {
488
- if (client->exp && client->exp->ctx) {
916
+ .name = "zone_reset",
489
- aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
917
+ .altname = "zrs",
490
- client->can_read ? nbd_read : NULL,
918
+ .cfunc = zone_reset_f,
491
- client->send_coroutine ? nbd_restart_write : NULL,
919
+ .argmin = 2,
492
- NULL, client);
920
+ .argmax = 2,
493
- }
921
+ .args = "offset len",
494
-}
922
+ .oneline = "reset a zone write pointer in zone block device",
495
-
923
+};
496
-static void nbd_unset_handlers(NBDClient *client)
924
+
497
-{
925
static int truncate_f(BlockBackend *blk, int argc, char **argv);
498
- if (client->exp && client->exp->ctx) {
926
static const cmdinfo_t truncate_cmd = {
499
- aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
927
.name = "truncate",
500
- NULL, NULL, NULL);
928
@@ -XXX,XX +XXX,XX @@ static void __attribute((constructor)) init_qemuio_commands(void)
501
- }
929
qemuio_add_command(&aio_write_cmd);
502
-}
930
qemuio_add_command(&aio_flush_cmd);
503
-
931
qemuio_add_command(&flush_cmd);
504
-static void nbd_update_can_read(NBDClient *client)
932
+ qemuio_add_command(&zone_report_cmd);
505
-{
933
+ qemuio_add_command(&zone_open_cmd);
506
- bool can_read = client->recv_coroutine ||
934
+ qemuio_add_command(&zone_close_cmd);
507
- client->nb_requests < MAX_NBD_REQUESTS;
935
+ qemuio_add_command(&zone_finish_cmd);
508
-
936
+ qemuio_add_command(&zone_reset_cmd);
509
- if (can_read != client->can_read) {
937
qemuio_add_command(&truncate_cmd);
510
- client->can_read = can_read;
938
qemuio_add_command(&length_cmd);
511
- nbd_set_handlers(client);
939
qemuio_add_command(&info_cmd);
512
-
513
- /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
514
- * in nbd_set_handlers() will have taken care of that */
515
+ if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
516
+ nbd_client_get(client);
517
+ client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
518
+ aio_co_schedule(client->exp->ctx, client->recv_coroutine);
519
}
520
}
521
522
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
523
goto out;
524
}
525
qemu_co_mutex_init(&client->send_lock);
526
- nbd_set_handlers(client);
527
528
if (exp) {
529
QTAILQ_INSERT_TAIL(&exp->clients, client, next);
530
}
531
+
532
+ nbd_client_receive_next_request(client);
533
+
534
out:
535
g_free(data);
536
}
537
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
538
object_ref(OBJECT(client->sioc));
539
client->ioc = QIO_CHANNEL(sioc);
540
object_ref(OBJECT(client->ioc));
541
- client->can_read = true;
542
client->close = close_fn;
543
544
data->client = client;
545
--
940
--
546
2.9.3
941
2.40.1
547
942
548
943
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
This uses the lock-free mutex described in the paper '"Blocking without
3
raw-format driver usually sits on top of file-posix driver. It needs to
4
Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
4
pass through requests of zone commands.
5
Papatriantafilou. The same technique is used in OSv, and in fact
6
the code is essentially a conversion to C of OSv's code.
7
5
8
[Added missing coroutine_fn in tests/test-aio-multithread.c.
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
9
Reviewed-by: Hannes Reinecke <hare@suse.de>
10
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
11
Acked-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: 20230508045533.175575-5-faithilikerun@gmail.com
14
Message-id: 20230324090605.28361-5-faithilikerun@gmail.com
15
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
16
<philmd@linaro.org>.
9
--Stefan]
17
--Stefan]
10
11
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
12
Reviewed-by: Fam Zheng <famz@redhat.com>
13
Message-id: 20170213181244.16297-2-pbonzini@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
19
---
16
include/qemu/coroutine.h | 17 ++++-
20
block/raw-format.c | 17 +++++++++++++++++
17
tests/test-aio-multithread.c | 86 ++++++++++++++++++++++++
21
1 file changed, 17 insertions(+)
18
util/qemu-coroutine-lock.c | 155 ++++++++++++++++++++++++++++++++++++++++---
19
util/trace-events | 1 +
20
4 files changed, 246 insertions(+), 13 deletions(-)
21
22
22
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
23
diff --git a/block/raw-format.c b/block/raw-format.c
23
index XXXXXXX..XXXXXXX 100644
24
index XXXXXXX..XXXXXXX 100644
24
--- a/include/qemu/coroutine.h
25
--- a/block/raw-format.c
25
+++ b/include/qemu/coroutine.h
26
+++ b/block/raw-format.c
26
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
27
@@ -XXX,XX +XXX,XX @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
27
/**
28
return bdrv_co_pdiscard(bs->file, offset, bytes);
28
* Provides a mutex that can be used to synchronise coroutines
29
*/
30
+struct CoWaitRecord;
31
typedef struct CoMutex {
32
- bool locked;
33
+ /* Count of pending lockers; 0 for a free mutex, 1 for an
34
+ * uncontended mutex.
35
+ */
36
+ unsigned locked;
37
+
38
+ /* A queue of waiters. Elements are added atomically in front of
39
+ * from_push. to_pop is only populated, and popped from, by whoever
40
+ * is in charge of the next wakeup. This can be an unlocker or,
41
+ * through the handoff protocol, a locker that is about to go to sleep.
42
+ */
43
+ QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
44
+
45
+ unsigned handoff, sequence;
46
+
47
Coroutine *holder;
48
- CoQueue queue;
49
} CoMutex;
50
51
/**
52
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
53
index XXXXXXX..XXXXXXX 100644
54
--- a/tests/test-aio-multithread.c
55
+++ b/tests/test-aio-multithread.c
56
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
57
test_multi_co_schedule(10);
58
}
29
}
59
30
60
+/* CoMutex thread-safety. */
31
+static int coroutine_fn GRAPH_RDLOCK
61
+
32
+raw_co_zone_report(BlockDriverState *bs, int64_t offset,
62
+static uint32_t atomic_counter;
33
+ unsigned int *nr_zones,
63
+static uint32_t running;
34
+ BlockZoneDescriptor *zones)
64
+static uint32_t counter;
65
+static CoMutex comutex;
66
+
67
+static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
68
+{
35
+{
69
+ while (!atomic_mb_read(&now_stopping)) {
36
+ return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
70
+ qemu_co_mutex_lock(&comutex);
71
+ counter++;
72
+ qemu_co_mutex_unlock(&comutex);
73
+
74
+ /* Increase atomic_counter *after* releasing the mutex. Otherwise
75
+ * there is a chance (it happens about 1 in 3 runs) that the iothread
76
+ * exits before the coroutine is woken up, causing a spurious
77
+ * assertion failure.
78
+ */
79
+ atomic_inc(&atomic_counter);
80
+ }
81
+ atomic_dec(&running);
82
+}
37
+}
83
+
38
+
84
+static void test_multi_co_mutex(int threads, int seconds)
39
+static int coroutine_fn GRAPH_RDLOCK
40
+raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
41
+ int64_t offset, int64_t len)
85
+{
42
+{
86
+ int i;
43
+ return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
87
+
88
+ qemu_co_mutex_init(&comutex);
89
+ counter = 0;
90
+ atomic_counter = 0;
91
+ now_stopping = false;
92
+
93
+ create_aio_contexts();
94
+ assert(threads <= NUM_CONTEXTS);
95
+ running = threads;
96
+ for (i = 0; i < threads; i++) {
97
+ Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
98
+ aio_co_schedule(ctx[i], co1);
99
+ }
100
+
101
+ g_usleep(seconds * 1000000);
102
+
103
+ atomic_mb_set(&now_stopping, true);
104
+ while (running > 0) {
105
+ g_usleep(100000);
106
+ }
107
+
108
+ join_aio_contexts();
109
+ g_test_message("%d iterations/second\n", counter / seconds);
110
+ g_assert_cmpint(counter, ==, atomic_counter);
111
+}
44
+}
112
+
45
+
113
+/* Testing with NUM_CONTEXTS threads focuses on the queue. The mutex however
46
static int64_t coroutine_fn GRAPH_RDLOCK
114
+ * is too contended (and the threads spend too much time in aio_poll)
47
raw_co_getlength(BlockDriverState *bs)
115
+ * to actually stress the handoff protocol.
116
+ */
117
+static void test_multi_co_mutex_1(void)
118
+{
119
+ test_multi_co_mutex(NUM_CONTEXTS, 1);
120
+}
121
+
122
+static void test_multi_co_mutex_10(void)
123
+{
124
+ test_multi_co_mutex(NUM_CONTEXTS, 10);
125
+}
126
+
127
+/* Testing with fewer threads stresses the handoff protocol too. Still, the
128
+ * case where the locker _can_ pick up a handoff is very rare, happening
129
+ * about 10 times in 1 million, so increase the runtime a bit compared to
130
+ * other "quick" testcases that only run for 1 second.
131
+ */
132
+static void test_multi_co_mutex_2_3(void)
133
+{
134
+ test_multi_co_mutex(2, 3);
135
+}
136
+
137
+static void test_multi_co_mutex_2_30(void)
138
+{
139
+ test_multi_co_mutex(2, 30);
140
+}
141
+
142
/* End of tests. */
143
144
int main(int argc, char **argv)
145
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
146
g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
147
if (g_test_quick()) {
148
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
149
+ g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
150
+ g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
151
} else {
152
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
153
+ g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
154
+ g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
155
}
156
return g_test_run();
157
}
158
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
159
index XXXXXXX..XXXXXXX 100644
160
--- a/util/qemu-coroutine-lock.c
161
+++ b/util/qemu-coroutine-lock.c
162
@@ -XXX,XX +XXX,XX @@
163
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
164
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
165
* THE SOFTWARE.
166
+ *
167
+ * The lock-free mutex implementation is based on OSv
168
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
169
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
170
*/
171
172
#include "qemu/osdep.h"
173
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
174
return QSIMPLEQ_FIRST(&queue->entries) == NULL;
175
}
176
177
+/* The wait records are handled with a multiple-producer, single-consumer
178
+ * lock-free queue. There cannot be two concurrent pop_waiter() calls
179
+ * because pop_waiter() can only be called while mutex->handoff is zero.
180
+ * This can happen in three cases:
181
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
182
+ * In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
183
+ * not take part in the handoff.
184
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
185
+ * qemu_co_mutex_unlock. In this case, qemu_co_mutex_unlock will fail
186
+ * the cmpxchg (it will see either 0 or the next sequence value) and
187
+ * exit. The next hand-off cannot begin until qemu_co_mutex_lock has
188
+ * woken up someone.
189
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
190
+ * In this case another iteration starts with mutex->handoff == 0;
191
+ * a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
192
+ * qemu_co_mutex_unlock will go back to case (1).
193
+ *
194
+ * The following functions manage this queue.
195
+ */
196
+typedef struct CoWaitRecord {
197
+ Coroutine *co;
198
+ QSLIST_ENTRY(CoWaitRecord) next;
199
+} CoWaitRecord;
200
+
201
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
202
+{
203
+ w->co = qemu_coroutine_self();
204
+ QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
205
+}
206
+
207
+static void move_waiters(CoMutex *mutex)
208
+{
209
+ QSLIST_HEAD(, CoWaitRecord) reversed;
210
+ QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
211
+ while (!QSLIST_EMPTY(&reversed)) {
212
+ CoWaitRecord *w = QSLIST_FIRST(&reversed);
213
+ QSLIST_REMOVE_HEAD(&reversed, next);
214
+ QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
215
+ }
216
+}
217
+
218
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
219
+{
220
+ CoWaitRecord *w;
221
+
222
+ if (QSLIST_EMPTY(&mutex->to_pop)) {
223
+ move_waiters(mutex);
224
+ if (QSLIST_EMPTY(&mutex->to_pop)) {
225
+ return NULL;
226
+ }
227
+ }
228
+ w = QSLIST_FIRST(&mutex->to_pop);
229
+ QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
230
+ return w;
231
+}
232
+
233
+static bool has_waiters(CoMutex *mutex)
234
+{
235
+ return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
236
+}
237
+
238
void qemu_co_mutex_init(CoMutex *mutex)
239
{
48
{
240
memset(mutex, 0, sizeof(*mutex));
49
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
241
- qemu_co_queue_init(&mutex->queue);
50
.bdrv_co_pwritev = &raw_co_pwritev,
242
}
51
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
243
52
.bdrv_co_pdiscard = &raw_co_pdiscard,
244
-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
53
+ .bdrv_co_zone_report = &raw_co_zone_report,
245
+static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
54
+ .bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
246
{
55
.bdrv_co_block_status = &raw_co_block_status,
247
Coroutine *self = qemu_coroutine_self();
56
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
248
+ CoWaitRecord w;
57
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
249
+ unsigned old_handoff;
250
251
trace_qemu_co_mutex_lock_entry(mutex, self);
252
+ w.co = self;
253
+ push_waiter(mutex, &w);
254
255
- while (mutex->locked) {
256
- qemu_co_queue_wait(&mutex->queue);
257
+ /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
258
+ * a concurrent unlock() the responsibility of waking somebody up.
259
+ */
260
+ old_handoff = atomic_mb_read(&mutex->handoff);
261
+ if (old_handoff &&
262
+ has_waiters(mutex) &&
263
+ atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
264
+ /* There can be no concurrent pops, because there can be only
265
+ * one active handoff at a time.
266
+ */
267
+ CoWaitRecord *to_wake = pop_waiter(mutex);
268
+ Coroutine *co = to_wake->co;
269
+ if (co == self) {
270
+ /* We got the lock ourselves! */
271
+ assert(to_wake == &w);
272
+ return;
273
+ }
274
+
275
+ aio_co_wake(co);
276
}
277
278
- mutex->locked = true;
279
- mutex->holder = self;
280
- self->locks_held++;
281
-
282
+ qemu_coroutine_yield();
283
trace_qemu_co_mutex_lock_return(mutex, self);
284
}
285
286
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
287
+{
288
+ Coroutine *self = qemu_coroutine_self();
289
+
290
+ if (atomic_fetch_inc(&mutex->locked) == 0) {
291
+ /* Uncontended. */
292
+ trace_qemu_co_mutex_lock_uncontended(mutex, self);
293
+ } else {
294
+ qemu_co_mutex_lock_slowpath(mutex);
295
+ }
296
+ mutex->holder = self;
297
+ self->locks_held++;
298
+}
299
+
300
void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
301
{
302
Coroutine *self = qemu_coroutine_self();
303
304
trace_qemu_co_mutex_unlock_entry(mutex, self);
305
306
- assert(mutex->locked == true);
307
+ assert(mutex->locked);
308
assert(mutex->holder == self);
309
assert(qemu_in_coroutine());
310
311
- mutex->locked = false;
312
mutex->holder = NULL;
313
self->locks_held--;
314
- qemu_co_queue_next(&mutex->queue);
315
+ if (atomic_fetch_dec(&mutex->locked) == 1) {
316
+ /* No waiting qemu_co_mutex_lock(). Pfew, that was easy! */
317
+ return;
318
+ }
319
+
320
+ for (;;) {
321
+ CoWaitRecord *to_wake = pop_waiter(mutex);
322
+ unsigned our_handoff;
323
+
324
+ if (to_wake) {
325
+ Coroutine *co = to_wake->co;
326
+ aio_co_wake(co);
327
+ break;
328
+ }
329
+
330
+ /* Some concurrent lock() is in progress (we know this because
331
+ * mutex->locked was >1) but it hasn't yet put itself on the wait
332
+ * queue. Pick a sequence number for the handoff protocol (not 0).
333
+ */
334
+ if (++mutex->sequence == 0) {
335
+ mutex->sequence = 1;
336
+ }
337
+
338
+ our_handoff = mutex->sequence;
339
+ atomic_mb_set(&mutex->handoff, our_handoff);
340
+ if (!has_waiters(mutex)) {
341
+ /* The concurrent lock has not added itself yet, so it
342
+ * will be able to pick our handoff.
343
+ */
344
+ break;
345
+ }
346
+
347
+ /* Try to do the handoff protocol ourselves; if somebody else has
348
+ * already taken it, however, we're done and they're responsible.
349
+ */
350
+ if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
351
+ break;
352
+ }
353
+ }
354
355
trace_qemu_co_mutex_unlock_return(mutex, self);
356
}
357
diff --git a/util/trace-events b/util/trace-events
358
index XXXXXXX..XXXXXXX 100644
359
--- a/util/trace-events
360
+++ b/util/trace-events
361
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
362
363
# util/qemu-coroutine-lock.c
364
qemu_co_queue_run_restart(void *co) "co %p"
365
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
366
qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
367
qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
368
qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
369
--
58
--
370
2.9.3
59
2.40.1
371
60
372
61
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Pull the increment/decrement pair out of aio_bh_poll and into the
3
Putting zoned/non-zoned BlockDrivers on top of each other is not
4
callers.
4
allowed.
5
5
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
8
Reviewed-by: Hannes Reinecke <hare@suse.de>
8
Reviewed-by: Fam Zheng <famz@redhat.com>
9
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
9
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
10
Acked-by: Kevin Wolf <kwolf@redhat.com>
10
Message-id: 20170213135235.12274-18-pbonzini@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 20230508045533.175575-6-faithilikerun@gmail.com
13
Message-id: 20230324090605.28361-6-faithilikerun@gmail.com
14
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
15
<philmd@linaro.org> and clarify that the check is about zoned
16
BlockDrivers.
17
--Stefan]
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
19
---
13
util/aio-posix.c | 8 +++-----
20
include/block/block_int-common.h | 5 +++++
14
util/aio-win32.c | 8 ++++----
21
block.c | 19 +++++++++++++++++++
15
util/async.c | 12 ++++++------
22
block/file-posix.c | 12 ++++++++++++
16
3 files changed, 13 insertions(+), 15 deletions(-)
23
block/raw-format.c | 1 +
24
4 files changed, 37 insertions(+)
17
25
18
diff --git a/util/aio-posix.c b/util/aio-posix.c
26
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
19
index XXXXXXX..XXXXXXX 100644
27
index XXXXXXX..XXXXXXX 100644
20
--- a/util/aio-posix.c
28
--- a/include/block/block_int-common.h
21
+++ b/util/aio-posix.c
29
+++ b/include/block/block_int-common.h
22
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
30
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
23
31
*/
24
void aio_dispatch(AioContext *ctx)
32
bool is_format;
25
{
33
26
+ qemu_lockcnt_inc(&ctx->list_lock);
34
+ /*
27
aio_bh_poll(ctx);
35
+ * Set to true if the BlockDriver supports zoned children.
28
-
36
+ */
29
- qemu_lockcnt_inc(&ctx->list_lock);
37
+ bool supports_zoned_children;
30
aio_dispatch_handlers(ctx);
38
+
31
qemu_lockcnt_dec(&ctx->list_lock);
39
/*
32
40
* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
33
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
41
* this field set to true, except ones that are defined only by their
42
diff --git a/block.c b/block.c
43
index XXXXXXX..XXXXXXX 100644
44
--- a/block.c
45
+++ b/block.c
46
@@ -XXX,XX +XXX,XX @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
47
return;
34
}
48
}
35
49
36
npfd = 0;
50
+ /*
37
- qemu_lockcnt_dec(&ctx->list_lock);
51
+ * Non-zoned block drivers do not follow zoned storage constraints
38
52
+ * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
39
progress |= aio_bh_poll(ctx);
53
+ * drivers in a graph.
40
54
+ */
41
if (ret > 0) {
55
+ if (!parent_bs->drv->supports_zoned_children &&
42
- qemu_lockcnt_inc(&ctx->list_lock);
56
+ child_bs->bl.zoned == BLK_Z_HM) {
43
progress |= aio_dispatch_handlers(ctx);
57
+ /*
44
- qemu_lockcnt_dec(&ctx->list_lock);
58
+ * The host-aware model allows zoned storage constraints and random
45
}
59
+ * write. Allow mixing host-aware and non-zoned drivers. Using
46
60
+ * host-aware device as a regular device.
47
+ qemu_lockcnt_dec(&ctx->list_lock);
61
+ */
62
+ error_setg(errp, "Cannot add a %s child to a %s parent",
63
+ child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
64
+ parent_bs->drv->supports_zoned_children ?
65
+ "support zoned children" : "not support zoned children");
66
+ return;
67
+ }
48
+
68
+
49
progress |= timerlistgroup_run_timers(&ctx->tlg);
69
if (!QLIST_EMPTY(&child_bs->parents)) {
50
70
error_setg(errp, "The node %s already has a parent",
51
return progress;
71
child_bs->node_name);
52
diff --git a/util/aio-win32.c b/util/aio-win32.c
72
diff --git a/block/file-posix.c b/block/file-posix.c
53
index XXXXXXX..XXXXXXX 100644
73
index XXXXXXX..XXXXXXX 100644
54
--- a/util/aio-win32.c
74
--- a/block/file-posix.c
55
+++ b/util/aio-win32.c
75
+++ b/block/file-posix.c
56
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
76
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
57
bool progress = false;
77
goto fail;
58
AioHandler *tmp;
59
60
- qemu_lockcnt_inc(&ctx->list_lock);
61
-
62
/*
63
* We have to walk very carefully in case aio_set_fd_handler is
64
* called while we're walking.
65
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
66
}
78
}
67
}
79
}
68
80
+#ifdef CONFIG_BLKZONED
69
- qemu_lockcnt_dec(&ctx->list_lock);
81
+ /*
70
return progress;
82
+ * The kernel page cache does not reliably work for writes to SWR zones
71
}
83
+ * of zoned block device because it can not guarantee the order of writes.
72
84
+ */
73
void aio_dispatch(AioContext *ctx)
85
+ if ((bs->bl.zoned != BLK_Z_NONE) &&
74
{
86
+ (!(s->open_flags & O_DIRECT))) {
75
+ qemu_lockcnt_inc(&ctx->list_lock);
87
+ error_setg(errp, "The driver supports zoned devices, and it requires "
76
aio_bh_poll(ctx);
88
+ "cache.direct=on, which was not specified.");
77
aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
89
+ return -EINVAL; /* No host kernel page cache */
78
+ qemu_lockcnt_dec(&ctx->list_lock);
90
+ }
79
timerlistgroup_run_timers(&ctx->tlg);
91
+#endif
80
}
92
81
93
if (S_ISBLK(st.st_mode)) {
82
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
94
#ifdef __linux__
83
}
95
diff --git a/block/raw-format.c b/block/raw-format.c
84
}
85
86
- qemu_lockcnt_dec(&ctx->list_lock);
87
first = true;
88
89
/* ctx->notifier is always registered. */
90
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
91
progress |= aio_dispatch_handlers(ctx, event);
92
} while (count > 0);
93
94
+ qemu_lockcnt_dec(&ctx->list_lock);
95
+
96
progress |= timerlistgroup_run_timers(&ctx->tlg);
97
return progress;
98
}
99
diff --git a/util/async.c b/util/async.c
100
index XXXXXXX..XXXXXXX 100644
96
index XXXXXXX..XXXXXXX 100644
101
--- a/util/async.c
97
--- a/block/raw-format.c
102
+++ b/util/async.c
98
+++ b/block/raw-format.c
103
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
99
@@ -XXX,XX +XXX,XX @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
104
bh->cb(bh->opaque);
100
BlockDriver bdrv_raw = {
105
}
101
.format_name = "raw",
106
102
.instance_size = sizeof(BDRVRawState),
107
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
103
+ .supports_zoned_children = true,
108
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
104
.bdrv_probe = &raw_probe,
109
+ * The count in ctx->list_lock is incremented before the call, and is
105
.bdrv_reopen_prepare = &raw_reopen_prepare,
110
+ * not affected by the call.
106
.bdrv_reopen_commit = &raw_reopen_commit,
111
+ */
112
int aio_bh_poll(AioContext *ctx)
113
{
114
QEMUBH *bh, **bhp, *next;
115
int ret;
116
bool deleted = false;
117
118
- qemu_lockcnt_inc(&ctx->list_lock);
119
-
120
ret = 0;
121
for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
122
next = atomic_rcu_read(&bh->next);
123
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
124
125
/* remove deleted bhs */
126
if (!deleted) {
127
- qemu_lockcnt_dec(&ctx->list_lock);
128
return ret;
129
}
130
131
- if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
132
+ if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
133
bhp = &ctx->first_bh;
134
while (*bhp) {
135
bh = *bhp;
136
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
137
bhp = &bh->next;
138
}
139
}
140
- qemu_lockcnt_unlock(&ctx->list_lock);
141
+ qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
142
}
143
return ret;
144
}
145
--
107
--
146
2.9.3
108
2.40.1
147
109
148
110
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
aio_co_wake provides the infrastructure to start a coroutine on a "home"
3
The new block layer APIs of zoned block devices can be tested by:
4
AioContext. It will be used by CoMutex and CoQueue, so that coroutines
4
$ tests/qemu-iotests/check zoned
5
don't jump from one context to another when they go to sleep on a
5
Run each zone operation on a newly created null_blk device
6
mutex or waitqueue. However, it can also be used as a more efficient
6
and see whether it outputs the same zone information.
7
alternative to one-shot bottom halves, and saves the effort of tracking
8
which AioContext a coroutine is running on.
9
7
10
aio_co_schedule is the part of aio_co_wake that starts a coroutine
8
Signed-off-by: Sam Li <faithilikerun@gmail.com>
11
on a remove AioContext, but it is also useful to implement e.g.
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
bdrv_set_aio_context callbacks.
10
Acked-by: Kevin Wolf <kwolf@redhat.com>
13
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
The implementation of aio_co_schedule is based on a lock-free
12
Message-id: 20230508045533.175575-7-faithilikerun@gmail.com
15
multiple-producer, single-consumer queue. The multiple producers use
13
Message-id: 20230324090605.28361-7-faithilikerun@gmail.com
16
cmpxchg to add to a LIFO stack. The consumer (a per-AioContext bottom
14
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
17
half) grabs all items added so far, inverts the list to make it FIFO,
15
<philmd@linaro.org>.
18
and goes through it one item at a time until it's empty. The data
16
--Stefan]
19
structure was inspired by OSv, which uses it in the very code we'll
20
"port" to QEMU for the thread-safe CoMutex.
21
22
Most of the new code is really tests.
23
24
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
25
Reviewed-by: Fam Zheng <famz@redhat.com>
26
Message-id: 20170213135235.12274-3-pbonzini@redhat.com
27
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
28
---
18
---
29
tests/Makefile.include | 8 +-
19
tests/qemu-iotests/tests/zoned | 89 ++++++++++++++++++++++++++++++
30
include/block/aio.h | 32 +++++++
20
tests/qemu-iotests/tests/zoned.out | 53 ++++++++++++++++++
31
include/qemu/coroutine_int.h | 11 ++-
21
2 files changed, 142 insertions(+)
32
tests/iothread.h | 25 +++++
22
create mode 100755 tests/qemu-iotests/tests/zoned
33
tests/iothread.c | 91 ++++++++++++++++++
23
create mode 100644 tests/qemu-iotests/tests/zoned.out
34
tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
35
util/async.c | 65 +++++++++++++
36
util/qemu-coroutine.c | 8 ++
37
util/trace-events | 4 +
38
9 files changed, 453 insertions(+), 4 deletions(-)
39
create mode 100644 tests/iothread.h
40
create mode 100644 tests/iothread.c
41
create mode 100644 tests/test-aio-multithread.c
42
24
43
diff --git a/tests/Makefile.include b/tests/Makefile.include
25
diff --git a/tests/qemu-iotests/tests/zoned b/tests/qemu-iotests/tests/zoned
44
index XXXXXXX..XXXXXXX 100644
26
new file mode 100755
45
--- a/tests/Makefile.include
27
index XXXXXXX..XXXXXXX
46
+++ b/tests/Makefile.include
28
--- /dev/null
47
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
29
+++ b/tests/qemu-iotests/tests/zoned
48
gcov-files-test-aio-y = util/async.c util/qemu-timer.o
30
@@ -XXX,XX +XXX,XX @@
49
gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
31
+#!/usr/bin/env bash
50
gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
32
+#
51
+check-unit-y += tests/test-aio-multithread$(EXESUF)
33
+# Test zone management operations.
52
+gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
34
+#
53
+gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
54
check-unit-y += tests/test-throttle$(EXESUF)
55
-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
56
-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
57
check-unit-y += tests/test-thread-pool$(EXESUF)
58
gcov-files-test-thread-pool-y = thread-pool.c
59
gcov-files-test-hbitmap-y = util/hbitmap.c
60
@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
61
    $(test-qom-obj-y)
62
test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
63
test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
64
-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
65
+test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
66
67
tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
68
tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
69
@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
70
tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
71
tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
72
tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
73
+tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
74
tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
75
tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
76
tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
77
diff --git a/include/block/aio.h b/include/block/aio.h
78
index XXXXXXX..XXXXXXX 100644
79
--- a/include/block/aio.h
80
+++ b/include/block/aio.h
81
@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
82
typedef bool AioPollFn(void *opaque);
83
typedef void IOHandler(void *opaque);
84
85
+struct Coroutine;
86
struct ThreadPool;
87
struct LinuxAioState;
88
89
@@ -XXX,XX +XXX,XX @@ struct AioContext {
90
bool notified;
91
EventNotifier notifier;
92
93
+ QSLIST_HEAD(, Coroutine) scheduled_coroutines;
94
+ QEMUBH *co_schedule_bh;
95
+
35
+
96
/* Thread pool for performing work and receiving completion callbacks.
36
+seq="$(basename $0)"
97
* Has its own locking.
37
+echo "QA output created by $seq"
98
*/
38
+status=1 # failure is the default!
99
@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
100
}
101
102
/**
103
+ * aio_co_schedule:
104
+ * @ctx: the aio context
105
+ * @co: the coroutine
106
+ *
107
+ * Start a coroutine on a remote AioContext.
108
+ *
109
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
110
+ * is active. In addition the coroutine must have yielded unless ctx
111
+ * is the context in which the coroutine is running (i.e. the value of
112
+ * qemu_get_current_aio_context() from the coroutine itself).
113
+ */
114
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
115
+
39
+
116
+/**
40
+_cleanup()
117
+ * aio_co_wake:
41
+{
118
+ * @co: the coroutine
42
+ _cleanup_test_img
119
+ *
43
+ sudo -n rmmod null_blk
120
+ * Restart a coroutine on the AioContext where it was running last, thus
44
+}
121
+ * preventing coroutines from jumping from one context to another when they
45
+trap "_cleanup; exit \$status" 0 1 2 3 15
122
+ * go to sleep.
123
+ *
124
+ * aio_co_wake may be executed either in coroutine or non-coroutine
125
+ * context. The coroutine must not be entered by anyone else while
126
+ * aio_co_wake() is active.
127
+ */
128
+void aio_co_wake(struct Coroutine *co);
129
+
46
+
130
+/**
47
+# get standard environment, filters and checks
131
* Return the AioContext whose event loop runs in the current thread.
48
+. ../common.rc
132
*
49
+. ../common.filter
133
* If called from an IOThread this will be the IOThread's AioContext. If
50
+. ../common.qemu
134
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
135
index XXXXXXX..XXXXXXX 100644
136
--- a/include/qemu/coroutine_int.h
137
+++ b/include/qemu/coroutine_int.h
138
@@ -XXX,XX +XXX,XX @@ struct Coroutine {
139
CoroutineEntry *entry;
140
void *entry_arg;
141
Coroutine *caller;
142
+
51
+
143
+ /* Only used when the coroutine has terminated. */
52
+# This test only runs on Linux hosts with raw image files.
144
QSLIST_ENTRY(Coroutine) pool_next;
53
+_supported_fmt raw
54
+_supported_proto file
55
+_supported_os Linux
145
+
56
+
146
size_t locks_held;
57
+sudo -n true || \
147
58
+ _notrun 'Password-less sudo required'
148
- /* Coroutines that should be woken up when we yield or terminate */
149
+ /* Coroutines that should be woken up when we yield or terminate.
150
+ * Only used when the coroutine is running.
151
+ */
152
QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
153
+
59
+
154
+ /* Only used when the coroutine has yielded. */
60
+IMG="--image-opts -n driver=host_device,filename=/dev/nullb0"
155
+ AioContext *ctx;
61
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
156
QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
62
+
157
+ QSLIST_ENTRY(Coroutine) co_scheduled_next;
63
+echo "Testing a null_blk device:"
158
};
64
+echo "case 1: if the operations work"
159
65
+sudo -n modprobe null_blk nr_devices=1 zoned=1
160
Coroutine *qemu_coroutine_new(void);
66
+sudo -n chmod 0666 /dev/nullb0
161
diff --git a/tests/iothread.h b/tests/iothread.h
67
+
68
+echo "(1) report the first zone:"
69
+$QEMU_IO $IMG -c "zrp 0 1"
70
+echo
71
+echo "report the first 10 zones"
72
+$QEMU_IO $IMG -c "zrp 0 10"
73
+echo
74
+echo "report the last zone:"
75
+$QEMU_IO $IMG -c "zrp 0x3e70000000 2" # 0x3e70000000 / 512 = 0x1f380000
76
+echo
77
+echo
78
+echo "(2) opening the first zone"
79
+$QEMU_IO $IMG -c "zo 0 268435456" # 268435456 / 512 = 524288
80
+echo "report after:"
81
+$QEMU_IO $IMG -c "zrp 0 1"
82
+echo
83
+echo "opening the second zone"
84
+$QEMU_IO $IMG -c "zo 268435456 268435456" #
85
+echo "report after:"
86
+$QEMU_IO $IMG -c "zrp 268435456 1"
87
+echo
88
+echo "opening the last zone"
89
+$QEMU_IO $IMG -c "zo 0x3e70000000 268435456"
90
+echo "report after:"
91
+$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
92
+echo
93
+echo
94
+echo "(3) closing the first zone"
95
+$QEMU_IO $IMG -c "zc 0 268435456"
96
+echo "report after:"
97
+$QEMU_IO $IMG -c "zrp 0 1"
98
+echo
99
+echo "closing the last zone"
100
+$QEMU_IO $IMG -c "zc 0x3e70000000 268435456"
101
+echo "report after:"
102
+$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
103
+echo
104
+echo
105
+echo "(4) finishing the second zone"
106
+$QEMU_IO $IMG -c "zf 268435456 268435456"
107
+echo "After finishing a zone:"
108
+$QEMU_IO $IMG -c "zrp 268435456 1"
109
+echo
110
+echo
111
+echo "(5) resetting the second zone"
112
+$QEMU_IO $IMG -c "zrs 268435456 268435456"
113
+echo "After resetting a zone:"
114
+$QEMU_IO $IMG -c "zrp 268435456 1"
115
+
116
+# success, all done
117
+echo "*** done"
118
+rm -f $seq.full
119
+status=0
120
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
162
new file mode 100644
121
new file mode 100644
163
index XXXXXXX..XXXXXXX
122
index XXXXXXX..XXXXXXX
164
--- /dev/null
123
--- /dev/null
165
+++ b/tests/iothread.h
124
+++ b/tests/qemu-iotests/tests/zoned.out
166
@@ -XXX,XX +XXX,XX @@
125
@@ -XXX,XX +XXX,XX @@
167
+/*
126
+QA output created by zoned
168
+ * Event loop thread implementation for unit tests
127
+Testing a null_blk device:
169
+ *
128
+case 1: if the operations work
170
+ * Copyright Red Hat Inc., 2013, 2016
129
+(1) report the first zone:
171
+ *
130
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
172
+ * Authors:
173
+ * Stefan Hajnoczi <stefanha@redhat.com>
174
+ * Paolo Bonzini <pbonzini@redhat.com>
175
+ *
176
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
177
+ * See the COPYING file in the top-level directory.
178
+ */
179
+#ifndef TEST_IOTHREAD_H
180
+#define TEST_IOTHREAD_H
181
+
131
+
182
+#include "block/aio.h"
132
+report the first 10 zones
183
+#include "qemu/thread.h"
133
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
134
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
135
+start: 0x100000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:1, [type: 2]
136
+start: 0x180000, len 0x80000, cap 0x80000, wptr 0x180000, zcond:1, [type: 2]
137
+start: 0x200000, len 0x80000, cap 0x80000, wptr 0x200000, zcond:1, [type: 2]
138
+start: 0x280000, len 0x80000, cap 0x80000, wptr 0x280000, zcond:1, [type: 2]
139
+start: 0x300000, len 0x80000, cap 0x80000, wptr 0x300000, zcond:1, [type: 2]
140
+start: 0x380000, len 0x80000, cap 0x80000, wptr 0x380000, zcond:1, [type: 2]
141
+start: 0x400000, len 0x80000, cap 0x80000, wptr 0x400000, zcond:1, [type: 2]
142
+start: 0x480000, len 0x80000, cap 0x80000, wptr 0x480000, zcond:1, [type: 2]
184
+
143
+
185
+typedef struct IOThread IOThread;
144
+report the last zone:
186
+
145
+start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
187
+IOThread *iothread_new(void);
188
+void iothread_join(IOThread *iothread);
189
+AioContext *iothread_get_aio_context(IOThread *iothread);
190
+
191
+#endif
192
diff --git a/tests/iothread.c b/tests/iothread.c
193
new file mode 100644
194
index XXXXXXX..XXXXXXX
195
--- /dev/null
196
+++ b/tests/iothread.c
197
@@ -XXX,XX +XXX,XX @@
198
+/*
199
+ * Event loop thread implementation for unit tests
200
+ *
201
+ * Copyright Red Hat Inc., 2013, 2016
202
+ *
203
+ * Authors:
204
+ * Stefan Hajnoczi <stefanha@redhat.com>
205
+ * Paolo Bonzini <pbonzini@redhat.com>
206
+ *
207
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
208
+ * See the COPYING file in the top-level directory.
209
+ *
210
+ */
211
+
212
+#include "qemu/osdep.h"
213
+#include "qapi/error.h"
214
+#include "block/aio.h"
215
+#include "qemu/main-loop.h"
216
+#include "qemu/rcu.h"
217
+#include "iothread.h"
218
+
219
+struct IOThread {
220
+ AioContext *ctx;
221
+
222
+ QemuThread thread;
223
+ QemuMutex init_done_lock;
224
+ QemuCond init_done_cond; /* is thread initialization done? */
225
+ bool stopping;
226
+};
227
+
228
+static __thread IOThread *my_iothread;
229
+
230
+AioContext *qemu_get_current_aio_context(void)
231
+{
232
+ return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
233
+}
234
+
235
+static void *iothread_run(void *opaque)
236
+{
237
+ IOThread *iothread = opaque;
238
+
239
+ rcu_register_thread();
240
+
241
+ my_iothread = iothread;
242
+ qemu_mutex_lock(&iothread->init_done_lock);
243
+ iothread->ctx = aio_context_new(&error_abort);
244
+ qemu_cond_signal(&iothread->init_done_cond);
245
+ qemu_mutex_unlock(&iothread->init_done_lock);
246
+
247
+ while (!atomic_read(&iothread->stopping)) {
248
+ aio_poll(iothread->ctx, true);
249
+ }
250
+
251
+ rcu_unregister_thread();
252
+ return NULL;
253
+}
254
+
255
+void iothread_join(IOThread *iothread)
256
+{
257
+ iothread->stopping = true;
258
+ aio_notify(iothread->ctx);
259
+ qemu_thread_join(&iothread->thread);
260
+ qemu_cond_destroy(&iothread->init_done_cond);
261
+ qemu_mutex_destroy(&iothread->init_done_lock);
262
+ aio_context_unref(iothread->ctx);
263
+ g_free(iothread);
264
+}
265
+
266
+IOThread *iothread_new(void)
267
+{
268
+ IOThread *iothread = g_new0(IOThread, 1);
269
+
270
+ qemu_mutex_init(&iothread->init_done_lock);
271
+ qemu_cond_init(&iothread->init_done_cond);
272
+ qemu_thread_create(&iothread->thread, NULL, iothread_run,
273
+ iothread, QEMU_THREAD_JOINABLE);
274
+
275
+ /* Wait for initialization to complete */
276
+ qemu_mutex_lock(&iothread->init_done_lock);
277
+ while (iothread->ctx == NULL) {
278
+ qemu_cond_wait(&iothread->init_done_cond,
279
+ &iothread->init_done_lock);
280
+ }
281
+ qemu_mutex_unlock(&iothread->init_done_lock);
282
+ return iothread;
283
+}
284
+
285
+AioContext *iothread_get_aio_context(IOThread *iothread)
286
+{
287
+ return iothread->ctx;
288
+}
289
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
290
new file mode 100644
291
index XXXXXXX..XXXXXXX
292
--- /dev/null
293
+++ b/tests/test-aio-multithread.c
294
@@ -XXX,XX +XXX,XX @@
295
+/*
296
+ * AioContext multithreading tests
297
+ *
298
+ * Copyright Red Hat, Inc. 2016
299
+ *
300
+ * Authors:
301
+ * Paolo Bonzini <pbonzini@redhat.com>
302
+ *
303
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
304
+ * See the COPYING.LIB file in the top-level directory.
305
+ */
306
+
307
+#include "qemu/osdep.h"
308
+#include <glib.h>
309
+#include "block/aio.h"
310
+#include "qapi/error.h"
311
+#include "qemu/coroutine.h"
312
+#include "qemu/thread.h"
313
+#include "qemu/error-report.h"
314
+#include "iothread.h"
315
+
316
+/* AioContext management */
317
+
318
+#define NUM_CONTEXTS 5
319
+
320
+static IOThread *threads[NUM_CONTEXTS];
321
+static AioContext *ctx[NUM_CONTEXTS];
322
+static __thread int id = -1;
323
+
324
+static QemuEvent done_event;
325
+
326
+/* Run a function synchronously on a remote iothread. */
327
+
328
+typedef struct CtxRunData {
329
+ QEMUBHFunc *cb;
330
+ void *arg;
331
+} CtxRunData;
332
+
333
+static void ctx_run_bh_cb(void *opaque)
334
+{
335
+ CtxRunData *data = opaque;
336
+
337
+ data->cb(data->arg);
338
+ qemu_event_set(&done_event);
339
+}
340
+
341
+static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
342
+{
343
+ CtxRunData data = {
344
+ .cb = cb,
345
+ .arg = opaque
346
+ };
347
+
348
+ qemu_event_reset(&done_event);
349
+ aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
350
+ qemu_event_wait(&done_event);
351
+}
352
+
353
+/* Starting the iothreads. */
354
+
355
+static void set_id_cb(void *opaque)
356
+{
357
+ int *i = opaque;
358
+
359
+ id = *i;
360
+}
361
+
362
+static void create_aio_contexts(void)
363
+{
364
+ int i;
365
+
366
+ for (i = 0; i < NUM_CONTEXTS; i++) {
367
+ threads[i] = iothread_new();
368
+ ctx[i] = iothread_get_aio_context(threads[i]);
369
+ }
370
+
371
+ qemu_event_init(&done_event, false);
372
+ for (i = 0; i < NUM_CONTEXTS; i++) {
373
+ ctx_run(i, set_id_cb, &i);
374
+ }
375
+}
376
+
377
+/* Stopping the iothreads. */
378
+
379
+static void join_aio_contexts(void)
380
+{
381
+ int i;
382
+
383
+ for (i = 0; i < NUM_CONTEXTS; i++) {
384
+ aio_context_ref(ctx[i]);
385
+ }
386
+ for (i = 0; i < NUM_CONTEXTS; i++) {
387
+ iothread_join(threads[i]);
388
+ }
389
+ for (i = 0; i < NUM_CONTEXTS; i++) {
390
+ aio_context_unref(ctx[i]);
391
+ }
392
+ qemu_event_destroy(&done_event);
393
+}
394
+
395
+/* Basic test for the stuff above. */
396
+
397
+static void test_lifecycle(void)
398
+{
399
+ create_aio_contexts();
400
+ join_aio_contexts();
401
+}
402
+
403
+/* aio_co_schedule test. */
404
+
405
+static Coroutine *to_schedule[NUM_CONTEXTS];
406
+
407
+static bool now_stopping;
408
+
409
+static int count_retry;
410
+static int count_here;
411
+static int count_other;
412
+
413
+static bool schedule_next(int n)
414
+{
415
+ Coroutine *co;
416
+
417
+ co = atomic_xchg(&to_schedule[n], NULL);
418
+ if (!co) {
419
+ atomic_inc(&count_retry);
420
+ return false;
421
+ }
422
+
423
+ if (n == id) {
424
+ atomic_inc(&count_here);
425
+ } else {
426
+ atomic_inc(&count_other);
427
+ }
428
+
429
+ aio_co_schedule(ctx[n], co);
430
+ return true;
431
+}
432
+
433
+static void finish_cb(void *opaque)
434
+{
435
+ schedule_next(id);
436
+}
437
+
438
+static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
439
+{
440
+ g_assert(to_schedule[id] == NULL);
441
+ atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
442
+
443
+ while (!atomic_mb_read(&now_stopping)) {
444
+ int n;
445
+
446
+ n = g_test_rand_int_range(0, NUM_CONTEXTS);
447
+ schedule_next(n);
448
+ qemu_coroutine_yield();
449
+
450
+ g_assert(to_schedule[id] == NULL);
451
+ atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
452
+ }
453
+}
454
+
146
+
455
+
147
+
456
+static void test_multi_co_schedule(int seconds)
148
+(2) opening the first zone
457
+{
149
+report after:
458
+ int i;
150
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:3, [type: 2]
459
+
151
+
460
+ count_here = count_other = count_retry = 0;
152
+opening the second zone
461
+ now_stopping = false;
153
+report after:
154
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:3, [type: 2]
462
+
155
+
463
+ create_aio_contexts();
156
+opening the last zone
464
+ for (i = 0; i < NUM_CONTEXTS; i++) {
157
+report after:
465
+ Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
158
+start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:3, [type: 2]
466
+ aio_co_schedule(ctx[i], co1);
467
+ }
468
+
159
+
469
+ g_usleep(seconds * 1000000);
470
+
160
+
471
+ atomic_mb_set(&now_stopping, true);
161
+(3) closing the first zone
472
+ for (i = 0; i < NUM_CONTEXTS; i++) {
162
+report after:
473
+ ctx_run(i, finish_cb, NULL);
163
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
474
+ to_schedule[i] = NULL;
475
+ }
476
+
164
+
477
+ join_aio_contexts();
165
+closing the last zone
478
+ g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
166
+report after:
479
+ count_other, count_here, count_retry,
167
+start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
480
+ count_here + count_other + count_retry);
481
+}
482
+
168
+
483
+static void test_multi_co_schedule_1(void)
484
+{
485
+ test_multi_co_schedule(1);
486
+}
487
+
169
+
488
+static void test_multi_co_schedule_10(void)
170
+(4) finishing the second zone
489
+{
171
+After finishing a zone:
490
+ test_multi_co_schedule(10);
172
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
491
+}
492
+
173
+
493
+/* End of tests. */
494
+
174
+
495
+int main(int argc, char **argv)
175
+(5) resetting the second zone
496
+{
176
+After resetting a zone:
497
+ init_clocks();
177
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
498
+
178
+*** done
499
+ g_test_init(&argc, &argv, NULL);
500
+ g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
501
+ if (g_test_quick()) {
502
+ g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
503
+ } else {
504
+ g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
505
+ }
506
+ return g_test_run();
507
+}
508
diff --git a/util/async.c b/util/async.c
509
index XXXXXXX..XXXXXXX 100644
510
--- a/util/async.c
511
+++ b/util/async.c
512
@@ -XXX,XX +XXX,XX @@
513
#include "qemu/main-loop.h"
514
#include "qemu/atomic.h"
515
#include "block/raw-aio.h"
516
+#include "qemu/coroutine_int.h"
517
+#include "trace.h"
518
519
/***********************************************************/
520
/* bottom halves (can be seen as timers which expire ASAP) */
521
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource *source)
522
}
523
#endif
524
525
+ assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
526
+ qemu_bh_delete(ctx->co_schedule_bh);
527
+
528
qemu_lockcnt_lock(&ctx->list_lock);
529
assert(!qemu_lockcnt_count(&ctx->list_lock));
530
while (ctx->first_bh) {
531
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
532
return atomic_read(&ctx->notified);
533
}
534
535
+static void co_schedule_bh_cb(void *opaque)
536
+{
537
+ AioContext *ctx = opaque;
538
+ QSLIST_HEAD(, Coroutine) straight, reversed;
539
+
540
+ QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
541
+ QSLIST_INIT(&straight);
542
+
543
+ while (!QSLIST_EMPTY(&reversed)) {
544
+ Coroutine *co = QSLIST_FIRST(&reversed);
545
+ QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
546
+ QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
547
+ }
548
+
549
+ while (!QSLIST_EMPTY(&straight)) {
550
+ Coroutine *co = QSLIST_FIRST(&straight);
551
+ QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
552
+ trace_aio_co_schedule_bh_cb(ctx, co);
553
+ qemu_coroutine_enter(co);
554
+ }
555
+}
556
+
557
AioContext *aio_context_new(Error **errp)
558
{
559
int ret;
560
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
561
}
562
g_source_set_can_recurse(&ctx->source, true);
563
qemu_lockcnt_init(&ctx->list_lock);
564
+
565
+ ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
566
+ QSLIST_INIT(&ctx->scheduled_coroutines);
567
+
568
aio_set_event_notifier(ctx, &ctx->notifier,
569
false,
570
(EventNotifierHandler *)
571
@@ -XXX,XX +XXX,XX @@ fail:
572
return NULL;
573
}
574
575
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
576
+{
577
+ trace_aio_co_schedule(ctx, co);
578
+ QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
579
+ co, co_scheduled_next);
580
+ qemu_bh_schedule(ctx->co_schedule_bh);
581
+}
582
+
583
+void aio_co_wake(struct Coroutine *co)
584
+{
585
+ AioContext *ctx;
586
+
587
+ /* Read coroutine before co->ctx. Matches smp_wmb in
588
+ * qemu_coroutine_enter.
589
+ */
590
+ smp_read_barrier_depends();
591
+ ctx = atomic_read(&co->ctx);
592
+
593
+ if (ctx != qemu_get_current_aio_context()) {
594
+ aio_co_schedule(ctx, co);
595
+ return;
596
+ }
597
+
598
+ if (qemu_in_coroutine()) {
599
+ Coroutine *self = qemu_coroutine_self();
600
+ assert(self != co);
601
+ QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
602
+ } else {
603
+ aio_context_acquire(ctx);
604
+ qemu_coroutine_enter(co);
605
+ aio_context_release(ctx);
606
+ }
607
+}
608
+
609
void aio_context_ref(AioContext *ctx)
610
{
611
g_source_ref(&ctx->source);
612
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
613
index XXXXXXX..XXXXXXX 100644
614
--- a/util/qemu-coroutine.c
615
+++ b/util/qemu-coroutine.c
616
@@ -XXX,XX +XXX,XX @@
617
#include "qemu/atomic.h"
618
#include "qemu/coroutine.h"
619
#include "qemu/coroutine_int.h"
620
+#include "block/aio.h"
621
622
enum {
623
POOL_BATCH_SIZE = 64,
624
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
625
}
626
627
co->caller = self;
628
+ co->ctx = qemu_get_current_aio_context();
629
+
630
+ /* Store co->ctx before anything that stores co. Matches
631
+ * barrier in aio_co_wake.
632
+ */
633
+ smp_wmb();
634
+
635
ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
636
637
qemu_co_queue_run_restart(co);
638
diff --git a/util/trace-events b/util/trace-events
639
index XXXXXXX..XXXXXXX 100644
640
--- a/util/trace-events
641
+++ b/util/trace-events
642
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
643
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
644
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
645
646
+# util/async.c
647
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
648
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
649
+
650
# util/thread-pool.c
651
thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
652
thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
653
--
179
--
654
2.9.3
180
2.40.1
655
181
656
182
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
qcow2_create2 calls this. Do not run a nested event loop, as that
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
5
list of the currently running one.
6
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
5
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
9
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Acked-by: Kevin Wolf <kwolf@redhat.com>
10
Message-id: 20170213135235.12274-4-pbonzini@redhat.com
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Message-id: 20230508045533.175575-8-faithilikerun@gmail.com
9
Message-id: 20230324090605.28361-8-faithilikerun@gmail.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
11
---
13
block/block-backend.c | 12 ++++++++----
12
block/file-posix.c | 3 +++
14
1 file changed, 8 insertions(+), 4 deletions(-)
13
block/trace-events | 2 ++
14
2 files changed, 5 insertions(+)
15
15
16
diff --git a/block/block-backend.c b/block/block-backend.c
16
diff --git a/block/file-posix.c b/block/file-posix.c
17
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
18
--- a/block/block-backend.c
18
--- a/block/file-posix.c
19
+++ b/block/block-backend.c
19
+++ b/block/file-posix.c
20
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
20
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
21
{
21
},
22
QEMUIOVector qiov;
23
struct iovec iov;
24
- Coroutine *co;
25
BlkRwCo rwco;
26
27
iov = (struct iovec) {
28
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
29
.ret = NOT_DONE,
30
};
22
};
31
23
32
- co = qemu_coroutine_create(co_entry, &rwco);
24
+ trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
33
- qemu_coroutine_enter(co);
25
return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
34
- BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
35
+ if (qemu_in_coroutine()) {
36
+ /* Fast-path if already in coroutine context */
37
+ co_entry(&rwco);
38
+ } else {
39
+ Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
40
+ qemu_coroutine_enter(co);
41
+ BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
42
+ }
43
44
return rwco.ret;
45
}
26
}
27
#endif
28
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
29
},
30
};
31
32
+ trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
33
+ len >> BDRV_SECTOR_BITS);
34
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
35
if (ret != 0) {
36
error_report("ioctl %s failed %d", op_name, ret);
37
diff --git a/block/trace-events b/block/trace-events
38
index XXXXXXX..XXXXXXX 100644
39
--- a/block/trace-events
40
+++ b/block/trace-events
41
@@ -XXX,XX +XXX,XX @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
42
file_setup_cdrom(const char *partition) "Using %s as optical disc"
43
file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
44
file_flush_fdatasync_failed(int err) "errno %d"
45
+zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
46
+zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
47
48
# ssh.c
49
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
46
--
50
--
47
2.9.3
51
2.40.1
48
49
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
AioContext is fairly self contained, the only dependency is QEMUTimer but
3
Add the documentation about the zoned device support to virtio-blk
4
that in turn doesn't need anything else. So move them out of block-obj-y
4
emulation.
5
to avoid introducing a dependency from io/ to block-obj-y.
6
5
7
main-loop and its dependency iohandler also need to be moved, because
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
8
later in this series io/ will call iohandler_get_aio_context.
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
8
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
10
[Changed copyright "the QEMU team" to "other QEMU contributors" as
9
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
11
suggested by Daniel Berrange and agreed by Paolo.
10
Acked-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 20230508045533.175575-9-faithilikerun@gmail.com
13
Message-id: 20230324090605.28361-9-faithilikerun@gmail.com
14
[Add index-api.rst to fix "zoned-storage.rst:document isn't included in
15
any toctree" error and fix pre-formatted code syntax.
12
--Stefan]
16
--Stefan]
13
14
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
15
Reviewed-by: Fam Zheng <famz@redhat.com>
16
Message-id: 20170213135235.12274-2-pbonzini@redhat.com
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
18
---
19
Makefile.objs | 4 ---
19
docs/devel/index-api.rst | 1 +
20
stubs/Makefile.objs | 1 +
20
docs/devel/zoned-storage.rst | 43 ++++++++++++++++++++++++++
21
tests/Makefile.include | 11 ++++----
21
docs/system/qemu-block-drivers.rst.inc | 6 ++++
22
util/Makefile.objs | 6 +++-
22
3 files changed, 50 insertions(+)
23
block/io.c | 29 -------------------
23
create mode 100644 docs/devel/zoned-storage.rst
24
stubs/linux-aio.c | 32 +++++++++++++++++++++
25
stubs/set-fd-handler.c | 11 --------
26
aio-posix.c => util/aio-posix.c | 2 +-
27
aio-win32.c => util/aio-win32.c | 0
28
util/aiocb.c | 55 +++++++++++++++++++++++++++++++++++++
29
async.c => util/async.c | 3 +-
30
iohandler.c => util/iohandler.c | 0
31
main-loop.c => util/main-loop.c | 0
32
qemu-timer.c => util/qemu-timer.c | 0
33
thread-pool.c => util/thread-pool.c | 2 +-
34
trace-events | 11 --------
35
util/trace-events | 11 ++++++++
36
17 files changed, 114 insertions(+), 64 deletions(-)
37
create mode 100644 stubs/linux-aio.c
38
rename aio-posix.c => util/aio-posix.c (99%)
39
rename aio-win32.c => util/aio-win32.c (100%)
40
create mode 100644 util/aiocb.c
41
rename async.c => util/async.c (99%)
42
rename iohandler.c => util/iohandler.c (100%)
43
rename main-loop.c => util/main-loop.c (100%)
44
rename qemu-timer.c => util/qemu-timer.c (100%)
45
rename thread-pool.c => util/thread-pool.c (99%)
46
24
47
diff --git a/Makefile.objs b/Makefile.objs
25
diff --git a/docs/devel/index-api.rst b/docs/devel/index-api.rst
48
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
49
--- a/Makefile.objs
27
--- a/docs/devel/index-api.rst
50
+++ b/Makefile.objs
28
+++ b/docs/devel/index-api.rst
51
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
29
@@ -XXX,XX +XXX,XX @@ generated from in-code annotations to function prototypes.
52
#######################################################################
30
memory
53
# block-obj-y is code used by both qemu system emulation and qemu-img
31
modules
54
32
ui
55
-block-obj-y = async.o thread-pool.o
33
+ zoned-storage
56
block-obj-y += nbd/
34
diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
57
block-obj-y += block.o blockjob.o
58
-block-obj-y += main-loop.o iohandler.o qemu-timer.o
59
-block-obj-$(CONFIG_POSIX) += aio-posix.o
60
-block-obj-$(CONFIG_WIN32) += aio-win32.o
61
block-obj-y += block/
62
block-obj-y += qemu-io-cmds.o
63
block-obj-$(CONFIG_REPLICATION) += replication.o
64
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
65
index XXXXXXX..XXXXXXX 100644
66
--- a/stubs/Makefile.objs
67
+++ b/stubs/Makefile.objs
68
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
69
stub-obj-y += iothread.o
70
stub-obj-y += iothread-lock.o
71
stub-obj-y += is-daemonized.o
72
+stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
73
stub-obj-y += machine-init-done.o
74
stub-obj-y += migr-blocker.o
75
stub-obj-y += monitor.o
76
diff --git a/tests/Makefile.include b/tests/Makefile.include
77
index XXXXXXX..XXXXXXX 100644
78
--- a/tests/Makefile.include
79
+++ b/tests/Makefile.include
80
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
81
check-unit-y += tests/test-iov$(EXESUF)
82
gcov-files-test-iov-y = util/iov.c
83
check-unit-y += tests/test-aio$(EXESUF)
84
+gcov-files-test-aio-y = util/async.c util/qemu-timer.o
85
+gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
86
+gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
87
check-unit-y += tests/test-throttle$(EXESUF)
88
gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
89
gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
90
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
91
tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
92
tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
93
94
-tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
95
-    $(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
96
+tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
97
tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
98
tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
99
tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
100
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
101
    migration/vmstate.o migration/qemu-file.o \
102
migration/qemu-file-channel.o migration/qjson.o \
103
    $(test-io-obj-y)
104
-tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
105
-    $(test-util-obj-y)
106
+tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
107
tests/test-base64$(EXESUF): tests/test-base64.o \
108
    libqemuutil.a libqemustub.a
109
tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
110
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
111
tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
112
tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
113
tests/postcopy-test$(EXESUF): tests/postcopy-test.o
114
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
115
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
116
    $(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
117
    $(chardev-obj-y)
118
tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
119
diff --git a/util/Makefile.objs b/util/Makefile.objs
120
index XXXXXXX..XXXXXXX 100644
121
--- a/util/Makefile.objs
122
+++ b/util/Makefile.objs
123
@@ -XXX,XX +XXX,XX @@
124
util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
125
util-obj-y += bufferiszero.o
126
util-obj-y += lockcnt.o
127
+util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
128
+util-obj-y += main-loop.o iohandler.o
129
+util-obj-$(CONFIG_POSIX) += aio-posix.o
130
util-obj-$(CONFIG_POSIX) += compatfd.o
131
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
132
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
133
util-obj-$(CONFIG_POSIX) += oslib-posix.o
134
util-obj-$(CONFIG_POSIX) += qemu-openpty.o
135
util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
136
-util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
137
util-obj-$(CONFIG_POSIX) += memfd.o
138
+util-obj-$(CONFIG_WIN32) += aio-win32.o
139
+util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
140
util-obj-$(CONFIG_WIN32) += oslib-win32.o
141
util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
142
util-obj-y += envlist.o path.o module.o
143
diff --git a/block/io.c b/block/io.c
144
index XXXXXXX..XXXXXXX 100644
145
--- a/block/io.c
146
+++ b/block/io.c
147
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
148
return &acb->common;
149
}
150
151
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
152
- BlockCompletionFunc *cb, void *opaque)
153
-{
154
- BlockAIOCB *acb;
155
-
156
- acb = g_malloc(aiocb_info->aiocb_size);
157
- acb->aiocb_info = aiocb_info;
158
- acb->bs = bs;
159
- acb->cb = cb;
160
- acb->opaque = opaque;
161
- acb->refcnt = 1;
162
- return acb;
163
-}
164
-
165
-void qemu_aio_ref(void *p)
166
-{
167
- BlockAIOCB *acb = p;
168
- acb->refcnt++;
169
-}
170
-
171
-void qemu_aio_unref(void *p)
172
-{
173
- BlockAIOCB *acb = p;
174
- assert(acb->refcnt > 0);
175
- if (--acb->refcnt == 0) {
176
- g_free(acb);
177
- }
178
-}
179
-
180
/**************************************************************/
181
/* Coroutine block device emulation */
182
183
diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
184
new file mode 100644
35
new file mode 100644
185
index XXXXXXX..XXXXXXX
36
index XXXXXXX..XXXXXXX
186
--- /dev/null
37
--- /dev/null
187
+++ b/stubs/linux-aio.c
38
+++ b/docs/devel/zoned-storage.rst
188
@@ -XXX,XX +XXX,XX @@
39
@@ -XXX,XX +XXX,XX @@
189
+/*
40
+=============
190
+ * Linux native AIO support.
41
+zoned-storage
191
+ *
42
+=============
192
+ * Copyright (C) 2009 IBM, Corp.
193
+ * Copyright (C) 2009 Red Hat, Inc.
194
+ *
195
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
196
+ * See the COPYING file in the top-level directory.
197
+ */
198
+#include "qemu/osdep.h"
199
+#include "block/aio.h"
200
+#include "block/raw-aio.h"
201
+
43
+
202
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
44
+Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones
203
+{
45
+that are larger than the LBA size. They can only allow sequential writes, which
204
+ abort();
46
+can reduce write amplification in SSDs, and potentially lead to higher
205
+}
47
+throughput and increased capacity. More details about ZBDs can be found at:
206
+
48
+
207
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
49
+https://zonedstorage.io/docs/introduction/zoned-storage
208
+{
209
+ abort();
210
+}
211
+
50
+
212
+LinuxAioState *laio_init(void)
51
+1. Block layer APIs for zoned storage
213
+{
52
+-------------------------------------
214
+ abort();
53
+QEMU block layer supports three zoned storage models:
215
+}
54
+- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
55
+to zones. It supports ZBD-specific I/O commands that can be used by a host to
56
+manage the zones of a device.
57
+- BLK_Z_HA: The host-aware zoned model allows random write operations in
58
+zones, making it backward compatible with regular block devices.
59
+- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
60
+regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
61
+supported.
216
+
62
+
217
+void laio_cleanup(LinuxAioState *s)
63
+The block device information resides inside BlockDriverState. QEMU uses
218
+{
64
+BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
219
+ abort();
65
+block layer while processing I/O requests. A BlockBackend has a root pointer to
220
+}
66
+a BlockDriverState graph(for example, raw format on top of file-posix). The
221
diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
67
+zoned storage information can be propagated from the leaf BlockDriverState all
68
+the way up to the BlockBackend. If the zoned storage model in file-posix is
69
+set to BLK_Z_HM, then block drivers will declare support for zoned host device.
70
+
71
+The block layer APIs support commands needed for zoned storage devices,
72
+including report zones, four zone operations, and zone append.
73
+
74
+2. Emulating zoned storage controllers
75
+--------------------------------------
76
+When the BlockBackend's BlockLimits model reports a zoned storage device, users
77
+like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
78
+APIs for zoned storage emulation or testing.
79
+
80
+For example, to test zone_report on a null_blk device using qemu-io is::
81
+
82
+ $ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
83
diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-drivers.rst.inc
222
index XXXXXXX..XXXXXXX 100644
84
index XXXXXXX..XXXXXXX 100644
223
--- a/stubs/set-fd-handler.c
85
--- a/docs/system/qemu-block-drivers.rst.inc
224
+++ b/stubs/set-fd-handler.c
86
+++ b/docs/system/qemu-block-drivers.rst.inc
225
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
87
@@ -XXX,XX +XXX,XX @@ Hard disks
226
{
88
you may corrupt your host data (use the ``-snapshot`` command
227
abort();
89
line option or modify the device permissions accordingly).
228
}
90
229
-
91
+Zoned block devices
230
-void aio_set_fd_handler(AioContext *ctx,
92
+ Zoned block devices can be passed through to the guest if the emulated storage
231
- int fd,
93
+ controller supports zoned storage. Use ``--blockdev host_device,
232
- bool is_external,
94
+ node-name=drive0,filename=/dev/nullb0,cache.direct=on`` to pass through
233
- IOHandler *io_read,
95
+ ``/dev/nullb0`` as ``drive0``.
234
- IOHandler *io_write,
235
- AioPollFn *io_poll,
236
- void *opaque)
237
-{
238
- abort();
239
-}
240
diff --git a/aio-posix.c b/util/aio-posix.c
241
similarity index 99%
242
rename from aio-posix.c
243
rename to util/aio-posix.c
244
index XXXXXXX..XXXXXXX 100644
245
--- a/aio-posix.c
246
+++ b/util/aio-posix.c
247
@@ -XXX,XX +XXX,XX @@
248
#include "qemu/rcu_queue.h"
249
#include "qemu/sockets.h"
250
#include "qemu/cutils.h"
251
-#include "trace-root.h"
252
+#include "trace.h"
253
#ifdef CONFIG_EPOLL_CREATE1
254
#include <sys/epoll.h>
255
#endif
256
diff --git a/aio-win32.c b/util/aio-win32.c
257
similarity index 100%
258
rename from aio-win32.c
259
rename to util/aio-win32.c
260
diff --git a/util/aiocb.c b/util/aiocb.c
261
new file mode 100644
262
index XXXXXXX..XXXXXXX
263
--- /dev/null
264
+++ b/util/aiocb.c
265
@@ -XXX,XX +XXX,XX @@
266
+/*
267
+ * BlockAIOCB allocation
268
+ *
269
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
270
+ *
271
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
272
+ * of this software and associated documentation files (the "Software"), to deal
273
+ * in the Software without restriction, including without limitation the rights
274
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
275
+ * copies of the Software, and to permit persons to whom the Software is
276
+ * furnished to do so, subject to the following conditions:
277
+ *
278
+ * The above copyright notice and this permission notice shall be included in
279
+ * all copies or substantial portions of the Software.
280
+ *
281
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
282
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
283
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
284
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
285
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
286
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
287
+ * THE SOFTWARE.
288
+ */
289
+
96
+
290
+#include "qemu/osdep.h"
97
Windows
291
+#include "block/aio.h"
98
^^^^^^^
292
+
99
293
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
294
+ BlockCompletionFunc *cb, void *opaque)
295
+{
296
+ BlockAIOCB *acb;
297
+
298
+ acb = g_malloc(aiocb_info->aiocb_size);
299
+ acb->aiocb_info = aiocb_info;
300
+ acb->bs = bs;
301
+ acb->cb = cb;
302
+ acb->opaque = opaque;
303
+ acb->refcnt = 1;
304
+ return acb;
305
+}
306
+
307
+void qemu_aio_ref(void *p)
308
+{
309
+ BlockAIOCB *acb = p;
310
+ acb->refcnt++;
311
+}
312
+
313
+void qemu_aio_unref(void *p)
314
+{
315
+ BlockAIOCB *acb = p;
316
+ assert(acb->refcnt > 0);
317
+ if (--acb->refcnt == 0) {
318
+ g_free(acb);
319
+ }
320
+}
321
diff --git a/async.c b/util/async.c
322
similarity index 99%
323
rename from async.c
324
rename to util/async.c
325
index XXXXXXX..XXXXXXX 100644
326
--- a/async.c
327
+++ b/util/async.c
328
@@ -XXX,XX +XXX,XX @@
329
/*
330
- * QEMU System Emulator
331
+ * Data plane event loop
332
*
333
* Copyright (c) 2003-2008 Fabrice Bellard
334
+ * Copyright (c) 2009-2017 QEMU contributors
335
*
336
* Permission is hereby granted, free of charge, to any person obtaining a copy
337
* of this software and associated documentation files (the "Software"), to deal
338
diff --git a/iohandler.c b/util/iohandler.c
339
similarity index 100%
340
rename from iohandler.c
341
rename to util/iohandler.c
342
diff --git a/main-loop.c b/util/main-loop.c
343
similarity index 100%
344
rename from main-loop.c
345
rename to util/main-loop.c
346
diff --git a/qemu-timer.c b/util/qemu-timer.c
347
similarity index 100%
348
rename from qemu-timer.c
349
rename to util/qemu-timer.c
350
diff --git a/thread-pool.c b/util/thread-pool.c
351
similarity index 99%
352
rename from thread-pool.c
353
rename to util/thread-pool.c
354
index XXXXXXX..XXXXXXX 100644
355
--- a/thread-pool.c
356
+++ b/util/thread-pool.c
357
@@ -XXX,XX +XXX,XX @@
358
#include "qemu/queue.h"
359
#include "qemu/thread.h"
360
#include "qemu/coroutine.h"
361
-#include "trace-root.h"
362
+#include "trace.h"
363
#include "block/thread-pool.h"
364
#include "qemu/main-loop.h"
365
366
diff --git a/trace-events b/trace-events
367
index XXXXXXX..XXXXXXX 100644
368
--- a/trace-events
369
+++ b/trace-events
370
@@ -XXX,XX +XXX,XX @@
371
#
372
# The <format-string> should be a sprintf()-compatible format string.
373
374
-# aio-posix.c
375
-run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
376
-run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
377
-poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
378
-poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
379
-
380
-# thread-pool.c
381
-thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
382
-thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
383
-thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
384
-
385
# ioport.c
386
cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
387
cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
388
diff --git a/util/trace-events b/util/trace-events
389
index XXXXXXX..XXXXXXX 100644
390
--- a/util/trace-events
391
+++ b/util/trace-events
392
@@ -XXX,XX +XXX,XX @@
393
# See docs/tracing.txt for syntax documentation.
394
395
+# util/aio-posix.c
396
+run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
397
+run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
398
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
399
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
400
+
401
+# util/thread-pool.c
402
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
403
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
404
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
405
+
406
# util/buffer.c
407
buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
408
buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
409
--
100
--
410
2.9.3
101
2.40.1
411
412
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
Once the thread pool starts using aio_co_wake, it will also need
4
qemu_get_current_aio_context(). Make test-thread-pool create
5
an AioContext with qemu_init_main_loop, so that stubs/iothread.c
6
and tests/iothread.c can provide the rest.
7
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Reviewed-by: Fam Zheng <famz@redhat.com>
11
Message-id: 20170213135235.12274-5-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
tests/test-thread-pool.c | 12 +++---------
15
1 file changed, 3 insertions(+), 9 deletions(-)
16
17
diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/tests/test-thread-pool.c
20
+++ b/tests/test-thread-pool.c
21
@@ -XXX,XX +XXX,XX @@
22
#include "qapi/error.h"
23
#include "qemu/timer.h"
24
#include "qemu/error-report.h"
25
+#include "qemu/main-loop.h"
26
27
static AioContext *ctx;
28
static ThreadPool *pool;
29
@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
30
int main(int argc, char **argv)
31
{
32
int ret;
33
- Error *local_error = NULL;
34
35
- init_clocks();
36
-
37
- ctx = aio_context_new(&local_error);
38
- if (!ctx) {
39
- error_reportf_err(local_error, "Failed to create AIO Context: ");
40
- exit(1);
41
- }
42
+ qemu_init_main_loop(&error_abort);
43
+ ctx = qemu_get_current_aio_context();
44
pool = aio_get_thread_pool(ctx);
45
46
g_test_init(&argc, &argv, NULL);
47
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
48
49
ret = g_test_run();
50
51
- aio_context_unref(ctx);
52
return ret;
53
}
54
--
55
2.9.3
56
57
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
This is in preparation for making qio_channel_yield work on
4
AioContexts other than the main one.
5
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
Reviewed-by: Fam Zheng <famz@redhat.com>
10
Message-id: 20170213135235.12274-6-pbonzini@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
include/io/channel.h | 25 +++++++++++++++++++++++++
14
io/channel-command.c | 13 +++++++++++++
15
io/channel-file.c | 11 +++++++++++
16
io/channel-socket.c | 16 +++++++++++-----
17
io/channel-tls.c | 12 ++++++++++++
18
io/channel-watch.c | 6 ++++++
19
io/channel.c | 11 +++++++++++
20
7 files changed, 89 insertions(+), 5 deletions(-)
21
22
diff --git a/include/io/channel.h b/include/io/channel.h
23
index XXXXXXX..XXXXXXX 100644
24
--- a/include/io/channel.h
25
+++ b/include/io/channel.h
26
@@ -XXX,XX +XXX,XX @@
27
28
#include "qemu-common.h"
29
#include "qom/object.h"
30
+#include "block/aio.h"
31
32
#define TYPE_QIO_CHANNEL "qio-channel"
33
#define QIO_CHANNEL(obj) \
34
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
35
off_t offset,
36
int whence,
37
Error **errp);
38
+ void (*io_set_aio_fd_handler)(QIOChannel *ioc,
39
+ AioContext *ctx,
40
+ IOHandler *io_read,
41
+ IOHandler *io_write,
42
+ void *opaque);
43
};
44
45
/* General I/O handling functions */
46
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
47
void qio_channel_wait(QIOChannel *ioc,
48
GIOCondition condition);
49
50
+/**
51
+ * qio_channel_set_aio_fd_handler:
52
+ * @ioc: the channel object
53
+ * @ctx: the AioContext to set the handlers on
54
+ * @io_read: the read handler
55
+ * @io_write: the write handler
56
+ * @opaque: the opaque value passed to the handler
57
+ *
58
+ * This is used internally by qio_channel_yield(). It can
59
+ * be used by channel implementations to forward the handlers
60
+ * to another channel (e.g. from #QIOChannelTLS to the
61
+ * underlying socket).
62
+ */
63
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
64
+ AioContext *ctx,
65
+ IOHandler *io_read,
66
+ IOHandler *io_write,
67
+ void *opaque);
68
+
69
#endif /* QIO_CHANNEL_H */
70
diff --git a/io/channel-command.c b/io/channel-command.c
71
index XXXXXXX..XXXXXXX 100644
72
--- a/io/channel-command.c
73
+++ b/io/channel-command.c
74
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
75
}
76
77
78
+static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
79
+ AioContext *ctx,
80
+ IOHandler *io_read,
81
+ IOHandler *io_write,
82
+ void *opaque)
83
+{
84
+ QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
85
+ aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
86
+ aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
87
+}
88
+
89
+
90
static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
91
GIOCondition condition)
92
{
93
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
94
ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
95
ioc_klass->io_close = qio_channel_command_close;
96
ioc_klass->io_create_watch = qio_channel_command_create_watch;
97
+ ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
98
}
99
100
static const TypeInfo qio_channel_command_info = {
101
diff --git a/io/channel-file.c b/io/channel-file.c
102
index XXXXXXX..XXXXXXX 100644
103
--- a/io/channel-file.c
104
+++ b/io/channel-file.c
105
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
106
}
107
108
109
+static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
110
+ AioContext *ctx,
111
+ IOHandler *io_read,
112
+ IOHandler *io_write,
113
+ void *opaque)
114
+{
115
+ QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
116
+ aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
117
+}
118
+
119
static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
120
GIOCondition condition)
121
{
122
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
123
ioc_klass->io_seek = qio_channel_file_seek;
124
ioc_klass->io_close = qio_channel_file_close;
125
ioc_klass->io_create_watch = qio_channel_file_create_watch;
126
+ ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
127
}
128
129
static const TypeInfo qio_channel_file_info = {
130
diff --git a/io/channel-socket.c b/io/channel-socket.c
131
index XXXXXXX..XXXXXXX 100644
132
--- a/io/channel-socket.c
133
+++ b/io/channel-socket.c
134
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
135
qemu_set_block(sioc->fd);
136
} else {
137
qemu_set_nonblock(sioc->fd);
138
-#ifdef WIN32
139
- WSAEventSelect(sioc->fd, ioc->event,
140
- FD_READ | FD_ACCEPT | FD_CLOSE |
141
- FD_CONNECT | FD_WRITE | FD_OOB);
142
-#endif
143
}
144
return 0;
145
}
146
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
147
return 0;
148
}
149
150
+static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
151
+ AioContext *ctx,
152
+ IOHandler *io_read,
153
+ IOHandler *io_write,
154
+ void *opaque)
155
+{
156
+ QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
157
+ aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
158
+}
159
+
160
static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
161
GIOCondition condition)
162
{
163
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
164
ioc_klass->io_set_cork = qio_channel_socket_set_cork;
165
ioc_klass->io_set_delay = qio_channel_socket_set_delay;
166
ioc_klass->io_create_watch = qio_channel_socket_create_watch;
167
+ ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
168
}
169
170
static const TypeInfo qio_channel_socket_info = {
171
diff --git a/io/channel-tls.c b/io/channel-tls.c
172
index XXXXXXX..XXXXXXX 100644
173
--- a/io/channel-tls.c
174
+++ b/io/channel-tls.c
175
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
176
return qio_channel_close(tioc->master, errp);
177
}
178
179
+static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
180
+ AioContext *ctx,
181
+ IOHandler *io_read,
182
+ IOHandler *io_write,
183
+ void *opaque)
184
+{
185
+ QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
186
+
187
+ qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
188
+}
189
+
190
static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
191
GIOCondition condition)
192
{
193
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
194
ioc_klass->io_close = qio_channel_tls_close;
195
ioc_klass->io_shutdown = qio_channel_tls_shutdown;
196
ioc_klass->io_create_watch = qio_channel_tls_create_watch;
197
+ ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
198
}
199
200
static const TypeInfo qio_channel_tls_info = {
201
diff --git a/io/channel-watch.c b/io/channel-watch.c
202
index XXXXXXX..XXXXXXX 100644
203
--- a/io/channel-watch.c
204
+++ b/io/channel-watch.c
205
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
206
GSource *source;
207
QIOChannelSocketSource *ssource;
208
209
+#ifdef WIN32
210
+ WSAEventSelect(socket, ioc->event,
211
+ FD_READ | FD_ACCEPT | FD_CLOSE |
212
+ FD_CONNECT | FD_WRITE | FD_OOB);
213
+#endif
214
+
215
source = g_source_new(&qio_channel_socket_source_funcs,
216
sizeof(QIOChannelSocketSource));
217
ssource = (QIOChannelSocketSource *)source;
218
diff --git a/io/channel.c b/io/channel.c
219
index XXXXXXX..XXXXXXX 100644
220
--- a/io/channel.c
221
+++ b/io/channel.c
222
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
223
}
224
225
226
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
227
+ AioContext *ctx,
228
+ IOHandler *io_read,
229
+ IOHandler *io_write,
230
+ void *opaque)
231
+{
232
+ QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
233
+
234
+ klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
235
+}
236
+
237
guint qio_channel_add_watch(QIOChannel *ioc,
238
GIOCondition condition,
239
QIOChannelFunc func,
240
--
241
2.9.3
242
243
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Add two implementations of the same benchmark as the previous patch,
3
Since Linux doesn't have a user API to issue zone append operations to
4
but using pthreads. One uses a normal QemuMutex, the other is Linux
4
zoned devices from user space, the file-posix driver is modified to add
5
only and implements a fair mutex based on MCS locks and futexes.
5
zone append emulation using regular writes. To do this, the file-posix
6
This shows that the slower performance of the 5-thread case is due to
6
driver tracks the wp location of all zones of the device. It uses an
7
the fairness of CoMutex, rather than to coroutines. If fairness does
7
array of uint64_t. The most significant bit of each wp location indicates
8
not matter, as is the case with two threads, CoMutex can actually be
8
if the zone type is conventional zones.
9
faster than pthreads.
10
9
11
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
The zones wp can be changed due to the following operations issued:
12
Reviewed-by: Fam Zheng <famz@redhat.com>
11
- zone reset: change the wp to the start offset of that zone
13
Message-id: 20170213181244.16297-4-pbonzini@redhat.com
12
- zone finish: change to the end location of that zone
13
- write to a zone
14
- zone append
15
16
Signed-off-by: Sam Li <faithilikerun@gmail.com>
17
Message-id: 20230508051510.177850-2-faithilikerun@gmail.com
18
[Fix errno propagation from handle_aiocb_zone_mgmt()
19
--Stefan]
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
21
---
16
tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
22
include/block/block-common.h | 14 +++
17
1 file changed, 164 insertions(+)
23
include/block/block_int-common.h | 5 +
24
block/file-posix.c | 178 ++++++++++++++++++++++++++++++-
25
3 files changed, 193 insertions(+), 4 deletions(-)
18
26
19
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
27
diff --git a/include/block/block-common.h b/include/block/block-common.h
20
index XXXXXXX..XXXXXXX 100644
28
index XXXXXXX..XXXXXXX 100644
21
--- a/tests/test-aio-multithread.c
29
--- a/include/block/block-common.h
22
+++ b/tests/test-aio-multithread.c
30
+++ b/include/block/block-common.h
23
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
31
@@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneDescriptor {
24
test_multi_co_mutex(2, 30);
32
BlockZoneState state;
33
} BlockZoneDescriptor;
34
35
+/*
36
+ * Track write pointers of a zone in bytes.
37
+ */
38
+typedef struct BlockZoneWps {
39
+ CoMutex colock;
40
+ uint64_t wp[];
41
+} BlockZoneWps;
42
+
43
typedef struct BlockDriverInfo {
44
/* in bytes, 0 if irrelevant */
45
int cluster_size;
46
@@ -XXX,XX +XXX,XX @@ typedef enum {
47
#define BDRV_SECTOR_BITS 9
48
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
49
50
+/*
51
+ * Get the first most significant bit of wp. If it is zero, then
52
+ * the zone type is SWR.
53
+ */
54
+#define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63))
55
+
56
#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
57
INT_MAX >> BDRV_SECTOR_BITS)
58
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
59
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
60
index XXXXXXX..XXXXXXX 100644
61
--- a/include/block/block_int-common.h
62
+++ b/include/block/block_int-common.h
63
@@ -XXX,XX +XXX,XX @@ typedef struct BlockLimits {
64
65
/* maximum number of active zones */
66
uint32_t max_active_zones;
67
+
68
+ uint32_t write_granularity;
69
} BlockLimits;
70
71
typedef struct BdrvOpBlocker BdrvOpBlocker;
72
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
73
CoMutex bsc_modify_lock;
74
/* Always non-NULL, but must only be dereferenced under an RCU read guard */
75
BdrvBlockStatusCache *block_status_cache;
76
+
77
+ /* array of write pointers' location of each zone in the zoned device. */
78
+ BlockZoneWps *wps;
79
};
80
81
struct BlockBackendRootState {
82
diff --git a/block/file-posix.c b/block/file-posix.c
83
index XXXXXXX..XXXXXXX 100644
84
--- a/block/file-posix.c
85
+++ b/block/file-posix.c
86
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_segments(int fd, struct stat *st)
25
}
87
}
26
88
27
+/* Same test with fair mutexes, for performance comparison. */
89
#if defined(CONFIG_BLKZONED)
28
+
90
+/*
29
+#ifdef CONFIG_LINUX
91
+ * If the reset_all flag is true, then the wps of zone whose state is
30
+#include "qemu/futex.h"
92
+ * not readonly or offline should be all reset to the start sector.
31
+
93
+ * Else, take the real wp of the device.
32
+/* The nodes for the mutex reside in this structure (on which we try to avoid
33
+ * false sharing). The head of the mutex is in the "mutex_head" variable.
34
+ */
94
+ */
35
+static struct {
95
+static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
36
+ int next, locked;
96
+ unsigned int nrz, bool reset_all)
37
+ int padding[14];
38
+} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
39
+
40
+static int mutex_head = -1;
41
+
42
+static void mcs_mutex_lock(void)
43
+{
97
+{
44
+ int prev;
98
+ struct blk_zone *blkz;
45
+
99
+ size_t rep_size;
46
+ nodes[id].next = -1;
100
+ uint64_t sector = offset >> BDRV_SECTOR_BITS;
47
+ nodes[id].locked = 1;
101
+ BlockZoneWps *wps = bs->wps;
48
+ prev = atomic_xchg(&mutex_head, id);
102
+ unsigned int j = offset / bs->bl.zone_size;
49
+ if (prev != -1) {
103
+ unsigned int n = 0, i = 0;
50
+ atomic_set(&nodes[prev].next, id);
104
+ int ret;
51
+ qemu_futex_wait(&nodes[id].locked, 1);
105
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
52
+ }
106
+ g_autofree struct blk_zone_report *rep = NULL;
107
+
108
+ rep = g_malloc(rep_size);
109
+ blkz = (struct blk_zone *)(rep + 1);
110
+ while (n < nrz) {
111
+ memset(rep, 0, rep_size);
112
+ rep->sector = sector;
113
+ rep->nr_zones = nrz - n;
114
+
115
+ do {
116
+ ret = ioctl(fd, BLKREPORTZONE, rep);
117
+ } while (ret != 0 && errno == EINTR);
118
+ if (ret != 0) {
119
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
120
+ fd, offset, errno);
121
+ return -errno;
122
+ }
123
+
124
+ if (!rep->nr_zones) {
125
+ break;
126
+ }
127
+
128
+ for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
129
+ /*
130
+ * The wp tracking cares only about sequential writes required and
131
+ * sequential write preferred zones so that the wp can advance to
132
+ * the right location.
133
+ * Use the most significant bit of the wp location to indicate the
134
+ * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
135
+ */
136
+ if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
137
+ wps->wp[j] |= 1ULL << 63;
138
+ } else {
139
+ switch(blkz[i].cond) {
140
+ case BLK_ZONE_COND_FULL:
141
+ case BLK_ZONE_COND_READONLY:
142
+ /* Zone not writable */
143
+ wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
144
+ break;
145
+ case BLK_ZONE_COND_OFFLINE:
146
+ /* Zone not writable nor readable */
147
+ wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
148
+ break;
149
+ default:
150
+ if (reset_all) {
151
+ wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
152
+ } else {
153
+ wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
154
+ }
155
+ break;
156
+ }
157
+ }
158
+ }
159
+ sector = blkz[i - 1].start + blkz[i - 1].len;
160
+ }
161
+
162
+ return 0;
53
+}
163
+}
54
+
164
+
55
+static void mcs_mutex_unlock(void)
165
+static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
166
+ unsigned int nrz)
56
+{
167
+{
57
+ int next;
168
+ if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
58
+ if (nodes[id].next == -1) {
169
+ error_report("update zone wp failed");
59
+ if (atomic_read(&mutex_head) == id &&
170
+ }
60
+ atomic_cmpxchg(&mutex_head, id, -1) == id) {
61
+ /* Last item in the list, exit. */
62
+ return;
63
+ }
64
+ while (atomic_read(&nodes[id].next) == -1) {
65
+ /* mcs_mutex_lock did the xchg, but has not updated
66
+ * nodes[prev].next yet.
67
+ */
68
+ }
69
+ }
70
+
71
+ /* Wake up the next in line. */
72
+ next = nodes[id].next;
73
+ nodes[next].locked = 0;
74
+ qemu_futex_wake(&nodes[next].locked, 1);
75
+}
171
+}
76
+
172
+
77
+static void test_multi_fair_mutex_entry(void *opaque)
173
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
174
Error **errp)
175
{
176
+ BDRVRawState *s = bs->opaque;
177
BlockZoneModel zoned;
178
int ret;
179
180
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
181
if (ret > 0) {
182
bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
183
}
184
+
185
+ ret = get_sysfs_long_val(st, "physical_block_size");
186
+ if (ret >= 0) {
187
+ bs->bl.write_granularity = ret;
188
+ }
189
+
190
+ /* The refresh_limits() function can be called multiple times. */
191
+ g_free(bs->wps);
192
+ bs->wps = g_malloc(sizeof(BlockZoneWps) +
193
+ sizeof(int64_t) * bs->bl.nr_zones);
194
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
195
+ if (ret < 0) {
196
+ error_setg_errno(errp, -ret, "report wps failed");
197
+ bs->wps = NULL;
198
+ return;
199
+ }
200
+ qemu_co_mutex_init(&bs->wps->colock);
201
}
202
#else /* !defined(CONFIG_BLKZONED) */
203
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
204
@@ -XXX,XX +XXX,XX @@ static int handle_aiocb_zone_mgmt(void *opaque)
205
ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
206
} while (ret != 0 && errno == EINTR);
207
208
- return ret;
209
+ return ret < 0 ? -errno : ret;
210
}
211
#endif
212
213
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
214
{
215
BDRVRawState *s = bs->opaque;
216
RawPosixAIOData acb;
217
+ int ret;
218
219
if (fd_open(bs) < 0)
220
return -EIO;
221
+#if defined(CONFIG_BLKZONED)
222
+ if (type & QEMU_AIO_WRITE && bs->wps) {
223
+ qemu_co_mutex_lock(&bs->wps->colock);
224
+ }
225
+#endif
226
227
/*
228
* When using O_DIRECT, the request must be aligned to be able to use
229
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
230
#ifdef CONFIG_LINUX_IO_URING
231
} else if (s->use_linux_io_uring) {
232
assert(qiov->size == bytes);
233
- return luring_co_submit(bs, s->fd, offset, qiov, type);
234
+ ret = luring_co_submit(bs, s->fd, offset, qiov, type);
235
+ goto out;
236
#endif
237
#ifdef CONFIG_LINUX_AIO
238
} else if (s->use_linux_aio) {
239
assert(qiov->size == bytes);
240
- return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
241
+ ret = laio_co_submit(s->fd, offset, qiov, type,
242
+ s->aio_max_batch);
243
+ goto out;
244
#endif
245
}
246
247
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
248
};
249
250
assert(qiov->size == bytes);
251
- return raw_thread_pool_submit(handle_aiocb_rw, &acb);
252
+ ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
253
+ goto out; /* Avoid the compiler err of unused label */
254
+
255
+out:
256
+#if defined(CONFIG_BLKZONED)
78
+{
257
+{
79
+ while (!atomic_mb_read(&now_stopping)) {
258
+ BlockZoneWps *wps = bs->wps;
80
+ mcs_mutex_lock();
259
+ if (ret == 0) {
81
+ counter++;
260
+ if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
82
+ mcs_mutex_unlock();
261
+ uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
83
+ atomic_inc(&atomic_counter);
262
+ if (!BDRV_ZT_IS_CONV(*wp)) {
84
+ }
263
+ /* Advance the wp if needed */
85
+ atomic_dec(&running);
264
+ if (offset + bytes > *wp) {
86
+}
265
+ *wp = offset + bytes;
87
+
266
+ }
88
+static void test_multi_fair_mutex(int threads, int seconds)
267
+ }
89
+{
268
+ }
90
+ int i;
269
+ } else {
91
+
270
+ if (type & QEMU_AIO_WRITE) {
92
+ assert(mutex_head == -1);
271
+ update_zones_wp(bs, s->fd, 0, 1);
93
+ counter = 0;
272
+ }
94
+ atomic_counter = 0;
273
+ }
95
+ now_stopping = false;
274
+
96
+
275
+ if (type & QEMU_AIO_WRITE && wps) {
97
+ create_aio_contexts();
276
+ qemu_co_mutex_unlock(&wps->colock);
98
+ assert(threads <= NUM_CONTEXTS);
277
+ }
99
+ running = threads;
100
+ for (i = 0; i < threads; i++) {
101
+ Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
102
+ aio_co_schedule(ctx[i], co1);
103
+ }
104
+
105
+ g_usleep(seconds * 1000000);
106
+
107
+ atomic_mb_set(&now_stopping, true);
108
+ while (running > 0) {
109
+ g_usleep(100000);
110
+ }
111
+
112
+ join_aio_contexts();
113
+ g_test_message("%d iterations/second\n", counter / seconds);
114
+ g_assert_cmpint(counter, ==, atomic_counter);
115
+}
116
+
117
+static void test_multi_fair_mutex_1(void)
118
+{
119
+ test_multi_fair_mutex(NUM_CONTEXTS, 1);
120
+}
121
+
122
+static void test_multi_fair_mutex_10(void)
123
+{
124
+ test_multi_fair_mutex(NUM_CONTEXTS, 10);
125
+}
278
+}
126
+#endif
279
+#endif
127
+
280
+ return ret;
128
+/* Same test with pthread mutexes, for performance comparison and
281
}
129
+ * portability. */
282
130
+
283
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
131
+static QemuMutex mutex;
284
@@ -XXX,XX +XXX,XX @@ static void raw_close(BlockDriverState *bs)
132
+
285
BDRVRawState *s = bs->opaque;
133
+static void test_multi_mutex_entry(void *opaque)
286
134
+{
287
if (s->fd >= 0) {
135
+ while (!atomic_mb_read(&now_stopping)) {
288
+#if defined(CONFIG_BLKZONED)
136
+ qemu_mutex_lock(&mutex);
289
+ g_free(bs->wps);
137
+ counter++;
138
+ qemu_mutex_unlock(&mutex);
139
+ atomic_inc(&atomic_counter);
140
+ }
141
+ atomic_dec(&running);
142
+}
143
+
144
+static void test_multi_mutex(int threads, int seconds)
145
+{
146
+ int i;
147
+
148
+ qemu_mutex_init(&mutex);
149
+ counter = 0;
150
+ atomic_counter = 0;
151
+ now_stopping = false;
152
+
153
+ create_aio_contexts();
154
+ assert(threads <= NUM_CONTEXTS);
155
+ running = threads;
156
+ for (i = 0; i < threads; i++) {
157
+ Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
158
+ aio_co_schedule(ctx[i], co1);
159
+ }
160
+
161
+ g_usleep(seconds * 1000000);
162
+
163
+ atomic_mb_set(&now_stopping, true);
164
+ while (running > 0) {
165
+ g_usleep(100000);
166
+ }
167
+
168
+ join_aio_contexts();
169
+ g_test_message("%d iterations/second\n", counter / seconds);
170
+ g_assert_cmpint(counter, ==, atomic_counter);
171
+}
172
+
173
+static void test_multi_mutex_1(void)
174
+{
175
+ test_multi_mutex(NUM_CONTEXTS, 1);
176
+}
177
+
178
+static void test_multi_mutex_10(void)
179
+{
180
+ test_multi_mutex(NUM_CONTEXTS, 10);
181
+}
182
+
183
/* End of tests. */
184
185
int main(int argc, char **argv)
186
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
187
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
188
g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
189
g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
190
+#ifdef CONFIG_LINUX
191
+ g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
192
+#endif
290
+#endif
193
+ g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
291
qemu_close(s->fd);
194
} else {
292
s->fd = -1;
195
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
293
}
196
g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
294
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
197
g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
295
const char *op_name;
198
+#ifdef CONFIG_LINUX
296
unsigned long zo;
199
+ g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
297
int ret;
200
+#endif
298
+ BlockZoneWps *wps = bs->wps;
201
+ g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
299
int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
202
}
300
203
return g_test_run();
301
zone_size = bs->bl.zone_size;
204
}
302
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
303
return -EINVAL;
304
}
305
306
+ uint32_t i = offset / bs->bl.zone_size;
307
+ uint32_t nrz = len / bs->bl.zone_size;
308
+ uint64_t *wp = &wps->wp[i];
309
+ if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
310
+ error_report("zone mgmt operations are not allowed for conventional zones");
311
+ return -EIO;
312
+ }
313
+
314
switch (op) {
315
case BLK_ZO_OPEN:
316
op_name = "BLKOPENZONE";
317
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
318
len >> BDRV_SECTOR_BITS);
319
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
320
if (ret != 0) {
321
+ update_zones_wp(bs, s->fd, offset, i);
322
error_report("ioctl %s failed %d", op_name, ret);
323
+ return ret;
324
+ }
325
+
326
+ if (zo == BLKRESETZONE && len == capacity) {
327
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
328
+ if (ret < 0) {
329
+ error_report("reporting single wp failed");
330
+ return ret;
331
+ }
332
+ } else if (zo == BLKRESETZONE) {
333
+ for (unsigned int j = 0; j < nrz; ++j) {
334
+ wp[j] = offset + j * zone_size;
335
+ }
336
+ } else if (zo == BLKFINISHZONE) {
337
+ for (unsigned int j = 0; j < nrz; ++j) {
338
+ /* The zoned device allows the last zone smaller that the
339
+ * zone size. */
340
+ wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
341
+ }
342
}
343
344
return ret;
205
--
345
--
206
2.9.3
346
2.40.1
207
208
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
A zone append command is a write operation that specifies the first
4
logical block of a zone as the write position. When writing to a zoned
5
block device using zone append, the byte offset of the call may point at
6
any position within the zone to which the data is being appended. Upon
7
completion the device will respond with the position where the data has
8
been written in the zone.
9
10
Signed-off-by: Sam Li <faithilikerun@gmail.com>
11
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
13
Message-id: 20230508051510.177850-3-faithilikerun@gmail.com
5
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Message-id: 20170213135235.12274-15-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
15
---
10
block/archipelago.c | 3 +++
16
include/block/block-io.h | 4 ++
11
block/blkreplay.c | 2 +-
17
include/block/block_int-common.h | 3 ++
12
block/block-backend.c | 6 ++++++
18
include/block/raw-aio.h | 4 +-
13
block/curl.c | 26 ++++++++++++++++++--------
19
include/sysemu/block-backend-io.h | 9 +++++
14
block/gluster.c | 9 +--------
20
block/block-backend.c | 61 +++++++++++++++++++++++++++++++
15
block/io.c | 6 +++++-
21
block/file-posix.c | 58 +++++++++++++++++++++++++----
16
block/iscsi.c | 6 +++++-
22
block/io.c | 27 ++++++++++++++
17
block/linux-aio.c | 15 +++++++++------
23
block/io_uring.c | 4 ++
18
block/nfs.c | 3 ++-
24
block/linux-aio.c | 3 ++
19
block/null.c | 4 ++++
25
block/raw-format.c | 8 ++++
20
block/qed.c | 3 +++
26
10 files changed, 173 insertions(+), 8 deletions(-)
21
block/rbd.c | 4 ++++
22
dma-helpers.c | 2 ++
23
hw/block/virtio-blk.c | 2 ++
24
hw/scsi/scsi-bus.c | 2 ++
25
util/async.c | 4 ++--
26
util/thread-pool.c | 2 ++
27
17 files changed, 71 insertions(+), 28 deletions(-)
28
27
29
diff --git a/block/archipelago.c b/block/archipelago.c
28
diff --git a/include/block/block-io.h b/include/block/block-io.h
30
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
31
--- a/block/archipelago.c
30
--- a/include/block/block-io.h
32
+++ b/block/archipelago.c
31
+++ b/include/block/block-io.h
33
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
32
@@ -XXX,XX +XXX,XX @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs,
34
{
33
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
35
AIORequestData *reqdata = (AIORequestData *) opaque;
34
BlockZoneOp op,
36
ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
35
int64_t offset, int64_t len);
37
+ AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
36
+int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs,
38
37
+ int64_t *offset,
39
+ aio_context_acquire(ctx);
38
+ QEMUIOVector *qiov,
40
aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
39
+ BdrvRequestFlags flags);
41
+ aio_context_release(ctx);
40
42
aio_cb->status = 0;
41
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
43
42
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
44
qemu_aio_unref(aio_cb);
43
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
45
diff --git a/block/blkreplay.c b/block/blkreplay.c
44
index XXXXXXX..XXXXXXX 100644
46
index XXXXXXX..XXXXXXX 100755
45
--- a/include/block/block_int-common.h
47
--- a/block/blkreplay.c
46
+++ b/include/block/block_int-common.h
48
+++ b/block/blkreplay.c
47
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
49
@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
48
BlockZoneDescriptor *zones);
50
static void blkreplay_bh_cb(void *opaque)
49
int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
51
{
50
int64_t offset, int64_t len);
52
Request *req = opaque;
51
+ int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
53
- qemu_coroutine_enter(req->co);
52
+ int64_t *offset, QEMUIOVector *qiov,
54
+ aio_co_wake(req->co);
53
+ BdrvRequestFlags flags);
55
qemu_bh_delete(req->bh);
54
56
g_free(req);
55
/* removable device specific */
57
}
56
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
57
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
58
index XXXXXXX..XXXXXXX 100644
59
--- a/include/block/raw-aio.h
60
+++ b/include/block/raw-aio.h
61
@@ -XXX,XX +XXX,XX @@
62
#define QEMU_AIO_TRUNCATE 0x0080
63
#define QEMU_AIO_ZONE_REPORT 0x0100
64
#define QEMU_AIO_ZONE_MGMT 0x0200
65
+#define QEMU_AIO_ZONE_APPEND 0x0400
66
#define QEMU_AIO_TYPE_MASK \
67
(QEMU_AIO_READ | \
68
QEMU_AIO_WRITE | \
69
@@ -XXX,XX +XXX,XX @@
70
QEMU_AIO_COPY_RANGE | \
71
QEMU_AIO_TRUNCATE | \
72
QEMU_AIO_ZONE_REPORT | \
73
- QEMU_AIO_ZONE_MGMT)
74
+ QEMU_AIO_ZONE_MGMT | \
75
+ QEMU_AIO_ZONE_APPEND)
76
77
/* AIO flags */
78
#define QEMU_AIO_MISALIGNED 0x1000
79
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
80
index XXXXXXX..XXXXXXX 100644
81
--- a/include/sysemu/block-backend-io.h
82
+++ b/include/sysemu/block-backend-io.h
83
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
84
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
85
int64_t offset, int64_t len,
86
BlockCompletionFunc *cb, void *opaque);
87
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
88
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
89
+ BlockCompletionFunc *cb, void *opaque);
90
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
91
BlockCompletionFunc *cb, void *opaque);
92
void blk_aio_cancel_async(BlockAIOCB *acb);
93
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
94
int64_t offset, int64_t len);
95
int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
96
int64_t offset, int64_t len);
97
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
98
+ QEMUIOVector *qiov,
99
+ BdrvRequestFlags flags);
100
+int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset,
101
+ QEMUIOVector *qiov,
102
+ BdrvRequestFlags flags);
103
104
int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
105
int64_t bytes);
58
diff --git a/block/block-backend.c b/block/block-backend.c
106
diff --git a/block/block-backend.c b/block/block-backend.c
59
index XXXXXXX..XXXXXXX 100644
107
index XXXXXXX..XXXXXXX 100644
60
--- a/block/block-backend.c
108
--- a/block/block-backend.c
61
+++ b/block/block-backend.c
109
+++ b/block/block-backend.c
62
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
110
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
63
static void error_callback_bh(void *opaque)
111
return &acb->common;
112
}
113
114
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
115
+{
116
+ BlkAioEmAIOCB *acb = opaque;
117
+ BlkRwCo *rwco = &acb->rwco;
118
+
119
+ rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
120
+ rwco->iobuf, rwco->flags);
121
+ blk_aio_complete(acb);
122
+}
123
+
124
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
125
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
126
+ BlockCompletionFunc *cb, void *opaque) {
127
+ BlkAioEmAIOCB *acb;
128
+ Coroutine *co;
129
+ IO_CODE();
130
+
131
+ blk_inc_in_flight(blk);
132
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
133
+ acb->rwco = (BlkRwCo) {
134
+ .blk = blk,
135
+ .ret = NOT_DONE,
136
+ .flags = flags,
137
+ .iobuf = qiov,
138
+ };
139
+ acb->bytes = (int64_t)(uintptr_t)offset;
140
+ acb->has_returned = false;
141
+
142
+ co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
143
+ aio_co_enter(blk_get_aio_context(blk), co);
144
+ acb->has_returned = true;
145
+ if (acb->rwco.ret != NOT_DONE) {
146
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
147
+ blk_aio_complete_bh, acb);
148
+ }
149
+
150
+ return &acb->common;
151
+}
152
+
153
/*
154
* Send a zone_report command.
155
* offset is a byte offset from the start of the device. No alignment
156
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
157
return ret;
158
}
159
160
+/*
161
+ * Send a zone_append command.
162
+ */
163
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
164
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
165
+{
166
+ int ret;
167
+ IO_CODE();
168
+
169
+ blk_inc_in_flight(blk);
170
+ blk_wait_while_drained(blk);
171
+ GRAPH_RDLOCK_GUARD();
172
+ if (!blk_is_available(blk)) {
173
+ blk_dec_in_flight(blk);
174
+ return -ENOMEDIUM;
175
+ }
176
+
177
+ ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
178
+ blk_dec_in_flight(blk);
179
+ return ret;
180
+}
181
+
182
void blk_drain(BlockBackend *blk)
64
{
183
{
65
struct BlockBackendAIOCB *acb = opaque;
184
BlockDriverState *bs = blk_bs(blk);
66
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
185
diff --git a/block/file-posix.c b/block/file-posix.c
67
186
index XXXXXXX..XXXXXXX 100644
68
bdrv_dec_in_flight(acb->common.bs);
187
--- a/block/file-posix.c
69
+ aio_context_acquire(ctx);
188
+++ b/block/file-posix.c
70
acb->common.cb(acb->common.opaque, acb->ret);
189
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState {
71
+ aio_context_release(ctx);
190
bool has_write_zeroes:1;
72
qemu_aio_unref(acb);
191
bool use_linux_aio:1;
73
}
192
bool use_linux_io_uring:1;
74
193
+ int64_t *offset; /* offset of zone append operation */
75
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
194
int page_cache_inconsistent; /* errno from fdatasync failure */
76
static void blk_aio_complete_bh(void *opaque)
195
bool has_fallocate;
196
bool needs_alignment;
197
@@ -XXX,XX +XXX,XX @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
198
ssize_t len;
199
200
len = RETRY_ON_EINTR(
201
- (aiocb->aio_type & QEMU_AIO_WRITE) ?
202
+ (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
203
qemu_pwritev(aiocb->aio_fildes,
204
aiocb->io.iov,
205
aiocb->io.niov,
206
@@ -XXX,XX +XXX,XX @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
207
ssize_t len;
208
209
while (offset < aiocb->aio_nbytes) {
210
- if (aiocb->aio_type & QEMU_AIO_WRITE) {
211
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
212
len = pwrite(aiocb->aio_fildes,
213
(const char *)buf + offset,
214
aiocb->aio_nbytes - offset,
215
@@ -XXX,XX +XXX,XX @@ static int handle_aiocb_rw(void *opaque)
216
}
217
218
nbytes = handle_aiocb_rw_linear(aiocb, buf);
219
- if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
220
+ if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
221
char *p = buf;
222
size_t count = aiocb->aio_nbytes, copy;
223
int i;
224
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
225
if (fd_open(bs) < 0)
226
return -EIO;
227
#if defined(CONFIG_BLKZONED)
228
- if (type & QEMU_AIO_WRITE && bs->wps) {
229
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
230
qemu_co_mutex_lock(&bs->wps->colock);
231
+ if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
232
+ int index = offset / bs->bl.zone_size;
233
+ offset = bs->wps->wp[index];
234
+ }
235
}
236
#endif
237
238
@@ -XXX,XX +XXX,XX @@ out:
77
{
239
{
78
BlkAioEmAIOCB *acb = opaque;
240
BlockZoneWps *wps = bs->wps;
79
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
241
if (ret == 0) {
80
242
- if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
81
assert(acb->has_returned);
243
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
82
+ aio_context_acquire(ctx);
244
+ && wps && bs->bl.zone_size) {
83
blk_aio_complete(acb);
245
uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
84
+ aio_context_release(ctx);
246
if (!BDRV_ZT_IS_CONV(*wp)) {
85
}
247
+ if (type & QEMU_AIO_ZONE_APPEND) {
86
248
+ *s->offset = *wp;
87
static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
249
+ }
88
diff --git a/block/curl.c b/block/curl.c
250
/* Advance the wp if needed */
89
index XXXXXXX..XXXXXXX 100644
251
if (offset + bytes > *wp) {
90
--- a/block/curl.c
252
*wp = offset + bytes;
91
+++ b/block/curl.c
253
@@ -XXX,XX +XXX,XX @@ out:
92
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
254
}
93
{
255
}
94
CURLState *state;
256
} else {
95
int running;
257
- if (type & QEMU_AIO_WRITE) {
96
+ int ret = -EINPROGRESS;
258
+ if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
97
259
update_zones_wp(bs, s->fd, 0, 1);
98
CURLAIOCB *acb = p;
260
}
99
- BDRVCURLState *s = acb->common.bs->opaque;
100
+ BlockDriverState *bs = acb->common.bs;
101
+ BDRVCURLState *s = bs->opaque;
102
+ AioContext *ctx = bdrv_get_aio_context(bs);
103
104
size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
105
size_t end;
106
107
+ aio_context_acquire(ctx);
108
+
109
// In case we have the requested data already (e.g. read-ahead),
110
// we can just call the callback and be done.
111
switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
112
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
113
qemu_aio_unref(acb);
114
// fall through
115
case FIND_RET_WAIT:
116
- return;
117
+ goto out;
118
default:
119
break;
120
}
261
}
121
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
262
122
// No cache found, so let's start a new request
263
- if (type & QEMU_AIO_WRITE && wps) {
123
state = curl_init_state(acb->common.bs, s);
264
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
124
if (!state) {
265
qemu_co_mutex_unlock(&wps->colock);
125
- acb->common.cb(acb->common.opaque, -EIO);
126
- qemu_aio_unref(acb);
127
- return;
128
+ ret = -EIO;
129
+ goto out;
130
}
266
}
131
267
}
132
acb->start = 0;
268
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
133
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
269
}
134
state->orig_buf = g_try_malloc(state->buf_len);
270
#endif
135
if (state->buf_len && state->orig_buf == NULL) {
271
136
curl_clean_state(state);
272
+#if defined(CONFIG_BLKZONED)
137
- acb->common.cb(acb->common.opaque, -ENOMEM);
273
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
138
- qemu_aio_unref(acb);
274
+ int64_t *offset,
139
- return;
275
+ QEMUIOVector *qiov,
140
+ ret = -ENOMEM;
276
+ BdrvRequestFlags flags) {
141
+ goto out;
277
+ assert(flags == 0);
142
}
278
+ int64_t zone_size_mask = bs->bl.zone_size - 1;
143
state->acb[0] = acb;
279
+ int64_t iov_len = 0;
144
280
+ int64_t len = 0;
145
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
281
+ BDRVRawState *s = bs->opaque;
146
282
+ s->offset = offset;
147
/* Tell curl it needs to kick things off */
283
+
148
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
284
+ if (*offset & zone_size_mask) {
149
+
285
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
150
+out:
286
+ "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
151
+ if (ret != -EINPROGRESS) {
287
+ return -EINVAL;
152
+ acb->common.cb(acb->common.opaque, ret);
288
+ }
153
+ qemu_aio_unref(acb);
289
+
154
+ }
290
+ int64_t wg = bs->bl.write_granularity;
155
+ aio_context_release(ctx);
291
+ int64_t wg_mask = wg - 1;
156
}
292
+ for (int i = 0; i < qiov->niov; i++) {
157
293
+ iov_len = qiov->iov[i].iov_len;
158
static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
294
+ if (iov_len & wg_mask) {
159
diff --git a/block/gluster.c b/block/gluster.c
295
+ error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
160
index XXXXXXX..XXXXXXX 100644
296
+ "block size %" PRId64 "", i, iov_len, wg);
161
--- a/block/gluster.c
297
+ return -EINVAL;
162
+++ b/block/gluster.c
298
+ }
163
@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
299
+ len += iov_len;
164
return qemu_gluster_glfs_init(gconf, errp);
300
+ }
165
}
301
+
166
302
+ return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
167
-static void qemu_gluster_complete_aio(void *opaque)
303
+}
168
-{
304
+#endif
169
- GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
305
+
170
-
306
static coroutine_fn int
171
- qemu_coroutine_enter(acb->coroutine);
307
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
172
-}
308
bool blkdev)
173
-
309
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
174
/*
310
/* zone management operations */
175
* AIO callback routine called from GlusterFS thread.
311
.bdrv_co_zone_report = raw_co_zone_report,
176
*/
312
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
177
@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
313
+ .bdrv_co_zone_append = raw_co_zone_append,
178
acb->ret = -EIO; /* Partial read/write - fail it */
314
#endif
179
}
315
};
180
316
181
- aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
182
+ aio_co_schedule(acb->aio_context, acb->coroutine);
183
}
184
185
static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
186
diff --git a/block/io.c b/block/io.c
317
diff --git a/block/io.c b/block/io.c
187
index XXXXXXX..XXXXXXX 100644
318
index XXXXXXX..XXXXXXX 100644
188
--- a/block/io.c
319
--- a/block/io.c
189
+++ b/block/io.c
320
+++ b/block/io.c
190
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
321
@@ -XXX,XX +XXX,XX @@ out:
191
bdrv_dec_in_flight(bs);
322
return co.ret;
192
bdrv_drained_begin(bs);
323
}
193
data->done = true;
324
194
- qemu_coroutine_enter(co);
325
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
195
+ aio_co_wake(co);
326
+ QEMUIOVector *qiov,
196
}
327
+ BdrvRequestFlags flags)
197
328
+{
198
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
329
+ int ret;
199
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
330
+ BlockDriver *drv = bs->drv;
200
static void bdrv_co_em_bh(void *opaque)
331
+ CoroutineIOCompletion co = {
332
+ .coroutine = qemu_coroutine_self(),
333
+ };
334
+ IO_CODE();
335
+
336
+ ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
337
+ if (ret < 0) {
338
+ return ret;
339
+ }
340
+
341
+ bdrv_inc_in_flight(bs);
342
+ if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
343
+ co.ret = -ENOTSUP;
344
+ goto out;
345
+ }
346
+ co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
347
+out:
348
+ bdrv_dec_in_flight(bs);
349
+ return co.ret;
350
+}
351
+
352
void *qemu_blockalign(BlockDriverState *bs, size_t size)
201
{
353
{
202
BlockAIOCBCoroutine *acb = opaque;
354
IO_CODE();
203
+ BlockDriverState *bs = acb->common.bs;
355
diff --git a/block/io_uring.c b/block/io_uring.c
204
+ AioContext *ctx = bdrv_get_aio_context(bs);
356
index XXXXXXX..XXXXXXX 100644
205
357
--- a/block/io_uring.c
206
assert(!acb->need_bh);
358
+++ b/block/io_uring.c
207
+ aio_context_acquire(ctx);
359
@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
208
bdrv_co_complete(acb);
360
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
209
+ aio_context_release(ctx);
361
luringcb->qiov->niov, offset);
210
}
362
break;
211
363
+ case QEMU_AIO_ZONE_APPEND:
212
static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
364
+ io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
213
diff --git a/block/iscsi.c b/block/iscsi.c
365
+ luringcb->qiov->niov, offset);
214
index XXXXXXX..XXXXXXX 100644
366
+ break;
215
--- a/block/iscsi.c
367
case QEMU_AIO_READ:
216
+++ b/block/iscsi.c
368
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
217
@@ -XXX,XX +XXX,XX @@ static void
369
luringcb->qiov->niov, offset);
218
iscsi_bh_cb(void *p)
219
{
220
IscsiAIOCB *acb = p;
221
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
222
223
qemu_bh_delete(acb->bh);
224
225
g_free(acb->buf);
226
acb->buf = NULL;
227
228
+ aio_context_acquire(ctx);
229
acb->common.cb(acb->common.opaque, acb->status);
230
+ aio_context_release(ctx);
231
232
if (acb->task != NULL) {
233
scsi_free_scsi_task(acb->task);
234
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
235
static void iscsi_co_generic_bh_cb(void *opaque)
236
{
237
struct IscsiTask *iTask = opaque;
238
+
239
iTask->complete = 1;
240
- qemu_coroutine_enter(iTask->co);
241
+ aio_co_wake(iTask->co);
242
}
243
244
static void iscsi_retry_timer_expired(void *opaque)
245
diff --git a/block/linux-aio.c b/block/linux-aio.c
370
diff --git a/block/linux-aio.c b/block/linux-aio.c
246
index XXXXXXX..XXXXXXX 100644
371
index XXXXXXX..XXXXXXX 100644
247
--- a/block/linux-aio.c
372
--- a/block/linux-aio.c
248
+++ b/block/linux-aio.c
373
+++ b/block/linux-aio.c
249
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
374
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
250
io_context_t ctx;
375
case QEMU_AIO_WRITE:
251
EventNotifier e;
376
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
252
377
break;
253
- /* io queue for submit at batch */
378
+ case QEMU_AIO_ZONE_APPEND:
254
+ /* io queue for submit at batch. Protected by AioContext lock. */
379
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
255
LaioQueue io_q;
380
+ break;
256
381
case QEMU_AIO_READ:
257
- /* I/O completion processing */
382
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
258
+ /* I/O completion processing. Only runs in I/O thread. */
383
break;
259
QEMUBH *completion_bh;
384
diff --git a/block/raw-format.c b/block/raw-format.c
260
int event_idx;
385
index XXXXXXX..XXXXXXX 100644
261
int event_max;
386
--- a/block/raw-format.c
262
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
387
+++ b/block/raw-format.c
263
*/
388
@@ -XXX,XX +XXX,XX @@ raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
264
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
389
return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
390
}
391
392
+static int coroutine_fn GRAPH_RDLOCK
393
+raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
394
+ BdrvRequestFlags flags)
395
+{
396
+ return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
397
+}
398
+
399
static int64_t coroutine_fn GRAPH_RDLOCK
400
raw_co_getlength(BlockDriverState *bs)
265
{
401
{
266
+ LinuxAioState *s = laiocb->ctx;
402
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
267
int ret;
403
.bdrv_co_pdiscard = &raw_co_pdiscard,
268
404
.bdrv_co_zone_report = &raw_co_zone_report,
269
ret = laiocb->ret;
405
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
270
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
406
+ .bdrv_co_zone_append = &raw_co_zone_append,
271
}
407
.bdrv_co_block_status = &raw_co_block_status,
272
408
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
273
laiocb->ret = ret;
409
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
274
+ aio_context_acquire(s->aio_context);
275
if (laiocb->co) {
276
/* If the coroutine is already entered it must be in ioq_submit() and
277
* will notice laio->ret has been filled in when it eventually runs
278
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
279
laiocb->common.cb(laiocb->common.opaque, ret);
280
qemu_aio_unref(laiocb);
281
}
282
+ aio_context_release(s->aio_context);
283
}
284
285
/**
286
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
287
static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
288
{
289
qemu_laio_process_completions(s);
290
+
291
+ aio_context_acquire(s->aio_context);
292
if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
293
ioq_submit(s);
294
}
295
+ aio_context_release(s->aio_context);
296
}
297
298
static void qemu_laio_completion_bh(void *opaque)
299
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
300
LinuxAioState *s = container_of(e, LinuxAioState, e);
301
302
if (event_notifier_test_and_clear(&s->e)) {
303
- aio_context_acquire(s->aio_context);
304
qemu_laio_process_completions_and_submit(s);
305
- aio_context_release(s->aio_context);
306
}
307
}
308
309
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
310
return false;
311
}
312
313
- aio_context_acquire(s->aio_context);
314
qemu_laio_process_completions_and_submit(s);
315
- aio_context_release(s->aio_context);
316
return true;
317
}
318
319
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
320
{
321
aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
322
qemu_bh_delete(s->completion_bh);
323
+ s->aio_context = NULL;
324
}
325
326
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
327
diff --git a/block/nfs.c b/block/nfs.c
328
index XXXXXXX..XXXXXXX 100644
329
--- a/block/nfs.c
330
+++ b/block/nfs.c
331
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
332
static void nfs_co_generic_bh_cb(void *opaque)
333
{
334
NFSRPC *task = opaque;
335
+
336
task->complete = 1;
337
- qemu_coroutine_enter(task->co);
338
+ aio_co_wake(task->co);
339
}
340
341
static void
342
diff --git a/block/null.c b/block/null.c
343
index XXXXXXX..XXXXXXX 100644
344
--- a/block/null.c
345
+++ b/block/null.c
346
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
347
static void null_bh_cb(void *opaque)
348
{
349
NullAIOCB *acb = opaque;
350
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
351
+
352
+ aio_context_acquire(ctx);
353
acb->common.cb(acb->common.opaque, 0);
354
+ aio_context_release(ctx);
355
qemu_aio_unref(acb);
356
}
357
358
diff --git a/block/qed.c b/block/qed.c
359
index XXXXXXX..XXXXXXX 100644
360
--- a/block/qed.c
361
+++ b/block/qed.c
362
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
363
static void qed_aio_complete_bh(void *opaque)
364
{
365
QEDAIOCB *acb = opaque;
366
+ BDRVQEDState *s = acb_to_s(acb);
367
BlockCompletionFunc *cb = acb->common.cb;
368
void *user_opaque = acb->common.opaque;
369
int ret = acb->bh_ret;
370
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
371
qemu_aio_unref(acb);
372
373
/* Invoke callback */
374
+ qed_acquire(s);
375
cb(user_opaque, ret);
376
+ qed_release(s);
377
}
378
379
static void qed_aio_complete(QEDAIOCB *acb, int ret)
380
diff --git a/block/rbd.c b/block/rbd.c
381
index XXXXXXX..XXXXXXX 100644
382
--- a/block/rbd.c
383
+++ b/block/rbd.c
384
@@ -XXX,XX +XXX,XX @@ shutdown:
385
static void qemu_rbd_complete_aio(RADOSCB *rcb)
386
{
387
RBDAIOCB *acb = rcb->acb;
388
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
389
int64_t r;
390
391
r = rcb->ret;
392
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
393
qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
394
}
395
qemu_vfree(acb->bounce);
396
+
397
+ aio_context_acquire(ctx);
398
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
399
+ aio_context_release(ctx);
400
401
qemu_aio_unref(acb);
402
}
403
diff --git a/dma-helpers.c b/dma-helpers.c
404
index XXXXXXX..XXXXXXX 100644
405
--- a/dma-helpers.c
406
+++ b/dma-helpers.c
407
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
408
QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
409
}
410
411
+ aio_context_acquire(dbs->ctx);
412
dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
413
dma_blk_cb, dbs, dbs->io_func_opaque);
414
+ aio_context_release(dbs->ctx);
415
assert(dbs->acb);
416
}
417
418
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
419
index XXXXXXX..XXXXXXX 100644
420
--- a/hw/block/virtio-blk.c
421
+++ b/hw/block/virtio-blk.c
422
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
423
424
s->rq = NULL;
425
426
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
427
while (req) {
428
VirtIOBlockReq *next = req->next;
429
if (virtio_blk_handle_request(req, &mrb)) {
430
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
431
if (mrb.num_reqs) {
432
virtio_blk_submit_multireq(s->blk, &mrb);
433
}
434
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
435
}
436
437
static void virtio_blk_dma_restart_cb(void *opaque, int running,
438
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
439
index XXXXXXX..XXXXXXX 100644
440
--- a/hw/scsi/scsi-bus.c
441
+++ b/hw/scsi/scsi-bus.c
442
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
443
qemu_bh_delete(s->bh);
444
s->bh = NULL;
445
446
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
447
QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
448
scsi_req_ref(req);
449
if (req->retry) {
450
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
451
}
452
scsi_req_unref(req);
453
}
454
+ aio_context_release(blk_get_aio_context(s->conf.blk));
455
}
456
457
void scsi_req_retry(SCSIRequest *req)
458
diff --git a/util/async.c b/util/async.c
459
index XXXXXXX..XXXXXXX 100644
460
--- a/util/async.c
461
+++ b/util/async.c
462
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
463
ret = 1;
464
}
465
bh->idle = 0;
466
- aio_context_acquire(ctx);
467
aio_bh_call(bh);
468
- aio_context_release(ctx);
469
}
470
if (bh->deleted) {
471
deleted = true;
472
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
473
Coroutine *co = QSLIST_FIRST(&straight);
474
QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
475
trace_aio_co_schedule_bh_cb(ctx, co);
476
+ aio_context_acquire(ctx);
477
qemu_coroutine_enter(co);
478
+ aio_context_release(ctx);
479
}
480
}
481
482
diff --git a/util/thread-pool.c b/util/thread-pool.c
483
index XXXXXXX..XXXXXXX 100644
484
--- a/util/thread-pool.c
485
+++ b/util/thread-pool.c
486
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
487
ThreadPool *pool = opaque;
488
ThreadPoolElement *elem, *next;
489
490
+ aio_context_acquire(pool->ctx);
491
restart:
492
QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
493
if (elem->state != THREAD_DONE) {
494
@@ -XXX,XX +XXX,XX @@ restart:
495
qemu_aio_unref(elem);
496
}
497
}
498
+ aio_context_release(pool->ctx);
499
}
500
501
static void thread_pool_cancel(BlockAIOCB *acb)
502
--
410
--
503
2.9.3
411
2.40.1
504
505
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Support separate coroutines for reading and writing, and place the
3
The patch tests zone append writes by reporting the zone wp after
4
read/write handlers on the AioContext that the QIOChannel is registered
4
the completion of the call. "zap -p" option can print the sector
5
with.
5
offset value after completion, which should be the start sector
6
where the append write begins.
6
7
7
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
8
Signed-off-by: Sam Li <faithilikerun@gmail.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Message-id: 20230508051510.177850-4-faithilikerun@gmail.com
10
Reviewed-by: Fam Zheng <famz@redhat.com>
11
Message-id: 20170213135235.12274-7-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
12
---
14
include/io/channel.h | 47 ++++++++++++++++++++++++++--
13
qemu-io-cmds.c | 75 ++++++++++++++++++++++++++++++
15
io/channel.c | 86 +++++++++++++++++++++++++++++++++++++++-------------
14
tests/qemu-iotests/tests/zoned | 16 +++++++
16
2 files changed, 109 insertions(+), 24 deletions(-)
15
tests/qemu-iotests/tests/zoned.out | 16 +++++++
16
3 files changed, 107 insertions(+)
17
17
18
diff --git a/include/io/channel.h b/include/io/channel.h
18
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
19
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
20
--- a/include/io/channel.h
20
--- a/qemu-io-cmds.c
21
+++ b/include/io/channel.h
21
+++ b/qemu-io-cmds.c
22
@@ -XXX,XX +XXX,XX @@
22
@@ -XXX,XX +XXX,XX @@ static const cmdinfo_t zone_reset_cmd = {
23
23
.oneline = "reset a zone write pointer in zone block device",
24
#include "qemu-common.h"
24
};
25
#include "qom/object.h"
25
26
+#include "qemu/coroutine.h"
26
+static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
27
#include "block/aio.h"
27
+ int64_t *offset, int flags, int *total)
28
28
+{
29
#define TYPE_QIO_CHANNEL "qio-channel"
29
+ int async_ret = NOT_DONE;
30
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
31
Object parent;
32
unsigned int features; /* bitmask of QIOChannelFeatures */
33
char *name;
34
+ AioContext *ctx;
35
+ Coroutine *read_coroutine;
36
+ Coroutine *write_coroutine;
37
#ifdef _WIN32
38
HANDLE event; /* For use with GSource on Win32 */
39
#endif
40
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
41
42
43
/**
44
+ * qio_channel_attach_aio_context:
45
+ * @ioc: the channel object
46
+ * @ctx: the #AioContext to set the handlers on
47
+ *
48
+ * Request that qio_channel_yield() sets I/O handlers on
49
+ * the given #AioContext. If @ctx is %NULL, qio_channel_yield()
50
+ * uses QEMU's main thread event loop.
51
+ *
52
+ * You can move a #QIOChannel from one #AioContext to another even if
53
+ * I/O handlers are set for a coroutine. However, #QIOChannel provides
54
+ * no synchronization between the calls to qio_channel_yield() and
55
+ * qio_channel_attach_aio_context().
56
+ *
57
+ * Therefore you should first call qio_channel_detach_aio_context()
58
+ * to ensure that the coroutine is not entered concurrently. Then,
59
+ * while the coroutine has yielded, call qio_channel_attach_aio_context(),
60
+ * and then aio_co_schedule() to place the coroutine on the new
61
+ * #AioContext. The calls to qio_channel_detach_aio_context()
62
+ * and qio_channel_attach_aio_context() should be protected with
63
+ * aio_context_acquire() and aio_context_release().
64
+ */
65
+void qio_channel_attach_aio_context(QIOChannel *ioc,
66
+ AioContext *ctx);
67
+
30
+
68
+/**
31
+ blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
69
+ * qio_channel_detach_aio_context:
32
+ while (async_ret == NOT_DONE) {
70
+ * @ioc: the channel object
33
+ main_loop_wait(false);
71
+ *
72
+ * Disable any I/O handlers set by qio_channel_yield(). With the
73
+ * help of aio_co_schedule(), this allows moving a coroutine that was
74
+ * paused by qio_channel_yield() to another context.
75
+ */
76
+void qio_channel_detach_aio_context(QIOChannel *ioc);
77
+
78
+/**
79
* qio_channel_yield:
80
* @ioc: the channel object
81
* @condition: the I/O condition to wait for
82
*
83
- * Yields execution from the current coroutine until
84
- * the condition indicated by @condition becomes
85
- * available.
86
+ * Yields execution from the current coroutine until the condition
87
+ * indicated by @condition becomes available. @condition must
88
+ * be either %G_IO_IN or %G_IO_OUT; it cannot contain both. In
89
+ * addition, no two coroutine can be waiting on the same condition
90
+ * and channel at the same time.
91
*
92
* This must only be called from coroutine context
93
*/
94
diff --git a/io/channel.c b/io/channel.c
95
index XXXXXXX..XXXXXXX 100644
96
--- a/io/channel.c
97
+++ b/io/channel.c
98
@@ -XXX,XX +XXX,XX @@
99
#include "qemu/osdep.h"
100
#include "io/channel.h"
101
#include "qapi/error.h"
102
-#include "qemu/coroutine.h"
103
+#include "qemu/main-loop.h"
104
105
bool qio_channel_has_feature(QIOChannel *ioc,
106
QIOChannelFeature feature)
107
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
108
}
109
110
111
-typedef struct QIOChannelYieldData QIOChannelYieldData;
112
-struct QIOChannelYieldData {
113
- QIOChannel *ioc;
114
- Coroutine *co;
115
-};
116
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
117
118
+static void qio_channel_restart_read(void *opaque)
119
+{
120
+ QIOChannel *ioc = opaque;
121
+ Coroutine *co = ioc->read_coroutine;
122
+
123
+ ioc->read_coroutine = NULL;
124
+ qio_channel_set_aio_fd_handlers(ioc);
125
+ aio_co_wake(co);
126
+}
127
128
-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
129
- GIOCondition condition,
130
- gpointer opaque)
131
+static void qio_channel_restart_write(void *opaque)
132
{
133
- QIOChannelYieldData *data = opaque;
134
- qemu_coroutine_enter(data->co);
135
- return FALSE;
136
+ QIOChannel *ioc = opaque;
137
+ Coroutine *co = ioc->write_coroutine;
138
+
139
+ ioc->write_coroutine = NULL;
140
+ qio_channel_set_aio_fd_handlers(ioc);
141
+ aio_co_wake(co);
142
}
143
144
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
145
+{
146
+ IOHandler *rd_handler = NULL, *wr_handler = NULL;
147
+ AioContext *ctx;
148
+
149
+ if (ioc->read_coroutine) {
150
+ rd_handler = qio_channel_restart_read;
151
+ }
152
+ if (ioc->write_coroutine) {
153
+ wr_handler = qio_channel_restart_write;
154
+ }
34
+ }
155
+
35
+
156
+ ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
36
+ *total = qiov->size;
157
+ qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
37
+ return async_ret < 0 ? async_ret : 1;
158
+}
38
+}
159
+
39
+
160
+void qio_channel_attach_aio_context(QIOChannel *ioc,
40
+static int zone_append_f(BlockBackend *blk, int argc, char **argv)
161
+ AioContext *ctx)
162
+{
41
+{
163
+ AioContext *old_ctx;
42
+ int ret;
164
+ if (ioc->ctx == ctx) {
43
+ bool pflag = false;
165
+ return;
44
+ int flags = 0;
45
+ int total = 0;
46
+ int64_t offset;
47
+ char *buf;
48
+ int c, nr_iov;
49
+ int pattern = 0xcd;
50
+ QEMUIOVector qiov;
51
+
52
+ if (optind > argc - 3) {
53
+ return -EINVAL;
166
+ }
54
+ }
167
+
55
+
168
+ old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
56
+ if ((c = getopt(argc, argv, "p")) != -1) {
169
+ qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
57
+ pflag = true;
170
+ ioc->ctx = ctx;
58
+ }
171
+ qio_channel_set_aio_fd_handlers(ioc);
59
+
60
+ offset = cvtnum(argv[optind]);
61
+ if (offset < 0) {
62
+ print_cvtnum_err(offset, argv[optind]);
63
+ return offset;
64
+ }
65
+ optind++;
66
+ nr_iov = argc - optind;
67
+ buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern,
68
+ flags & BDRV_REQ_REGISTERED_BUF);
69
+ if (buf == NULL) {
70
+ return -EINVAL;
71
+ }
72
+ ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
73
+ if (ret < 0) {
74
+ printf("zone append failed: %s\n", strerror(-ret));
75
+ goto out;
76
+ }
77
+
78
+ if (pflag) {
79
+ printf("After zap done, the append sector is 0x%" PRIx64 "\n",
80
+ tosector(offset));
81
+ }
82
+
83
+out:
84
+ qemu_io_free(blk, buf, qiov.size,
85
+ flags & BDRV_REQ_REGISTERED_BUF);
86
+ qemu_iovec_destroy(&qiov);
87
+ return ret;
172
+}
88
+}
173
+
89
+
174
+void qio_channel_detach_aio_context(QIOChannel *ioc)
90
+static const cmdinfo_t zone_append_cmd = {
175
+{
91
+ .name = "zone_append",
176
+ ioc->read_coroutine = NULL;
92
+ .altname = "zap",
177
+ ioc->write_coroutine = NULL;
93
+ .cfunc = zone_append_f,
178
+ qio_channel_set_aio_fd_handlers(ioc);
94
+ .argmin = 3,
179
+ ioc->ctx = NULL;
95
+ .argmax = 4,
180
+}
96
+ .args = "offset len [len..]",
181
97
+ .oneline = "append write a number of bytes at a specified offset",
182
void coroutine_fn qio_channel_yield(QIOChannel *ioc,
98
+};
183
GIOCondition condition)
99
+
184
{
100
static int truncate_f(BlockBackend *blk, int argc, char **argv);
185
- QIOChannelYieldData data;
101
static const cmdinfo_t truncate_cmd = {
186
-
102
.name = "truncate",
187
assert(qemu_in_coroutine());
103
@@ -XXX,XX +XXX,XX @@ static void __attribute((constructor)) init_qemuio_commands(void)
188
- data.ioc = ioc;
104
qemuio_add_command(&zone_close_cmd);
189
- data.co = qemu_coroutine_self();
105
qemuio_add_command(&zone_finish_cmd);
190
- qio_channel_add_watch(ioc,
106
qemuio_add_command(&zone_reset_cmd);
191
- condition,
107
+ qemuio_add_command(&zone_append_cmd);
192
- qio_channel_yield_enter,
108
qemuio_add_command(&truncate_cmd);
193
- &data,
109
qemuio_add_command(&length_cmd);
194
- NULL);
110
qemuio_add_command(&info_cmd);
195
+ if (condition == G_IO_IN) {
111
diff --git a/tests/qemu-iotests/tests/zoned b/tests/qemu-iotests/tests/zoned
196
+ assert(!ioc->read_coroutine);
112
index XXXXXXX..XXXXXXX 100755
197
+ ioc->read_coroutine = qemu_coroutine_self();
113
--- a/tests/qemu-iotests/tests/zoned
198
+ } else if (condition == G_IO_OUT) {
114
+++ b/tests/qemu-iotests/tests/zoned
199
+ assert(!ioc->write_coroutine);
115
@@ -XXX,XX +XXX,XX @@ echo "(5) resetting the second zone"
200
+ ioc->write_coroutine = qemu_coroutine_self();
116
$QEMU_IO $IMG -c "zrs 268435456 268435456"
201
+ } else {
117
echo "After resetting a zone:"
202
+ abort();
118
$QEMU_IO $IMG -c "zrp 268435456 1"
203
+ }
119
+echo
204
+ qio_channel_set_aio_fd_handlers(ioc);
120
+echo
205
qemu_coroutine_yield();
121
+echo "(6) append write" # the physical block size of the device is 4096
206
}
122
+$QEMU_IO $IMG -c "zrp 0 1"
207
123
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
124
+echo "After appending the first zone firstly:"
125
+$QEMU_IO $IMG -c "zrp 0 1"
126
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
127
+echo "After appending the first zone secondly:"
128
+$QEMU_IO $IMG -c "zrp 0 1"
129
+$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
130
+echo "After appending the second zone firstly:"
131
+$QEMU_IO $IMG -c "zrp 268435456 1"
132
+$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
133
+echo "After appending the second zone secondly:"
134
+$QEMU_IO $IMG -c "zrp 268435456 1"
135
136
# success, all done
137
echo "*** done"
138
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
139
index XXXXXXX..XXXXXXX 100644
140
--- a/tests/qemu-iotests/tests/zoned.out
141
+++ b/tests/qemu-iotests/tests/zoned.out
142
@@ -XXX,XX +XXX,XX @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
143
(5) resetting the second zone
144
After resetting a zone:
145
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
146
+
147
+
148
+(6) append write
149
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
150
+After zap done, the append sector is 0x0
151
+After appending the first zone firstly:
152
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
153
+After zap done, the append sector is 0x18
154
+After appending the first zone secondly:
155
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x30, zcond:2, [type: 2]
156
+After zap done, the append sector is 0x80000
157
+After appending the second zone firstly:
158
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
159
+After zap done, the append sector is 0x80018
160
+After appending the second zone secondly:
161
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80030, zcond:2, [type: 2]
162
*** done
208
--
163
--
209
2.9.3
164
2.40.1
210
211
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
As a small step towards the introduction of multiqueue, we want
4
coroutines to remain on the same AioContext that started them,
5
unless they are moved explicitly with e.g. aio_co_schedule. This patch
6
avoids that coroutines switch AioContext when they use a CoMutex.
7
For now it does not make much of a difference, because the CoMutex
8
is not thread-safe and the AioContext itself is used to protect the
9
CoMutex from concurrent access. However, this is going to change.
10
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
13
Reviewed-by: Fam Zheng <famz@redhat.com>
14
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
15
Message-id: 20170213135235.12274-9-pbonzini@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
18
util/qemu-coroutine-lock.c | 5 ++---
19
util/trace-events | 1 -
20
2 files changed, 2 insertions(+), 4 deletions(-)
21
22
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
23
index XXXXXXX..XXXXXXX 100644
24
--- a/util/qemu-coroutine-lock.c
25
+++ b/util/qemu-coroutine-lock.c
26
@@ -XXX,XX +XXX,XX @@
27
#include "qemu/coroutine.h"
28
#include "qemu/coroutine_int.h"
29
#include "qemu/queue.h"
30
+#include "block/aio.h"
31
#include "trace.h"
32
33
void qemu_co_queue_init(CoQueue *queue)
34
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
35
36
static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
37
{
38
- Coroutine *self = qemu_coroutine_self();
39
Coroutine *next;
40
41
if (QSIMPLEQ_EMPTY(&queue->entries)) {
42
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
43
44
while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
45
QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
46
- QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
47
- trace_qemu_co_queue_next(next);
48
+ aio_co_wake(next);
49
if (single) {
50
break;
51
}
52
diff --git a/util/trace-events b/util/trace-events
53
index XXXXXXX..XXXXXXX 100644
54
--- a/util/trace-events
55
+++ b/util/trace-events
56
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
57
58
# util/qemu-coroutine-lock.c
59
qemu_co_queue_run_restart(void *co) "co %p"
60
-qemu_co_queue_next(void *nxt) "next %p"
61
qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
62
qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
63
qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
64
--
65
2.9.3
66
67
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
Keep the coroutine on the same AioContext. Without this change,
4
there would be a race between yielding the coroutine and reentering it.
5
While the race cannot happen now, because the code only runs from a single
6
AioContext, this will change with multiqueue support in the block layer.
7
8
While doing the change, replace custom bottom half with aio_co_schedule.
9
10
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Reviewed-by: Fam Zheng <famz@redhat.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
14
Message-id: 20170213135235.12274-10-pbonzini@redhat.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
block/blkdebug.c | 9 +--------
18
1 file changed, 1 insertion(+), 8 deletions(-)
19
20
diff --git a/block/blkdebug.c b/block/blkdebug.c
21
index XXXXXXX..XXXXXXX 100644
22
--- a/block/blkdebug.c
23
+++ b/block/blkdebug.c
24
@@ -XXX,XX +XXX,XX @@ out:
25
return ret;
26
}
27
28
-static void error_callback_bh(void *opaque)
29
-{
30
- Coroutine *co = opaque;
31
- qemu_coroutine_enter(co);
32
-}
33
-
34
static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
35
{
36
BDRVBlkdebugState *s = bs->opaque;
37
@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
38
}
39
40
if (!immediately) {
41
- aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
42
- qemu_coroutine_self());
43
+ aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
44
qemu_coroutine_yield();
45
}
46
47
--
48
2.9.3
49
50
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
qed_aio_start_io and qed_aio_next_io will not have to acquire/release
4
the AioContext, while qed_aio_next_io_cb will. Split the functionality
5
and gain a little type-safety in the process.
6
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
Reviewed-by: Fam Zheng <famz@redhat.com>
10
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
11
Message-id: 20170213135235.12274-11-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
block/qed.c | 39 +++++++++++++++++++++++++--------------
15
1 file changed, 25 insertions(+), 14 deletions(-)
16
17
diff --git a/block/qed.c b/block/qed.c
18
index XXXXXXX..XXXXXXX 100644
19
--- a/block/qed.c
20
+++ b/block/qed.c
21
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
22
return l2_table;
23
}
24
25
-static void qed_aio_next_io(void *opaque, int ret);
26
+static void qed_aio_next_io(QEDAIOCB *acb, int ret);
27
+
28
+static void qed_aio_start_io(QEDAIOCB *acb)
29
+{
30
+ qed_aio_next_io(acb, 0);
31
+}
32
+
33
+static void qed_aio_next_io_cb(void *opaque, int ret)
34
+{
35
+ QEDAIOCB *acb = opaque;
36
+
37
+ qed_aio_next_io(acb, ret);
38
+}
39
40
static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
41
{
42
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
43
44
acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
45
if (acb) {
46
- qed_aio_next_io(acb, 0);
47
+ qed_aio_start_io(acb);
48
}
49
}
50
51
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
52
QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
53
acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
54
if (acb) {
55
- qed_aio_next_io(acb, 0);
56
+ qed_aio_start_io(acb);
57
} else if (s->header.features & QED_F_NEED_CHECK) {
58
qed_start_need_check_timer(s);
59
}
60
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
61
acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
62
assert(acb->request.l2_table != NULL);
63
64
- qed_aio_next_io(opaque, ret);
65
+ qed_aio_next_io(acb, ret);
66
}
67
68
/**
69
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
70
if (need_alloc) {
71
/* Write out the whole new L2 table */
72
qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
73
- qed_aio_write_l1_update, acb);
74
+ qed_aio_write_l1_update, acb);
75
} else {
76
/* Write out only the updated part of the L2 table */
77
qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
78
- qed_aio_next_io, acb);
79
+ qed_aio_next_io_cb, acb);
80
}
81
return;
82
83
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
84
}
85
86
if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
87
- next_fn = qed_aio_next_io;
88
+ next_fn = qed_aio_next_io_cb;
89
} else {
90
if (s->bs->backing) {
91
next_fn = qed_aio_write_flush_before_l2_update;
92
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
93
if (acb->flags & QED_AIOCB_ZERO) {
94
/* Skip ahead if the clusters are already zero */
95
if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
96
- qed_aio_next_io(acb, 0);
97
+ qed_aio_start_io(acb);
98
return;
99
}
100
101
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
102
/* Handle zero cluster and backing file reads */
103
if (ret == QED_CLUSTER_ZERO) {
104
qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
105
- qed_aio_next_io(acb, 0);
106
+ qed_aio_start_io(acb);
107
return;
108
} else if (ret != QED_CLUSTER_FOUND) {
109
qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
110
- &acb->backing_qiov, qed_aio_next_io, acb);
111
+ &acb->backing_qiov, qed_aio_next_io_cb, acb);
112
return;
113
}
114
115
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
116
bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
117
&acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
118
- qed_aio_next_io, acb);
119
+ qed_aio_next_io_cb, acb);
120
return;
121
122
err:
123
@@ -XXX,XX +XXX,XX @@ err:
124
/**
125
* Begin next I/O or complete the request
126
*/
127
-static void qed_aio_next_io(void *opaque, int ret)
128
+static void qed_aio_next_io(QEDAIOCB *acb, int ret)
129
{
130
- QEDAIOCB *acb = opaque;
131
BDRVQEDState *s = acb_to_s(acb);
132
QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
133
qed_aio_write_data : qed_aio_read_data;
134
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
135
qemu_iovec_init(&acb->cur_qiov, qiov->niov);
136
137
/* Start request */
138
- qed_aio_next_io(acb, 0);
139
+ qed_aio_start_io(acb);
140
return &acb->common;
141
}
142
143
--
144
2.9.3
145
146
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
The AioContext data structures are now protected by list_lock and/or
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
they are walked with FOREACH_RCU primitives. There is no need anymore
4
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
5
to acquire the AioContext for the entire duration of aio_dispatch.
6
Instead, just acquire it before and after invoking the callbacks.
7
The next step is then to push it further down.
8
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
6
Message-id: 20230508051510.177850-5-faithilikerun@gmail.com
11
Reviewed-by: Fam Zheng <famz@redhat.com>
12
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
13
Message-id: 20170213135235.12274-12-pbonzini@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
8
---
16
util/aio-posix.c | 25 +++++++++++--------------
9
block/file-posix.c | 3 +++
17
util/aio-win32.c | 15 +++++++--------
10
block/trace-events | 2 ++
18
util/async.c | 2 ++
11
2 files changed, 5 insertions(+)
19
3 files changed, 20 insertions(+), 22 deletions(-)
20
12
21
diff --git a/util/aio-posix.c b/util/aio-posix.c
13
diff --git a/block/file-posix.c b/block/file-posix.c
22
index XXXXXXX..XXXXXXX 100644
14
index XXXXXXX..XXXXXXX 100644
23
--- a/util/aio-posix.c
15
--- a/block/file-posix.c
24
+++ b/util/aio-posix.c
16
+++ b/block/file-posix.c
25
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
17
@@ -XXX,XX +XXX,XX @@ out:
26
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
18
if (!BDRV_ZT_IS_CONV(*wp)) {
27
aio_node_check(ctx, node->is_external) &&
19
if (type & QEMU_AIO_ZONE_APPEND) {
28
node->io_read) {
20
*s->offset = *wp;
29
+ aio_context_acquire(ctx);
21
+ trace_zbd_zone_append_complete(bs, *s->offset
30
node->io_read(node->opaque);
22
+ >> BDRV_SECTOR_BITS);
31
+ aio_context_release(ctx);
23
}
32
24
/* Advance the wp if needed */
33
/* aio_notify() does not count as progress */
25
if (offset + bytes > *wp) {
34
if (node->opaque != &ctx->notifier) {
26
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
35
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
27
len += iov_len;
36
(revents & (G_IO_OUT | G_IO_ERR)) &&
37
aio_node_check(ctx, node->is_external) &&
38
node->io_write) {
39
+ aio_context_acquire(ctx);
40
node->io_write(node->opaque);
41
+ aio_context_release(ctx);
42
progress = true;
43
}
44
45
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
46
}
28
}
47
29
48
/* Run our timers */
30
+ trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
49
+ aio_context_acquire(ctx);
31
return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
50
progress |= timerlistgroup_run_timers(&ctx->tlg);
51
+ aio_context_release(ctx);
52
53
return progress;
54
}
32
}
55
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
33
#endif
56
int64_t timeout;
34
diff --git a/block/trace-events b/block/trace-events
57
int64_t start = 0;
58
59
- aio_context_acquire(ctx);
60
- progress = false;
61
-
62
/* aio_notify can avoid the expensive event_notifier_set if
63
* everything (file descriptors, bottom halves, timers) will
64
* be re-evaluated before the next blocking poll(). This is
65
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
66
start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
67
}
68
69
- if (try_poll_mode(ctx, blocking)) {
70
- progress = true;
71
- } else {
72
+ aio_context_acquire(ctx);
73
+ progress = try_poll_mode(ctx, blocking);
74
+ aio_context_release(ctx);
75
+
76
+ if (!progress) {
77
assert(npfd == 0);
78
79
/* fill pollfds */
80
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
81
timeout = blocking ? aio_compute_timeout(ctx) : 0;
82
83
/* wait until next event */
84
- if (timeout) {
85
- aio_context_release(ctx);
86
- }
87
if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
88
AioHandler epoll_handler;
89
90
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
91
} else {
92
ret = qemu_poll_ns(pollfds, npfd, timeout);
93
}
94
- if (timeout) {
95
- aio_context_acquire(ctx);
96
- }
97
}
98
99
if (blocking) {
100
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
101
progress = true;
102
}
103
104
- aio_context_release(ctx);
105
-
106
return progress;
107
}
108
109
diff --git a/util/aio-win32.c b/util/aio-win32.c
110
index XXXXXXX..XXXXXXX 100644
35
index XXXXXXX..XXXXXXX 100644
111
--- a/util/aio-win32.c
36
--- a/block/trace-events
112
+++ b/util/aio-win32.c
37
+++ b/block/trace-events
113
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
38
@@ -XXX,XX +XXX,XX @@ file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
114
(revents || event_notifier_get_handle(node->e) == event) &&
39
file_flush_fdatasync_failed(int err) "errno %d"
115
node->io_notify) {
40
zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
116
node->pfd.revents = 0;
41
zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
117
+ aio_context_acquire(ctx);
42
+zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
118
node->io_notify(node->e);
43
+zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""
119
+ aio_context_release(ctx);
44
120
45
# ssh.c
121
/* aio_notify() does not count as progress */
46
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
122
if (node->e != &ctx->notifier) {
123
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
124
(node->io_read || node->io_write)) {
125
node->pfd.revents = 0;
126
if ((revents & G_IO_IN) && node->io_read) {
127
+ aio_context_acquire(ctx);
128
node->io_read(node->opaque);
129
+ aio_context_release(ctx);
130
progress = true;
131
}
132
if ((revents & G_IO_OUT) && node->io_write) {
133
+ aio_context_acquire(ctx);
134
node->io_write(node->opaque);
135
+ aio_context_release(ctx);
136
progress = true;
137
}
138
139
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
140
int count;
141
int timeout;
142
143
- aio_context_acquire(ctx);
144
progress = false;
145
146
/* aio_notify can avoid the expensive event_notifier_set if
147
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
148
149
timeout = blocking && !have_select_revents
150
? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
151
- if (timeout) {
152
- aio_context_release(ctx);
153
- }
154
ret = WaitForMultipleObjects(count, events, FALSE, timeout);
155
if (blocking) {
156
assert(first);
157
atomic_sub(&ctx->notify_me, 2);
158
}
159
- if (timeout) {
160
- aio_context_acquire(ctx);
161
- }
162
163
if (first) {
164
aio_notify_accept(ctx);
165
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
166
progress |= aio_dispatch_handlers(ctx, event);
167
} while (count > 0);
168
169
+ aio_context_acquire(ctx);
170
progress |= timerlistgroup_run_timers(&ctx->tlg);
171
-
172
aio_context_release(ctx);
173
return progress;
174
}
175
diff --git a/util/async.c b/util/async.c
176
index XXXXXXX..XXXXXXX 100644
177
--- a/util/async.c
178
+++ b/util/async.c
179
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
180
ret = 1;
181
}
182
bh->idle = 0;
183
+ aio_context_acquire(ctx);
184
aio_bh_call(bh);
185
+ aio_context_release(ctx);
186
}
187
if (bh->deleted) {
188
deleted = true;
189
--
47
--
190
2.9.3
48
2.40.1
191
192
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
5
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Message-id: 20170213135235.12274-13-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
block/qed.h | 3 +++
11
block/curl.c | 2 ++
12
block/io.c | 5 +++++
13
block/iscsi.c | 8 ++++++--
14
block/null.c | 4 ++++
15
block/qed.c | 12 ++++++++++++
16
block/throttle-groups.c | 2 ++
17
util/aio-posix.c | 2 --
18
util/aio-win32.c | 2 --
19
util/qemu-coroutine-sleep.c | 2 +-
20
10 files changed, 35 insertions(+), 7 deletions(-)
21
22
diff --git a/block/qed.h b/block/qed.h
23
index XXXXXXX..XXXXXXX 100644
24
--- a/block/qed.h
25
+++ b/block/qed.h
26
@@ -XXX,XX +XXX,XX @@ enum {
27
*/
28
typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
29
30
+void qed_acquire(BDRVQEDState *s);
31
+void qed_release(BDRVQEDState *s);
32
+
33
/**
34
* Generic callback for chaining async callbacks
35
*/
36
diff --git a/block/curl.c b/block/curl.c
37
index XXXXXXX..XXXXXXX 100644
38
--- a/block/curl.c
39
+++ b/block/curl.c
40
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
41
return;
42
}
43
44
+ aio_context_acquire(s->aio_context);
45
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
46
47
curl_multi_check_completion(s);
48
+ aio_context_release(s->aio_context);
49
#else
50
abort();
51
#endif
52
diff --git a/block/io.c b/block/io.c
53
index XXXXXXX..XXXXXXX 100644
54
--- a/block/io.c
55
+++ b/block/io.c
56
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
57
if (acb->aiocb_info->get_aio_context) {
58
aio_poll(acb->aiocb_info->get_aio_context(acb), true);
59
} else if (acb->bs) {
60
+ /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
61
+ * assert that we're not using an I/O thread. Thread-safe
62
+ * code should use bdrv_aio_cancel_async exclusively.
63
+ */
64
+ assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
65
aio_poll(bdrv_get_aio_context(acb->bs), true);
66
} else {
67
abort();
68
diff --git a/block/iscsi.c b/block/iscsi.c
69
index XXXXXXX..XXXXXXX 100644
70
--- a/block/iscsi.c
71
+++ b/block/iscsi.c
72
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
73
struct IscsiTask *iTask = opaque;
74
iTask->complete = 1;
75
if (iTask->co) {
76
- qemu_coroutine_enter(iTask->co);
77
+ aio_co_wake(iTask->co);
78
}
79
}
80
81
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
82
{
83
IscsiLun *iscsilun = opaque;
84
85
+ aio_context_acquire(iscsilun->aio_context);
86
if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
87
error_report("iSCSI: NOP timeout. Reconnecting...");
88
iscsilun->request_timed_out = true;
89
} else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
90
error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
91
- return;
92
+ goto out;
93
}
94
95
timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
96
iscsi_set_events(iscsilun);
97
+
98
+out:
99
+ aio_context_release(iscsilun->aio_context);
100
}
101
102
static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
103
diff --git a/block/null.c b/block/null.c
104
index XXXXXXX..XXXXXXX 100644
105
--- a/block/null.c
106
+++ b/block/null.c
107
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
108
static void null_timer_cb(void *opaque)
109
{
110
NullAIOCB *acb = opaque;
111
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
112
+
113
+ aio_context_acquire(ctx);
114
acb->common.cb(acb->common.opaque, 0);
115
+ aio_context_release(ctx);
116
timer_deinit(&acb->timer);
117
qemu_aio_unref(acb);
118
}
119
diff --git a/block/qed.c b/block/qed.c
120
index XXXXXXX..XXXXXXX 100644
121
--- a/block/qed.c
122
+++ b/block/qed.c
123
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
124
125
trace_qed_need_check_timer_cb(s);
126
127
+ qed_acquire(s);
128
qed_plug_allocating_write_reqs(s);
129
130
/* Ensure writes are on disk before clearing flag */
131
bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
132
+ qed_release(s);
133
+}
134
+
135
+void qed_acquire(BDRVQEDState *s)
136
+{
137
+ aio_context_acquire(bdrv_get_aio_context(s->bs));
138
+}
139
+
140
+void qed_release(BDRVQEDState *s)
141
+{
142
+ aio_context_release(bdrv_get_aio_context(s->bs));
143
}
144
145
static void qed_start_need_check_timer(BDRVQEDState *s)
146
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
147
index XXXXXXX..XXXXXXX 100644
148
--- a/block/throttle-groups.c
149
+++ b/block/throttle-groups.c
150
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
151
qemu_mutex_unlock(&tg->lock);
152
153
/* Run the request that was waiting for this timer */
154
+ aio_context_acquire(blk_get_aio_context(blk));
155
empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
156
+ aio_context_release(blk_get_aio_context(blk));
157
158
/* If the request queue was empty then we have to take care of
159
* scheduling the next one */
160
diff --git a/util/aio-posix.c b/util/aio-posix.c
161
index XXXXXXX..XXXXXXX 100644
162
--- a/util/aio-posix.c
163
+++ b/util/aio-posix.c
164
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
165
}
166
167
/* Run our timers */
168
- aio_context_acquire(ctx);
169
progress |= timerlistgroup_run_timers(&ctx->tlg);
170
- aio_context_release(ctx);
171
172
return progress;
173
}
174
diff --git a/util/aio-win32.c b/util/aio-win32.c
175
index XXXXXXX..XXXXXXX 100644
176
--- a/util/aio-win32.c
177
+++ b/util/aio-win32.c
178
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
179
progress |= aio_dispatch_handlers(ctx, event);
180
} while (count > 0);
181
182
- aio_context_acquire(ctx);
183
progress |= timerlistgroup_run_timers(&ctx->tlg);
184
- aio_context_release(ctx);
185
return progress;
186
}
187
188
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
189
index XXXXXXX..XXXXXXX 100644
190
--- a/util/qemu-coroutine-sleep.c
191
+++ b/util/qemu-coroutine-sleep.c
192
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
193
{
194
CoSleepCB *sleep_cb = opaque;
195
196
- qemu_coroutine_enter(sleep_cb->co);
197
+ aio_co_wake(sleep_cb->co);
198
}
199
200
void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
201
--
202
2.9.3
203
204
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
This covers both file descriptor callbacks and polling callbacks,
3
This patch extends virtio-blk emulation to handle zoned device commands
4
since they execute related code.
4
by calling the new block layer APIs to perform zoned device I/O on
5
behalf of the guest. It supports Report Zone, four zone oparations (open,
6
close, finish, reset), and Append Zone.
5
7
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
7
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
support zoned block devices. Regular block devices(conventional zones)
8
Reviewed-by: Fam Zheng <famz@redhat.com>
10
will not be set.
9
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
11
10
Message-id: 20170213135235.12274-14-pbonzini@redhat.com
12
The guest os can use blktests, fio to test those commands on zoned devices.
13
Furthermore, using zonefs to test zone append write is also supported.
14
15
Signed-off-by: Sam Li <faithilikerun@gmail.com>
16
Message-id: 20230508051916.178322-2-faithilikerun@gmail.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
18
---
13
block/curl.c | 16 +++++++++++++---
19
hw/block/virtio-blk-common.c | 2 +
14
block/iscsi.c | 4 ++++
20
hw/block/virtio-blk.c | 389 +++++++++++++++++++++++++++++++++++
15
block/linux-aio.c | 4 ++++
21
hw/virtio/virtio-qmp.c | 2 +
16
block/nfs.c | 6 ++++++
22
3 files changed, 393 insertions(+)
17
block/sheepdog.c | 29 +++++++++++++++--------------
18
block/ssh.c | 29 +++++++++--------------------
19
block/win32-aio.c | 10 ++++++----
20
hw/block/virtio-blk.c | 5 ++++-
21
hw/scsi/virtio-scsi.c | 7 +++++++
22
util/aio-posix.c | 7 -------
23
util/aio-win32.c | 6 ------
24
11 files changed, 68 insertions(+), 55 deletions(-)
25
23
26
diff --git a/block/curl.c b/block/curl.c
24
diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
27
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
28
--- a/block/curl.c
26
--- a/hw/block/virtio-blk-common.c
29
+++ b/block/curl.c
27
+++ b/hw/block/virtio-blk-common.c
30
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
28
@@ -XXX,XX +XXX,XX @@ static const VirtIOFeature feature_sizes[] = {
31
}
29
.end = endof(struct virtio_blk_config, discard_sector_alignment)},
32
}
30
{.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
33
31
.end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
34
-static void curl_multi_do(void *arg)
32
+ {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
35
+static void curl_multi_do_locked(CURLState *s)
33
+ .end = endof(struct virtio_blk_config, zoned)},
36
{
34
{}
37
- CURLState *s = (CURLState *)arg;
38
CURLSocket *socket, *next_socket;
39
int running;
40
int r;
41
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
42
}
43
}
44
45
+static void curl_multi_do(void *arg)
46
+{
47
+ CURLState *s = (CURLState *)arg;
48
+
49
+ aio_context_acquire(s->s->aio_context);
50
+ curl_multi_do_locked(s);
51
+ aio_context_release(s->s->aio_context);
52
+}
53
+
54
static void curl_multi_read(void *arg)
55
{
56
CURLState *s = (CURLState *)arg;
57
58
- curl_multi_do(arg);
59
+ aio_context_acquire(s->s->aio_context);
60
+ curl_multi_do_locked(s);
61
curl_multi_check_completion(s->s);
62
+ aio_context_release(s->s->aio_context);
63
}
64
65
static void curl_multi_timeout_do(void *arg)
66
diff --git a/block/iscsi.c b/block/iscsi.c
67
index XXXXXXX..XXXXXXX 100644
68
--- a/block/iscsi.c
69
+++ b/block/iscsi.c
70
@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
71
IscsiLun *iscsilun = arg;
72
struct iscsi_context *iscsi = iscsilun->iscsi;
73
74
+ aio_context_acquire(iscsilun->aio_context);
75
iscsi_service(iscsi, POLLIN);
76
iscsi_set_events(iscsilun);
77
+ aio_context_release(iscsilun->aio_context);
78
}
79
80
static void
81
@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
82
IscsiLun *iscsilun = arg;
83
struct iscsi_context *iscsi = iscsilun->iscsi;
84
85
+ aio_context_acquire(iscsilun->aio_context);
86
iscsi_service(iscsi, POLLOUT);
87
iscsi_set_events(iscsilun);
88
+ aio_context_release(iscsilun->aio_context);
89
}
90
91
static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
92
diff --git a/block/linux-aio.c b/block/linux-aio.c
93
index XXXXXXX..XXXXXXX 100644
94
--- a/block/linux-aio.c
95
+++ b/block/linux-aio.c
96
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
97
LinuxAioState *s = container_of(e, LinuxAioState, e);
98
99
if (event_notifier_test_and_clear(&s->e)) {
100
+ aio_context_acquire(s->aio_context);
101
qemu_laio_process_completions_and_submit(s);
102
+ aio_context_release(s->aio_context);
103
}
104
}
105
106
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
107
return false;
108
}
109
110
+ aio_context_acquire(s->aio_context);
111
qemu_laio_process_completions_and_submit(s);
112
+ aio_context_release(s->aio_context);
113
return true;
114
}
115
116
diff --git a/block/nfs.c b/block/nfs.c
117
index XXXXXXX..XXXXXXX 100644
118
--- a/block/nfs.c
119
+++ b/block/nfs.c
120
@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
121
static void nfs_process_read(void *arg)
122
{
123
NFSClient *client = arg;
124
+
125
+ aio_context_acquire(client->aio_context);
126
nfs_service(client->context, POLLIN);
127
nfs_set_events(client);
128
+ aio_context_release(client->aio_context);
129
}
130
131
static void nfs_process_write(void *arg)
132
{
133
NFSClient *client = arg;
134
+
135
+ aio_context_acquire(client->aio_context);
136
nfs_service(client->context, POLLOUT);
137
nfs_set_events(client);
138
+ aio_context_release(client->aio_context);
139
}
140
141
static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
142
diff --git a/block/sheepdog.c b/block/sheepdog.c
143
index XXXXXXX..XXXXXXX 100644
144
--- a/block/sheepdog.c
145
+++ b/block/sheepdog.c
146
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
147
return ret;
148
}
149
150
-static void restart_co_req(void *opaque)
151
-{
152
- Coroutine *co = opaque;
153
-
154
- qemu_coroutine_enter(co);
155
-}
156
-
157
typedef struct SheepdogReqCo {
158
int sockfd;
159
BlockDriverState *bs;
160
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
161
unsigned int *rlen;
162
int ret;
163
bool finished;
164
+ Coroutine *co;
165
} SheepdogReqCo;
166
167
+static void restart_co_req(void *opaque)
168
+{
169
+ SheepdogReqCo *srco = opaque;
170
+
171
+ aio_co_wake(srco->co);
172
+}
173
+
174
static coroutine_fn void do_co_req(void *opaque)
175
{
176
int ret;
177
- Coroutine *co;
178
SheepdogReqCo *srco = opaque;
179
int sockfd = srco->sockfd;
180
SheepdogReq *hdr = srco->hdr;
181
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
182
unsigned int *wlen = srco->wlen;
183
unsigned int *rlen = srco->rlen;
184
185
- co = qemu_coroutine_self();
186
+ srco->co = qemu_coroutine_self();
187
aio_set_fd_handler(srco->aio_context, sockfd, false,
188
- NULL, restart_co_req, NULL, co);
189
+ NULL, restart_co_req, NULL, srco);
190
191
ret = send_co_req(sockfd, hdr, data, wlen);
192
if (ret < 0) {
193
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
194
}
195
196
aio_set_fd_handler(srco->aio_context, sockfd, false,
197
- restart_co_req, NULL, NULL, co);
198
+ restart_co_req, NULL, NULL, srco);
199
200
ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
201
if (ret != sizeof(*hdr)) {
202
@@ -XXX,XX +XXX,XX @@ out:
203
aio_set_fd_handler(srco->aio_context, sockfd, false,
204
NULL, NULL, NULL, NULL);
205
206
+ srco->co = NULL;
207
srco->ret = ret;
208
srco->finished = true;
209
if (srco->bs) {
210
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
211
* We've finished all requests which belong to the AIOCB, so
212
* we can switch back to sd_co_readv/writev now.
213
*/
214
- qemu_coroutine_enter(acb->coroutine);
215
+ aio_co_wake(acb->coroutine);
216
}
217
218
return;
219
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
220
s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
221
}
222
223
- qemu_coroutine_enter(s->co_recv);
224
+ aio_co_wake(s->co_recv);
225
}
226
227
static void co_write_request(void *opaque)
228
{
229
BDRVSheepdogState *s = opaque;
230
231
- qemu_coroutine_enter(s->co_send);
232
+ aio_co_wake(s->co_send);
233
}
234
235
/*
236
diff --git a/block/ssh.c b/block/ssh.c
237
index XXXXXXX..XXXXXXX 100644
238
--- a/block/ssh.c
239
+++ b/block/ssh.c
240
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
241
242
DPRINTF("co=%p", co);
243
244
- qemu_coroutine_enter(co);
245
+ aio_co_wake(co);
246
}
247
248
-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
249
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
250
+ * handlers are set up so that we'll be rescheduled when there is an
251
+ * interesting event on the socket.
252
+ */
253
+static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
254
{
255
int r;
256
IOHandler *rd_handler = NULL, *wr_handler = NULL;
257
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
258
259
aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
260
false, rd_handler, wr_handler, NULL, co);
261
-}
262
-
263
-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
264
- BlockDriverState *bs)
265
-{
266
- DPRINTF("s->sock=%d", s->sock);
267
- aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
268
- false, NULL, NULL, NULL, NULL);
269
-}
270
-
271
-/* A non-blocking call returned EAGAIN, so yield, ensuring the
272
- * handlers are set up so that we'll be rescheduled when there is an
273
- * interesting event on the socket.
274
- */
275
-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
276
-{
277
- set_fd_handler(s, bs);
278
qemu_coroutine_yield();
279
- clear_fd_handler(s, bs);
280
+ DPRINTF("s->sock=%d - back", s->sock);
281
+ aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
282
+ NULL, NULL, NULL, NULL);
283
}
284
285
/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
286
diff --git a/block/win32-aio.c b/block/win32-aio.c
287
index XXXXXXX..XXXXXXX 100644
288
--- a/block/win32-aio.c
289
+++ b/block/win32-aio.c
290
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
291
HANDLE hIOCP;
292
EventNotifier e;
293
int count;
294
- bool is_aio_context_attached;
295
+ AioContext *aio_ctx;
296
};
35
};
297
36
298
typedef struct QEMUWin32AIOCB {
299
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
300
}
301
302
303
+ aio_context_acquire(s->aio_ctx);
304
waiocb->common.cb(waiocb->common.opaque, ret);
305
+ aio_context_release(s->aio_ctx);
306
qemu_aio_unref(waiocb);
307
}
308
309
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
310
AioContext *old_context)
311
{
312
aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
313
- aio->is_aio_context_attached = false;
314
+ aio->aio_ctx = NULL;
315
}
316
317
void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
318
AioContext *new_context)
319
{
320
- aio->is_aio_context_attached = true;
321
+ aio->aio_ctx = new_context;
322
aio_set_event_notifier(new_context, &aio->e, false,
323
win32_aio_completion_cb, NULL);
324
}
325
@@ -XXX,XX +XXX,XX @@ out_free_state:
326
327
void win32_aio_cleanup(QEMUWin32AIOState *aio)
328
{
329
- assert(!aio->is_aio_context_attached);
330
+ assert(!aio->aio_ctx);
331
CloseHandle(aio->hIOCP);
332
event_notifier_cleanup(&aio->e);
333
g_free(aio);
334
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
37
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
335
index XXXXXXX..XXXXXXX 100644
38
index XXXXXXX..XXXXXXX 100644
336
--- a/hw/block/virtio-blk.c
39
--- a/hw/block/virtio-blk.c
337
+++ b/hw/block/virtio-blk.c
40
+++ b/hw/block/virtio-blk.c
338
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
41
@@ -XXX,XX +XXX,XX @@
42
#include "qemu/module.h"
43
#include "qemu/error-report.h"
44
#include "qemu/main-loop.h"
45
+#include "block/block_int.h"
46
#include "trace.h"
47
#include "hw/block/block.h"
48
#include "hw/qdev-properties.h"
49
@@ -XXX,XX +XXX,XX @@ err:
50
return err_status;
51
}
52
53
+typedef struct ZoneCmdData {
54
+ VirtIOBlockReq *req;
55
+ struct iovec *in_iov;
56
+ unsigned in_num;
57
+ union {
58
+ struct {
59
+ unsigned int nr_zones;
60
+ BlockZoneDescriptor *zones;
61
+ } zone_report_data;
62
+ struct {
63
+ int64_t offset;
64
+ } zone_append_data;
65
+ };
66
+} ZoneCmdData;
67
+
68
+/*
69
+ * check zoned_request: error checking before issuing requests. If all checks
70
+ * passed, return true.
71
+ * append: true if only zone append requests issued.
72
+ */
73
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
74
+ bool append, uint8_t *status) {
75
+ BlockDriverState *bs = blk_bs(s->blk);
76
+ int index;
77
+
78
+ if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
79
+ *status = VIRTIO_BLK_S_UNSUPP;
80
+ return false;
81
+ }
82
+
83
+ if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
84
+ || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
85
+ *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
86
+ return false;
87
+ }
88
+
89
+ if (append) {
90
+ if (bs->bl.write_granularity) {
91
+ if ((offset % bs->bl.write_granularity) != 0) {
92
+ *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
93
+ return false;
94
+ }
95
+ }
96
+
97
+ index = offset / bs->bl.zone_size;
98
+ if (BDRV_ZT_IS_CONV(bs->wps->wp[index])) {
99
+ *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
100
+ return false;
101
+ }
102
+
103
+ if (len / 512 > bs->bl.max_append_sectors) {
104
+ if (bs->bl.max_append_sectors == 0) {
105
+ *status = VIRTIO_BLK_S_UNSUPP;
106
+ } else {
107
+ *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
108
+ }
109
+ return false;
110
+ }
111
+ }
112
+ return true;
113
+}
114
+
115
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
116
+{
117
+ ZoneCmdData *data = opaque;
118
+ VirtIOBlockReq *req = data->req;
119
+ VirtIOBlock *s = req->dev;
120
+ VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
121
+ struct iovec *in_iov = data->in_iov;
122
+ unsigned in_num = data->in_num;
123
+ int64_t zrp_size, n, j = 0;
124
+ int64_t nz = data->zone_report_data.nr_zones;
125
+ int8_t err_status = VIRTIO_BLK_S_OK;
126
+
127
+ if (ret) {
128
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
129
+ goto out;
130
+ }
131
+
132
+ struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
133
+ .nr_zones = cpu_to_le64(nz),
134
+ };
135
+ zrp_size = sizeof(struct virtio_blk_zone_report)
136
+ + sizeof(struct virtio_blk_zone_descriptor) * nz;
137
+ n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
138
+ if (n != sizeof(zrp_hdr)) {
139
+ virtio_error(vdev, "Driver provided input buffer that is too small!");
140
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
141
+ goto out;
142
+ }
143
+
144
+ for (size_t i = sizeof(zrp_hdr); i < zrp_size;
145
+ i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
146
+ struct virtio_blk_zone_descriptor desc =
147
+ (struct virtio_blk_zone_descriptor) {
148
+ .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
149
+ >> BDRV_SECTOR_BITS),
150
+ .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
151
+ >> BDRV_SECTOR_BITS),
152
+ .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
153
+ >> BDRV_SECTOR_BITS),
154
+ };
155
+
156
+ switch (data->zone_report_data.zones[j].type) {
157
+ case BLK_ZT_CONV:
158
+ desc.z_type = VIRTIO_BLK_ZT_CONV;
159
+ break;
160
+ case BLK_ZT_SWR:
161
+ desc.z_type = VIRTIO_BLK_ZT_SWR;
162
+ break;
163
+ case BLK_ZT_SWP:
164
+ desc.z_type = VIRTIO_BLK_ZT_SWP;
165
+ break;
166
+ default:
167
+ g_assert_not_reached();
168
+ }
169
+
170
+ switch (data->zone_report_data.zones[j].state) {
171
+ case BLK_ZS_RDONLY:
172
+ desc.z_state = VIRTIO_BLK_ZS_RDONLY;
173
+ break;
174
+ case BLK_ZS_OFFLINE:
175
+ desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
176
+ break;
177
+ case BLK_ZS_EMPTY:
178
+ desc.z_state = VIRTIO_BLK_ZS_EMPTY;
179
+ break;
180
+ case BLK_ZS_CLOSED:
181
+ desc.z_state = VIRTIO_BLK_ZS_CLOSED;
182
+ break;
183
+ case BLK_ZS_FULL:
184
+ desc.z_state = VIRTIO_BLK_ZS_FULL;
185
+ break;
186
+ case BLK_ZS_EOPEN:
187
+ desc.z_state = VIRTIO_BLK_ZS_EOPEN;
188
+ break;
189
+ case BLK_ZS_IOPEN:
190
+ desc.z_state = VIRTIO_BLK_ZS_IOPEN;
191
+ break;
192
+ case BLK_ZS_NOT_WP:
193
+ desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
194
+ break;
195
+ default:
196
+ g_assert_not_reached();
197
+ }
198
+
199
+ /* TODO: it takes O(n^2) time complexity. Optimizations required. */
200
+ n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
201
+ if (n != sizeof(desc)) {
202
+ virtio_error(vdev, "Driver provided input buffer "
203
+ "for descriptors that is too small!");
204
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
205
+ }
206
+ }
207
+
208
+out:
209
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
210
+ virtio_blk_req_complete(req, err_status);
211
+ virtio_blk_free_request(req);
212
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
213
+ g_free(data->zone_report_data.zones);
214
+ g_free(data);
215
+}
216
+
217
+static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
218
+ struct iovec *in_iov,
219
+ unsigned in_num)
220
+{
221
+ VirtIOBlock *s = req->dev;
222
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
223
+ unsigned int nr_zones;
224
+ ZoneCmdData *data;
225
+ int64_t zone_size, offset;
226
+ uint8_t err_status;
227
+
228
+ if (req->in_len < sizeof(struct virtio_blk_inhdr) +
229
+ sizeof(struct virtio_blk_zone_report) +
230
+ sizeof(struct virtio_blk_zone_descriptor)) {
231
+ virtio_error(vdev, "in buffer too small for zone report");
232
+ return;
233
+ }
234
+
235
+ /* start byte offset of the zone report */
236
+ offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
237
+ if (!check_zoned_request(s, offset, 0, false, &err_status)) {
238
+ goto out;
239
+ }
240
+ nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
241
+ sizeof(struct virtio_blk_zone_report)) /
242
+ sizeof(struct virtio_blk_zone_descriptor);
243
+
244
+ zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
245
+ data = g_malloc(sizeof(ZoneCmdData));
246
+ data->req = req;
247
+ data->in_iov = in_iov;
248
+ data->in_num = in_num;
249
+ data->zone_report_data.nr_zones = nr_zones;
250
+ data->zone_report_data.zones = g_malloc(zone_size),
251
+
252
+ blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
253
+ data->zone_report_data.zones,
254
+ virtio_blk_zone_report_complete, data);
255
+ return;
256
+out:
257
+ virtio_blk_req_complete(req, err_status);
258
+ virtio_blk_free_request(req);
259
+}
260
+
261
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
262
+{
263
+ VirtIOBlockReq *req = opaque;
264
+ VirtIOBlock *s = req->dev;
265
+ int8_t err_status = VIRTIO_BLK_S_OK;
266
+
267
+ if (ret) {
268
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
269
+ }
270
+
271
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
272
+ virtio_blk_req_complete(req, err_status);
273
+ virtio_blk_free_request(req);
274
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
275
+}
276
+
277
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
278
+{
279
+ VirtIOBlock *s = req->dev;
280
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
281
+ BlockDriverState *bs = blk_bs(s->blk);
282
+ int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
283
+ uint64_t len;
284
+ uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
285
+ uint8_t err_status = VIRTIO_BLK_S_OK;
286
+
287
+ uint32_t type = virtio_ldl_p(vdev, &req->out.type);
288
+ if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
289
+ /* Entire drive capacity */
290
+ offset = 0;
291
+ len = capacity;
292
+ } else {
293
+ if (bs->bl.zone_size > capacity - offset) {
294
+ /* The zoned device allows the last smaller zone. */
295
+ len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
296
+ } else {
297
+ len = bs->bl.zone_size;
298
+ }
299
+ }
300
+
301
+ if (!check_zoned_request(s, offset, len, false, &err_status)) {
302
+ goto out;
303
+ }
304
+
305
+ blk_aio_zone_mgmt(s->blk, op, offset, len,
306
+ virtio_blk_zone_mgmt_complete, req);
307
+
308
+ return 0;
309
+out:
310
+ virtio_blk_req_complete(req, err_status);
311
+ virtio_blk_free_request(req);
312
+ return err_status;
313
+}
314
+
315
+static void virtio_blk_zone_append_complete(void *opaque, int ret)
316
+{
317
+ ZoneCmdData *data = opaque;
318
+ VirtIOBlockReq *req = data->req;
319
+ VirtIOBlock *s = req->dev;
320
+ VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
321
+ int64_t append_sector, n;
322
+ uint8_t err_status = VIRTIO_BLK_S_OK;
323
+
324
+ if (ret) {
325
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
326
+ goto out;
327
+ }
328
+
329
+ virtio_stq_p(vdev, &append_sector,
330
+ data->zone_append_data.offset >> BDRV_SECTOR_BITS);
331
+ n = iov_from_buf(data->in_iov, data->in_num, 0, &append_sector,
332
+ sizeof(append_sector));
333
+ if (n != sizeof(append_sector)) {
334
+ virtio_error(vdev, "Driver provided input buffer less than size of "
335
+ "append_sector");
336
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
337
+ goto out;
338
+ }
339
+
340
+out:
341
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
342
+ virtio_blk_req_complete(req, err_status);
343
+ virtio_blk_free_request(req);
344
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
345
+ g_free(data);
346
+}
347
+
348
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
349
+ struct iovec *out_iov,
350
+ struct iovec *in_iov,
351
+ uint64_t out_num,
352
+ unsigned in_num) {
353
+ VirtIOBlock *s = req->dev;
354
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
355
+ uint8_t err_status = VIRTIO_BLK_S_OK;
356
+
357
+ int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
358
+ int64_t len = iov_size(out_iov, out_num);
359
+
360
+ if (!check_zoned_request(s, offset, len, true, &err_status)) {
361
+ goto out;
362
+ }
363
+
364
+ ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
365
+ data->req = req;
366
+ data->in_iov = in_iov;
367
+ data->in_num = in_num;
368
+ data->zone_append_data.offset = offset;
369
+ qemu_iovec_init_external(&req->qiov, out_iov, out_num);
370
+ blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
371
+ virtio_blk_zone_append_complete, data);
372
+ return 0;
373
+
374
+out:
375
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
376
+ virtio_blk_req_complete(req, err_status);
377
+ virtio_blk_free_request(req);
378
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
379
+ return err_status;
380
+}
381
+
382
static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
339
{
383
{
340
VirtIOBlockIoctlReq *ioctl_req = opaque;
384
uint32_t type;
341
VirtIOBlockReq *req = ioctl_req->req;
385
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
342
- VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
386
case VIRTIO_BLK_T_FLUSH:
343
+ VirtIOBlock *s = req->dev;
387
virtio_blk_handle_flush(req, mrb);
344
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
388
break;
345
struct virtio_scsi_inhdr *scsi;
389
+ case VIRTIO_BLK_T_ZONE_REPORT:
346
struct sg_io_hdr *hdr;
390
+ virtio_blk_handle_zone_report(req, in_iov, in_num);
347
391
+ break;
348
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
392
+ case VIRTIO_BLK_T_ZONE_OPEN:
349
MultiReqBuffer mrb = {};
393
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
350
bool progress = false;
394
+ break;
351
395
+ case VIRTIO_BLK_T_ZONE_CLOSE:
352
+ aio_context_acquire(blk_get_aio_context(s->blk));
396
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
353
blk_io_plug(s->blk);
397
+ break;
354
398
+ case VIRTIO_BLK_T_ZONE_FINISH:
355
do {
399
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
356
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
400
+ break;
401
+ case VIRTIO_BLK_T_ZONE_RESET:
402
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
403
+ break;
404
+ case VIRTIO_BLK_T_ZONE_RESET_ALL:
405
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
406
+ break;
407
case VIRTIO_BLK_T_SCSI_CMD:
408
virtio_blk_handle_scsi(req);
409
break;
410
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
411
virtio_blk_free_request(req);
412
break;
357
}
413
}
358
414
+ case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
359
blk_io_unplug(s->blk);
415
+ /*
360
+ aio_context_release(blk_get_aio_context(s->blk));
416
+ * Passing out_iov/out_num and in_iov/in_num is not safe
361
return progress;
417
+ * to access req->elem.out_sg directly because it may be
418
+ * modified by virtio_blk_handle_request().
419
+ */
420
+ virtio_blk_handle_zone_append(req, out_iov, in_iov, out_num, in_num);
421
+ break;
422
/*
423
* VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
424
* VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
425
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
426
{
427
VirtIOBlock *s = VIRTIO_BLK(vdev);
428
BlockConf *conf = &s->conf.conf;
429
+ BlockDriverState *bs = blk_bs(s->blk);
430
struct virtio_blk_config blkcfg;
431
uint64_t capacity;
432
int64_t length;
433
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
434
blkcfg.write_zeroes_may_unmap = 1;
435
virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
436
}
437
+ if (bs->bl.zoned != BLK_Z_NONE) {
438
+ switch (bs->bl.zoned) {
439
+ case BLK_Z_HM:
440
+ blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
441
+ break;
442
+ case BLK_Z_HA:
443
+ blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
444
+ break;
445
+ default:
446
+ g_assert_not_reached();
447
+ }
448
+
449
+ virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
450
+ bs->bl.zone_size / 512);
451
+ virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
452
+ bs->bl.max_active_zones);
453
+ virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
454
+ bs->bl.max_open_zones);
455
+ virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
456
+ virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
457
+ bs->bl.max_append_sectors);
458
+ } else {
459
+ blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
460
+ }
461
memcpy(config, &blkcfg, s->config_size);
362
}
462
}
363
463
364
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
464
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
465
return;
466
}
467
468
+ BlockDriverState *bs = blk_bs(conf->conf.blk);
469
+ if (bs->bl.zoned != BLK_Z_NONE) {
470
+ virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
471
+ if (bs->bl.zoned == BLK_Z_HM) {
472
+ virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
473
+ }
474
+ }
475
+
476
if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
477
(!conf->max_discard_sectors ||
478
conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
479
diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c
365
index XXXXXXX..XXXXXXX 100644
480
index XXXXXXX..XXXXXXX 100644
366
--- a/hw/scsi/virtio-scsi.c
481
--- a/hw/virtio/virtio-qmp.c
367
+++ b/hw/scsi/virtio-scsi.c
482
+++ b/hw/virtio/virtio-qmp.c
368
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
483
@@ -XXX,XX +XXX,XX @@ static const qmp_virtio_feature_map_t virtio_blk_feature_map[] = {
369
VirtIOSCSIReq *req;
484
"VIRTIO_BLK_F_DISCARD: Discard command supported"),
370
bool progress = false;
485
FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \
371
486
"VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"),
372
+ virtio_scsi_acquire(s);
487
+ FEATURE_ENTRY(VIRTIO_BLK_F_ZONED, \
373
while ((req = virtio_scsi_pop_req(s, vq))) {
488
+ "VIRTIO_BLK_F_ZONED: Zoned block devices"),
374
progress = true;
489
#ifndef VIRTIO_BLK_NO_LEGACY
375
virtio_scsi_handle_ctrl_req(s, req);
490
FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \
376
}
491
"VIRTIO_BLK_F_BARRIER: Request barriers supported"),
377
+ virtio_scsi_release(s);
378
return progress;
379
}
380
381
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
382
383
QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
384
385
+ virtio_scsi_acquire(s);
386
do {
387
virtio_queue_set_notification(vq, 0);
388
389
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
390
QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
391
virtio_scsi_handle_cmd_req_submit(s, req);
392
}
393
+ virtio_scsi_release(s);
394
return progress;
395
}
396
397
@@ -XXX,XX +XXX,XX @@ out:
398
399
bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
400
{
401
+ virtio_scsi_acquire(s);
402
if (s->events_dropped) {
403
virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
404
+ virtio_scsi_release(s);
405
return true;
406
}
407
+ virtio_scsi_release(s);
408
return false;
409
}
410
411
diff --git a/util/aio-posix.c b/util/aio-posix.c
412
index XXXXXXX..XXXXXXX 100644
413
--- a/util/aio-posix.c
414
+++ b/util/aio-posix.c
415
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
416
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
417
aio_node_check(ctx, node->is_external) &&
418
node->io_read) {
419
- aio_context_acquire(ctx);
420
node->io_read(node->opaque);
421
- aio_context_release(ctx);
422
423
/* aio_notify() does not count as progress */
424
if (node->opaque != &ctx->notifier) {
425
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
426
(revents & (G_IO_OUT | G_IO_ERR)) &&
427
aio_node_check(ctx, node->is_external) &&
428
node->io_write) {
429
- aio_context_acquire(ctx);
430
node->io_write(node->opaque);
431
- aio_context_release(ctx);
432
progress = true;
433
}
434
435
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
436
start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
437
}
438
439
- aio_context_acquire(ctx);
440
progress = try_poll_mode(ctx, blocking);
441
- aio_context_release(ctx);
442
-
443
if (!progress) {
444
assert(npfd == 0);
445
446
diff --git a/util/aio-win32.c b/util/aio-win32.c
447
index XXXXXXX..XXXXXXX 100644
448
--- a/util/aio-win32.c
449
+++ b/util/aio-win32.c
450
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
451
(revents || event_notifier_get_handle(node->e) == event) &&
452
node->io_notify) {
453
node->pfd.revents = 0;
454
- aio_context_acquire(ctx);
455
node->io_notify(node->e);
456
- aio_context_release(ctx);
457
458
/* aio_notify() does not count as progress */
459
if (node->e != &ctx->notifier) {
460
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
461
(node->io_read || node->io_write)) {
462
node->pfd.revents = 0;
463
if ((revents & G_IO_IN) && node->io_read) {
464
- aio_context_acquire(ctx);
465
node->io_read(node->opaque);
466
- aio_context_release(ctx);
467
progress = true;
468
}
469
if ((revents & G_IO_OUT) && node->io_write) {
470
- aio_context_acquire(ctx);
471
node->io_write(node->opaque);
472
- aio_context_release(ctx);
473
progress = true;
474
}
475
476
--
492
--
477
2.9.3
493
2.40.1
478
479
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Running a very small critical section on pthread_mutex_t and CoMutex
3
Taking account of the new zone append write operation for zoned devices,
4
shows that pthread_mutex_t is much faster because it doesn't actually
4
BLOCK_ACCT_ZONE_APPEND enum is introduced as other I/O request type (read,
5
go to sleep. What happens is that the critical section is shorter
5
write, flush).
6
than the latency of entering the kernel and thus FUTEX_WAIT always
7
fails. With CoMutex there is no such latency but you still want to
8
avoid wait and wakeup. So introduce it artificially.
9
6
10
This only works with one waiters; because CoMutex is fair, it will
7
Signed-off-by: Sam Li <faithilikerun@gmail.com>
11
always have more waits and wakeups than a pthread_mutex_t.
8
Message-id: 20230508051916.178322-3-faithilikerun@gmail.com
12
13
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
14
Reviewed-by: Fam Zheng <famz@redhat.com>
15
Message-id: 20170213181244.16297-3-pbonzini@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
10
---
18
include/qemu/coroutine.h | 5 +++++
11
qapi/block-core.json | 68 ++++++++++++++++++++++++++++++++------
19
util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
12
qapi/block.json | 4 +++
20
util/qemu-coroutine.c | 2 +-
13
include/block/accounting.h | 1 +
21
3 files changed, 51 insertions(+), 7 deletions(-)
14
block/qapi-sysemu.c | 11 ++++++
15
block/qapi.c | 18 ++++++++++
16
hw/block/virtio-blk.c | 4 +++
17
tests/qemu-iotests/227.out | 18 ++++++++++
18
7 files changed, 113 insertions(+), 11 deletions(-)
22
19
23
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
20
diff --git a/qapi/block-core.json b/qapi/block-core.json
24
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
25
--- a/include/qemu/coroutine.h
22
--- a/qapi/block-core.json
26
+++ b/include/qemu/coroutine.h
23
+++ b/qapi/block-core.json
27
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
24
@@ -XXX,XX +XXX,XX @@
28
*/
25
# @min_wr_latency_ns: Minimum latency of write operations in the
29
unsigned locked;
26
# defined interval, in nanoseconds.
30
27
#
31
+ /* Context that is holding the lock. Useful to avoid spinning
28
+# @min_zone_append_latency_ns: Minimum latency of zone append operations
32
+ * when two coroutines on the same AioContext try to get the lock. :)
29
+# in the defined interval, in nanoseconds
33
+ */
30
+# (since 8.1)
34
+ AioContext *ctx;
31
+#
35
+
32
# @min_flush_latency_ns: Minimum latency of flush operations in the
36
/* A queue of waiters. Elements are added atomically in front of
33
# defined interval, in nanoseconds.
37
* from_push. to_pop is only populated, and popped from, by whoever
34
#
38
* is in charge of the next wakeup. This can be an unlocker or,
35
@@ -XXX,XX +XXX,XX @@
39
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
36
# @max_wr_latency_ns: Maximum latency of write operations in the
40
index XXXXXXX..XXXXXXX 100644
37
# defined interval, in nanoseconds.
41
--- a/util/qemu-coroutine-lock.c
38
#
42
+++ b/util/qemu-coroutine-lock.c
39
+# @max_zone_append_latency_ns: Maximum latency of zone append operations
43
@@ -XXX,XX +XXX,XX @@
40
+# in the defined interval, in nanoseconds
44
#include "qemu-common.h"
41
+# (since 8.1)
45
#include "qemu/coroutine.h"
42
+#
46
#include "qemu/coroutine_int.h"
43
# @max_flush_latency_ns: Maximum latency of flush operations in the
47
+#include "qemu/processor.h"
44
# defined interval, in nanoseconds.
48
#include "qemu/queue.h"
45
#
49
#include "block/aio.h"
46
@@ -XXX,XX +XXX,XX @@
50
#include "trace.h"
47
# @avg_wr_latency_ns: Average latency of write operations in the
51
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
48
# defined interval, in nanoseconds.
52
memset(mutex, 0, sizeof(*mutex));
49
#
53
}
50
+# @avg_zone_append_latency_ns: Average latency of zone append operations
54
51
+# in the defined interval, in nanoseconds
55
-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
52
+# (since 8.1)
56
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
53
+#
57
+{
54
# @avg_flush_latency_ns: Average latency of flush operations in the
58
+ /* Read co before co->ctx; pairs with smp_wmb() in
55
# defined interval, in nanoseconds.
59
+ * qemu_coroutine_enter().
56
#
60
+ */
57
@@ -XXX,XX +XXX,XX @@
61
+ smp_read_barrier_depends();
58
# @avg_wr_queue_depth: Average number of pending write operations in
62
+ mutex->ctx = co->ctx;
59
# the defined interval.
63
+ aio_co_wake(co);
60
#
64
+}
61
+# @avg_zone_append_queue_depth: Average number of pending zone append
65
+
62
+# operations in the defined interval
66
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
63
+# (since 8.1).
67
+ CoMutex *mutex)
64
+#
65
# Since: 2.5
66
##
67
{ 'struct': 'BlockDeviceTimedStats',
68
'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int',
69
'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int',
70
'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int',
71
- 'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int',
72
- 'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int',
73
- 'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } }
74
+ 'avg_wr_latency_ns': 'int', 'min_zone_append_latency_ns': 'int',
75
+ 'max_zone_append_latency_ns': 'int',
76
+ 'avg_zone_append_latency_ns': 'int',
77
+ 'min_flush_latency_ns': 'int', 'max_flush_latency_ns': 'int',
78
+ 'avg_flush_latency_ns': 'int', 'avg_rd_queue_depth': 'number',
79
+ 'avg_wr_queue_depth': 'number',
80
+ 'avg_zone_append_queue_depth': 'number' } }
81
82
##
83
# @BlockDeviceStats:
84
@@ -XXX,XX +XXX,XX @@
85
#
86
# @wr_bytes: The number of bytes written by the device.
87
#
88
+# @zone_append_bytes: The number of bytes appended by the zoned devices
89
+# (since 8.1)
90
+#
91
# @unmap_bytes: The number of bytes unmapped by the device (Since 4.2)
92
#
93
# @rd_operations: The number of read operations performed by the
94
@@ -XXX,XX +XXX,XX @@
95
# @wr_operations: The number of write operations performed by the
96
# device.
97
#
98
+# @zone_append_operations: The number of zone append operations performed
99
+# by the zoned devices (since 8.1)
100
+#
101
# @flush_operations: The number of cache flush operations performed by
102
# the device (since 0.15)
103
#
104
@@ -XXX,XX +XXX,XX @@
105
# @wr_total_time_ns: Total time spent on writes in nanoseconds (since
106
# 0.15).
107
#
108
+# @zone_append_total_time_ns: Total time spent on zone append writes
109
+# in nanoseconds (since 8.1)
110
+#
111
# @flush_total_time_ns: Total time spent on cache flushes in
112
# nanoseconds (since 0.15).
113
#
114
@@ -XXX,XX +XXX,XX @@
115
# @wr_merged: Number of write requests that have been merged into
116
# another request (Since 2.3).
117
#
118
+# @zone_append_merged: Number of zone append requests that have been merged
119
+# into another request (since 8.1)
120
+#
121
# @unmap_merged: Number of unmap requests that have been merged into
122
# another request (Since 4.2)
123
#
124
@@ -XXX,XX +XXX,XX @@
125
# @failed_wr_operations: The number of failed write operations
126
# performed by the device (Since 2.5)
127
#
128
+# @failed_zone_append_operations: The number of failed zone append write
129
+# operations performed by the zoned devices
130
+# (since 8.1)
131
+#
132
# @failed_flush_operations: The number of failed flush operations
133
# performed by the device (Since 2.5)
134
#
135
@@ -XXX,XX +XXX,XX @@
136
# @invalid_wr_operations: The number of invalid write operations
137
# performed by the device (Since 2.5)
138
#
139
+# @invalid_zone_append_operations: The number of invalid zone append operations
140
+# performed by the zoned device (since 8.1)
141
+#
142
# @invalid_flush_operations: The number of invalid flush operations
143
# performed by the device (Since 2.5)
144
#
145
@@ -XXX,XX +XXX,XX @@
146
#
147
# @wr_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
148
#
149
+# @zone_append_latency_histogram: @BlockLatencyHistogramInfo. (since 8.1)
150
+#
151
# @flush_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
152
#
153
# Since: 0.14
154
##
155
{ 'struct': 'BlockDeviceStats',
156
- 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'unmap_bytes' : 'int',
157
- 'rd_operations': 'int', 'wr_operations': 'int',
158
+ 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'zone_append_bytes': 'int',
159
+ 'unmap_bytes' : 'int', 'rd_operations': 'int',
160
+ 'wr_operations': 'int', 'zone_append_operations': 'int',
161
'flush_operations': 'int', 'unmap_operations': 'int',
162
'rd_total_time_ns': 'int', 'wr_total_time_ns': 'int',
163
- 'flush_total_time_ns': 'int', 'unmap_total_time_ns': 'int',
164
- 'wr_highest_offset': 'int',
165
- 'rd_merged': 'int', 'wr_merged': 'int', 'unmap_merged': 'int',
166
- '*idle_time_ns': 'int',
167
+ 'zone_append_total_time_ns': 'int', 'flush_total_time_ns': 'int',
168
+ 'unmap_total_time_ns': 'int', 'wr_highest_offset': 'int',
169
+ 'rd_merged': 'int', 'wr_merged': 'int', 'zone_append_merged': 'int',
170
+ 'unmap_merged': 'int', '*idle_time_ns': 'int',
171
'failed_rd_operations': 'int', 'failed_wr_operations': 'int',
172
- 'failed_flush_operations': 'int', 'failed_unmap_operations': 'int',
173
- 'invalid_rd_operations': 'int', 'invalid_wr_operations': 'int',
174
+ 'failed_zone_append_operations': 'int',
175
+ 'failed_flush_operations': 'int',
176
+ 'failed_unmap_operations': 'int', 'invalid_rd_operations': 'int',
177
+ 'invalid_wr_operations': 'int',
178
+ 'invalid_zone_append_operations': 'int',
179
'invalid_flush_operations': 'int', 'invalid_unmap_operations': 'int',
180
'account_invalid': 'bool', 'account_failed': 'bool',
181
'timed_stats': ['BlockDeviceTimedStats'],
182
'*rd_latency_histogram': 'BlockLatencyHistogramInfo',
183
'*wr_latency_histogram': 'BlockLatencyHistogramInfo',
184
+ '*zone_append_latency_histogram': 'BlockLatencyHistogramInfo',
185
'*flush_latency_histogram': 'BlockLatencyHistogramInfo' } }
186
187
##
188
diff --git a/qapi/block.json b/qapi/block.json
189
index XXXXXXX..XXXXXXX 100644
190
--- a/qapi/block.json
191
+++ b/qapi/block.json
192
@@ -XXX,XX +XXX,XX @@
193
# @boundaries-write: list of interval boundary values for write
194
# latency histogram.
195
#
196
+# @boundaries-zap: list of interval boundary values for zone append write
197
+# latency histogram.
198
+#
199
# @boundaries-flush: list of interval boundary values for flush
200
# latency histogram.
201
#
202
@@ -XXX,XX +XXX,XX @@
203
'*boundaries': ['uint64'],
204
'*boundaries-read': ['uint64'],
205
'*boundaries-write': ['uint64'],
206
+ '*boundaries-zap': ['uint64'],
207
'*boundaries-flush': ['uint64'] },
208
'allow-preconfig': true }
209
diff --git a/include/block/accounting.h b/include/block/accounting.h
210
index XXXXXXX..XXXXXXX 100644
211
--- a/include/block/accounting.h
212
+++ b/include/block/accounting.h
213
@@ -XXX,XX +XXX,XX @@ enum BlockAcctType {
214
BLOCK_ACCT_READ,
215
BLOCK_ACCT_WRITE,
216
BLOCK_ACCT_FLUSH,
217
+ BLOCK_ACCT_ZONE_APPEND,
218
BLOCK_ACCT_UNMAP,
219
BLOCK_MAX_IOTYPE,
220
};
221
diff --git a/block/qapi-sysemu.c b/block/qapi-sysemu.c
222
index XXXXXXX..XXXXXXX 100644
223
--- a/block/qapi-sysemu.c
224
+++ b/block/qapi-sysemu.c
225
@@ -XXX,XX +XXX,XX @@ void qmp_block_latency_histogram_set(
226
bool has_boundaries, uint64List *boundaries,
227
bool has_boundaries_read, uint64List *boundaries_read,
228
bool has_boundaries_write, uint64List *boundaries_write,
229
+ bool has_boundaries_append, uint64List *boundaries_append,
230
bool has_boundaries_flush, uint64List *boundaries_flush,
231
Error **errp)
68
{
232
{
69
Coroutine *self = qemu_coroutine_self();
233
@@ -XXX,XX +XXX,XX @@ void qmp_block_latency_histogram_set(
70
CoWaitRecord w;
71
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
72
if (co == self) {
73
/* We got the lock ourselves! */
74
assert(to_wake == &w);
75
+ mutex->ctx = ctx;
76
return;
77
}
234
}
78
79
- aio_co_wake(co);
80
+ qemu_co_mutex_wake(mutex, co);
81
}
235
}
82
236
83
qemu_coroutine_yield();
237
+ if (has_boundaries || has_boundaries_append) {
84
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
238
+ ret = block_latency_histogram_set(
85
239
+ stats, BLOCK_ACCT_ZONE_APPEND,
86
void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
240
+ has_boundaries_append ? boundaries_append : boundaries);
87
{
241
+ if (ret) {
88
+ AioContext *ctx = qemu_get_current_aio_context();
242
+ error_setg(errp, "Device '%s' set append write boundaries fail", id);
89
Coroutine *self = qemu_coroutine_self();
243
+ return;
90
+ int waiters, i;
91
92
- if (atomic_fetch_inc(&mutex->locked) == 0) {
93
+ /* Running a very small critical section on pthread_mutex_t and CoMutex
94
+ * shows that pthread_mutex_t is much faster because it doesn't actually
95
+ * go to sleep. What happens is that the critical section is shorter
96
+ * than the latency of entering the kernel and thus FUTEX_WAIT always
97
+ * fails. With CoMutex there is no such latency but you still want to
98
+ * avoid wait and wakeup. So introduce it artificially.
99
+ */
100
+ i = 0;
101
+retry_fast_path:
102
+ waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
103
+ if (waiters != 0) {
104
+ while (waiters == 1 && ++i < 1000) {
105
+ if (atomic_read(&mutex->ctx) == ctx) {
106
+ break;
107
+ }
108
+ if (atomic_read(&mutex->locked) == 0) {
109
+ goto retry_fast_path;
110
+ }
111
+ cpu_relax();
112
+ }
244
+ }
113
+ waiters = atomic_fetch_inc(&mutex->locked);
114
+ }
245
+ }
115
+
246
+
116
+ if (waiters == 0) {
247
if (has_boundaries || has_boundaries_flush) {
117
/* Uncontended. */
248
ret = block_latency_histogram_set(
118
trace_qemu_co_mutex_lock_uncontended(mutex, self);
249
stats, BLOCK_ACCT_FLUSH,
119
+ mutex->ctx = ctx;
250
diff --git a/block/qapi.c b/block/qapi.c
120
} else {
251
index XXXXXXX..XXXXXXX 100644
121
- qemu_co_mutex_lock_slowpath(mutex);
252
--- a/block/qapi.c
122
+ qemu_co_mutex_lock_slowpath(ctx, mutex);
253
+++ b/block/qapi.c
254
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
255
256
ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
257
ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
258
+ ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
259
ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
260
ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
261
ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
262
+ ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
263
ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];
264
265
ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
266
ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
267
+ ds->failed_zone_append_operations =
268
+ stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
269
ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
270
ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];
271
272
ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
273
ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
274
+ ds->invalid_zone_append_operations =
275
+ stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
276
ds->invalid_flush_operations =
277
stats->invalid_ops[BLOCK_ACCT_FLUSH];
278
ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];
279
280
ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
281
ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
282
+ ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
283
ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
284
ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
285
ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
286
+ ds->zone_append_total_time_ns =
287
+ stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
288
ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
289
ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
290
ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
291
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
292
293
TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
294
TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
295
+ TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
296
TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
297
298
dev_stats->interval_length = ts->interval_length;
299
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
300
dev_stats->max_wr_latency_ns = timed_average_max(wr);
301
dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
302
303
+ dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
304
+ dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
305
+ dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
306
+
307
dev_stats->min_flush_latency_ns = timed_average_min(fl);
308
dev_stats->max_flush_latency_ns = timed_average_max(fl);
309
dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
310
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
311
block_acct_queue_depth(ts, BLOCK_ACCT_READ);
312
dev_stats->avg_wr_queue_depth =
313
block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
314
+ dev_stats->avg_zone_append_queue_depth =
315
+ block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);
316
317
QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
123
}
318
}
124
mutex->holder = self;
319
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
125
self->locks_held++;
320
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
126
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
321
ds->wr_latency_histogram
127
assert(mutex->holder == self);
322
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
128
assert(qemu_in_coroutine());
323
+ ds->zone_append_latency_histogram
129
324
+ = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
130
+ mutex->ctx = NULL;
325
ds->flush_latency_histogram
131
mutex->holder = NULL;
326
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
132
self->locks_held--;
327
}
133
if (atomic_fetch_dec(&mutex->locked) == 1) {
328
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
134
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
329
index XXXXXXX..XXXXXXX 100644
135
unsigned our_handoff;
330
--- a/hw/block/virtio-blk.c
136
331
+++ b/hw/block/virtio-blk.c
137
if (to_wake) {
332
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
138
- Coroutine *co = to_wake->co;
333
data->in_num = in_num;
139
- aio_co_wake(co);
334
data->zone_append_data.offset = offset;
140
+ qemu_co_mutex_wake(mutex, to_wake->co);
335
qemu_iovec_init_external(&req->qiov, out_iov, out_num);
141
break;
336
+
142
}
337
+ block_acct_start(blk_get_stats(s->blk), &req->acct, len,
143
338
+ BLOCK_ACCT_ZONE_APPEND);
144
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
339
+
145
index XXXXXXX..XXXXXXX 100644
340
blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
146
--- a/util/qemu-coroutine.c
341
virtio_blk_zone_append_complete, data);
147
+++ b/util/qemu-coroutine.c
342
return 0;
148
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
343
diff --git a/tests/qemu-iotests/227.out b/tests/qemu-iotests/227.out
149
co->ctx = qemu_get_current_aio_context();
344
index XXXXXXX..XXXXXXX 100644
150
345
--- a/tests/qemu-iotests/227.out
151
/* Store co->ctx before anything that stores co. Matches
346
+++ b/tests/qemu-iotests/227.out
152
- * barrier in aio_co_wake.
347
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
153
+ * barrier in aio_co_wake and qemu_co_mutex_wake.
348
"stats": {
154
*/
349
"unmap_operations": 0,
155
smp_wmb();
350
"unmap_merged": 0,
156
351
+ "failed_zone_append_operations": 0,
352
"flush_total_time_ns": 0,
353
"wr_highest_offset": 0,
354
"wr_total_time_ns": 0,
355
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
356
"timed_stats": [
357
],
358
"failed_unmap_operations": 0,
359
+ "zone_append_merged": 0,
360
"failed_flush_operations": 0,
361
"account_invalid": true,
362
"rd_total_time_ns": 0,
363
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
364
"unmap_total_time_ns": 0,
365
"invalid_flush_operations": 0,
366
"account_failed": true,
367
+ "zone_append_total_time_ns": 0,
368
+ "zone_append_operations": 0,
369
"rd_operations": 0,
370
+ "zone_append_bytes": 0,
371
+ "invalid_zone_append_operations": 0,
372
"invalid_wr_operations": 0,
373
"invalid_rd_operations": 0
374
},
375
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,if=none
376
"stats": {
377
"unmap_operations": 0,
378
"unmap_merged": 0,
379
+ "failed_zone_append_operations": 0,
380
"flush_total_time_ns": 0,
381
"wr_highest_offset": 0,
382
"wr_total_time_ns": 0,
383
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,if=none
384
"timed_stats": [
385
],
386
"failed_unmap_operations": 0,
387
+ "zone_append_merged": 0,
388
"failed_flush_operations": 0,
389
"account_invalid": true,
390
"rd_total_time_ns": 0,
391
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,if=none
392
"unmap_total_time_ns": 0,
393
"invalid_flush_operations": 0,
394
"account_failed": true,
395
+ "zone_append_total_time_ns": 0,
396
+ "zone_append_operations": 0,
397
"rd_operations": 0,
398
+ "zone_append_bytes": 0,
399
+ "invalid_zone_append_operations": 0,
400
"invalid_wr_operations": 0,
401
"invalid_rd_operations": 0
402
},
403
@@ -XXX,XX +XXX,XX @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
404
"stats": {
405
"unmap_operations": 0,
406
"unmap_merged": 0,
407
+ "failed_zone_append_operations": 0,
408
"flush_total_time_ns": 0,
409
"wr_highest_offset": 0,
410
"wr_total_time_ns": 0,
411
@@ -XXX,XX +XXX,XX @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
412
"timed_stats": [
413
],
414
"failed_unmap_operations": 0,
415
+ "zone_append_merged": 0,
416
"failed_flush_operations": 0,
417
"account_invalid": true,
418
"rd_total_time_ns": 0,
419
@@ -XXX,XX +XXX,XX @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
420
"unmap_total_time_ns": 0,
421
"invalid_flush_operations": 0,
422
"account_failed": true,
423
+ "zone_append_total_time_ns": 0,
424
+ "zone_append_operations": 0,
425
"rd_operations": 0,
426
+ "zone_append_bytes": 0,
427
+ "invalid_zone_append_operations": 0,
428
"invalid_wr_operations": 0,
429
"invalid_rd_operations": 0
430
},
157
--
431
--
158
2.9.3
432
2.40.1
159
160
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
5
Message-id: 20230508051916.178322-4-faithilikerun@gmail.com
5
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Message-id: 20170213135235.12274-16-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
7
---
10
block/archipelago.c | 3 ---
8
hw/block/virtio-blk.c | 12 ++++++++++++
11
block/block-backend.c | 7 -------
9
hw/block/trace-events | 7 +++++++
12
block/curl.c | 2 +-
10
2 files changed, 19 insertions(+)
13
block/io.c | 6 +-----
14
block/iscsi.c | 3 ---
15
block/linux-aio.c | 5 +----
16
block/mirror.c | 12 +++++++++---
17
block/null.c | 8 --------
18
block/qed-cluster.c | 2 ++
19
block/qed-table.c | 12 ++++++++++--
20
block/qed.c | 4 ++--
21
block/rbd.c | 4 ----
22
block/win32-aio.c | 3 ---
23
hw/block/virtio-blk.c | 12 +++++++++++-
24
hw/scsi/scsi-disk.c | 15 +++++++++++++++
25
hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
26
util/thread-pool.c | 4 +++-
27
17 files changed, 72 insertions(+), 50 deletions(-)
28
11
29
diff --git a/block/archipelago.c b/block/archipelago.c
30
index XXXXXXX..XXXXXXX 100644
31
--- a/block/archipelago.c
32
+++ b/block/archipelago.c
33
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
34
{
35
AIORequestData *reqdata = (AIORequestData *) opaque;
36
ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
37
- AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
38
39
- aio_context_acquire(ctx);
40
aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
41
- aio_context_release(ctx);
42
aio_cb->status = 0;
43
44
qemu_aio_unref(aio_cb);
45
diff --git a/block/block-backend.c b/block/block-backend.c
46
index XXXXXXX..XXXXXXX 100644
47
--- a/block/block-backend.c
48
+++ b/block/block-backend.c
49
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
50
static void error_callback_bh(void *opaque)
51
{
52
struct BlockBackendAIOCB *acb = opaque;
53
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
54
55
bdrv_dec_in_flight(acb->common.bs);
56
- aio_context_acquire(ctx);
57
acb->common.cb(acb->common.opaque, acb->ret);
58
- aio_context_release(ctx);
59
qemu_aio_unref(acb);
60
}
61
62
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
63
static void blk_aio_complete_bh(void *opaque)
64
{
65
BlkAioEmAIOCB *acb = opaque;
66
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
67
-
68
assert(acb->has_returned);
69
- aio_context_acquire(ctx);
70
blk_aio_complete(acb);
71
- aio_context_release(ctx);
72
}
73
74
static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
75
diff --git a/block/curl.c b/block/curl.c
76
index XXXXXXX..XXXXXXX 100644
77
--- a/block/curl.c
78
+++ b/block/curl.c
79
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
80
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
81
82
out:
83
+ aio_context_release(ctx);
84
if (ret != -EINPROGRESS) {
85
acb->common.cb(acb->common.opaque, ret);
86
qemu_aio_unref(acb);
87
}
88
- aio_context_release(ctx);
89
}
90
91
static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
92
diff --git a/block/io.c b/block/io.c
93
index XXXXXXX..XXXXXXX 100644
94
--- a/block/io.c
95
+++ b/block/io.c
96
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
97
CoroutineIOCompletion *co = opaque;
98
99
co->ret = ret;
100
- qemu_coroutine_enter(co->coroutine);
101
+ aio_co_wake(co->coroutine);
102
}
103
104
static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
105
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
106
static void bdrv_co_em_bh(void *opaque)
107
{
108
BlockAIOCBCoroutine *acb = opaque;
109
- BlockDriverState *bs = acb->common.bs;
110
- AioContext *ctx = bdrv_get_aio_context(bs);
111
112
assert(!acb->need_bh);
113
- aio_context_acquire(ctx);
114
bdrv_co_complete(acb);
115
- aio_context_release(ctx);
116
}
117
118
static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
119
diff --git a/block/iscsi.c b/block/iscsi.c
120
index XXXXXXX..XXXXXXX 100644
121
--- a/block/iscsi.c
122
+++ b/block/iscsi.c
123
@@ -XXX,XX +XXX,XX @@ static void
124
iscsi_bh_cb(void *p)
125
{
126
IscsiAIOCB *acb = p;
127
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
128
129
qemu_bh_delete(acb->bh);
130
131
g_free(acb->buf);
132
acb->buf = NULL;
133
134
- aio_context_acquire(ctx);
135
acb->common.cb(acb->common.opaque, acb->status);
136
- aio_context_release(ctx);
137
138
if (acb->task != NULL) {
139
scsi_free_scsi_task(acb->task);
140
diff --git a/block/linux-aio.c b/block/linux-aio.c
141
index XXXXXXX..XXXXXXX 100644
142
--- a/block/linux-aio.c
143
+++ b/block/linux-aio.c
144
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
145
*/
146
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
147
{
148
- LinuxAioState *s = laiocb->ctx;
149
int ret;
150
151
ret = laiocb->ret;
152
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
153
}
154
155
laiocb->ret = ret;
156
- aio_context_acquire(s->aio_context);
157
if (laiocb->co) {
158
/* If the coroutine is already entered it must be in ioq_submit() and
159
* will notice laio->ret has been filled in when it eventually runs
160
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
161
* that!
162
*/
163
if (!qemu_coroutine_entered(laiocb->co)) {
164
- qemu_coroutine_enter(laiocb->co);
165
+ aio_co_wake(laiocb->co);
166
}
167
} else {
168
laiocb->common.cb(laiocb->common.opaque, ret);
169
qemu_aio_unref(laiocb);
170
}
171
- aio_context_release(s->aio_context);
172
}
173
174
/**
175
diff --git a/block/mirror.c b/block/mirror.c
176
index XXXXXXX..XXXXXXX 100644
177
--- a/block/mirror.c
178
+++ b/block/mirror.c
179
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
180
{
181
MirrorOp *op = opaque;
182
MirrorBlockJob *s = op->s;
183
+
184
+ aio_context_acquire(blk_get_aio_context(s->common.blk));
185
if (ret < 0) {
186
BlockErrorAction action;
187
188
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
189
}
190
}
191
mirror_iteration_done(op, ret);
192
+ aio_context_release(blk_get_aio_context(s->common.blk));
193
}
194
195
static void mirror_read_complete(void *opaque, int ret)
196
{
197
MirrorOp *op = opaque;
198
MirrorBlockJob *s = op->s;
199
+
200
+ aio_context_acquire(blk_get_aio_context(s->common.blk));
201
if (ret < 0) {
202
BlockErrorAction action;
203
204
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
205
}
206
207
mirror_iteration_done(op, ret);
208
- return;
209
+ } else {
210
+ blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
211
+ 0, mirror_write_complete, op);
212
}
213
- blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
214
- 0, mirror_write_complete, op);
215
+ aio_context_release(blk_get_aio_context(s->common.blk));
216
}
217
218
static inline void mirror_clip_sectors(MirrorBlockJob *s,
219
diff --git a/block/null.c b/block/null.c
220
index XXXXXXX..XXXXXXX 100644
221
--- a/block/null.c
222
+++ b/block/null.c
223
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
224
static void null_bh_cb(void *opaque)
225
{
226
NullAIOCB *acb = opaque;
227
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
228
-
229
- aio_context_acquire(ctx);
230
acb->common.cb(acb->common.opaque, 0);
231
- aio_context_release(ctx);
232
qemu_aio_unref(acb);
233
}
234
235
static void null_timer_cb(void *opaque)
236
{
237
NullAIOCB *acb = opaque;
238
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
239
-
240
- aio_context_acquire(ctx);
241
acb->common.cb(acb->common.opaque, 0);
242
- aio_context_release(ctx);
243
timer_deinit(&acb->timer);
244
qemu_aio_unref(acb);
245
}
246
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
247
index XXXXXXX..XXXXXXX 100644
248
--- a/block/qed-cluster.c
249
+++ b/block/qed-cluster.c
250
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
251
unsigned int index;
252
unsigned int n;
253
254
+ qed_acquire(s);
255
if (ret) {
256
goto out;
257
}
258
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
259
260
out:
261
find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
262
+ qed_release(s);
263
g_free(find_cluster_cb);
264
}
265
266
diff --git a/block/qed-table.c b/block/qed-table.c
267
index XXXXXXX..XXXXXXX 100644
268
--- a/block/qed-table.c
269
+++ b/block/qed-table.c
270
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
271
{
272
QEDReadTableCB *read_table_cb = opaque;
273
QEDTable *table = read_table_cb->table;
274
+ BDRVQEDState *s = read_table_cb->s;
275
int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
276
int i;
277
278
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
279
}
280
281
/* Byteswap offsets */
282
+ qed_acquire(s);
283
for (i = 0; i < noffsets; i++) {
284
table->offsets[i] = le64_to_cpu(table->offsets[i]);
285
}
286
+ qed_release(s);
287
288
out:
289
/* Completion */
290
- trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
291
+ trace_qed_read_table_cb(s, read_table_cb->table, ret);
292
gencb_complete(&read_table_cb->gencb, ret);
293
}
294
295
@@ -XXX,XX +XXX,XX @@ typedef struct {
296
static void qed_write_table_cb(void *opaque, int ret)
297
{
298
QEDWriteTableCB *write_table_cb = opaque;
299
+ BDRVQEDState *s = write_table_cb->s;
300
301
- trace_qed_write_table_cb(write_table_cb->s,
302
+ trace_qed_write_table_cb(s,
303
write_table_cb->orig_table,
304
write_table_cb->flush,
305
ret);
306
@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
307
if (write_table_cb->flush) {
308
/* We still need to flush first */
309
write_table_cb->flush = false;
310
+ qed_acquire(s);
311
bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
312
write_table_cb);
313
+ qed_release(s);
314
return;
315
}
316
317
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
318
CachedL2Table *l2_table = request->l2_table;
319
uint64_t l2_offset = read_l2_table_cb->l2_offset;
320
321
+ qed_acquire(s);
322
if (ret) {
323
/* can't trust loaded L2 table anymore */
324
qed_unref_l2_cache_entry(l2_table);
325
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
326
request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
327
assert(request->l2_table != NULL);
328
}
329
+ qed_release(s);
330
331
gencb_complete(&read_l2_table_cb->gencb, ret);
332
}
333
diff --git a/block/qed.c b/block/qed.c
334
index XXXXXXX..XXXXXXX 100644
335
--- a/block/qed.c
336
+++ b/block/qed.c
337
@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
338
}
339
340
if (cb->co) {
341
- qemu_coroutine_enter(cb->co);
342
+ aio_co_wake(cb->co);
343
}
344
}
345
346
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
347
cb->done = true;
348
cb->ret = ret;
349
if (cb->co) {
350
- qemu_coroutine_enter(cb->co);
351
+ aio_co_wake(cb->co);
352
}
353
}
354
355
diff --git a/block/rbd.c b/block/rbd.c
356
index XXXXXXX..XXXXXXX 100644
357
--- a/block/rbd.c
358
+++ b/block/rbd.c
359
@@ -XXX,XX +XXX,XX @@ shutdown:
360
static void qemu_rbd_complete_aio(RADOSCB *rcb)
361
{
362
RBDAIOCB *acb = rcb->acb;
363
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
364
int64_t r;
365
366
r = rcb->ret;
367
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
368
qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
369
}
370
qemu_vfree(acb->bounce);
371
-
372
- aio_context_acquire(ctx);
373
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
374
- aio_context_release(ctx);
375
376
qemu_aio_unref(acb);
377
}
378
diff --git a/block/win32-aio.c b/block/win32-aio.c
379
index XXXXXXX..XXXXXXX 100644
380
--- a/block/win32-aio.c
381
+++ b/block/win32-aio.c
382
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
383
qemu_vfree(waiocb->buf);
384
}
385
386
-
387
- aio_context_acquire(s->aio_ctx);
388
waiocb->common.cb(waiocb->common.opaque, ret);
389
- aio_context_release(s->aio_ctx);
390
qemu_aio_unref(waiocb);
391
}
392
393
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
12
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
394
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
395
--- a/hw/block/virtio-blk.c
14
--- a/hw/block/virtio-blk.c
396
+++ b/hw/block/virtio-blk.c
15
+++ b/hw/block/virtio-blk.c
397
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
16
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
398
static void virtio_blk_rw_complete(void *opaque, int ret)
17
int64_t nz = data->zone_report_data.nr_zones;
399
{
18
int8_t err_status = VIRTIO_BLK_S_OK;
400
VirtIOBlockReq *next = opaque;
19
401
+ VirtIOBlock *s = next->dev;
20
+ trace_virtio_blk_zone_report_complete(vdev, req, nz, ret);
402
21
if (ret) {
403
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
22
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
404
while (next) {
23
goto out;
405
VirtIOBlockReq *req = next;
24
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
406
next = req->mr_next;
25
nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
407
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
26
sizeof(struct virtio_blk_zone_report)) /
408
block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
27
sizeof(struct virtio_blk_zone_descriptor);
409
virtio_blk_free_request(req);
28
+ trace_virtio_blk_handle_zone_report(vdev, req,
410
}
29
+ offset >> BDRV_SECTOR_BITS, nr_zones);
411
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
30
412
}
31
zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
413
32
data = g_malloc(sizeof(ZoneCmdData));
414
static void virtio_blk_flush_complete(void *opaque, int ret)
33
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
415
{
34
{
416
VirtIOBlockReq *req = opaque;
35
VirtIOBlockReq *req = opaque;
417
+ VirtIOBlock *s = req->dev;
36
VirtIOBlock *s = req->dev;
418
37
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
419
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
38
int8_t err_status = VIRTIO_BLK_S_OK;
39
+ trace_virtio_blk_zone_mgmt_complete(vdev, req,ret);
40
420
if (ret) {
41
if (ret) {
421
if (virtio_blk_handle_rw_error(req, -ret, 0)) {
42
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
422
- return;
43
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
423
+ goto out;
44
/* Entire drive capacity */
45
offset = 0;
46
len = capacity;
47
+ trace_virtio_blk_handle_zone_reset_all(vdev, req, 0,
48
+ bs->total_sectors);
49
} else {
50
if (bs->bl.zone_size > capacity - offset) {
51
/* The zoned device allows the last smaller zone. */
52
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
53
} else {
54
len = bs->bl.zone_size;
424
}
55
}
56
+ trace_virtio_blk_handle_zone_mgmt(vdev, req, op,
57
+ offset >> BDRV_SECTOR_BITS,
58
+ len >> BDRV_SECTOR_BITS);
425
}
59
}
426
60
427
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
61
if (!check_zoned_request(s, offset, len, false, &err_status)) {
428
block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
62
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
429
virtio_blk_free_request(req);
63
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
430
+
64
goto out;
431
+out:
65
}
432
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
66
+ trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
433
}
434
435
#ifdef __linux__
436
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
437
virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
438
67
439
out:
68
out:
440
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
69
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
441
virtio_blk_req_complete(req, status);
70
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
442
virtio_blk_free_request(req);
71
int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
443
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
72
int64_t len = iov_size(out_iov, out_num);
444
g_free(ioctl_req);
73
445
}
74
+ trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS);
446
75
if (!check_zoned_request(s, offset, len, true, &err_status)) {
447
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
76
goto out;
77
}
78
diff --git a/hw/block/trace-events b/hw/block/trace-events
448
index XXXXXXX..XXXXXXX 100644
79
index XXXXXXX..XXXXXXX 100644
449
--- a/hw/scsi/scsi-disk.c
80
--- a/hw/block/trace-events
450
+++ b/hw/scsi/scsi-disk.c
81
+++ b/hw/block/trace-events
451
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
82
@@ -XXX,XX +XXX,XX @@ pflash_write_unknown(const char *name, uint8_t cmd) "%s: unknown command 0x%02x"
452
83
# virtio-blk.c
453
assert(r->req.aiocb != NULL);
84
virtio_blk_req_complete(void *vdev, void *req, int status) "vdev %p req %p status %d"
454
r->req.aiocb = NULL;
85
virtio_blk_rw_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
455
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
86
+virtio_blk_zone_report_complete(void *vdev, void *req, unsigned int nr_zones, int ret) "vdev %p req %p nr_zones %u ret %d"
456
if (scsi_disk_req_check_error(r, ret, true)) {
87
+virtio_blk_zone_mgmt_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
457
goto done;
88
+virtio_blk_zone_append_complete(void *vdev, void *req, int64_t sector, int ret) "vdev %p req %p, append sector 0x%" PRIx64 " ret %d"
458
}
89
virtio_blk_handle_write(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
459
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
90
virtio_blk_handle_read(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
460
scsi_req_complete(&r->req, GOOD);
91
virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint64_t offset, size_t size, bool is_write) "vdev %p mrb %p start %d num_reqs %d offset %"PRIu64" size %zu is_write %d"
461
92
+virtio_blk_handle_zone_report(void *vdev, void *req, int64_t sector, unsigned int nr_zones) "vdev %p req %p sector 0x%" PRIx64 " nr_zones %u"
462
done:
93
+virtio_blk_handle_zone_mgmt(void *vdev, void *req, uint8_t op, int64_t sector, int64_t len) "vdev %p req %p op 0x%x sector 0x%" PRIx64 " len 0x%" PRIx64 ""
463
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
94
+virtio_blk_handle_zone_reset_all(void *vdev, void *req, int64_t sector, int64_t len) "vdev %p req %p sector 0x%" PRIx64 " cap 0x%" PRIx64 ""
464
scsi_req_unref(&r->req);
95
+virtio_blk_handle_zone_append(void *vdev, void *req, int64_t sector) "vdev %p req %p, append sector 0x%" PRIx64 ""
465
}
96
466
97
# hd-geometry.c
467
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
98
hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
468
assert(r->req.aiocb != NULL);
469
r->req.aiocb = NULL;
470
471
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
472
if (ret < 0) {
473
block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
474
} else {
475
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
476
}
477
scsi_dma_complete_noio(r, ret);
478
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
479
}
480
481
static void scsi_read_complete(void * opaque, int ret)
482
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
483
484
assert(r->req.aiocb != NULL);
485
r->req.aiocb = NULL;
486
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
487
if (scsi_disk_req_check_error(r, ret, true)) {
488
goto done;
489
}
490
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
491
492
done:
493
scsi_req_unref(&r->req);
494
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
495
}
496
497
/* Actually issue a read to the block device. */
498
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
499
assert (r->req.aiocb != NULL);
500
r->req.aiocb = NULL;
501
502
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
503
if (ret < 0) {
504
block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
505
} else {
506
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
507
}
508
scsi_do_read(opaque, ret);
509
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
510
}
511
512
/* Read more data from scsi device into buffer. */
513
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
514
assert (r->req.aiocb != NULL);
515
r->req.aiocb = NULL;
516
517
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
518
if (ret < 0) {
519
block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
520
} else {
521
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
522
}
523
scsi_write_complete_noio(r, ret);
524
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
525
}
526
527
static void scsi_write_data(SCSIRequest *req)
528
@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
529
{
530
UnmapCBData *data = opaque;
531
SCSIDiskReq *r = data->r;
532
+ SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
533
534
assert(r->req.aiocb != NULL);
535
r->req.aiocb = NULL;
536
537
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
538
scsi_unmap_complete_noio(data, ret);
539
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
540
}
541
542
static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
543
@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
544
545
assert(r->req.aiocb != NULL);
546
r->req.aiocb = NULL;
547
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
548
if (scsi_disk_req_check_error(r, ret, true)) {
549
goto done;
550
}
551
@@ -XXX,XX +XXX,XX @@ done:
552
scsi_req_unref(&r->req);
553
qemu_vfree(data->iov.iov_base);
554
g_free(data);
555
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
556
}
557
558
static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
559
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
560
index XXXXXXX..XXXXXXX 100644
561
--- a/hw/scsi/scsi-generic.c
562
+++ b/hw/scsi/scsi-generic.c
563
@@ -XXX,XX +XXX,XX @@ done:
564
static void scsi_command_complete(void *opaque, int ret)
565
{
566
SCSIGenericReq *r = (SCSIGenericReq *)opaque;
567
+ SCSIDevice *s = r->req.dev;
568
569
assert(r->req.aiocb != NULL);
570
r->req.aiocb = NULL;
571
+
572
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
573
scsi_command_complete_noio(r, ret);
574
+ aio_context_release(blk_get_aio_context(s->conf.blk));
575
}
576
577
static int execute_command(BlockBackend *blk,
578
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
579
assert(r->req.aiocb != NULL);
580
r->req.aiocb = NULL;
581
582
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
583
+
584
if (ret || r->req.io_canceled) {
585
scsi_command_complete_noio(r, ret);
586
- return;
587
+ goto done;
588
}
589
590
len = r->io_header.dxfer_len - r->io_header.resid;
591
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
592
r->len = -1;
593
if (len == 0) {
594
scsi_command_complete_noio(r, 0);
595
- return;
596
+ goto done;
597
}
598
599
/* Snoop READ CAPACITY output to set the blocksize. */
600
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
601
}
602
scsi_req_data(&r->req, len);
603
scsi_req_unref(&r->req);
604
+
605
+done:
606
+ aio_context_release(blk_get_aio_context(s->conf.blk));
607
}
608
609
/* Read more data from scsi device into buffer. */
610
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
611
assert(r->req.aiocb != NULL);
612
r->req.aiocb = NULL;
613
614
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
615
+
616
if (ret || r->req.io_canceled) {
617
scsi_command_complete_noio(r, ret);
618
- return;
619
+ goto done;
620
}
621
622
if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
623
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
624
}
625
626
scsi_command_complete_noio(r, ret);
627
+
628
+done:
629
+ aio_context_release(blk_get_aio_context(s->conf.blk));
630
}
631
632
/* Write data to a scsi device. Returns nonzero on failure.
633
diff --git a/util/thread-pool.c b/util/thread-pool.c
634
index XXXXXXX..XXXXXXX 100644
635
--- a/util/thread-pool.c
636
+++ b/util/thread-pool.c
637
@@ -XXX,XX +XXX,XX @@ restart:
638
*/
639
qemu_bh_schedule(pool->completion_bh);
640
641
+ aio_context_release(pool->ctx);
642
elem->common.cb(elem->common.opaque, elem->ret);
643
+ aio_context_acquire(pool->ctx);
644
qemu_aio_unref(elem);
645
goto restart;
646
} else {
647
@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
648
ThreadPoolCo *co = opaque;
649
650
co->ret = ret;
651
- qemu_coroutine_enter(co->co);
652
+ aio_co_wake(co->co);
653
}
654
655
int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
656
--
99
--
657
2.9.3
100
2.40.1
658
659
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
4
Extract the dispatching loop for file descriptor handlers into a new
5
function aio_dispatch_handlers, and then inline aio_dispatch into
6
aio_poll.
7
8
aio_dispatch can now become void.
9
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
12
Reviewed-by: Fam Zheng <famz@redhat.com>
13
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
14
Message-id: 20170213135235.12274-17-pbonzini@redhat.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
include/block/aio.h | 6 +-----
18
util/aio-posix.c | 44 ++++++++++++++------------------------------
19
util/aio-win32.c | 13 ++++---------
20
util/async.c | 2 +-
21
4 files changed, 20 insertions(+), 45 deletions(-)
22
23
diff --git a/include/block/aio.h b/include/block/aio.h
24
index XXXXXXX..XXXXXXX 100644
25
--- a/include/block/aio.h
26
+++ b/include/block/aio.h
27
@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
28
/* Dispatch any pending callbacks from the GSource attached to the AioContext.
29
*
30
* This is used internally in the implementation of the GSource.
31
- *
32
- * @dispatch_fds: true to process fds, false to skip them
33
- * (can be used as an optimization by callers that know there
34
- * are no fds ready)
35
*/
36
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
37
+void aio_dispatch(AioContext *ctx);
38
39
/* Progress in completing AIO work to occur. This can issue new pending
40
* aio as a result of executing I/O completion or bh callbacks.
41
diff --git a/util/aio-posix.c b/util/aio-posix.c
42
index XXXXXXX..XXXXXXX 100644
43
--- a/util/aio-posix.c
44
+++ b/util/aio-posix.c
45
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
46
AioHandler *node, *tmp;
47
bool progress = false;
48
49
- /*
50
- * We have to walk very carefully in case aio_set_fd_handler is
51
- * called while we're walking.
52
- */
53
- qemu_lockcnt_inc(&ctx->list_lock);
54
-
55
QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
56
int revents;
57
58
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
59
}
60
}
61
62
- qemu_lockcnt_dec(&ctx->list_lock);
63
return progress;
64
}
65
66
-/*
67
- * Note that dispatch_fds == false has the side-effect of post-poning the
68
- * freeing of deleted handlers.
69
- */
70
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
71
+void aio_dispatch(AioContext *ctx)
72
{
73
- bool progress;
74
+ aio_bh_poll(ctx);
75
76
- /*
77
- * If there are callbacks left that have been queued, we need to call them.
78
- * Do not call select in this case, because it is possible that the caller
79
- * does not need a complete flush (as is the case for aio_poll loops).
80
- */
81
- progress = aio_bh_poll(ctx);
82
+ qemu_lockcnt_inc(&ctx->list_lock);
83
+ aio_dispatch_handlers(ctx);
84
+ qemu_lockcnt_dec(&ctx->list_lock);
85
86
- if (dispatch_fds) {
87
- progress |= aio_dispatch_handlers(ctx);
88
- }
89
-
90
- /* Run our timers */
91
- progress |= timerlistgroup_run_timers(&ctx->tlg);
92
-
93
- return progress;
94
+ timerlistgroup_run_timers(&ctx->tlg);
95
}
96
97
/* These thread-local variables are used only in a small part of aio_poll
98
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
99
npfd = 0;
100
qemu_lockcnt_dec(&ctx->list_lock);
101
102
- /* Run dispatch even if there were no readable fds to run timers */
103
- if (aio_dispatch(ctx, ret > 0)) {
104
- progress = true;
105
+ progress |= aio_bh_poll(ctx);
106
+
107
+ if (ret > 0) {
108
+ qemu_lockcnt_inc(&ctx->list_lock);
109
+ progress |= aio_dispatch_handlers(ctx);
110
+ qemu_lockcnt_dec(&ctx->list_lock);
111
}
112
113
+ progress |= timerlistgroup_run_timers(&ctx->tlg);
114
+
115
return progress;
116
}
117
118
diff --git a/util/aio-win32.c b/util/aio-win32.c
119
index XXXXXXX..XXXXXXX 100644
120
--- a/util/aio-win32.c
121
+++ b/util/aio-win32.c
122
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
123
return progress;
124
}
125
126
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
127
+void aio_dispatch(AioContext *ctx)
128
{
129
- bool progress;
130
-
131
- progress = aio_bh_poll(ctx);
132
- if (dispatch_fds) {
133
- progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
134
- }
135
- progress |= timerlistgroup_run_timers(&ctx->tlg);
136
- return progress;
137
+ aio_bh_poll(ctx);
138
+ aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
139
+ timerlistgroup_run_timers(&ctx->tlg);
140
}
141
142
bool aio_poll(AioContext *ctx, bool blocking)
143
diff --git a/util/async.c b/util/async.c
144
index XXXXXXX..XXXXXXX 100644
145
--- a/util/async.c
146
+++ b/util/async.c
147
@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource *source,
148
AioContext *ctx = (AioContext *) source;
149
150
assert(callback == NULL);
151
- aio_dispatch(ctx, true);
152
+ aio_dispatch(ctx);
153
return true;
154
}
155
156
--
157
2.9.3
158
159
diff view generated by jsdifflib
Deleted patch
1
From: Paolo Bonzini <pbonzini@redhat.com>
2
1
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
5
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Message-id: 20170213135235.12274-19-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
include/block/block_int.h | 64 +++++++++++++++++++++++++-----------------
11
include/sysemu/block-backend.h | 14 ++++++---
12
2 files changed, 49 insertions(+), 29 deletions(-)
13
14
diff --git a/include/block/block_int.h b/include/block/block_int.h
15
index XXXXXXX..XXXXXXX 100644
16
--- a/include/block/block_int.h
17
+++ b/include/block/block_int.h
18
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
19
* copied as well.
20
*/
21
struct BlockDriverState {
22
- int64_t total_sectors; /* if we are reading a disk image, give its
23
- size in sectors */
24
+ /* Protected by big QEMU lock or read-only after opening. No special
25
+ * locking needed during I/O...
26
+ */
27
int open_flags; /* flags used to open the file, re-used for re-open */
28
bool read_only; /* if true, the media is read only */
29
bool encrypted; /* if true, the media is encrypted */
30
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
31
bool sg; /* if true, the device is a /dev/sg* */
32
bool probed; /* if true, format was probed rather than specified */
33
34
- int copy_on_read; /* if nonzero, copy read backing sectors into image.
35
- note this is a reference count */
36
-
37
- CoQueue flush_queue; /* Serializing flush queue */
38
- bool active_flush_req; /* Flush request in flight? */
39
- unsigned int write_gen; /* Current data generation */
40
- unsigned int flushed_gen; /* Flushed write generation */
41
-
42
BlockDriver *drv; /* NULL means no media */
43
void *opaque;
44
45
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
46
BdrvChild *backing;
47
BdrvChild *file;
48
49
- /* Callback before write request is processed */
50
- NotifierWithReturnList before_write_notifiers;
51
-
52
- /* number of in-flight requests; overall and serialising */
53
- unsigned int in_flight;
54
- unsigned int serialising_in_flight;
55
-
56
- bool wakeup;
57
-
58
- /* Offset after the highest byte written to */
59
- uint64_t wr_highest_offset;
60
-
61
/* I/O Limits */
62
BlockLimits bl;
63
64
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
65
QTAILQ_ENTRY(BlockDriverState) bs_list;
66
/* element of the list of monitor-owned BDS */
67
QTAILQ_ENTRY(BlockDriverState) monitor_list;
68
- QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
69
int refcnt;
70
71
- QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
72
-
73
/* operation blockers */
74
QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
75
76
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
77
/* The error object in use for blocking operations on backing_hd */
78
Error *backing_blocker;
79
80
+ /* Protected by AioContext lock */
81
+
82
+ /* If true, copy read backing sectors into image. Can be >1 if more
83
+ * than one client has requested copy-on-read.
84
+ */
85
+ int copy_on_read;
86
+
87
+ /* If we are reading a disk image, give its size in sectors.
88
+ * Generally read-only; it is written to by load_vmstate and save_vmstate,
89
+ * but the block layer is quiescent during those.
90
+ */
91
+ int64_t total_sectors;
92
+
93
+ /* Callback before write request is processed */
94
+ NotifierWithReturnList before_write_notifiers;
95
+
96
+ /* number of in-flight requests; overall and serialising */
97
+ unsigned int in_flight;
98
+ unsigned int serialising_in_flight;
99
+
100
+ bool wakeup;
101
+
102
+ /* Offset after the highest byte written to */
103
+ uint64_t wr_highest_offset;
104
+
105
/* threshold limit for writes, in bytes. "High water mark". */
106
uint64_t write_threshold_offset;
107
NotifierWithReturn write_threshold_notifier;
108
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
109
/* counter for nested bdrv_io_plug */
110
unsigned io_plugged;
111
112
+ QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
113
+ CoQueue flush_queue; /* Serializing flush queue */
114
+ bool active_flush_req; /* Flush request in flight? */
115
+ unsigned int write_gen; /* Current data generation */
116
+ unsigned int flushed_gen; /* Flushed write generation */
117
+
118
+ QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
119
+
120
+ /* do we need to tell the quest if we have a volatile write cache? */
121
+ int enable_write_cache;
122
+
123
int quiesce_counter;
124
};
125
126
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
127
index XXXXXXX..XXXXXXX 100644
128
--- a/include/sysemu/block-backend.h
129
+++ b/include/sysemu/block-backend.h
130
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
131
* fields that must be public. This is in particular for QLIST_ENTRY() and
132
* friends so that BlockBackends can be kept in lists outside block-backend.c */
133
typedef struct BlockBackendPublic {
134
- /* I/O throttling.
135
- * throttle_state tells us if this BlockBackend has I/O limits configured.
136
- * io_limits_disabled tells us if they are currently being enforced */
137
+ /* I/O throttling has its own locking, but also some fields are
138
+ * protected by the AioContext lock.
139
+ */
140
+
141
+ /* Protected by AioContext lock. */
142
CoQueue throttled_reqs[2];
143
+
144
+ /* Nonzero if the I/O limits are currently being ignored; generally
145
+ * it is zero. */
146
unsigned int io_limits_disabled;
147
148
/* The following fields are protected by the ThrottleGroup lock.
149
- * See the ThrottleGroup documentation for details. */
150
+ * See the ThrottleGroup documentation for details.
151
+ * throttle_state tells us if I/O limits are configured. */
152
ThrottleState *throttle_state;
153
ThrottleTimers throttle_timers;
154
unsigned pending_reqs[2];
155
--
156
2.9.3
157
158
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
This will avoid forward references in the next patch. It is also
3
Add the documentation about the example of using virtio-blk driver
4
more logical because CoQueue is not anymore the basic primitive.
4
to pass the zoned block devices through to the guest.
5
5
6
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
7
Reviewed-by: Fam Zheng <famz@redhat.com>
7
Message-id: 20230508051916.178322-5-faithilikerun@gmail.com
8
Message-id: 20170213181244.16297-5-pbonzini@redhat.com
8
[Fix pre-formatted code syntax
9
--Stefan]
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
11
---
11
include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
12
docs/devel/zoned-storage.rst | 19 +++++++++++++++++++
12
1 file changed, 44 insertions(+), 45 deletions(-)
13
1 file changed, 19 insertions(+)
13
14
14
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
15
diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
15
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
16
--- a/include/qemu/coroutine.h
17
--- a/docs/devel/zoned-storage.rst
17
+++ b/include/qemu/coroutine.h
18
+++ b/docs/devel/zoned-storage.rst
18
@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
19
@@ -XXX,XX +XXX,XX @@ APIs for zoned storage emulation or testing.
19
*/
20
For example, to test zone_report on a null_blk device using qemu-io is::
20
bool qemu_coroutine_entered(Coroutine *co);
21
21
22
$ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
22
-
23
-/**
24
- * CoQueues are a mechanism to queue coroutines in order to continue executing
25
- * them later. They provide the fundamental primitives on which coroutine locks
26
- * are built.
27
- */
28
-typedef struct CoQueue {
29
- QSIMPLEQ_HEAD(, Coroutine) entries;
30
-} CoQueue;
31
-
32
-/**
33
- * Initialise a CoQueue. This must be called before any other operation is used
34
- * on the CoQueue.
35
- */
36
-void qemu_co_queue_init(CoQueue *queue);
37
-
38
-/**
39
- * Adds the current coroutine to the CoQueue and transfers control to the
40
- * caller of the coroutine.
41
- */
42
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
43
-
44
-/**
45
- * Restarts the next coroutine in the CoQueue and removes it from the queue.
46
- *
47
- * Returns true if a coroutine was restarted, false if the queue is empty.
48
- */
49
-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
50
-
51
-/**
52
- * Restarts all coroutines in the CoQueue and leaves the queue empty.
53
- */
54
-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
55
-
56
-/**
57
- * Enter the next coroutine in the queue
58
- */
59
-bool qemu_co_enter_next(CoQueue *queue);
60
-
61
-/**
62
- * Checks if the CoQueue is empty.
63
- */
64
-bool qemu_co_queue_empty(CoQueue *queue);
65
-
66
-
67
/**
68
* Provides a mutex that can be used to synchronise coroutines
69
*/
70
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
71
*/
72
void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
73
74
+
23
+
75
+/**
24
+To expose the host's zoned block device through virtio-blk, the command line
76
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
25
+can be (includes the -device parameter)::
77
+ * them later.
78
+ */
79
+typedef struct CoQueue {
80
+ QSIMPLEQ_HEAD(, Coroutine) entries;
81
+} CoQueue;
82
+
26
+
83
+/**
27
+ -blockdev node-name=drive0,driver=host_device,filename=/dev/nullb0,cache.direct=on \
84
+ * Initialise a CoQueue. This must be called before any other operation is used
28
+ -device virtio-blk-pci,drive=drive0
85
+ * on the CoQueue.
86
+ */
87
+void qemu_co_queue_init(CoQueue *queue);
88
+
29
+
89
+/**
30
+Or only use the -drive parameter::
90
+ * Adds the current coroutine to the CoQueue and transfers control to the
91
+ * caller of the coroutine.
92
+ */
93
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
94
+
31
+
95
+/**
32
+ -driver driver=host_device,file=/dev/nullb0,if=virtio,cache.direct=on
96
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
97
+ *
98
+ * Returns true if a coroutine was restarted, false if the queue is empty.
99
+ */
100
+bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
101
+
33
+
102
+/**
34
+Additionally, QEMU has several ways of supporting zoned storage, including:
103
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
35
+(1) Using virtio-scsi: --device scsi-block allows for the passing through of
104
+ */
36
+SCSI ZBC devices, enabling the attachment of ZBC or ZAC HDDs to QEMU.
105
+void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
37
+(2) PCI device pass-through: While NVMe ZNS emulation is available for testing
106
+
38
+purposes, it cannot yet pass through a zoned device from the host. To pass on
107
+/**
39
+the NVMe ZNS device to the guest, use VFIO PCI pass the entire NVMe PCI adapter
108
+ * Enter the next coroutine in the queue
40
+through to the guest. Likewise, an HDD HBA can be passed on to QEMU all HDDs
109
+ */
41
+attached to the HBA.
110
+bool qemu_co_enter_next(CoQueue *queue);
111
+
112
+/**
113
+ * Checks if the CoQueue is empty.
114
+ */
115
+bool qemu_co_queue_empty(CoQueue *queue);
116
+
117
+
118
typedef struct CoRwlock {
119
bool writer;
120
int reader;
121
--
42
--
122
2.9.3
43
2.40.1
123
124
diff view generated by jsdifflib