1
The following changes since commit 6c769690ac845fa62642a5f93b4e4bd906adab95:
1
The following changes since commit 8844bb8d896595ee1d25d21c770e6e6f29803097:
2
2
3
Merge remote-tracking branch 'remotes/vsementsov/tags/pull-simplebench-2021-05-04' into staging (2021-05-21 12:02:34 +0100)
3
Merge tag 'or1k-pull-request-20230513' of https://github.com/stffrdhrn/qemu into staging (2023-05-13 11:23:14 +0100)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 0a6f0c76a030710780ce10d6347a70f098024d21:
9
for you to fetch changes up to 01562fee5f3ad4506d57dbcf4b1903b565eceec7:
10
10
11
coroutine-sleep: introduce qemu_co_sleep (2021-05-21 18:22:33 +0100)
11
docs/zoned-storage:add zoned emulation use case (2023-05-15 08:19:04 -0400)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
(Resent due to an email preparation mistake.)
16
This pull request contain's Sam Li's zoned storage support in the QEMU block
17
layer and virtio-blk emulation.
18
19
v2:
20
- Sam fixed the CI failures. CI passes for me now. [Richard]
17
21
18
----------------------------------------------------------------
22
----------------------------------------------------------------
19
23
20
Paolo Bonzini (6):
24
Sam Li (16):
21
coroutine-sleep: use a stack-allocated timer
25
block/block-common: add zoned device structs
22
coroutine-sleep: disallow NULL QemuCoSleepState** argument
26
block/file-posix: introduce helper functions for sysfs attributes
23
coroutine-sleep: allow qemu_co_sleep_wake that wakes nothing
27
block/block-backend: add block layer APIs resembling Linux
24
coroutine-sleep: move timer out of QemuCoSleepState
28
ZonedBlockDevice ioctls
25
coroutine-sleep: replace QemuCoSleepState pointer with struct in the
29
block/raw-format: add zone operations to pass through requests
26
API
30
block: add zoned BlockDriver check to block layer
27
coroutine-sleep: introduce qemu_co_sleep
31
iotests: test new zone operations
32
block: add some trace events for new block layer APIs
33
docs/zoned-storage: add zoned device documentation
34
file-posix: add tracking of the zone write pointers
35
block: introduce zone append write for zoned devices
36
qemu-iotests: test zone append operation
37
block: add some trace events for zone append
38
virtio-blk: add zoned storage emulation for zoned devices
39
block: add accounting for zone append operation
40
virtio-blk: add some trace events for zoned emulation
41
docs/zoned-storage:add zoned emulation use case
28
42
29
Philippe Mathieu-Daudé (1):
43
docs/devel/index-api.rst | 1 +
30
bitops.h: Improve find_xxx_bit() documentation
44
docs/devel/zoned-storage.rst | 62 +++
31
45
qapi/block-core.json | 68 ++-
32
Zenghui Yu (1):
46
qapi/block.json | 4 +
33
multi-process: Initialize variables declared with g_auto*
47
meson.build | 5 +
34
48
include/block/accounting.h | 1 +
35
include/qemu/bitops.h | 15 ++++++--
49
include/block/block-common.h | 57 ++
36
include/qemu/coroutine.h | 27 ++++++++-----
50
include/block/block-io.h | 13 +
37
block/block-copy.c | 10 ++---
51
include/block/block_int-common.h | 37 ++
38
block/nbd.c | 14 +++----
52
include/block/raw-aio.h | 8 +-
39
hw/remote/memory.c | 5 +--
53
include/sysemu/block-backend-io.h | 27 +
40
hw/remote/proxy.c | 3 +-
54
block.c | 19 +
41
util/qemu-coroutine-sleep.c | 75 +++++++++++++++++++------------------
55
block/block-backend.c | 198 +++++++
42
7 files changed, 79 insertions(+), 70 deletions(-)
56
block/file-posix.c | 692 +++++++++++++++++++++++--
57
block/io.c | 68 +++
58
block/io_uring.c | 4 +
59
block/linux-aio.c | 3 +
60
block/qapi-sysemu.c | 11 +
61
block/qapi.c | 18 +
62
block/raw-format.c | 26 +
63
hw/block/virtio-blk-common.c | 2 +
64
hw/block/virtio-blk.c | 405 +++++++++++++++
65
hw/virtio/virtio-qmp.c | 2 +
66
qemu-io-cmds.c | 224 ++++++++
67
block/trace-events | 4 +
68
docs/system/qemu-block-drivers.rst.inc | 6 +
69
hw/block/trace-events | 7 +
70
tests/qemu-iotests/227.out | 18 +
71
tests/qemu-iotests/tests/zoned | 105 ++++
72
tests/qemu-iotests/tests/zoned.out | 69 +++
73
30 files changed, 2106 insertions(+), 58 deletions(-)
74
create mode 100644 docs/devel/zoned-storage.rst
75
create mode 100755 tests/qemu-iotests/tests/zoned
76
create mode 100644 tests/qemu-iotests/tests/zoned.out
43
77
44
--
78
--
45
2.31.1
79
2.40.1
46
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
6
Reviewed-by: Hannes Reinecke <hare@suse.de>
7
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
8
Acked-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: 20230508045533.175575-2-faithilikerun@gmail.com
11
Message-id: 20230324090605.28361-2-faithilikerun@gmail.com
12
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
13
<philmd@linaro.org>.
14
--Stefan]
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
include/block/block-common.h | 43 ++++++++++++++++++++++++++++++++++++
18
1 file changed, 43 insertions(+)
19
20
diff --git a/include/block/block-common.h b/include/block/block-common.h
21
index XXXXXXX..XXXXXXX 100644
22
--- a/include/block/block-common.h
23
+++ b/include/block/block-common.h
24
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDriver BlockDriver;
25
typedef struct BdrvChild BdrvChild;
26
typedef struct BdrvChildClass BdrvChildClass;
27
28
+typedef enum BlockZoneOp {
29
+ BLK_ZO_OPEN,
30
+ BLK_ZO_CLOSE,
31
+ BLK_ZO_FINISH,
32
+ BLK_ZO_RESET,
33
+} BlockZoneOp;
34
+
35
+typedef enum BlockZoneModel {
36
+ BLK_Z_NONE = 0x0, /* Regular block device */
37
+ BLK_Z_HM = 0x1, /* Host-managed zoned block device */
38
+ BLK_Z_HA = 0x2, /* Host-aware zoned block device */
39
+} BlockZoneModel;
40
+
41
+typedef enum BlockZoneState {
42
+ BLK_ZS_NOT_WP = 0x0,
43
+ BLK_ZS_EMPTY = 0x1,
44
+ BLK_ZS_IOPEN = 0x2,
45
+ BLK_ZS_EOPEN = 0x3,
46
+ BLK_ZS_CLOSED = 0x4,
47
+ BLK_ZS_RDONLY = 0xD,
48
+ BLK_ZS_FULL = 0xE,
49
+ BLK_ZS_OFFLINE = 0xF,
50
+} BlockZoneState;
51
+
52
+typedef enum BlockZoneType {
53
+ BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
54
+ BLK_ZT_SWR = 0x2, /* Sequential writes required */
55
+ BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
56
+} BlockZoneType;
57
+
58
+/*
59
+ * Zone descriptor data structure.
60
+ * Provides information on a zone with all position and size values in bytes.
61
+ */
62
+typedef struct BlockZoneDescriptor {
63
+ uint64_t start;
64
+ uint64_t length;
65
+ uint64_t cap;
66
+ uint64_t wp;
67
+ BlockZoneType type;
68
+ BlockZoneState state;
69
+} BlockZoneDescriptor;
70
+
71
typedef struct BlockDriverInfo {
72
/* in bytes, 0 if irrelevant */
73
int cluster_size;
74
--
75
2.40.1
76
77
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
All callers of qemu_co_sleep_wake are checking whether they are passing
3
Use get_sysfs_str_val() to get the string value of device
4
a NULL argument inside the pointer-to-pointer: do the check in
4
zoned model. Then get_sysfs_zoned_model() can convert it to
5
qemu_co_sleep_wake itself.
5
BlockZoneModel type of QEMU.
6
6
7
As a side effect, qemu_co_sleep_wake can be called more than once and
7
Use get_sysfs_long_val() to get the long value of zoned device
8
it will only wake the coroutine once; after the first time, the argument
8
information.
9
will be set to NULL via *sleep_state->user_state_pointer. However, this
9
10
would not be safe unless co_sleep_cb keeps using the QemuCoSleepState*
10
Signed-off-by: Sam Li <faithilikerun@gmail.com>
11
directly, so make it go through the pointer-to-pointer instead.
11
Reviewed-by: Hannes Reinecke <hare@suse.de>
12
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
13
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
14
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
14
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
15
Message-id: 20210517100548.28806-4-pbonzini@redhat.com
15
Acked-by: Kevin Wolf <kwolf@redhat.com>
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Message-id: 20230508045533.175575-3-faithilikerun@gmail.com
18
Message-id: 20230324090605.28361-3-faithilikerun@gmail.com
19
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
20
<philmd@linaro.org>.
21
--Stefan]
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
23
---
18
block/block-copy.c | 4 +---
24
include/block/block_int-common.h | 3 +
19
block/nbd.c | 8 ++------
25
block/file-posix.c | 135 ++++++++++++++++++++++---------
20
util/qemu-coroutine-sleep.c | 21 ++++++++++++---------
26
2 files changed, 100 insertions(+), 38 deletions(-)
21
3 files changed, 15 insertions(+), 18 deletions(-)
27
22
28
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
23
diff --git a/block/block-copy.c b/block/block-copy.c
24
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
25
--- a/block/block-copy.c
30
--- a/include/block/block_int-common.h
26
+++ b/block/block-copy.c
31
+++ b/include/block/block_int-common.h
27
@@ -XXX,XX +XXX,XX @@ out:
32
@@ -XXX,XX +XXX,XX @@ typedef struct BlockLimits {
28
33
* an explicit monitor command to load the disk inside the guest).
29
void block_copy_kick(BlockCopyCallState *call_state)
34
*/
35
bool has_variable_length;
36
+
37
+ /* device zone model */
38
+ BlockZoneModel zoned;
39
} BlockLimits;
40
41
typedef struct BdrvOpBlocker BdrvOpBlocker;
42
diff --git a/block/file-posix.c b/block/file-posix.c
43
index XXXXXXX..XXXXXXX 100644
44
--- a/block/file-posix.c
45
+++ b/block/file-posix.c
46
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
47
#endif
48
}
49
50
-static int hdev_get_max_segments(int fd, struct stat *st)
51
+/*
52
+ * Get a sysfs attribute value as character string.
53
+ */
54
+#ifdef CONFIG_LINUX
55
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
56
+ char **val) {
57
+ g_autofree char *sysfspath = NULL;
58
+ int ret;
59
+ size_t len;
60
+
61
+ if (!S_ISBLK(st->st_mode)) {
62
+ return -ENOTSUP;
63
+ }
64
+
65
+ sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
66
+ major(st->st_rdev), minor(st->st_rdev),
67
+ attribute);
68
+ ret = g_file_get_contents(sysfspath, val, &len, NULL);
69
+ if (ret == -1) {
70
+ return -ENOENT;
71
+ }
72
+
73
+ /* The file is ended with '\n' */
74
+ char *p;
75
+ p = *val;
76
+ if (*(p + len - 1) == '\n') {
77
+ *(p + len - 1) = '\0';
78
+ }
79
+ return ret;
80
+}
81
+#endif
82
+
83
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
30
{
84
{
31
- if (call_state->sleep_state) {
85
+ g_autofree char *val = NULL;
32
- qemu_co_sleep_wake(call_state->sleep_state);
86
+ int ret;
33
- }
87
+
34
+ qemu_co_sleep_wake(call_state->sleep_state);
88
+ ret = get_sysfs_str_val(st, "zoned", &val);
89
+ if (ret < 0) {
90
+ return ret;
91
+ }
92
+
93
+ if (strcmp(val, "host-managed") == 0) {
94
+ *zoned = BLK_Z_HM;
95
+ } else if (strcmp(val, "host-aware") == 0) {
96
+ *zoned = BLK_Z_HA;
97
+ } else if (strcmp(val, "none") == 0) {
98
+ *zoned = BLK_Z_NONE;
99
+ } else {
100
+ return -ENOTSUP;
101
+ }
102
+ return 0;
103
+}
104
+
105
+/*
106
+ * Get a sysfs attribute value as a long integer.
107
+ */
108
#ifdef CONFIG_LINUX
109
- char buf[32];
110
+static long get_sysfs_long_val(struct stat *st, const char *attribute)
111
+{
112
+ g_autofree char *str = NULL;
113
const char *end;
114
- char *sysfspath = NULL;
115
+ long val;
116
+ int ret;
117
+
118
+ ret = get_sysfs_str_val(st, attribute, &str);
119
+ if (ret < 0) {
120
+ return ret;
121
+ }
122
+
123
+ /* The file is ended with '\n', pass 'end' to accept that. */
124
+ ret = qemu_strtol(str, &end, 10, &val);
125
+ if (ret == 0 && end && *end == '\0') {
126
+ ret = val;
127
+ }
128
+ return ret;
129
+}
130
+#endif
131
+
132
+static int hdev_get_max_segments(int fd, struct stat *st)
133
+{
134
+#ifdef CONFIG_LINUX
135
int ret;
136
- int sysfd = -1;
137
- long max_segments;
138
139
if (S_ISCHR(st->st_mode)) {
140
if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
141
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_segments(int fd, struct stat *st)
142
}
143
return -ENOTSUP;
144
}
145
-
146
- if (!S_ISBLK(st->st_mode)) {
147
- return -ENOTSUP;
148
- }
149
-
150
- sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
151
- major(st->st_rdev), minor(st->st_rdev));
152
- sysfd = open(sysfspath, O_RDONLY);
153
- if (sysfd == -1) {
154
- ret = -errno;
155
- goto out;
156
- }
157
- ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
158
- if (ret < 0) {
159
- ret = -errno;
160
- goto out;
161
- } else if (ret == 0) {
162
- ret = -EIO;
163
- goto out;
164
- }
165
- buf[ret] = 0;
166
- /* The file is ended with '\n', pass 'end' to accept that. */
167
- ret = qemu_strtol(buf, &end, 10, &max_segments);
168
- if (ret == 0 && end && *end == '\n') {
169
- ret = max_segments;
170
- }
171
-
172
-out:
173
- if (sysfd != -1) {
174
- close(sysfd);
175
- }
176
- g_free(sysfspath);
177
- return ret;
178
+ return get_sysfs_long_val(st, "max_segments");
179
#else
180
return -ENOTSUP;
181
#endif
35
}
182
}
36
183
37
/*
184
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
38
diff --git a/block/nbd.c b/block/nbd.c
185
+ Error **errp)
39
index XXXXXXX..XXXXXXX 100644
186
+{
40
--- a/block/nbd.c
187
+ BlockZoneModel zoned;
41
+++ b/block/nbd.c
188
+ int ret;
42
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn nbd_client_co_drain_begin(BlockDriverState *bs)
189
+
43
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
190
+ bs->bl.zoned = BLK_Z_NONE;
44
191
+
45
s->drained = true;
192
+ ret = get_sysfs_zoned_model(st, &zoned);
46
- if (s->connection_co_sleep_ns_state) {
193
+ if (ret < 0 || zoned == BLK_Z_NONE) {
47
- qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
194
+ return;
48
- }
195
+ }
49
+ qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
196
+ bs->bl.zoned = zoned;
50
197
+}
51
nbd_co_establish_connection_cancel(bs, false);
198
+
52
199
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
53
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
200
{
54
201
BDRVRawState *s = bs->opaque;
55
s->state = NBD_CLIENT_QUIT;
202
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
56
if (s->connection_co) {
203
bs->bl.max_hw_iov = ret;
57
- if (s->connection_co_sleep_ns_state) {
204
}
58
- qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
59
- }
60
+ qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
61
nbd_co_establish_connection_cancel(bs, true);
62
}
205
}
63
if (qemu_in_coroutine()) {
206
+
64
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
207
+ raw_refresh_zoned_limits(bs, &st, errp);
65
index XXXXXXX..XXXXXXX 100644
66
--- a/util/qemu-coroutine-sleep.c
67
+++ b/util/qemu-coroutine-sleep.c
68
@@ -XXX,XX +XXX,XX @@ struct QemuCoSleepState {
69
70
void qemu_co_sleep_wake(QemuCoSleepState *sleep_state)
71
{
72
- /* Write of schedule protected by barrier write in aio_co_schedule */
73
- const char *scheduled = qatomic_cmpxchg(&sleep_state->co->scheduled,
74
- qemu_co_sleep_ns__scheduled, NULL);
75
+ if (sleep_state) {
76
+ /* Write of schedule protected by barrier write in aio_co_schedule */
77
+ const char *scheduled = qatomic_cmpxchg(&sleep_state->co->scheduled,
78
+ qemu_co_sleep_ns__scheduled, NULL);
79
80
- assert(scheduled == qemu_co_sleep_ns__scheduled);
81
- *sleep_state->user_state_pointer = NULL;
82
- timer_del(&sleep_state->ts);
83
- aio_co_wake(sleep_state->co);
84
+ assert(scheduled == qemu_co_sleep_ns__scheduled);
85
+ *sleep_state->user_state_pointer = NULL;
86
+ timer_del(&sleep_state->ts);
87
+ aio_co_wake(sleep_state->co);
88
+ }
89
}
208
}
90
209
91
static void co_sleep_cb(void *opaque)
210
static int check_for_dasd(int fd)
92
{
93
- qemu_co_sleep_wake(opaque);
94
+ QemuCoSleepState **sleep_state = opaque;
95
+ qemu_co_sleep_wake(*sleep_state);
96
}
97
98
void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
99
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
100
abort();
101
}
102
103
- aio_timer_init(ctx, &state.ts, type, SCALE_NS, co_sleep_cb, &state);
104
+ aio_timer_init(ctx, &state.ts, type, SCALE_NS, co_sleep_cb, sleep_state);
105
*sleep_state = &state;
106
timer_mod(&state.ts, qemu_clock_get_ns(type) + ns);
107
qemu_coroutine_yield();
108
--
211
--
109
2.31.1
212
2.40.1
110
213
214
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Right now, users of qemu_co_sleep_ns_wakeable are simply passing
3
Add zoned device option to host_device BlockDriver. It will be presented only
4
a pointer to QemuCoSleepState by reference to the function. But
4
for zoned host block devices. By adding zone management operations to the
5
QemuCoSleepState really is just a Coroutine*; making the
5
host_block_device BlockDriver, users can use the new block layer APIs
6
content of the struct public is just as efficient and lets us
6
including Report Zone and four zone management operations
7
skip the user_state_pointer indirection.
7
(open, close, finish, reset, reset_all).
8
8
9
Since the usage is changed, take the occasion to rename the
9
Qemu-io uses the new APIs to perform zoned storage commands of the device:
10
struct to QemuCoSleep.
10
zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
11
zone_finish(zf).
11
12
12
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
13
For example, to test zone_report, use following command:
13
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
14
$ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0
14
Message-id: 20210517100548.28806-6-pbonzini@redhat.com
15
-c "zrp offset nr_zones"
16
17
Signed-off-by: Sam Li <faithilikerun@gmail.com>
18
Reviewed-by: Hannes Reinecke <hare@suse.de>
19
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
20
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
21
Acked-by: Kevin Wolf <kwolf@redhat.com>
22
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
23
Message-id: 20230508045533.175575-4-faithilikerun@gmail.com
24
Message-id: 20230324090605.28361-4-faithilikerun@gmail.com
25
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
26
<philmd@linaro.org> and remove spurious ret = -errno in
27
raw_co_zone_mgmt().
28
--Stefan]
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
29
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
30
---
17
include/qemu/coroutine.h | 23 +++++++++++----------
31
meson.build | 5 +
18
block/block-copy.c | 8 ++++----
32
include/block/block-io.h | 9 +
19
block/nbd.c | 10 ++++-----
33
include/block/block_int-common.h | 21 ++
20
util/qemu-coroutine-sleep.c | 41 ++++++++++++++++---------------------
34
include/block/raw-aio.h | 6 +-
21
4 files changed, 39 insertions(+), 43 deletions(-)
35
include/sysemu/block-backend-io.h | 18 ++
36
block/block-backend.c | 137 +++++++++++++
37
block/file-posix.c | 313 +++++++++++++++++++++++++++++-
38
block/io.c | 41 ++++
39
qemu-io-cmds.c | 149 ++++++++++++++
40
9 files changed, 696 insertions(+), 3 deletions(-)
22
41
23
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
42
diff --git a/meson.build b/meson.build
24
index XXXXXXX..XXXXXXX 100644
43
index XXXXXXX..XXXXXXX 100644
25
--- a/include/qemu/coroutine.h
44
--- a/meson.build
26
+++ b/include/qemu/coroutine.h
45
+++ b/meson.build
27
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock);
46
@@ -XXX,XX +XXX,XX @@ if rdma.found()
28
*/
47
endif
29
void qemu_co_rwlock_unlock(CoRwlock *lock);
48
30
49
# has_header_symbol
31
-typedef struct QemuCoSleepState QemuCoSleepState;
50
+config_host_data.set('CONFIG_BLKZONED',
32
+typedef struct QemuCoSleep {
51
+ cc.has_header_symbol('linux/blkzoned.h', 'BLKOPENZONE'))
33
+ Coroutine *to_wake;
52
config_host_data.set('CONFIG_EPOLL_CREATE1',
34
+} QemuCoSleep;
53
cc.has_header_symbol('sys/epoll.h', 'epoll_create1'))
35
54
config_host_data.set('CONFIG_FALLOCATE_PUNCH_HOLE',
36
/**
55
@@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
37
- * Yield the coroutine for a given duration. During this yield, @sleep_state
56
config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM',
38
- * is set to an opaque pointer, which may be used for
57
cc.has_member('struct stat', 'st_atim',
39
- * qemu_co_sleep_wake(). Be careful, the pointer is set back to zero when the
58
prefix: '#include <sys/stat.h>'))
40
- * timer fires. Don't save the obtained value to other variables and don't call
59
+config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY',
41
- * qemu_co_sleep_wake from another aio context.
60
+ cc.has_member('struct blk_zone', 'capacity',
42
+ * Yield the coroutine for a given duration. Initializes @w so that,
61
+ prefix: '#include <linux/blkzoned.h>'))
43
+ * during this yield, it can be passed to qemu_co_sleep_wake() to
62
44
+ * terminate the sleep.
63
# has_type
45
*/
64
config_host_data.set('CONFIG_IOVEC',
46
-void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
65
diff --git a/include/block/block-io.h b/include/block/block-io.h
47
- QemuCoSleepState **sleep_state);
66
index XXXXXXX..XXXXXXX 100644
48
+void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
67
--- a/include/block/block-io.h
49
+ QEMUClockType type, int64_t ns);
68
+++ b/include/block/block-io.h
50
+
69
@@ -XXX,XX +XXX,XX @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
51
static inline void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns)
70
int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
71
int64_t bytes);
72
73
+/* Report zone information of zone block device. */
74
+int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs,
75
+ int64_t offset,
76
+ unsigned int *nr_zones,
77
+ BlockZoneDescriptor *zones);
78
+int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
79
+ BlockZoneOp op,
80
+ int64_t offset, int64_t len);
81
+
82
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
83
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
84
int64_t bytes, int64_t *pnum, int64_t *map,
85
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
86
index XXXXXXX..XXXXXXX 100644
87
--- a/include/block/block_int-common.h
88
+++ b/include/block/block_int-common.h
89
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
90
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_load_vmstate)(
91
BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
92
93
+ int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
94
+ int64_t offset, unsigned int *nr_zones,
95
+ BlockZoneDescriptor *zones);
96
+ int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
97
+ int64_t offset, int64_t len);
98
+
99
/* removable device specific */
100
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
101
BlockDriverState *bs);
102
@@ -XXX,XX +XXX,XX @@ typedef struct BlockLimits {
103
104
/* device zone model */
105
BlockZoneModel zoned;
106
+
107
+ /* zone size expressed in bytes */
108
+ uint32_t zone_size;
109
+
110
+ /* total number of zones */
111
+ uint32_t nr_zones;
112
+
113
+ /* maximum sectors of a zone append write operation */
114
+ uint32_t max_append_sectors;
115
+
116
+ /* maximum number of open zones */
117
+ uint32_t max_open_zones;
118
+
119
+ /* maximum number of active zones */
120
+ uint32_t max_active_zones;
121
} BlockLimits;
122
123
typedef struct BdrvOpBlocker BdrvOpBlocker;
124
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
125
index XXXXXXX..XXXXXXX 100644
126
--- a/include/block/raw-aio.h
127
+++ b/include/block/raw-aio.h
128
@@ -XXX,XX +XXX,XX @@
129
#define QEMU_AIO_WRITE_ZEROES 0x0020
130
#define QEMU_AIO_COPY_RANGE 0x0040
131
#define QEMU_AIO_TRUNCATE 0x0080
132
+#define QEMU_AIO_ZONE_REPORT 0x0100
133
+#define QEMU_AIO_ZONE_MGMT 0x0200
134
#define QEMU_AIO_TYPE_MASK \
135
(QEMU_AIO_READ | \
136
QEMU_AIO_WRITE | \
137
@@ -XXX,XX +XXX,XX @@
138
QEMU_AIO_DISCARD | \
139
QEMU_AIO_WRITE_ZEROES | \
140
QEMU_AIO_COPY_RANGE | \
141
- QEMU_AIO_TRUNCATE)
142
+ QEMU_AIO_TRUNCATE | \
143
+ QEMU_AIO_ZONE_REPORT | \
144
+ QEMU_AIO_ZONE_MGMT)
145
146
/* AIO flags */
147
#define QEMU_AIO_MISALIGNED 0x1000
148
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
149
index XXXXXXX..XXXXXXX 100644
150
--- a/include/sysemu/block-backend-io.h
151
+++ b/include/sysemu/block-backend-io.h
152
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
153
BlockCompletionFunc *cb, void *opaque);
154
BlockAIOCB *blk_aio_flush(BlockBackend *blk,
155
BlockCompletionFunc *cb, void *opaque);
156
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
157
+ unsigned int *nr_zones,
158
+ BlockZoneDescriptor *zones,
159
+ BlockCompletionFunc *cb, void *opaque);
160
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
161
+ int64_t offset, int64_t len,
162
+ BlockCompletionFunc *cb, void *opaque);
163
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
164
BlockCompletionFunc *cb, void *opaque);
165
void blk_aio_cancel_async(BlockAIOCB *acb);
166
@@ -XXX,XX +XXX,XX @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
167
int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
168
int64_t bytes, BdrvRequestFlags flags);
169
170
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
171
+ unsigned int *nr_zones,
172
+ BlockZoneDescriptor *zones);
173
+int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset,
174
+ unsigned int *nr_zones,
175
+ BlockZoneDescriptor *zones);
176
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
177
+ int64_t offset, int64_t len);
178
+int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
179
+ int64_t offset, int64_t len);
180
+
181
int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
182
int64_t bytes);
183
int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
184
diff --git a/block/block-backend.c b/block/block-backend.c
185
index XXXXXXX..XXXXXXX 100644
186
--- a/block/block-backend.c
187
+++ b/block/block-backend.c
188
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
189
return ret;
190
}
191
192
+static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
193
+{
194
+ BlkAioEmAIOCB *acb = opaque;
195
+ BlkRwCo *rwco = &acb->rwco;
196
+
197
+ rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
198
+ (unsigned int*)(uintptr_t)acb->bytes,
199
+ rwco->iobuf);
200
+ blk_aio_complete(acb);
201
+}
202
+
203
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
204
+ unsigned int *nr_zones,
205
+ BlockZoneDescriptor *zones,
206
+ BlockCompletionFunc *cb, void *opaque)
207
+{
208
+ BlkAioEmAIOCB *acb;
209
+ Coroutine *co;
210
+ IO_CODE();
211
+
212
+ blk_inc_in_flight(blk);
213
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
214
+ acb->rwco = (BlkRwCo) {
215
+ .blk = blk,
216
+ .offset = offset,
217
+ .iobuf = zones,
218
+ .ret = NOT_DONE,
219
+ };
220
+ acb->bytes = (int64_t)(uintptr_t)nr_zones,
221
+ acb->has_returned = false;
222
+
223
+ co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
224
+ aio_co_enter(blk_get_aio_context(blk), co);
225
+
226
+ acb->has_returned = true;
227
+ if (acb->rwco.ret != NOT_DONE) {
228
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
229
+ blk_aio_complete_bh, acb);
230
+ }
231
+
232
+ return &acb->common;
233
+}
234
+
235
+static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
236
+{
237
+ BlkAioEmAIOCB *acb = opaque;
238
+ BlkRwCo *rwco = &acb->rwco;
239
+
240
+ rwco->ret = blk_co_zone_mgmt(rwco->blk,
241
+ (BlockZoneOp)(uintptr_t)rwco->iobuf,
242
+ rwco->offset, acb->bytes);
243
+ blk_aio_complete(acb);
244
+}
245
+
246
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
247
+ int64_t offset, int64_t len,
248
+ BlockCompletionFunc *cb, void *opaque) {
249
+ BlkAioEmAIOCB *acb;
250
+ Coroutine *co;
251
+ IO_CODE();
252
+
253
+ blk_inc_in_flight(blk);
254
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
255
+ acb->rwco = (BlkRwCo) {
256
+ .blk = blk,
257
+ .offset = offset,
258
+ .iobuf = (void *)(uintptr_t)op,
259
+ .ret = NOT_DONE,
260
+ };
261
+ acb->bytes = len;
262
+ acb->has_returned = false;
263
+
264
+ co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
265
+ aio_co_enter(blk_get_aio_context(blk), co);
266
+
267
+ acb->has_returned = true;
268
+ if (acb->rwco.ret != NOT_DONE) {
269
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
270
+ blk_aio_complete_bh, acb);
271
+ }
272
+
273
+ return &acb->common;
274
+}
275
+
276
+/*
277
+ * Send a zone_report command.
278
+ * offset is a byte offset from the start of the device. No alignment
279
+ * required for offset.
280
+ * nr_zones represents IN maximum and OUT actual.
281
+ */
282
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
283
+ unsigned int *nr_zones,
284
+ BlockZoneDescriptor *zones)
285
+{
286
+ int ret;
287
+ IO_CODE();
288
+
289
+ blk_inc_in_flight(blk); /* increase before waiting */
290
+ blk_wait_while_drained(blk);
291
+ GRAPH_RDLOCK_GUARD();
292
+ if (!blk_is_available(blk)) {
293
+ blk_dec_in_flight(blk);
294
+ return -ENOMEDIUM;
295
+ }
296
+ ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
297
+ blk_dec_in_flight(blk);
298
+ return ret;
299
+}
300
+
301
+/*
302
+ * Send a zone_management command.
303
+ * op is the zone operation;
304
+ * offset is the byte offset from the start of the zoned device;
305
+ * len is the maximum number of bytes the command should operate on. It
306
+ * should be aligned with the device zone size.
307
+ */
308
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
309
+ int64_t offset, int64_t len)
310
+{
311
+ int ret;
312
+ IO_CODE();
313
+
314
+ blk_inc_in_flight(blk);
315
+ blk_wait_while_drained(blk);
316
+ GRAPH_RDLOCK_GUARD();
317
+
318
+ ret = blk_check_byte_request(blk, offset, len);
319
+ if (ret < 0) {
320
+ blk_dec_in_flight(blk);
321
+ return ret;
322
+ }
323
+
324
+ ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
325
+ blk_dec_in_flight(blk);
326
+ return ret;
327
+}
328
+
329
void blk_drain(BlockBackend *blk)
52
{
330
{
53
- QemuCoSleepState *unused = NULL;
331
BlockDriverState *bs = blk_bs(blk);
54
- qemu_co_sleep_ns_wakeable(type, ns, &unused);
332
diff --git a/block/file-posix.c b/block/file-posix.c
55
+ QemuCoSleep w = { 0 };
333
index XXXXXXX..XXXXXXX 100644
56
+ qemu_co_sleep_ns_wakeable(&w, type, ns);
334
--- a/block/file-posix.c
335
+++ b/block/file-posix.c
336
@@ -XXX,XX +XXX,XX @@
337
#include <sys/param.h>
338
#include <sys/syscall.h>
339
#include <sys/vfs.h>
340
+#if defined(CONFIG_BLKZONED)
341
+#include <linux/blkzoned.h>
342
+#endif
343
#include <linux/cdrom.h>
344
#include <linux/fd.h>
345
#include <linux/fs.h>
346
@@ -XXX,XX +XXX,XX @@ typedef struct RawPosixAIOData {
347
PreallocMode prealloc;
348
Error **errp;
349
} truncate;
350
+ struct {
351
+ unsigned int *nr_zones;
352
+ BlockZoneDescriptor *zones;
353
+ } zone_report;
354
+ struct {
355
+ unsigned long op;
356
+ } zone_mgmt;
357
};
358
} RawPosixAIOData;
359
360
@@ -XXX,XX +XXX,XX @@ static int get_sysfs_str_val(struct stat *st, const char *attribute,
57
}
361
}
58
362
#endif
59
/**
363
60
@@ -XXX,XX +XXX,XX @@ static inline void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns)
364
+#if defined(CONFIG_BLKZONED)
61
* qemu_co_sleep_ns() and should be checked to be non-NULL before calling
365
static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
62
* qemu_co_sleep_wake().
366
{
63
*/
367
g_autofree char *val = NULL;
64
-void qemu_co_sleep_wake(QemuCoSleepState *sleep_state);
368
@@ -XXX,XX +XXX,XX @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
65
+void qemu_co_sleep_wake(QemuCoSleep *w);
369
}
66
370
return 0;
67
/**
371
}
68
* Yield until a file descriptor becomes readable
372
+#endif /* defined(CONFIG_BLKZONED) */
69
diff --git a/block/block-copy.c b/block/block-copy.c
373
70
index XXXXXXX..XXXXXXX 100644
374
/*
71
--- a/block/block-copy.c
375
* Get a sysfs attribute value as a long integer.
72
+++ b/block/block-copy.c
376
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_segments(int fd, struct stat *st)
73
@@ -XXX,XX +XXX,XX @@ typedef struct BlockCopyCallState {
377
#endif
74
/* State */
378
}
379
380
+#if defined(CONFIG_BLKZONED)
381
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
382
Error **errp)
383
{
384
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
385
return;
386
}
387
bs->bl.zoned = zoned;
388
+
389
+ ret = get_sysfs_long_val(st, "max_open_zones");
390
+ if (ret >= 0) {
391
+ bs->bl.max_open_zones = ret;
392
+ }
393
+
394
+ ret = get_sysfs_long_val(st, "max_active_zones");
395
+ if (ret >= 0) {
396
+ bs->bl.max_active_zones = ret;
397
+ }
398
+
399
+ /*
400
+ * The zoned device must at least have zone size and nr_zones fields.
401
+ */
402
+ ret = get_sysfs_long_val(st, "chunk_sectors");
403
+ if (ret < 0) {
404
+ error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
405
+ "sysfs attribute");
406
+ return;
407
+ } else if (!ret) {
408
+ error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
409
+ return;
410
+ }
411
+ bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
412
+
413
+ ret = get_sysfs_long_val(st, "nr_zones");
414
+ if (ret < 0) {
415
+ error_setg_errno(errp, -ret, "Unable to read nr_zones "
416
+ "sysfs attribute");
417
+ return;
418
+ } else if (!ret) {
419
+ error_setg(errp, "Read 0 from nr_zones sysfs attribute");
420
+ return;
421
+ }
422
+ bs->bl.nr_zones = ret;
423
+
424
+ ret = get_sysfs_long_val(st, "zone_append_max_bytes");
425
+ if (ret > 0) {
426
+ bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
427
+ }
428
}
429
+#else /* !defined(CONFIG_BLKZONED) */
430
+static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
431
+ Error **errp)
432
+{
433
+ bs->bl.zoned = BLK_Z_NONE;
434
+}
435
+#endif /* !defined(CONFIG_BLKZONED) */
436
437
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
438
{
439
@@ -XXX,XX +XXX,XX @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
440
BDRVRawState *s = bs->opaque;
75
int ret;
441
int ret;
76
bool finished;
442
77
- QemuCoSleepState *sleep_state;
443
- /* If DASD, get blocksizes */
78
+ QemuCoSleep sleep;
444
+ /* If DASD or zoned devices, get blocksizes */
79
bool cancelled;
445
if (check_for_dasd(s->fd) < 0) {
80
446
- return -ENOTSUP;
81
/* OUT parameters */
447
+ /* zoned devices are not DASD */
82
@@ -XXX,XX +XXX,XX @@ block_copy_dirty_clusters(BlockCopyCallState *call_state)
448
+ if (bs->bl.zoned == BLK_Z_NONE) {
83
if (ns > 0) {
449
+ return -ENOTSUP;
84
block_copy_task_end(task, -EAGAIN);
450
+ }
85
g_free(task);
451
}
86
- qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, ns,
452
ret = probe_logical_blocksize(s->fd, &bsz->log);
87
- &call_state->sleep_state);
453
if (ret < 0) {
88
+ qemu_co_sleep_ns_wakeable(&call_state->sleep,
454
@@ -XXX,XX +XXX,XX @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
89
+ QEMU_CLOCK_REALTIME, ns);
455
}
90
continue;
456
#endif
91
}
457
92
}
458
+/*
93
@@ -XXX,XX +XXX,XX @@ out:
459
+ * parse_zone - Fill a zone descriptor
94
460
+ */
95
void block_copy_kick(BlockCopyCallState *call_state)
461
+#if defined(CONFIG_BLKZONED)
462
+static inline int parse_zone(struct BlockZoneDescriptor *zone,
463
+ const struct blk_zone *blkz) {
464
+ zone->start = blkz->start << BDRV_SECTOR_BITS;
465
+ zone->length = blkz->len << BDRV_SECTOR_BITS;
466
+ zone->wp = blkz->wp << BDRV_SECTOR_BITS;
467
+
468
+#ifdef HAVE_BLK_ZONE_REP_CAPACITY
469
+ zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
470
+#else
471
+ zone->cap = blkz->len << BDRV_SECTOR_BITS;
472
+#endif
473
+
474
+ switch (blkz->type) {
475
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
476
+ zone->type = BLK_ZT_SWR;
477
+ break;
478
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
479
+ zone->type = BLK_ZT_SWP;
480
+ break;
481
+ case BLK_ZONE_TYPE_CONVENTIONAL:
482
+ zone->type = BLK_ZT_CONV;
483
+ break;
484
+ default:
485
+ error_report("Unsupported zone type: 0x%x", blkz->type);
486
+ return -ENOTSUP;
487
+ }
488
+
489
+ switch (blkz->cond) {
490
+ case BLK_ZONE_COND_NOT_WP:
491
+ zone->state = BLK_ZS_NOT_WP;
492
+ break;
493
+ case BLK_ZONE_COND_EMPTY:
494
+ zone->state = BLK_ZS_EMPTY;
495
+ break;
496
+ case BLK_ZONE_COND_IMP_OPEN:
497
+ zone->state = BLK_ZS_IOPEN;
498
+ break;
499
+ case BLK_ZONE_COND_EXP_OPEN:
500
+ zone->state = BLK_ZS_EOPEN;
501
+ break;
502
+ case BLK_ZONE_COND_CLOSED:
503
+ zone->state = BLK_ZS_CLOSED;
504
+ break;
505
+ case BLK_ZONE_COND_READONLY:
506
+ zone->state = BLK_ZS_RDONLY;
507
+ break;
508
+ case BLK_ZONE_COND_FULL:
509
+ zone->state = BLK_ZS_FULL;
510
+ break;
511
+ case BLK_ZONE_COND_OFFLINE:
512
+ zone->state = BLK_ZS_OFFLINE;
513
+ break;
514
+ default:
515
+ error_report("Unsupported zone state: 0x%x", blkz->cond);
516
+ return -ENOTSUP;
517
+ }
518
+ return 0;
519
+}
520
+#endif
521
+
522
+#if defined(CONFIG_BLKZONED)
523
+static int handle_aiocb_zone_report(void *opaque)
524
+{
525
+ RawPosixAIOData *aiocb = opaque;
526
+ int fd = aiocb->aio_fildes;
527
+ unsigned int *nr_zones = aiocb->zone_report.nr_zones;
528
+ BlockZoneDescriptor *zones = aiocb->zone_report.zones;
529
+ /* zoned block devices use 512-byte sectors */
530
+ uint64_t sector = aiocb->aio_offset / 512;
531
+
532
+ struct blk_zone *blkz;
533
+ size_t rep_size;
534
+ unsigned int nrz;
535
+ int ret;
536
+ unsigned int n = 0, i = 0;
537
+
538
+ nrz = *nr_zones;
539
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
540
+ g_autofree struct blk_zone_report *rep = NULL;
541
+ rep = g_malloc(rep_size);
542
+
543
+ blkz = (struct blk_zone *)(rep + 1);
544
+ while (n < nrz) {
545
+ memset(rep, 0, rep_size);
546
+ rep->sector = sector;
547
+ rep->nr_zones = nrz - n;
548
+
549
+ do {
550
+ ret = ioctl(fd, BLKREPORTZONE, rep);
551
+ } while (ret != 0 && errno == EINTR);
552
+ if (ret != 0) {
553
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
554
+ fd, sector, errno);
555
+ return -errno;
556
+ }
557
+
558
+ if (!rep->nr_zones) {
559
+ break;
560
+ }
561
+
562
+ for (i = 0; i < rep->nr_zones; i++, n++) {
563
+ ret = parse_zone(&zones[n], &blkz[i]);
564
+ if (ret != 0) {
565
+ return ret;
566
+ }
567
+
568
+ /* The next report should start after the last zone reported */
569
+ sector = blkz[i].start + blkz[i].len;
570
+ }
571
+ }
572
+
573
+ *nr_zones = n;
574
+ return 0;
575
+}
576
+#endif
577
+
578
+#if defined(CONFIG_BLKZONED)
579
+static int handle_aiocb_zone_mgmt(void *opaque)
580
+{
581
+ RawPosixAIOData *aiocb = opaque;
582
+ int fd = aiocb->aio_fildes;
583
+ uint64_t sector = aiocb->aio_offset / 512;
584
+ int64_t nr_sectors = aiocb->aio_nbytes / 512;
585
+ struct blk_zone_range range;
586
+ int ret;
587
+
588
+ /* Execute the operation */
589
+ range.sector = sector;
590
+ range.nr_sectors = nr_sectors;
591
+ do {
592
+ ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
593
+ } while (ret != 0 && errno == EINTR);
594
+
595
+ return ret;
596
+}
597
+#endif
598
+
599
static int handle_aiocb_copy_range(void *opaque)
96
{
600
{
97
- qemu_co_sleep_wake(call_state->sleep_state);
601
RawPosixAIOData *aiocb = opaque;
98
+ qemu_co_sleep_wake(&call_state->sleep);
602
@@ -XXX,XX +XXX,XX @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
99
}
100
101
/*
102
diff --git a/block/nbd.c b/block/nbd.c
103
index XXXXXXX..XXXXXXX 100644
104
--- a/block/nbd.c
105
+++ b/block/nbd.c
106
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVNBDState {
107
CoQueue free_sema;
108
Coroutine *connection_co;
109
Coroutine *teardown_co;
110
- QemuCoSleepState *connection_co_sleep_ns_state;
111
+ QemuCoSleep reconnect_sleep;
112
bool drained;
113
bool wait_drained_end;
114
int in_flight;
115
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn nbd_client_co_drain_begin(BlockDriverState *bs)
116
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
117
118
s->drained = true;
119
- qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
120
+ qemu_co_sleep_wake(&s->reconnect_sleep);
121
122
nbd_co_establish_connection_cancel(bs, false);
123
124
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
125
126
s->state = NBD_CLIENT_QUIT;
127
if (s->connection_co) {
128
- qemu_co_sleep_wake(s->connection_co_sleep_ns_state);
129
+ qemu_co_sleep_wake(&s->reconnect_sleep);
130
nbd_co_establish_connection_cancel(bs, true);
131
}
132
if (qemu_in_coroutine()) {
133
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s)
134
}
135
bdrv_inc_in_flight(s->bs);
136
} else {
137
- qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout,
138
- &s->connection_co_sleep_ns_state);
139
+ qemu_co_sleep_ns_wakeable(&s->reconnect_sleep,
140
+ QEMU_CLOCK_REALTIME, timeout);
141
if (s->drained) {
142
continue;
143
}
144
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
145
index XXXXXXX..XXXXXXX 100644
146
--- a/util/qemu-coroutine-sleep.c
147
+++ b/util/qemu-coroutine-sleep.c
148
@@ -XXX,XX +XXX,XX @@
149
150
static const char *qemu_co_sleep_ns__scheduled = "qemu_co_sleep_ns";
151
152
-struct QemuCoSleepState {
153
+void qemu_co_sleep_wake(QemuCoSleep *w)
154
+{
155
Coroutine *co;
156
- QemuCoSleepState **user_state_pointer;
157
-};
158
159
-void qemu_co_sleep_wake(QemuCoSleepState *sleep_state)
160
-{
161
- if (sleep_state) {
162
+ co = w->to_wake;
163
+ w->to_wake = NULL;
164
+ if (co) {
165
/* Write of schedule protected by barrier write in aio_co_schedule */
166
- const char *scheduled = qatomic_cmpxchg(&sleep_state->co->scheduled,
167
+ const char *scheduled = qatomic_cmpxchg(&co->scheduled,
168
qemu_co_sleep_ns__scheduled, NULL);
169
170
assert(scheduled == qemu_co_sleep_ns__scheduled);
171
- *sleep_state->user_state_pointer = NULL;
172
- aio_co_wake(sleep_state->co);
173
+ aio_co_wake(co);
174
}
603
}
175
}
604
}
176
605
177
static void co_sleep_cb(void *opaque)
606
+/*
607
+ * zone report - Get a zone block device's information in the form
608
+ * of an array of zone descriptors.
609
+ * zones is an array of zone descriptors to hold zone information on reply;
610
+ * offset can be any byte within the entire size of the device;
611
+ * nr_zones is the maxium number of sectors the command should operate on.
612
+ */
613
+#if defined(CONFIG_BLKZONED)
614
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
615
+ unsigned int *nr_zones,
616
+ BlockZoneDescriptor *zones) {
617
+ BDRVRawState *s = bs->opaque;
618
+ RawPosixAIOData acb = (RawPosixAIOData) {
619
+ .bs = bs,
620
+ .aio_fildes = s->fd,
621
+ .aio_type = QEMU_AIO_ZONE_REPORT,
622
+ .aio_offset = offset,
623
+ .zone_report = {
624
+ .nr_zones = nr_zones,
625
+ .zones = zones,
626
+ },
627
+ };
628
+
629
+ return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
630
+}
631
+#endif
632
+
633
+/*
634
+ * zone management operations - Execute an operation on a zone
635
+ */
636
+#if defined(CONFIG_BLKZONED)
637
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
638
+ int64_t offset, int64_t len) {
639
+ BDRVRawState *s = bs->opaque;
640
+ RawPosixAIOData acb;
641
+ int64_t zone_size, zone_size_mask;
642
+ const char *op_name;
643
+ unsigned long zo;
644
+ int ret;
645
+ int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
646
+
647
+ zone_size = bs->bl.zone_size;
648
+ zone_size_mask = zone_size - 1;
649
+ if (offset & zone_size_mask) {
650
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
651
+ "%" PRId64 "", offset / 512, zone_size / 512);
652
+ return -EINVAL;
653
+ }
654
+
655
+ if (((offset + len) < capacity && len & zone_size_mask) ||
656
+ offset + len > capacity) {
657
+ error_report("number of sectors %" PRId64 " is not aligned to zone size"
658
+ " %" PRId64 "", len / 512, zone_size / 512);
659
+ return -EINVAL;
660
+ }
661
+
662
+ switch (op) {
663
+ case BLK_ZO_OPEN:
664
+ op_name = "BLKOPENZONE";
665
+ zo = BLKOPENZONE;
666
+ break;
667
+ case BLK_ZO_CLOSE:
668
+ op_name = "BLKCLOSEZONE";
669
+ zo = BLKCLOSEZONE;
670
+ break;
671
+ case BLK_ZO_FINISH:
672
+ op_name = "BLKFINISHZONE";
673
+ zo = BLKFINISHZONE;
674
+ break;
675
+ case BLK_ZO_RESET:
676
+ op_name = "BLKRESETZONE";
677
+ zo = BLKRESETZONE;
678
+ break;
679
+ default:
680
+ error_report("Unsupported zone op: 0x%x", op);
681
+ return -ENOTSUP;
682
+ }
683
+
684
+ acb = (RawPosixAIOData) {
685
+ .bs = bs,
686
+ .aio_fildes = s->fd,
687
+ .aio_type = QEMU_AIO_ZONE_MGMT,
688
+ .aio_offset = offset,
689
+ .aio_nbytes = len,
690
+ .zone_mgmt = {
691
+ .op = zo,
692
+ },
693
+ };
694
+
695
+ ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
696
+ if (ret != 0) {
697
+ error_report("ioctl %s failed %d", op_name, ret);
698
+ }
699
+
700
+ return ret;
701
+}
702
+#endif
703
+
704
static coroutine_fn int
705
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
706
bool blkdev)
707
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
708
#ifdef __linux__
709
.bdrv_co_ioctl = hdev_co_ioctl,
710
#endif
711
+
712
+ /* zoned device */
713
+#if defined(CONFIG_BLKZONED)
714
+ /* zone management operations */
715
+ .bdrv_co_zone_report = raw_co_zone_report,
716
+ .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
717
+#endif
718
};
719
720
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
721
diff --git a/block/io.c b/block/io.c
722
index XXXXXXX..XXXXXXX 100644
723
--- a/block/io.c
724
+++ b/block/io.c
725
@@ -XXX,XX +XXX,XX @@ out:
726
return co.ret;
727
}
728
729
+int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
730
+ unsigned int *nr_zones,
731
+ BlockZoneDescriptor *zones)
732
+{
733
+ BlockDriver *drv = bs->drv;
734
+ CoroutineIOCompletion co = {
735
+ .coroutine = qemu_coroutine_self(),
736
+ };
737
+ IO_CODE();
738
+
739
+ bdrv_inc_in_flight(bs);
740
+ if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
741
+ co.ret = -ENOTSUP;
742
+ goto out;
743
+ }
744
+ co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
745
+out:
746
+ bdrv_dec_in_flight(bs);
747
+ return co.ret;
748
+}
749
+
750
+int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
751
+ int64_t offset, int64_t len)
752
+{
753
+ BlockDriver *drv = bs->drv;
754
+ CoroutineIOCompletion co = {
755
+ .coroutine = qemu_coroutine_self(),
756
+ };
757
+ IO_CODE();
758
+
759
+ bdrv_inc_in_flight(bs);
760
+ if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
761
+ co.ret = -ENOTSUP;
762
+ goto out;
763
+ }
764
+ co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
765
+out:
766
+ bdrv_dec_in_flight(bs);
767
+ return co.ret;
768
+}
769
+
770
void *qemu_blockalign(BlockDriverState *bs, size_t size)
178
{
771
{
179
- QemuCoSleepState **sleep_state = opaque;
772
IO_CODE();
180
- qemu_co_sleep_wake(*sleep_state);
773
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
181
+ QemuCoSleep *w = opaque;
774
index XXXXXXX..XXXXXXX 100644
182
+ qemu_co_sleep_wake(w);
775
--- a/qemu-io-cmds.c
183
}
776
+++ b/qemu-io-cmds.c
184
777
@@ -XXX,XX +XXX,XX @@ static const cmdinfo_t flush_cmd = {
185
-void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
778
.oneline = "flush all in-core file state to disk",
186
- QemuCoSleepState **sleep_state)
779
};
187
+void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
780
188
+ QEMUClockType type, int64_t ns)
781
+static inline int64_t tosector(int64_t bytes)
189
{
782
+{
190
+ Coroutine *co = qemu_coroutine_self();
783
+ return bytes >> BDRV_SECTOR_BITS;
191
AioContext *ctx = qemu_get_current_aio_context();
784
+}
192
QEMUTimer ts;
785
+
193
- QemuCoSleepState state = {
786
+static int zone_report_f(BlockBackend *blk, int argc, char **argv)
194
- .co = qemu_coroutine_self(),
787
+{
195
- .user_state_pointer = sleep_state,
788
+ int ret;
196
- };
789
+ int64_t offset;
197
790
+ unsigned int nr_zones;
198
- const char *scheduled = qatomic_cmpxchg(&state.co->scheduled, NULL,
791
+
199
- qemu_co_sleep_ns__scheduled);
792
+ ++optind;
200
+ const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
793
+ offset = cvtnum(argv[optind]);
201
+ qemu_co_sleep_ns__scheduled);
794
+ ++optind;
202
if (scheduled) {
795
+ nr_zones = cvtnum(argv[optind]);
203
fprintf(stderr,
796
+
204
"%s: Co-routine was already scheduled in '%s'\n",
797
+ g_autofree BlockZoneDescriptor *zones = NULL;
205
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
798
+ zones = g_new(BlockZoneDescriptor, nr_zones);
206
abort();
799
+ ret = blk_zone_report(blk, offset, &nr_zones, zones);
207
}
800
+ if (ret < 0) {
208
801
+ printf("zone report failed: %s\n", strerror(-ret));
209
- aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, sleep_state);
802
+ } else {
210
- *sleep_state = &state;
803
+ for (int i = 0; i < nr_zones; ++i) {
211
+ w->to_wake = co;
804
+ printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
212
+ aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, w),
805
+ "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
213
timer_mod(&ts, qemu_clock_get_ns(type) + ns);
806
+ "zcond:%u, [type: %u]\n",
214
qemu_coroutine_yield();
807
+ tosector(zones[i].start), tosector(zones[i].length),
215
timer_del(&ts);
808
+ tosector(zones[i].cap), tosector(zones[i].wp),
216
809
+ zones[i].state, zones[i].type);
217
- /* qemu_co_sleep_wake clears *sleep_state before resuming this coroutine. */
810
+ }
218
- assert(*sleep_state == NULL);
811
+ }
219
+ /* w->to_wake is cleared before resuming this coroutine. */
812
+ return ret;
220
+ assert(w->to_wake == NULL);
813
+}
221
}
814
+
815
+static const cmdinfo_t zone_report_cmd = {
816
+ .name = "zone_report",
817
+ .altname = "zrp",
818
+ .cfunc = zone_report_f,
819
+ .argmin = 2,
820
+ .argmax = 2,
821
+ .args = "offset number",
822
+ .oneline = "report zone information",
823
+};
824
+
825
+static int zone_open_f(BlockBackend *blk, int argc, char **argv)
826
+{
827
+ int ret;
828
+ int64_t offset, len;
829
+ ++optind;
830
+ offset = cvtnum(argv[optind]);
831
+ ++optind;
832
+ len = cvtnum(argv[optind]);
833
+ ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
834
+ if (ret < 0) {
835
+ printf("zone open failed: %s\n", strerror(-ret));
836
+ }
837
+ return ret;
838
+}
839
+
840
+static const cmdinfo_t zone_open_cmd = {
841
+ .name = "zone_open",
842
+ .altname = "zo",
843
+ .cfunc = zone_open_f,
844
+ .argmin = 2,
845
+ .argmax = 2,
846
+ .args = "offset len",
847
+ .oneline = "explicit open a range of zones in zone block device",
848
+};
849
+
850
+static int zone_close_f(BlockBackend *blk, int argc, char **argv)
851
+{
852
+ int ret;
853
+ int64_t offset, len;
854
+ ++optind;
855
+ offset = cvtnum(argv[optind]);
856
+ ++optind;
857
+ len = cvtnum(argv[optind]);
858
+ ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
859
+ if (ret < 0) {
860
+ printf("zone close failed: %s\n", strerror(-ret));
861
+ }
862
+ return ret;
863
+}
864
+
865
+static const cmdinfo_t zone_close_cmd = {
866
+ .name = "zone_close",
867
+ .altname = "zc",
868
+ .cfunc = zone_close_f,
869
+ .argmin = 2,
870
+ .argmax = 2,
871
+ .args = "offset len",
872
+ .oneline = "close a range of zones in zone block device",
873
+};
874
+
875
+static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
876
+{
877
+ int ret;
878
+ int64_t offset, len;
879
+ ++optind;
880
+ offset = cvtnum(argv[optind]);
881
+ ++optind;
882
+ len = cvtnum(argv[optind]);
883
+ ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
884
+ if (ret < 0) {
885
+ printf("zone finish failed: %s\n", strerror(-ret));
886
+ }
887
+ return ret;
888
+}
889
+
890
+static const cmdinfo_t zone_finish_cmd = {
891
+ .name = "zone_finish",
892
+ .altname = "zf",
893
+ .cfunc = zone_finish_f,
894
+ .argmin = 2,
895
+ .argmax = 2,
896
+ .args = "offset len",
897
+ .oneline = "finish a range of zones in zone block device",
898
+};
899
+
900
+static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
901
+{
902
+ int ret;
903
+ int64_t offset, len;
904
+ ++optind;
905
+ offset = cvtnum(argv[optind]);
906
+ ++optind;
907
+ len = cvtnum(argv[optind]);
908
+ ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
909
+ if (ret < 0) {
910
+ printf("zone reset failed: %s\n", strerror(-ret));
911
+ }
912
+ return ret;
913
+}
914
+
915
+static const cmdinfo_t zone_reset_cmd = {
916
+ .name = "zone_reset",
917
+ .altname = "zrs",
918
+ .cfunc = zone_reset_f,
919
+ .argmin = 2,
920
+ .argmax = 2,
921
+ .args = "offset len",
922
+ .oneline = "reset a zone write pointer in zone block device",
923
+};
924
+
925
static int truncate_f(BlockBackend *blk, int argc, char **argv);
926
static const cmdinfo_t truncate_cmd = {
927
.name = "truncate",
928
@@ -XXX,XX +XXX,XX @@ static void __attribute((constructor)) init_qemuio_commands(void)
929
qemuio_add_command(&aio_write_cmd);
930
qemuio_add_command(&aio_flush_cmd);
931
qemuio_add_command(&flush_cmd);
932
+ qemuio_add_command(&zone_report_cmd);
933
+ qemuio_add_command(&zone_open_cmd);
934
+ qemuio_add_command(&zone_close_cmd);
935
+ qemuio_add_command(&zone_finish_cmd);
936
+ qemuio_add_command(&zone_reset_cmd);
937
qemuio_add_command(&truncate_cmd);
938
qemuio_add_command(&length_cmd);
939
qemuio_add_command(&info_cmd);
222
--
940
--
223
2.31.1
941
2.40.1
224
942
943
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
raw-format driver usually sits on top of file-posix driver. It needs to
4
pass through requests of zone commands.
5
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
9
Reviewed-by: Hannes Reinecke <hare@suse.de>
10
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
11
Acked-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: 20230508045533.175575-5-faithilikerun@gmail.com
14
Message-id: 20230324090605.28361-5-faithilikerun@gmail.com
15
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
16
<philmd@linaro.org>.
17
--Stefan]
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
---
20
block/raw-format.c | 17 +++++++++++++++++
21
1 file changed, 17 insertions(+)
22
23
diff --git a/block/raw-format.c b/block/raw-format.c
24
index XXXXXXX..XXXXXXX 100644
25
--- a/block/raw-format.c
26
+++ b/block/raw-format.c
27
@@ -XXX,XX +XXX,XX @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
28
return bdrv_co_pdiscard(bs->file, offset, bytes);
29
}
30
31
+static int coroutine_fn GRAPH_RDLOCK
32
+raw_co_zone_report(BlockDriverState *bs, int64_t offset,
33
+ unsigned int *nr_zones,
34
+ BlockZoneDescriptor *zones)
35
+{
36
+ return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
37
+}
38
+
39
+static int coroutine_fn GRAPH_RDLOCK
40
+raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
41
+ int64_t offset, int64_t len)
42
+{
43
+ return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
44
+}
45
+
46
static int64_t coroutine_fn GRAPH_RDLOCK
47
raw_co_getlength(BlockDriverState *bs)
48
{
49
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
50
.bdrv_co_pwritev = &raw_co_pwritev,
51
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
52
.bdrv_co_pdiscard = &raw_co_pdiscard,
53
+ .bdrv_co_zone_report = &raw_co_zone_report,
54
+ .bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
55
.bdrv_co_block_status = &raw_co_block_status,
56
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
57
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
58
--
59
2.40.1
60
61
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Allow using QemuCoSleep to sleep forever until woken by qemu_co_sleep_wake.
3
Putting zoned/non-zoned BlockDrivers on top of each other is not
4
This makes the logic of qemu_co_sleep_ns_wakeable easy to understand.
4
allowed.
5
5
6
In the future we will introduce an API that can work even if the
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
7
sleep and wake happen from different threads. For now, initializing
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
w->to_wake after timer_mod is fine because the timer can only fire in
8
Reviewed-by: Hannes Reinecke <hare@suse.de>
9
the same AioContext.
9
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
10
10
Acked-by: Kevin Wolf <kwolf@redhat.com>
11
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
12
Message-id: 20230508045533.175575-6-faithilikerun@gmail.com
13
Message-id: 20210517100548.28806-7-pbonzini@redhat.com
13
Message-id: 20230324090605.28361-6-faithilikerun@gmail.com
14
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
15
<philmd@linaro.org> and clarify that the check is about zoned
16
BlockDrivers.
17
--Stefan]
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
19
---
16
include/qemu/coroutine.h | 5 +++++
20
include/block/block_int-common.h | 5 +++++
17
util/qemu-coroutine-sleep.c | 26 +++++++++++++++++++-------
21
block.c | 19 +++++++++++++++++++
18
2 files changed, 24 insertions(+), 7 deletions(-)
22
block/file-posix.c | 12 ++++++++++++
23
block/raw-format.c | 1 +
24
4 files changed, 37 insertions(+)
19
25
20
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
26
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
21
index XXXXXXX..XXXXXXX 100644
27
index XXXXXXX..XXXXXXX 100644
22
--- a/include/qemu/coroutine.h
28
--- a/include/block/block_int-common.h
23
+++ b/include/qemu/coroutine.h
29
+++ b/include/block/block_int-common.h
24
@@ -XXX,XX +XXX,XX @@ typedef struct QemuCoSleep {
30
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
25
void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
31
*/
26
QEMUClockType type, int64_t ns);
32
bool is_format;
27
33
28
+/**
34
+ /*
29
+ * Yield the coroutine until the next call to qemu_co_sleep_wake.
35
+ * Set to true if the BlockDriver supports zoned children.
30
+ */
36
+ */
31
+void coroutine_fn qemu_co_sleep(QemuCoSleep *w);
37
+ bool supports_zoned_children;
32
+
38
+
33
static inline void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns)
39
/*
34
{
40
* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
35
QemuCoSleep w = { 0 };
41
* this field set to true, except ones that are defined only by their
36
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
42
diff --git a/block.c b/block.c
37
index XXXXXXX..XXXXXXX 100644
43
index XXXXXXX..XXXXXXX 100644
38
--- a/util/qemu-coroutine-sleep.c
44
--- a/block.c
39
+++ b/util/qemu-coroutine-sleep.c
45
+++ b/block.c
40
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
46
@@ -XXX,XX +XXX,XX @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
41
qemu_co_sleep_wake(w);
47
return;
42
}
43
44
-void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
45
- QEMUClockType type, int64_t ns)
46
+void coroutine_fn qemu_co_sleep(QemuCoSleep *w)
47
{
48
Coroutine *co = qemu_coroutine_self();
49
- AioContext *ctx = qemu_get_current_aio_context();
50
- QEMUTimer ts;
51
52
const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
53
qemu_co_sleep_ns__scheduled);
54
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
55
}
48
}
56
49
57
w->to_wake = co;
50
+ /*
58
- aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, w),
51
+ * Non-zoned block drivers do not follow zoned storage constraints
59
- timer_mod(&ts, qemu_clock_get_ns(type) + ns);
52
+ * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
60
qemu_coroutine_yield();
53
+ * drivers in a graph.
61
- timer_del(&ts);
54
+ */
62
55
+ if (!parent_bs->drv->supports_zoned_children &&
63
/* w->to_wake is cleared before resuming this coroutine. */
56
+ child_bs->bl.zoned == BLK_Z_HM) {
64
assert(w->to_wake == NULL);
57
+ /*
65
}
58
+ * The host-aware model allows zoned storage constraints and random
59
+ * write. Allow mixing host-aware and non-zoned drivers. Using
60
+ * host-aware device as a regular device.
61
+ */
62
+ error_setg(errp, "Cannot add a %s child to a %s parent",
63
+ child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
64
+ parent_bs->drv->supports_zoned_children ?
65
+ "support zoned children" : "not support zoned children");
66
+ return;
67
+ }
66
+
68
+
67
+void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
69
if (!QLIST_EMPTY(&child_bs->parents)) {
68
+ QEMUClockType type, int64_t ns)
70
error_setg(errp, "The node %s already has a parent",
69
+{
71
child_bs->node_name);
70
+ AioContext *ctx = qemu_get_current_aio_context();
72
diff --git a/block/file-posix.c b/block/file-posix.c
71
+ QEMUTimer ts;
73
index XXXXXXX..XXXXXXX 100644
72
+
74
--- a/block/file-posix.c
73
+ aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, w);
75
+++ b/block/file-posix.c
74
+ timer_mod(&ts, qemu_clock_get_ns(type) + ns);
76
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
75
+
77
goto fail;
78
}
79
}
80
+#ifdef CONFIG_BLKZONED
76
+ /*
81
+ /*
77
+ * The timer will fire in the current AiOContext, so the callback
82
+ * The kernel page cache does not reliably work for writes to SWR zones
78
+ * must happen after qemu_co_sleep yields and there is no race
83
+ * of zoned block device because it can not guarantee the order of writes.
79
+ * between timer_mod and qemu_co_sleep.
80
+ */
84
+ */
81
+ qemu_co_sleep(w);
85
+ if ((bs->bl.zoned != BLK_Z_NONE) &&
82
+ timer_del(&ts);
86
+ (!(s->open_flags & O_DIRECT))) {
83
+}
87
+ error_setg(errp, "The driver supports zoned devices, and it requires "
88
+ "cache.direct=on, which was not specified.");
89
+ return -EINVAL; /* No host kernel page cache */
90
+ }
91
+#endif
92
93
if (S_ISBLK(st.st_mode)) {
94
#ifdef __linux__
95
diff --git a/block/raw-format.c b/block/raw-format.c
96
index XXXXXXX..XXXXXXX 100644
97
--- a/block/raw-format.c
98
+++ b/block/raw-format.c
99
@@ -XXX,XX +XXX,XX @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
100
BlockDriver bdrv_raw = {
101
.format_name = "raw",
102
.instance_size = sizeof(BDRVRawState),
103
+ .supports_zoned_children = true,
104
.bdrv_probe = &raw_probe,
105
.bdrv_reopen_prepare = &raw_reopen_prepare,
106
.bdrv_reopen_commit = &raw_reopen_commit,
84
--
107
--
85
2.31.1
108
2.40.1
86
109
110
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
The new block layer APIs of zoned block devices can be tested by:
4
$ tests/qemu-iotests/check zoned
5
Run each zone operation on a newly created null_blk device
6
and see whether it outputs the same zone information.
7
8
Signed-off-by: Sam Li <faithilikerun@gmail.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Acked-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 20230508045533.175575-7-faithilikerun@gmail.com
13
Message-id: 20230324090605.28361-7-faithilikerun@gmail.com
14
[Adjust commit message prefix as suggested by Philippe Mathieu-Daudé
15
<philmd@linaro.org>.
16
--Stefan]
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
19
tests/qemu-iotests/tests/zoned | 89 ++++++++++++++++++++++++++++++
20
tests/qemu-iotests/tests/zoned.out | 53 ++++++++++++++++++
21
2 files changed, 142 insertions(+)
22
create mode 100755 tests/qemu-iotests/tests/zoned
23
create mode 100644 tests/qemu-iotests/tests/zoned.out
24
25
diff --git a/tests/qemu-iotests/tests/zoned b/tests/qemu-iotests/tests/zoned
26
new file mode 100755
27
index XXXXXXX..XXXXXXX
28
--- /dev/null
29
+++ b/tests/qemu-iotests/tests/zoned
30
@@ -XXX,XX +XXX,XX @@
31
+#!/usr/bin/env bash
32
+#
33
+# Test zone management operations.
34
+#
35
+
36
+seq="$(basename $0)"
37
+echo "QA output created by $seq"
38
+status=1 # failure is the default!
39
+
40
+_cleanup()
41
+{
42
+ _cleanup_test_img
43
+ sudo -n rmmod null_blk
44
+}
45
+trap "_cleanup; exit \$status" 0 1 2 3 15
46
+
47
+# get standard environment, filters and checks
48
+. ../common.rc
49
+. ../common.filter
50
+. ../common.qemu
51
+
52
+# This test only runs on Linux hosts with raw image files.
53
+_supported_fmt raw
54
+_supported_proto file
55
+_supported_os Linux
56
+
57
+sudo -n true || \
58
+ _notrun 'Password-less sudo required'
59
+
60
+IMG="--image-opts -n driver=host_device,filename=/dev/nullb0"
61
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
62
+
63
+echo "Testing a null_blk device:"
64
+echo "case 1: if the operations work"
65
+sudo -n modprobe null_blk nr_devices=1 zoned=1
66
+sudo -n chmod 0666 /dev/nullb0
67
+
68
+echo "(1) report the first zone:"
69
+$QEMU_IO $IMG -c "zrp 0 1"
70
+echo
71
+echo "report the first 10 zones"
72
+$QEMU_IO $IMG -c "zrp 0 10"
73
+echo
74
+echo "report the last zone:"
75
+$QEMU_IO $IMG -c "zrp 0x3e70000000 2" # 0x3e70000000 / 512 = 0x1f380000
76
+echo
77
+echo
78
+echo "(2) opening the first zone"
79
+$QEMU_IO $IMG -c "zo 0 268435456" # 268435456 / 512 = 524288
80
+echo "report after:"
81
+$QEMU_IO $IMG -c "zrp 0 1"
82
+echo
83
+echo "opening the second zone"
84
+$QEMU_IO $IMG -c "zo 268435456 268435456" #
85
+echo "report after:"
86
+$QEMU_IO $IMG -c "zrp 268435456 1"
87
+echo
88
+echo "opening the last zone"
89
+$QEMU_IO $IMG -c "zo 0x3e70000000 268435456"
90
+echo "report after:"
91
+$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
92
+echo
93
+echo
94
+echo "(3) closing the first zone"
95
+$QEMU_IO $IMG -c "zc 0 268435456"
96
+echo "report after:"
97
+$QEMU_IO $IMG -c "zrp 0 1"
98
+echo
99
+echo "closing the last zone"
100
+$QEMU_IO $IMG -c "zc 0x3e70000000 268435456"
101
+echo "report after:"
102
+$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
103
+echo
104
+echo
105
+echo "(4) finishing the second zone"
106
+$QEMU_IO $IMG -c "zf 268435456 268435456"
107
+echo "After finishing a zone:"
108
+$QEMU_IO $IMG -c "zrp 268435456 1"
109
+echo
110
+echo
111
+echo "(5) resetting the second zone"
112
+$QEMU_IO $IMG -c "zrs 268435456 268435456"
113
+echo "After resetting a zone:"
114
+$QEMU_IO $IMG -c "zrp 268435456 1"
115
+
116
+# success, all done
117
+echo "*** done"
118
+rm -f $seq.full
119
+status=0
120
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
121
new file mode 100644
122
index XXXXXXX..XXXXXXX
123
--- /dev/null
124
+++ b/tests/qemu-iotests/tests/zoned.out
125
@@ -XXX,XX +XXX,XX @@
126
+QA output created by zoned
127
+Testing a null_blk device:
128
+case 1: if the operations work
129
+(1) report the first zone:
130
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
131
+
132
+report the first 10 zones
133
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
134
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
135
+start: 0x100000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:1, [type: 2]
136
+start: 0x180000, len 0x80000, cap 0x80000, wptr 0x180000, zcond:1, [type: 2]
137
+start: 0x200000, len 0x80000, cap 0x80000, wptr 0x200000, zcond:1, [type: 2]
138
+start: 0x280000, len 0x80000, cap 0x80000, wptr 0x280000, zcond:1, [type: 2]
139
+start: 0x300000, len 0x80000, cap 0x80000, wptr 0x300000, zcond:1, [type: 2]
140
+start: 0x380000, len 0x80000, cap 0x80000, wptr 0x380000, zcond:1, [type: 2]
141
+start: 0x400000, len 0x80000, cap 0x80000, wptr 0x400000, zcond:1, [type: 2]
142
+start: 0x480000, len 0x80000, cap 0x80000, wptr 0x480000, zcond:1, [type: 2]
143
+
144
+report the last zone:
145
+start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
146
+
147
+
148
+(2) opening the first zone
149
+report after:
150
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:3, [type: 2]
151
+
152
+opening the second zone
153
+report after:
154
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:3, [type: 2]
155
+
156
+opening the last zone
157
+report after:
158
+start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:3, [type: 2]
159
+
160
+
161
+(3) closing the first zone
162
+report after:
163
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
164
+
165
+closing the last zone
166
+report after:
167
+start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
168
+
169
+
170
+(4) finishing the second zone
171
+After finishing a zone:
172
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
173
+
174
+
175
+(5) resetting the second zone
176
+After resetting a zone:
177
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
178
+*** done
179
--
180
2.40.1
181
182
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
6
Acked-by: Kevin Wolf <kwolf@redhat.com>
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Message-id: 20230508045533.175575-8-faithilikerun@gmail.com
9
Message-id: 20230324090605.28361-8-faithilikerun@gmail.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
block/file-posix.c | 3 +++
13
block/trace-events | 2 ++
14
2 files changed, 5 insertions(+)
15
16
diff --git a/block/file-posix.c b/block/file-posix.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/block/file-posix.c
19
+++ b/block/file-posix.c
20
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
21
},
22
};
23
24
+ trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
25
return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
26
}
27
#endif
28
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
29
},
30
};
31
32
+ trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
33
+ len >> BDRV_SECTOR_BITS);
34
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
35
if (ret != 0) {
36
error_report("ioctl %s failed %d", op_name, ret);
37
diff --git a/block/trace-events b/block/trace-events
38
index XXXXXXX..XXXXXXX 100644
39
--- a/block/trace-events
40
+++ b/block/trace-events
41
@@ -XXX,XX +XXX,XX @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
42
file_setup_cdrom(const char *partition) "Using %s as optical disc"
43
file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
44
file_flush_fdatasync_failed(int err) "errno %d"
45
+zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
46
+zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
47
48
# ssh.c
49
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
50
--
51
2.40.1
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
Add the documentation about the zoned device support to virtio-blk
4
emulation.
5
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
9
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
10
Acked-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 20230508045533.175575-9-faithilikerun@gmail.com
13
Message-id: 20230324090605.28361-9-faithilikerun@gmail.com
14
[Add index-api.rst to fix "zoned-storage.rst:document isn't included in
15
any toctree" error and fix pre-formatted code syntax.
16
--Stefan]
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
19
docs/devel/index-api.rst | 1 +
20
docs/devel/zoned-storage.rst | 43 ++++++++++++++++++++++++++
21
docs/system/qemu-block-drivers.rst.inc | 6 ++++
22
3 files changed, 50 insertions(+)
23
create mode 100644 docs/devel/zoned-storage.rst
24
25
diff --git a/docs/devel/index-api.rst b/docs/devel/index-api.rst
26
index XXXXXXX..XXXXXXX 100644
27
--- a/docs/devel/index-api.rst
28
+++ b/docs/devel/index-api.rst
29
@@ -XXX,XX +XXX,XX @@ generated from in-code annotations to function prototypes.
30
memory
31
modules
32
ui
33
+ zoned-storage
34
diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
35
new file mode 100644
36
index XXXXXXX..XXXXXXX
37
--- /dev/null
38
+++ b/docs/devel/zoned-storage.rst
39
@@ -XXX,XX +XXX,XX @@
40
+=============
41
+zoned-storage
42
+=============
43
+
44
+Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones
45
+that are larger than the LBA size. They can only allow sequential writes, which
46
+can reduce write amplification in SSDs, and potentially lead to higher
47
+throughput and increased capacity. More details about ZBDs can be found at:
48
+
49
+https://zonedstorage.io/docs/introduction/zoned-storage
50
+
51
+1. Block layer APIs for zoned storage
52
+-------------------------------------
53
+QEMU block layer supports three zoned storage models:
54
+- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
55
+to zones. It supports ZBD-specific I/O commands that can be used by a host to
56
+manage the zones of a device.
57
+- BLK_Z_HA: The host-aware zoned model allows random write operations in
58
+zones, making it backward compatible with regular block devices.
59
+- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
60
+regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
61
+supported.
62
+
63
+The block device information resides inside BlockDriverState. QEMU uses
64
+BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
65
+block layer while processing I/O requests. A BlockBackend has a root pointer to
66
+a BlockDriverState graph(for example, raw format on top of file-posix). The
67
+zoned storage information can be propagated from the leaf BlockDriverState all
68
+the way up to the BlockBackend. If the zoned storage model in file-posix is
69
+set to BLK_Z_HM, then block drivers will declare support for zoned host device.
70
+
71
+The block layer APIs support commands needed for zoned storage devices,
72
+including report zones, four zone operations, and zone append.
73
+
74
+2. Emulating zoned storage controllers
75
+--------------------------------------
76
+When the BlockBackend's BlockLimits model reports a zoned storage device, users
77
+like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
78
+APIs for zoned storage emulation or testing.
79
+
80
+For example, to test zone_report on a null_blk device using qemu-io is::
81
+
82
+ $ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
83
diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-drivers.rst.inc
84
index XXXXXXX..XXXXXXX 100644
85
--- a/docs/system/qemu-block-drivers.rst.inc
86
+++ b/docs/system/qemu-block-drivers.rst.inc
87
@@ -XXX,XX +XXX,XX @@ Hard disks
88
you may corrupt your host data (use the ``-snapshot`` command
89
line option or modify the device permissions accordingly).
90
91
+Zoned block devices
92
+ Zoned block devices can be passed through to the guest if the emulated storage
93
+ controller supports zoned storage. Use ``--blockdev host_device,
94
+ node-name=drive0,filename=/dev/nullb0,cache.direct=on`` to pass through
95
+ ``/dev/nullb0`` as ``drive0``.
96
+
97
Windows
98
^^^^^^^
99
100
--
101
2.40.1
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
The lifetime of the timer is well-known (it cannot outlive
3
Since Linux doesn't have a user API to issue zone append operations to
4
qemu_co_sleep_ns_wakeable, because it's deleted by the time the
4
zoned devices from user space, the file-posix driver is modified to add
5
coroutine resumes), so it is not necessary to place it on the heap.
5
zone append emulation using regular writes. To do this, the file-posix
6
driver tracks the wp location of all zones of the device. It uses an
7
array of uint64_t. The most significant bit of each wp location indicates
8
if the zone type is conventional zones.
6
9
7
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10
The zones wp can be changed due to the following operations issued:
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
- zone reset: change the wp to the start offset of that zone
9
Message-id: 20210517100548.28806-2-pbonzini@redhat.com
12
- zone finish: change to the end location of that zone
13
- write to a zone
14
- zone append
15
16
Signed-off-by: Sam Li <faithilikerun@gmail.com>
17
Message-id: 20230508051510.177850-2-faithilikerun@gmail.com
18
[Fix errno propagation from handle_aiocb_zone_mgmt()
19
--Stefan]
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
21
---
12
util/qemu-coroutine-sleep.c | 9 ++++-----
22
include/block/block-common.h | 14 +++
13
1 file changed, 4 insertions(+), 5 deletions(-)
23
include/block/block_int-common.h | 5 +
24
block/file-posix.c | 178 ++++++++++++++++++++++++++++++-
25
3 files changed, 193 insertions(+), 4 deletions(-)
14
26
15
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
27
diff --git a/include/block/block-common.h b/include/block/block-common.h
16
index XXXXXXX..XXXXXXX 100644
28
index XXXXXXX..XXXXXXX 100644
17
--- a/util/qemu-coroutine-sleep.c
29
--- a/include/block/block-common.h
18
+++ b/util/qemu-coroutine-sleep.c
30
+++ b/include/block/block-common.h
19
@@ -XXX,XX +XXX,XX @@ static const char *qemu_co_sleep_ns__scheduled = "qemu_co_sleep_ns";
31
@@ -XXX,XX +XXX,XX @@ typedef struct BlockZoneDescriptor {
20
32
BlockZoneState state;
21
struct QemuCoSleepState {
33
} BlockZoneDescriptor;
22
Coroutine *co;
34
23
- QEMUTimer *ts;
35
+/*
24
+ QEMUTimer ts;
36
+ * Track write pointers of a zone in bytes.
25
QemuCoSleepState **user_state_pointer;
37
+ */
38
+typedef struct BlockZoneWps {
39
+ CoMutex colock;
40
+ uint64_t wp[];
41
+} BlockZoneWps;
42
+
43
typedef struct BlockDriverInfo {
44
/* in bytes, 0 if irrelevant */
45
int cluster_size;
46
@@ -XXX,XX +XXX,XX @@ typedef enum {
47
#define BDRV_SECTOR_BITS 9
48
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
49
50
+/*
51
+ * Get the first most significant bit of wp. If it is zero, then
52
+ * the zone type is SWR.
53
+ */
54
+#define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63))
55
+
56
#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
57
INT_MAX >> BDRV_SECTOR_BITS)
58
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
59
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
60
index XXXXXXX..XXXXXXX 100644
61
--- a/include/block/block_int-common.h
62
+++ b/include/block/block_int-common.h
63
@@ -XXX,XX +XXX,XX @@ typedef struct BlockLimits {
64
65
/* maximum number of active zones */
66
uint32_t max_active_zones;
67
+
68
+ uint32_t write_granularity;
69
} BlockLimits;
70
71
typedef struct BdrvOpBlocker BdrvOpBlocker;
72
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
73
CoMutex bsc_modify_lock;
74
/* Always non-NULL, but must only be dereferenced under an RCU read guard */
75
BdrvBlockStatusCache *block_status_cache;
76
+
77
+ /* array of write pointers' location of each zone in the zoned device. */
78
+ BlockZoneWps *wps;
26
};
79
};
27
80
28
@@ -XXX,XX +XXX,XX @@ void qemu_co_sleep_wake(QemuCoSleepState *sleep_state)
81
struct BlockBackendRootState {
29
if (sleep_state->user_state_pointer) {
82
diff --git a/block/file-posix.c b/block/file-posix.c
30
*sleep_state->user_state_pointer = NULL;
83
index XXXXXXX..XXXXXXX 100644
31
}
84
--- a/block/file-posix.c
32
- timer_del(sleep_state->ts);
85
+++ b/block/file-posix.c
33
+ timer_del(&sleep_state->ts);
86
@@ -XXX,XX +XXX,XX @@ static int hdev_get_max_segments(int fd, struct stat *st)
34
aio_co_wake(sleep_state->co);
35
}
87
}
36
88
37
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
89
#if defined(CONFIG_BLKZONED)
38
AioContext *ctx = qemu_get_current_aio_context();
90
+/*
39
QemuCoSleepState state = {
91
+ * If the reset_all flag is true, then the wps of zone whose state is
40
.co = qemu_coroutine_self(),
92
+ * not readonly or offline should be all reset to the start sector.
41
- .ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &state),
93
+ * Else, take the real wp of the device.
42
.user_state_pointer = sleep_state,
94
+ */
95
+static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
96
+ unsigned int nrz, bool reset_all)
97
+{
98
+ struct blk_zone *blkz;
99
+ size_t rep_size;
100
+ uint64_t sector = offset >> BDRV_SECTOR_BITS;
101
+ BlockZoneWps *wps = bs->wps;
102
+ unsigned int j = offset / bs->bl.zone_size;
103
+ unsigned int n = 0, i = 0;
104
+ int ret;
105
+ rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
106
+ g_autofree struct blk_zone_report *rep = NULL;
107
+
108
+ rep = g_malloc(rep_size);
109
+ blkz = (struct blk_zone *)(rep + 1);
110
+ while (n < nrz) {
111
+ memset(rep, 0, rep_size);
112
+ rep->sector = sector;
113
+ rep->nr_zones = nrz - n;
114
+
115
+ do {
116
+ ret = ioctl(fd, BLKREPORTZONE, rep);
117
+ } while (ret != 0 && errno == EINTR);
118
+ if (ret != 0) {
119
+ error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
120
+ fd, offset, errno);
121
+ return -errno;
122
+ }
123
+
124
+ if (!rep->nr_zones) {
125
+ break;
126
+ }
127
+
128
+ for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
129
+ /*
130
+ * The wp tracking cares only about sequential writes required and
131
+ * sequential write preferred zones so that the wp can advance to
132
+ * the right location.
133
+ * Use the most significant bit of the wp location to indicate the
134
+ * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
135
+ */
136
+ if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
137
+ wps->wp[j] |= 1ULL << 63;
138
+ } else {
139
+ switch(blkz[i].cond) {
140
+ case BLK_ZONE_COND_FULL:
141
+ case BLK_ZONE_COND_READONLY:
142
+ /* Zone not writable */
143
+ wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
144
+ break;
145
+ case BLK_ZONE_COND_OFFLINE:
146
+ /* Zone not writable nor readable */
147
+ wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
148
+ break;
149
+ default:
150
+ if (reset_all) {
151
+ wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
152
+ } else {
153
+ wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
154
+ }
155
+ break;
156
+ }
157
+ }
158
+ }
159
+ sector = blkz[i - 1].start + blkz[i - 1].len;
160
+ }
161
+
162
+ return 0;
163
+}
164
+
165
+static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
166
+ unsigned int nrz)
167
+{
168
+ if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
169
+ error_report("update zone wp failed");
170
+ }
171
+}
172
+
173
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
174
Error **errp)
175
{
176
+ BDRVRawState *s = bs->opaque;
177
BlockZoneModel zoned;
178
int ret;
179
180
@@ -XXX,XX +XXX,XX @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
181
if (ret > 0) {
182
bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
183
}
184
+
185
+ ret = get_sysfs_long_val(st, "physical_block_size");
186
+ if (ret >= 0) {
187
+ bs->bl.write_granularity = ret;
188
+ }
189
+
190
+ /* The refresh_limits() function can be called multiple times. */
191
+ g_free(bs->wps);
192
+ bs->wps = g_malloc(sizeof(BlockZoneWps) +
193
+ sizeof(int64_t) * bs->bl.nr_zones);
194
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
195
+ if (ret < 0) {
196
+ error_setg_errno(errp, -ret, "report wps failed");
197
+ bs->wps = NULL;
198
+ return;
199
+ }
200
+ qemu_co_mutex_init(&bs->wps->colock);
201
}
202
#else /* !defined(CONFIG_BLKZONED) */
203
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
204
@@ -XXX,XX +XXX,XX @@ static int handle_aiocb_zone_mgmt(void *opaque)
205
ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
206
} while (ret != 0 && errno == EINTR);
207
208
- return ret;
209
+ return ret < 0 ? -errno : ret;
210
}
211
#endif
212
213
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
214
{
215
BDRVRawState *s = bs->opaque;
216
RawPosixAIOData acb;
217
+ int ret;
218
219
if (fd_open(bs) < 0)
220
return -EIO;
221
+#if defined(CONFIG_BLKZONED)
222
+ if (type & QEMU_AIO_WRITE && bs->wps) {
223
+ qemu_co_mutex_lock(&bs->wps->colock);
224
+ }
225
+#endif
226
227
/*
228
* When using O_DIRECT, the request must be aligned to be able to use
229
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
230
#ifdef CONFIG_LINUX_IO_URING
231
} else if (s->use_linux_io_uring) {
232
assert(qiov->size == bytes);
233
- return luring_co_submit(bs, s->fd, offset, qiov, type);
234
+ ret = luring_co_submit(bs, s->fd, offset, qiov, type);
235
+ goto out;
236
#endif
237
#ifdef CONFIG_LINUX_AIO
238
} else if (s->use_linux_aio) {
239
assert(qiov->size == bytes);
240
- return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
241
+ ret = laio_co_submit(s->fd, offset, qiov, type,
242
+ s->aio_max_batch);
243
+ goto out;
244
#endif
245
}
246
247
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
43
};
248
};
44
249
45
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
250
assert(qiov->size == bytes);
46
abort();
251
- return raw_thread_pool_submit(handle_aiocb_rw, &acb);
47
}
252
+ ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
48
253
+ goto out; /* Avoid the compiler err of unused label */
49
+ aio_timer_init(ctx, &state.ts, type, SCALE_NS, co_sleep_cb, &state);
254
+
50
if (sleep_state) {
255
+out:
51
*sleep_state = &state;
256
+#if defined(CONFIG_BLKZONED)
52
}
257
+{
53
- timer_mod(state.ts, qemu_clock_get_ns(type) + ns);
258
+ BlockZoneWps *wps = bs->wps;
54
+ timer_mod(&state.ts, qemu_clock_get_ns(type) + ns);
259
+ if (ret == 0) {
55
qemu_coroutine_yield();
260
+ if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
56
if (sleep_state) {
261
+ uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
57
/*
262
+ if (!BDRV_ZT_IS_CONV(*wp)) {
58
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
263
+ /* Advance the wp if needed */
59
*/
264
+ if (offset + bytes > *wp) {
60
assert(*sleep_state == NULL);
265
+ *wp = offset + bytes;
61
}
266
+ }
62
- timer_free(state.ts);
267
+ }
268
+ }
269
+ } else {
270
+ if (type & QEMU_AIO_WRITE) {
271
+ update_zones_wp(bs, s->fd, 0, 1);
272
+ }
273
+ }
274
+
275
+ if (type & QEMU_AIO_WRITE && wps) {
276
+ qemu_co_mutex_unlock(&wps->colock);
277
+ }
278
+}
279
+#endif
280
+ return ret;
63
}
281
}
282
283
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
284
@@ -XXX,XX +XXX,XX @@ static void raw_close(BlockDriverState *bs)
285
BDRVRawState *s = bs->opaque;
286
287
if (s->fd >= 0) {
288
+#if defined(CONFIG_BLKZONED)
289
+ g_free(bs->wps);
290
+#endif
291
qemu_close(s->fd);
292
s->fd = -1;
293
}
294
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
295
const char *op_name;
296
unsigned long zo;
297
int ret;
298
+ BlockZoneWps *wps = bs->wps;
299
int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
300
301
zone_size = bs->bl.zone_size;
302
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
303
return -EINVAL;
304
}
305
306
+ uint32_t i = offset / bs->bl.zone_size;
307
+ uint32_t nrz = len / bs->bl.zone_size;
308
+ uint64_t *wp = &wps->wp[i];
309
+ if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
310
+ error_report("zone mgmt operations are not allowed for conventional zones");
311
+ return -EIO;
312
+ }
313
+
314
switch (op) {
315
case BLK_ZO_OPEN:
316
op_name = "BLKOPENZONE";
317
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
318
len >> BDRV_SECTOR_BITS);
319
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
320
if (ret != 0) {
321
+ update_zones_wp(bs, s->fd, offset, i);
322
error_report("ioctl %s failed %d", op_name, ret);
323
+ return ret;
324
+ }
325
+
326
+ if (zo == BLKRESETZONE && len == capacity) {
327
+ ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
328
+ if (ret < 0) {
329
+ error_report("reporting single wp failed");
330
+ return ret;
331
+ }
332
+ } else if (zo == BLKRESETZONE) {
333
+ for (unsigned int j = 0; j < nrz; ++j) {
334
+ wp[j] = offset + j * zone_size;
335
+ }
336
+ } else if (zo == BLKFINISHZONE) {
337
+ for (unsigned int j = 0; j < nrz; ++j) {
338
+ /* The zoned device allows the last zone smaller that the
339
+ * zone size. */
340
+ wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
341
+ }
342
}
343
344
return ret;
64
--
345
--
65
2.31.1
346
2.40.1
66
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Simplify the code by removing conditionals. qemu_co_sleep_ns
3
A zone append command is a write operation that specifies the first
4
can simply point the argument to an on-stack temporary.
4
logical block of a zone as the write position. When writing to a zoned
5
block device using zone append, the byte offset of the call may point at
6
any position within the zone to which the data is being appended. Upon
7
completion the device will respond with the position where the data has
8
been written in the zone.
5
9
6
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10
Signed-off-by: Sam Li <faithilikerun@gmail.com>
7
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
8
Message-id: 20210517100548.28806-3-pbonzini@redhat.com
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: 20230508051510.177850-3-faithilikerun@gmail.com
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
15
---
11
include/qemu/coroutine.h | 5 +++--
16
include/block/block-io.h | 4 ++
12
util/qemu-coroutine-sleep.c | 18 +++++-------------
17
include/block/block_int-common.h | 3 ++
13
2 files changed, 8 insertions(+), 15 deletions(-)
18
include/block/raw-aio.h | 4 +-
19
include/sysemu/block-backend-io.h | 9 +++++
20
block/block-backend.c | 61 +++++++++++++++++++++++++++++++
21
block/file-posix.c | 58 +++++++++++++++++++++++++----
22
block/io.c | 27 ++++++++++++++
23
block/io_uring.c | 4 ++
24
block/linux-aio.c | 3 ++
25
block/raw-format.c | 8 ++++
26
10 files changed, 173 insertions(+), 8 deletions(-)
14
27
15
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
28
diff --git a/include/block/block-io.h b/include/block/block-io.h
16
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
17
--- a/include/qemu/coroutine.h
30
--- a/include/block/block-io.h
18
+++ b/include/qemu/coroutine.h
31
+++ b/include/block/block-io.h
19
@@ -XXX,XX +XXX,XX @@ typedef struct QemuCoSleepState QemuCoSleepState;
32
@@ -XXX,XX +XXX,XX @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs,
20
33
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
21
/**
34
BlockZoneOp op,
22
* Yield the coroutine for a given duration. During this yield, @sleep_state
35
int64_t offset, int64_t len);
23
- * (if not NULL) is set to an opaque pointer, which may be used for
36
+int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs,
24
+ * is set to an opaque pointer, which may be used for
37
+ int64_t *offset,
25
* qemu_co_sleep_wake(). Be careful, the pointer is set back to zero when the
38
+ QEMUIOVector *qiov,
26
* timer fires. Don't save the obtained value to other variables and don't call
39
+ BdrvRequestFlags flags);
27
* qemu_co_sleep_wake from another aio context.
40
28
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
41
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
29
QemuCoSleepState **sleep_state);
42
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
30
static inline void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns)
43
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
44
index XXXXXXX..XXXXXXX 100644
45
--- a/include/block/block_int-common.h
46
+++ b/include/block/block_int-common.h
47
@@ -XXX,XX +XXX,XX @@ struct BlockDriver {
48
BlockZoneDescriptor *zones);
49
int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
50
int64_t offset, int64_t len);
51
+ int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
52
+ int64_t *offset, QEMUIOVector *qiov,
53
+ BdrvRequestFlags flags);
54
55
/* removable device specific */
56
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
57
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
58
index XXXXXXX..XXXXXXX 100644
59
--- a/include/block/raw-aio.h
60
+++ b/include/block/raw-aio.h
61
@@ -XXX,XX +XXX,XX @@
62
#define QEMU_AIO_TRUNCATE 0x0080
63
#define QEMU_AIO_ZONE_REPORT 0x0100
64
#define QEMU_AIO_ZONE_MGMT 0x0200
65
+#define QEMU_AIO_ZONE_APPEND 0x0400
66
#define QEMU_AIO_TYPE_MASK \
67
(QEMU_AIO_READ | \
68
QEMU_AIO_WRITE | \
69
@@ -XXX,XX +XXX,XX @@
70
QEMU_AIO_COPY_RANGE | \
71
QEMU_AIO_TRUNCATE | \
72
QEMU_AIO_ZONE_REPORT | \
73
- QEMU_AIO_ZONE_MGMT)
74
+ QEMU_AIO_ZONE_MGMT | \
75
+ QEMU_AIO_ZONE_APPEND)
76
77
/* AIO flags */
78
#define QEMU_AIO_MISALIGNED 0x1000
79
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
80
index XXXXXXX..XXXXXXX 100644
81
--- a/include/sysemu/block-backend-io.h
82
+++ b/include/sysemu/block-backend-io.h
83
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
84
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
85
int64_t offset, int64_t len,
86
BlockCompletionFunc *cb, void *opaque);
87
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
88
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
89
+ BlockCompletionFunc *cb, void *opaque);
90
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
91
BlockCompletionFunc *cb, void *opaque);
92
void blk_aio_cancel_async(BlockAIOCB *acb);
93
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
94
int64_t offset, int64_t len);
95
int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
96
int64_t offset, int64_t len);
97
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
98
+ QEMUIOVector *qiov,
99
+ BdrvRequestFlags flags);
100
+int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset,
101
+ QEMUIOVector *qiov,
102
+ BdrvRequestFlags flags);
103
104
int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
105
int64_t bytes);
106
diff --git a/block/block-backend.c b/block/block-backend.c
107
index XXXXXXX..XXXXXXX 100644
108
--- a/block/block-backend.c
109
+++ b/block/block-backend.c
110
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
111
return &acb->common;
112
}
113
114
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
115
+{
116
+ BlkAioEmAIOCB *acb = opaque;
117
+ BlkRwCo *rwco = &acb->rwco;
118
+
119
+ rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
120
+ rwco->iobuf, rwco->flags);
121
+ blk_aio_complete(acb);
122
+}
123
+
124
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
125
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
126
+ BlockCompletionFunc *cb, void *opaque) {
127
+ BlkAioEmAIOCB *acb;
128
+ Coroutine *co;
129
+ IO_CODE();
130
+
131
+ blk_inc_in_flight(blk);
132
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
133
+ acb->rwco = (BlkRwCo) {
134
+ .blk = blk,
135
+ .ret = NOT_DONE,
136
+ .flags = flags,
137
+ .iobuf = qiov,
138
+ };
139
+ acb->bytes = (int64_t)(uintptr_t)offset;
140
+ acb->has_returned = false;
141
+
142
+ co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
143
+ aio_co_enter(blk_get_aio_context(blk), co);
144
+ acb->has_returned = true;
145
+ if (acb->rwco.ret != NOT_DONE) {
146
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
147
+ blk_aio_complete_bh, acb);
148
+ }
149
+
150
+ return &acb->common;
151
+}
152
+
153
/*
154
* Send a zone_report command.
155
* offset is a byte offset from the start of the device. No alignment
156
@@ -XXX,XX +XXX,XX @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
157
return ret;
158
}
159
160
+/*
161
+ * Send a zone_append command.
162
+ */
163
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
164
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
165
+{
166
+ int ret;
167
+ IO_CODE();
168
+
169
+ blk_inc_in_flight(blk);
170
+ blk_wait_while_drained(blk);
171
+ GRAPH_RDLOCK_GUARD();
172
+ if (!blk_is_available(blk)) {
173
+ blk_dec_in_flight(blk);
174
+ return -ENOMEDIUM;
175
+ }
176
+
177
+ ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
178
+ blk_dec_in_flight(blk);
179
+ return ret;
180
+}
181
+
182
void blk_drain(BlockBackend *blk)
31
{
183
{
32
- qemu_co_sleep_ns_wakeable(type, ns, NULL);
184
BlockDriverState *bs = blk_bs(blk);
33
+ QemuCoSleepState *unused = NULL;
185
diff --git a/block/file-posix.c b/block/file-posix.c
34
+ qemu_co_sleep_ns_wakeable(type, ns, &unused);
186
index XXXXXXX..XXXXXXX 100644
35
}
187
--- a/block/file-posix.c
36
188
+++ b/block/file-posix.c
37
/**
189
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState {
38
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
190
bool has_write_zeroes:1;
39
index XXXXXXX..XXXXXXX 100644
191
bool use_linux_aio:1;
40
--- a/util/qemu-coroutine-sleep.c
192
bool use_linux_io_uring:1;
41
+++ b/util/qemu-coroutine-sleep.c
193
+ int64_t *offset; /* offset of zone append operation */
42
@@ -XXX,XX +XXX,XX @@ void qemu_co_sleep_wake(QemuCoSleepState *sleep_state)
194
int page_cache_inconsistent; /* errno from fdatasync failure */
43
qemu_co_sleep_ns__scheduled, NULL);
195
bool has_fallocate;
44
196
bool needs_alignment;
45
assert(scheduled == qemu_co_sleep_ns__scheduled);
197
@@ -XXX,XX +XXX,XX @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
46
- if (sleep_state->user_state_pointer) {
198
ssize_t len;
47
- *sleep_state->user_state_pointer = NULL;
199
48
- }
200
len = RETRY_ON_EINTR(
49
+ *sleep_state->user_state_pointer = NULL;
201
- (aiocb->aio_type & QEMU_AIO_WRITE) ?
50
timer_del(&sleep_state->ts);
202
+ (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
51
aio_co_wake(sleep_state->co);
203
qemu_pwritev(aiocb->aio_fildes,
52
}
204
aiocb->io.iov,
53
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
205
aiocb->io.niov,
206
@@ -XXX,XX +XXX,XX @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
207
ssize_t len;
208
209
while (offset < aiocb->aio_nbytes) {
210
- if (aiocb->aio_type & QEMU_AIO_WRITE) {
211
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
212
len = pwrite(aiocb->aio_fildes,
213
(const char *)buf + offset,
214
aiocb->aio_nbytes - offset,
215
@@ -XXX,XX +XXX,XX @@ static int handle_aiocb_rw(void *opaque)
54
}
216
}
55
217
56
aio_timer_init(ctx, &state.ts, type, SCALE_NS, co_sleep_cb, &state);
218
nbytes = handle_aiocb_rw_linear(aiocb, buf);
57
- if (sleep_state) {
219
- if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
58
- *sleep_state = &state;
220
+ if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
59
- }
221
char *p = buf;
60
+ *sleep_state = &state;
222
size_t count = aiocb->aio_nbytes, copy;
61
timer_mod(&state.ts, qemu_clock_get_ns(type) + ns);
223
int i;
62
qemu_coroutine_yield();
224
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
63
- if (sleep_state) {
225
if (fd_open(bs) < 0)
64
- /*
226
return -EIO;
65
- * Note that *sleep_state is cleared during qemu_co_sleep_wake
227
#if defined(CONFIG_BLKZONED)
66
- * before resuming this coroutine.
228
- if (type & QEMU_AIO_WRITE && bs->wps) {
67
- */
229
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
68
- assert(*sleep_state == NULL);
230
qemu_co_mutex_lock(&bs->wps->colock);
69
- }
231
+ if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
70
+
232
+ int index = offset / bs->bl.zone_size;
71
+ /* qemu_co_sleep_wake clears *sleep_state before resuming this coroutine. */
233
+ offset = bs->wps->wp[index];
72
+ assert(*sleep_state == NULL);
234
+ }
73
}
235
}
236
#endif
237
238
@@ -XXX,XX +XXX,XX @@ out:
239
{
240
BlockZoneWps *wps = bs->wps;
241
if (ret == 0) {
242
- if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
243
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
244
+ && wps && bs->bl.zone_size) {
245
uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
246
if (!BDRV_ZT_IS_CONV(*wp)) {
247
+ if (type & QEMU_AIO_ZONE_APPEND) {
248
+ *s->offset = *wp;
249
+ }
250
/* Advance the wp if needed */
251
if (offset + bytes > *wp) {
252
*wp = offset + bytes;
253
@@ -XXX,XX +XXX,XX @@ out:
254
}
255
}
256
} else {
257
- if (type & QEMU_AIO_WRITE) {
258
+ if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
259
update_zones_wp(bs, s->fd, 0, 1);
260
}
261
}
262
263
- if (type & QEMU_AIO_WRITE && wps) {
264
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
265
qemu_co_mutex_unlock(&wps->colock);
266
}
267
}
268
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
269
}
270
#endif
271
272
+#if defined(CONFIG_BLKZONED)
273
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
274
+ int64_t *offset,
275
+ QEMUIOVector *qiov,
276
+ BdrvRequestFlags flags) {
277
+ assert(flags == 0);
278
+ int64_t zone_size_mask = bs->bl.zone_size - 1;
279
+ int64_t iov_len = 0;
280
+ int64_t len = 0;
281
+ BDRVRawState *s = bs->opaque;
282
+ s->offset = offset;
283
+
284
+ if (*offset & zone_size_mask) {
285
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
286
+ "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
287
+ return -EINVAL;
288
+ }
289
+
290
+ int64_t wg = bs->bl.write_granularity;
291
+ int64_t wg_mask = wg - 1;
292
+ for (int i = 0; i < qiov->niov; i++) {
293
+ iov_len = qiov->iov[i].iov_len;
294
+ if (iov_len & wg_mask) {
295
+ error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
296
+ "block size %" PRId64 "", i, iov_len, wg);
297
+ return -EINVAL;
298
+ }
299
+ len += iov_len;
300
+ }
301
+
302
+ return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
303
+}
304
+#endif
305
+
306
static coroutine_fn int
307
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
308
bool blkdev)
309
@@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_host_device = {
310
/* zone management operations */
311
.bdrv_co_zone_report = raw_co_zone_report,
312
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
313
+ .bdrv_co_zone_append = raw_co_zone_append,
314
#endif
315
};
316
317
diff --git a/block/io.c b/block/io.c
318
index XXXXXXX..XXXXXXX 100644
319
--- a/block/io.c
320
+++ b/block/io.c
321
@@ -XXX,XX +XXX,XX @@ out:
322
return co.ret;
323
}
324
325
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
326
+ QEMUIOVector *qiov,
327
+ BdrvRequestFlags flags)
328
+{
329
+ int ret;
330
+ BlockDriver *drv = bs->drv;
331
+ CoroutineIOCompletion co = {
332
+ .coroutine = qemu_coroutine_self(),
333
+ };
334
+ IO_CODE();
335
+
336
+ ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
337
+ if (ret < 0) {
338
+ return ret;
339
+ }
340
+
341
+ bdrv_inc_in_flight(bs);
342
+ if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
343
+ co.ret = -ENOTSUP;
344
+ goto out;
345
+ }
346
+ co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
347
+out:
348
+ bdrv_dec_in_flight(bs);
349
+ return co.ret;
350
+}
351
+
352
void *qemu_blockalign(BlockDriverState *bs, size_t size)
353
{
354
IO_CODE();
355
diff --git a/block/io_uring.c b/block/io_uring.c
356
index XXXXXXX..XXXXXXX 100644
357
--- a/block/io_uring.c
358
+++ b/block/io_uring.c
359
@@ -XXX,XX +XXX,XX @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
360
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
361
luringcb->qiov->niov, offset);
362
break;
363
+ case QEMU_AIO_ZONE_APPEND:
364
+ io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
365
+ luringcb->qiov->niov, offset);
366
+ break;
367
case QEMU_AIO_READ:
368
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
369
luringcb->qiov->niov, offset);
370
diff --git a/block/linux-aio.c b/block/linux-aio.c
371
index XXXXXXX..XXXXXXX 100644
372
--- a/block/linux-aio.c
373
+++ b/block/linux-aio.c
374
@@ -XXX,XX +XXX,XX @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
375
case QEMU_AIO_WRITE:
376
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
377
break;
378
+ case QEMU_AIO_ZONE_APPEND:
379
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
380
+ break;
381
case QEMU_AIO_READ:
382
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
383
break;
384
diff --git a/block/raw-format.c b/block/raw-format.c
385
index XXXXXXX..XXXXXXX 100644
386
--- a/block/raw-format.c
387
+++ b/block/raw-format.c
388
@@ -XXX,XX +XXX,XX @@ raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
389
return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
390
}
391
392
+static int coroutine_fn GRAPH_RDLOCK
393
+raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
394
+ BdrvRequestFlags flags)
395
+{
396
+ return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
397
+}
398
+
399
static int64_t coroutine_fn GRAPH_RDLOCK
400
raw_co_getlength(BlockDriverState *bs)
401
{
402
@@ -XXX,XX +XXX,XX @@ BlockDriver bdrv_raw = {
403
.bdrv_co_pdiscard = &raw_co_pdiscard,
404
.bdrv_co_zone_report = &raw_co_zone_report,
405
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
406
+ .bdrv_co_zone_append = &raw_co_zone_append,
407
.bdrv_co_block_status = &raw_co_block_status,
408
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
409
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
74
--
410
--
75
2.31.1
411
2.40.1
76
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
The patch tests zone append writes by reporting the zone wp after
4
the completion of the call. "zap -p" option can print the sector
5
offset value after completion, which should be the start sector
6
where the append write begins.
7
8
Signed-off-by: Sam Li <faithilikerun@gmail.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: 20230508051510.177850-4-faithilikerun@gmail.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
qemu-io-cmds.c | 75 ++++++++++++++++++++++++++++++
14
tests/qemu-iotests/tests/zoned | 16 +++++++
15
tests/qemu-iotests/tests/zoned.out | 16 +++++++
16
3 files changed, 107 insertions(+)
17
18
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
19
index XXXXXXX..XXXXXXX 100644
20
--- a/qemu-io-cmds.c
21
+++ b/qemu-io-cmds.c
22
@@ -XXX,XX +XXX,XX @@ static const cmdinfo_t zone_reset_cmd = {
23
.oneline = "reset a zone write pointer in zone block device",
24
};
25
26
+static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
27
+ int64_t *offset, int flags, int *total)
28
+{
29
+ int async_ret = NOT_DONE;
30
+
31
+ blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
32
+ while (async_ret == NOT_DONE) {
33
+ main_loop_wait(false);
34
+ }
35
+
36
+ *total = qiov->size;
37
+ return async_ret < 0 ? async_ret : 1;
38
+}
39
+
40
+static int zone_append_f(BlockBackend *blk, int argc, char **argv)
41
+{
42
+ int ret;
43
+ bool pflag = false;
44
+ int flags = 0;
45
+ int total = 0;
46
+ int64_t offset;
47
+ char *buf;
48
+ int c, nr_iov;
49
+ int pattern = 0xcd;
50
+ QEMUIOVector qiov;
51
+
52
+ if (optind > argc - 3) {
53
+ return -EINVAL;
54
+ }
55
+
56
+ if ((c = getopt(argc, argv, "p")) != -1) {
57
+ pflag = true;
58
+ }
59
+
60
+ offset = cvtnum(argv[optind]);
61
+ if (offset < 0) {
62
+ print_cvtnum_err(offset, argv[optind]);
63
+ return offset;
64
+ }
65
+ optind++;
66
+ nr_iov = argc - optind;
67
+ buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern,
68
+ flags & BDRV_REQ_REGISTERED_BUF);
69
+ if (buf == NULL) {
70
+ return -EINVAL;
71
+ }
72
+ ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
73
+ if (ret < 0) {
74
+ printf("zone append failed: %s\n", strerror(-ret));
75
+ goto out;
76
+ }
77
+
78
+ if (pflag) {
79
+ printf("After zap done, the append sector is 0x%" PRIx64 "\n",
80
+ tosector(offset));
81
+ }
82
+
83
+out:
84
+ qemu_io_free(blk, buf, qiov.size,
85
+ flags & BDRV_REQ_REGISTERED_BUF);
86
+ qemu_iovec_destroy(&qiov);
87
+ return ret;
88
+}
89
+
90
+static const cmdinfo_t zone_append_cmd = {
91
+ .name = "zone_append",
92
+ .altname = "zap",
93
+ .cfunc = zone_append_f,
94
+ .argmin = 3,
95
+ .argmax = 4,
96
+ .args = "offset len [len..]",
97
+ .oneline = "append write a number of bytes at a specified offset",
98
+};
99
+
100
static int truncate_f(BlockBackend *blk, int argc, char **argv);
101
static const cmdinfo_t truncate_cmd = {
102
.name = "truncate",
103
@@ -XXX,XX +XXX,XX @@ static void __attribute((constructor)) init_qemuio_commands(void)
104
qemuio_add_command(&zone_close_cmd);
105
qemuio_add_command(&zone_finish_cmd);
106
qemuio_add_command(&zone_reset_cmd);
107
+ qemuio_add_command(&zone_append_cmd);
108
qemuio_add_command(&truncate_cmd);
109
qemuio_add_command(&length_cmd);
110
qemuio_add_command(&info_cmd);
111
diff --git a/tests/qemu-iotests/tests/zoned b/tests/qemu-iotests/tests/zoned
112
index XXXXXXX..XXXXXXX 100755
113
--- a/tests/qemu-iotests/tests/zoned
114
+++ b/tests/qemu-iotests/tests/zoned
115
@@ -XXX,XX +XXX,XX @@ echo "(5) resetting the second zone"
116
$QEMU_IO $IMG -c "zrs 268435456 268435456"
117
echo "After resetting a zone:"
118
$QEMU_IO $IMG -c "zrp 268435456 1"
119
+echo
120
+echo
121
+echo "(6) append write" # the physical block size of the device is 4096
122
+$QEMU_IO $IMG -c "zrp 0 1"
123
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
124
+echo "After appending the first zone firstly:"
125
+$QEMU_IO $IMG -c "zrp 0 1"
126
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
127
+echo "After appending the first zone secondly:"
128
+$QEMU_IO $IMG -c "zrp 0 1"
129
+$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
130
+echo "After appending the second zone firstly:"
131
+$QEMU_IO $IMG -c "zrp 268435456 1"
132
+$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
133
+echo "After appending the second zone secondly:"
134
+$QEMU_IO $IMG -c "zrp 268435456 1"
135
136
# success, all done
137
echo "*** done"
138
diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out
139
index XXXXXXX..XXXXXXX 100644
140
--- a/tests/qemu-iotests/tests/zoned.out
141
+++ b/tests/qemu-iotests/tests/zoned.out
142
@@ -XXX,XX +XXX,XX @@ start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
143
(5) resetting the second zone
144
After resetting a zone:
145
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
146
+
147
+
148
+(6) append write
149
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
150
+After zap done, the append sector is 0x0
151
+After appending the first zone firstly:
152
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
153
+After zap done, the append sector is 0x18
154
+After appending the first zone secondly:
155
+start: 0x0, len 0x80000, cap 0x80000, wptr 0x30, zcond:2, [type: 2]
156
+After zap done, the append sector is 0x80000
157
+After appending the second zone firstly:
158
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
159
+After zap done, the append sector is 0x80018
160
+After appending the second zone secondly:
161
+start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80030, zcond:2, [type: 2]
162
*** done
163
--
164
2.40.1
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
5
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Message-id: 20230508051510.177850-5-faithilikerun@gmail.com
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
---
9
block/file-posix.c | 3 +++
10
block/trace-events | 2 ++
11
2 files changed, 5 insertions(+)
12
13
diff --git a/block/file-posix.c b/block/file-posix.c
14
index XXXXXXX..XXXXXXX 100644
15
--- a/block/file-posix.c
16
+++ b/block/file-posix.c
17
@@ -XXX,XX +XXX,XX @@ out:
18
if (!BDRV_ZT_IS_CONV(*wp)) {
19
if (type & QEMU_AIO_ZONE_APPEND) {
20
*s->offset = *wp;
21
+ trace_zbd_zone_append_complete(bs, *s->offset
22
+ >> BDRV_SECTOR_BITS);
23
}
24
/* Advance the wp if needed */
25
if (offset + bytes > *wp) {
26
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
27
len += iov_len;
28
}
29
30
+ trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
31
return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
32
}
33
#endif
34
diff --git a/block/trace-events b/block/trace-events
35
index XXXXXXX..XXXXXXX 100644
36
--- a/block/trace-events
37
+++ b/block/trace-events
38
@@ -XXX,XX +XXX,XX @@ file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
39
file_flush_fdatasync_failed(int err) "errno %d"
40
zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
41
zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
42
+zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
43
+zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""
44
45
# ssh.c
46
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
47
--
48
2.40.1
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
This simplification is enabled by the previous patch. Now aio_co_wake
3
This patch extends virtio-blk emulation to handle zoned device commands
4
will only be called once, therefore we do not care about a spurious
4
by calling the new block layer APIs to perform zoned device I/O on
5
firing of the timer after a qemu_co_sleep_wake.
5
behalf of the guest. It supports Report Zone, four zone oparations (open,
6
close, finish, reset), and Append Zone.
6
7
7
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
8
The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
support zoned block devices. Regular block devices(conventional zones)
9
Message-id: 20210517100548.28806-5-pbonzini@redhat.com
10
will not be set.
11
12
The guest os can use blktests, fio to test those commands on zoned devices.
13
Furthermore, using zonefs to test zone append write is also supported.
14
15
Signed-off-by: Sam Li <faithilikerun@gmail.com>
16
Message-id: 20230508051916.178322-2-faithilikerun@gmail.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
18
---
12
util/qemu-coroutine-sleep.c | 8 ++++----
19
hw/block/virtio-blk-common.c | 2 +
13
1 file changed, 4 insertions(+), 4 deletions(-)
20
hw/block/virtio-blk.c | 389 +++++++++++++++++++++++++++++++++++
21
hw/virtio/virtio-qmp.c | 2 +
22
3 files changed, 393 insertions(+)
14
23
15
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
24
diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
16
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
17
--- a/util/qemu-coroutine-sleep.c
26
--- a/hw/block/virtio-blk-common.c
18
+++ b/util/qemu-coroutine-sleep.c
27
+++ b/hw/block/virtio-blk-common.c
19
@@ -XXX,XX +XXX,XX @@ static const char *qemu_co_sleep_ns__scheduled = "qemu_co_sleep_ns";
28
@@ -XXX,XX +XXX,XX @@ static const VirtIOFeature feature_sizes[] = {
20
29
.end = endof(struct virtio_blk_config, discard_sector_alignment)},
21
struct QemuCoSleepState {
30
{.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
22
Coroutine *co;
31
.end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
23
- QEMUTimer ts;
32
+ {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
24
QemuCoSleepState **user_state_pointer;
33
+ .end = endof(struct virtio_blk_config, zoned)},
34
{}
25
};
35
};
26
36
27
@@ -XXX,XX +XXX,XX @@ void qemu_co_sleep_wake(QemuCoSleepState *sleep_state)
37
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
28
38
index XXXXXXX..XXXXXXX 100644
29
assert(scheduled == qemu_co_sleep_ns__scheduled);
39
--- a/hw/block/virtio-blk.c
30
*sleep_state->user_state_pointer = NULL;
40
+++ b/hw/block/virtio-blk.c
31
- timer_del(&sleep_state->ts);
41
@@ -XXX,XX +XXX,XX @@
32
aio_co_wake(sleep_state->co);
42
#include "qemu/module.h"
43
#include "qemu/error-report.h"
44
#include "qemu/main-loop.h"
45
+#include "block/block_int.h"
46
#include "trace.h"
47
#include "hw/block/block.h"
48
#include "hw/qdev-properties.h"
49
@@ -XXX,XX +XXX,XX @@ err:
50
return err_status;
51
}
52
53
+typedef struct ZoneCmdData {
54
+ VirtIOBlockReq *req;
55
+ struct iovec *in_iov;
56
+ unsigned in_num;
57
+ union {
58
+ struct {
59
+ unsigned int nr_zones;
60
+ BlockZoneDescriptor *zones;
61
+ } zone_report_data;
62
+ struct {
63
+ int64_t offset;
64
+ } zone_append_data;
65
+ };
66
+} ZoneCmdData;
67
+
68
+/*
69
+ * check zoned_request: error checking before issuing requests. If all checks
70
+ * passed, return true.
71
+ * append: true if only zone append requests issued.
72
+ */
73
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
74
+ bool append, uint8_t *status) {
75
+ BlockDriverState *bs = blk_bs(s->blk);
76
+ int index;
77
+
78
+ if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
79
+ *status = VIRTIO_BLK_S_UNSUPP;
80
+ return false;
81
+ }
82
+
83
+ if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
84
+ || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
85
+ *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
86
+ return false;
87
+ }
88
+
89
+ if (append) {
90
+ if (bs->bl.write_granularity) {
91
+ if ((offset % bs->bl.write_granularity) != 0) {
92
+ *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
93
+ return false;
94
+ }
95
+ }
96
+
97
+ index = offset / bs->bl.zone_size;
98
+ if (BDRV_ZT_IS_CONV(bs->wps->wp[index])) {
99
+ *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
100
+ return false;
101
+ }
102
+
103
+ if (len / 512 > bs->bl.max_append_sectors) {
104
+ if (bs->bl.max_append_sectors == 0) {
105
+ *status = VIRTIO_BLK_S_UNSUPP;
106
+ } else {
107
+ *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
108
+ }
109
+ return false;
110
+ }
111
+ }
112
+ return true;
113
+}
114
+
115
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
116
+{
117
+ ZoneCmdData *data = opaque;
118
+ VirtIOBlockReq *req = data->req;
119
+ VirtIOBlock *s = req->dev;
120
+ VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
121
+ struct iovec *in_iov = data->in_iov;
122
+ unsigned in_num = data->in_num;
123
+ int64_t zrp_size, n, j = 0;
124
+ int64_t nz = data->zone_report_data.nr_zones;
125
+ int8_t err_status = VIRTIO_BLK_S_OK;
126
+
127
+ if (ret) {
128
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
129
+ goto out;
130
+ }
131
+
132
+ struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
133
+ .nr_zones = cpu_to_le64(nz),
134
+ };
135
+ zrp_size = sizeof(struct virtio_blk_zone_report)
136
+ + sizeof(struct virtio_blk_zone_descriptor) * nz;
137
+ n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
138
+ if (n != sizeof(zrp_hdr)) {
139
+ virtio_error(vdev, "Driver provided input buffer that is too small!");
140
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
141
+ goto out;
142
+ }
143
+
144
+ for (size_t i = sizeof(zrp_hdr); i < zrp_size;
145
+ i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
146
+ struct virtio_blk_zone_descriptor desc =
147
+ (struct virtio_blk_zone_descriptor) {
148
+ .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
149
+ >> BDRV_SECTOR_BITS),
150
+ .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
151
+ >> BDRV_SECTOR_BITS),
152
+ .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
153
+ >> BDRV_SECTOR_BITS),
154
+ };
155
+
156
+ switch (data->zone_report_data.zones[j].type) {
157
+ case BLK_ZT_CONV:
158
+ desc.z_type = VIRTIO_BLK_ZT_CONV;
159
+ break;
160
+ case BLK_ZT_SWR:
161
+ desc.z_type = VIRTIO_BLK_ZT_SWR;
162
+ break;
163
+ case BLK_ZT_SWP:
164
+ desc.z_type = VIRTIO_BLK_ZT_SWP;
165
+ break;
166
+ default:
167
+ g_assert_not_reached();
168
+ }
169
+
170
+ switch (data->zone_report_data.zones[j].state) {
171
+ case BLK_ZS_RDONLY:
172
+ desc.z_state = VIRTIO_BLK_ZS_RDONLY;
173
+ break;
174
+ case BLK_ZS_OFFLINE:
175
+ desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
176
+ break;
177
+ case BLK_ZS_EMPTY:
178
+ desc.z_state = VIRTIO_BLK_ZS_EMPTY;
179
+ break;
180
+ case BLK_ZS_CLOSED:
181
+ desc.z_state = VIRTIO_BLK_ZS_CLOSED;
182
+ break;
183
+ case BLK_ZS_FULL:
184
+ desc.z_state = VIRTIO_BLK_ZS_FULL;
185
+ break;
186
+ case BLK_ZS_EOPEN:
187
+ desc.z_state = VIRTIO_BLK_ZS_EOPEN;
188
+ break;
189
+ case BLK_ZS_IOPEN:
190
+ desc.z_state = VIRTIO_BLK_ZS_IOPEN;
191
+ break;
192
+ case BLK_ZS_NOT_WP:
193
+ desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
194
+ break;
195
+ default:
196
+ g_assert_not_reached();
197
+ }
198
+
199
+ /* TODO: it takes O(n^2) time complexity. Optimizations required. */
200
+ n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
201
+ if (n != sizeof(desc)) {
202
+ virtio_error(vdev, "Driver provided input buffer "
203
+ "for descriptors that is too small!");
204
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
205
+ }
206
+ }
207
+
208
+out:
209
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
210
+ virtio_blk_req_complete(req, err_status);
211
+ virtio_blk_free_request(req);
212
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
213
+ g_free(data->zone_report_data.zones);
214
+ g_free(data);
215
+}
216
+
217
+static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
218
+ struct iovec *in_iov,
219
+ unsigned in_num)
220
+{
221
+ VirtIOBlock *s = req->dev;
222
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
223
+ unsigned int nr_zones;
224
+ ZoneCmdData *data;
225
+ int64_t zone_size, offset;
226
+ uint8_t err_status;
227
+
228
+ if (req->in_len < sizeof(struct virtio_blk_inhdr) +
229
+ sizeof(struct virtio_blk_zone_report) +
230
+ sizeof(struct virtio_blk_zone_descriptor)) {
231
+ virtio_error(vdev, "in buffer too small for zone report");
232
+ return;
233
+ }
234
+
235
+ /* start byte offset of the zone report */
236
+ offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
237
+ if (!check_zoned_request(s, offset, 0, false, &err_status)) {
238
+ goto out;
239
+ }
240
+ nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
241
+ sizeof(struct virtio_blk_zone_report)) /
242
+ sizeof(struct virtio_blk_zone_descriptor);
243
+
244
+ zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
245
+ data = g_malloc(sizeof(ZoneCmdData));
246
+ data->req = req;
247
+ data->in_iov = in_iov;
248
+ data->in_num = in_num;
249
+ data->zone_report_data.nr_zones = nr_zones;
250
+ data->zone_report_data.zones = g_malloc(zone_size),
251
+
252
+ blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
253
+ data->zone_report_data.zones,
254
+ virtio_blk_zone_report_complete, data);
255
+ return;
256
+out:
257
+ virtio_blk_req_complete(req, err_status);
258
+ virtio_blk_free_request(req);
259
+}
260
+
261
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
262
+{
263
+ VirtIOBlockReq *req = opaque;
264
+ VirtIOBlock *s = req->dev;
265
+ int8_t err_status = VIRTIO_BLK_S_OK;
266
+
267
+ if (ret) {
268
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
269
+ }
270
+
271
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
272
+ virtio_blk_req_complete(req, err_status);
273
+ virtio_blk_free_request(req);
274
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
275
+}
276
+
277
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
278
+{
279
+ VirtIOBlock *s = req->dev;
280
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
281
+ BlockDriverState *bs = blk_bs(s->blk);
282
+ int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
283
+ uint64_t len;
284
+ uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
285
+ uint8_t err_status = VIRTIO_BLK_S_OK;
286
+
287
+ uint32_t type = virtio_ldl_p(vdev, &req->out.type);
288
+ if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
289
+ /* Entire drive capacity */
290
+ offset = 0;
291
+ len = capacity;
292
+ } else {
293
+ if (bs->bl.zone_size > capacity - offset) {
294
+ /* The zoned device allows the last smaller zone. */
295
+ len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
296
+ } else {
297
+ len = bs->bl.zone_size;
298
+ }
299
+ }
300
+
301
+ if (!check_zoned_request(s, offset, len, false, &err_status)) {
302
+ goto out;
303
+ }
304
+
305
+ blk_aio_zone_mgmt(s->blk, op, offset, len,
306
+ virtio_blk_zone_mgmt_complete, req);
307
+
308
+ return 0;
309
+out:
310
+ virtio_blk_req_complete(req, err_status);
311
+ virtio_blk_free_request(req);
312
+ return err_status;
313
+}
314
+
315
+static void virtio_blk_zone_append_complete(void *opaque, int ret)
316
+{
317
+ ZoneCmdData *data = opaque;
318
+ VirtIOBlockReq *req = data->req;
319
+ VirtIOBlock *s = req->dev;
320
+ VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
321
+ int64_t append_sector, n;
322
+ uint8_t err_status = VIRTIO_BLK_S_OK;
323
+
324
+ if (ret) {
325
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
326
+ goto out;
327
+ }
328
+
329
+ virtio_stq_p(vdev, &append_sector,
330
+ data->zone_append_data.offset >> BDRV_SECTOR_BITS);
331
+ n = iov_from_buf(data->in_iov, data->in_num, 0, &append_sector,
332
+ sizeof(append_sector));
333
+ if (n != sizeof(append_sector)) {
334
+ virtio_error(vdev, "Driver provided input buffer less than size of "
335
+ "append_sector");
336
+ err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
337
+ goto out;
338
+ }
339
+
340
+out:
341
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
342
+ virtio_blk_req_complete(req, err_status);
343
+ virtio_blk_free_request(req);
344
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
345
+ g_free(data);
346
+}
347
+
348
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
349
+ struct iovec *out_iov,
350
+ struct iovec *in_iov,
351
+ uint64_t out_num,
352
+ unsigned in_num) {
353
+ VirtIOBlock *s = req->dev;
354
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
355
+ uint8_t err_status = VIRTIO_BLK_S_OK;
356
+
357
+ int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
358
+ int64_t len = iov_size(out_iov, out_num);
359
+
360
+ if (!check_zoned_request(s, offset, len, true, &err_status)) {
361
+ goto out;
362
+ }
363
+
364
+ ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
365
+ data->req = req;
366
+ data->in_iov = in_iov;
367
+ data->in_num = in_num;
368
+ data->zone_append_data.offset = offset;
369
+ qemu_iovec_init_external(&req->qiov, out_iov, out_num);
370
+ blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
371
+ virtio_blk_zone_append_complete, data);
372
+ return 0;
373
+
374
+out:
375
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
376
+ virtio_blk_req_complete(req, err_status);
377
+ virtio_blk_free_request(req);
378
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
379
+ return err_status;
380
+}
381
+
382
static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
383
{
384
uint32_t type;
385
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
386
case VIRTIO_BLK_T_FLUSH:
387
virtio_blk_handle_flush(req, mrb);
388
break;
389
+ case VIRTIO_BLK_T_ZONE_REPORT:
390
+ virtio_blk_handle_zone_report(req, in_iov, in_num);
391
+ break;
392
+ case VIRTIO_BLK_T_ZONE_OPEN:
393
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
394
+ break;
395
+ case VIRTIO_BLK_T_ZONE_CLOSE:
396
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
397
+ break;
398
+ case VIRTIO_BLK_T_ZONE_FINISH:
399
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
400
+ break;
401
+ case VIRTIO_BLK_T_ZONE_RESET:
402
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
403
+ break;
404
+ case VIRTIO_BLK_T_ZONE_RESET_ALL:
405
+ virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
406
+ break;
407
case VIRTIO_BLK_T_SCSI_CMD:
408
virtio_blk_handle_scsi(req);
409
break;
410
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
411
virtio_blk_free_request(req);
412
break;
33
}
413
}
414
+ case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
415
+ /*
416
+ * Passing out_iov/out_num and in_iov/in_num is not safe
417
+ * to access req->elem.out_sg directly because it may be
418
+ * modified by virtio_blk_handle_request().
419
+ */
420
+ virtio_blk_handle_zone_append(req, out_iov, in_iov, out_num, in_num);
421
+ break;
422
/*
423
* VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
424
* VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
425
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
426
{
427
VirtIOBlock *s = VIRTIO_BLK(vdev);
428
BlockConf *conf = &s->conf.conf;
429
+ BlockDriverState *bs = blk_bs(s->blk);
430
struct virtio_blk_config blkcfg;
431
uint64_t capacity;
432
int64_t length;
433
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
434
blkcfg.write_zeroes_may_unmap = 1;
435
virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
436
}
437
+ if (bs->bl.zoned != BLK_Z_NONE) {
438
+ switch (bs->bl.zoned) {
439
+ case BLK_Z_HM:
440
+ blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
441
+ break;
442
+ case BLK_Z_HA:
443
+ blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
444
+ break;
445
+ default:
446
+ g_assert_not_reached();
447
+ }
448
+
449
+ virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
450
+ bs->bl.zone_size / 512);
451
+ virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
452
+ bs->bl.max_active_zones);
453
+ virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
454
+ bs->bl.max_open_zones);
455
+ virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
456
+ virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
457
+ bs->bl.max_append_sectors);
458
+ } else {
459
+ blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
460
+ }
461
memcpy(config, &blkcfg, s->config_size);
34
}
462
}
35
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
463
36
QemuCoSleepState **sleep_state)
464
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
37
{
465
return;
38
AioContext *ctx = qemu_get_current_aio_context();
39
+ QEMUTimer ts;
40
QemuCoSleepState state = {
41
.co = qemu_coroutine_self(),
42
.user_state_pointer = sleep_state,
43
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_sleep_ns_wakeable(QEMUClockType type, int64_t ns,
44
abort();
45
}
466
}
46
467
47
- aio_timer_init(ctx, &state.ts, type, SCALE_NS, co_sleep_cb, sleep_state);
468
+ BlockDriverState *bs = blk_bs(conf->conf.blk);
48
+ aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, sleep_state);
469
+ if (bs->bl.zoned != BLK_Z_NONE) {
49
*sleep_state = &state;
470
+ virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
50
- timer_mod(&state.ts, qemu_clock_get_ns(type) + ns);
471
+ if (bs->bl.zoned == BLK_Z_HM) {
51
+ timer_mod(&ts, qemu_clock_get_ns(type) + ns);
472
+ virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
52
qemu_coroutine_yield();
473
+ }
53
+ timer_del(&ts);
474
+ }
54
475
+
55
/* qemu_co_sleep_wake clears *sleep_state before resuming this coroutine. */
476
if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
56
assert(*sleep_state == NULL);
477
(!conf->max_discard_sectors ||
478
conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
479
diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c
480
index XXXXXXX..XXXXXXX 100644
481
--- a/hw/virtio/virtio-qmp.c
482
+++ b/hw/virtio/virtio-qmp.c
483
@@ -XXX,XX +XXX,XX @@ static const qmp_virtio_feature_map_t virtio_blk_feature_map[] = {
484
"VIRTIO_BLK_F_DISCARD: Discard command supported"),
485
FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \
486
"VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"),
487
+ FEATURE_ENTRY(VIRTIO_BLK_F_ZONED, \
488
+ "VIRTIO_BLK_F_ZONED: Zoned block devices"),
489
#ifndef VIRTIO_BLK_NO_LEGACY
490
FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \
491
"VIRTIO_BLK_F_BARRIER: Request barriers supported"),
57
--
492
--
58
2.31.1
493
2.40.1
59
diff view generated by jsdifflib
New patch
1
From: Sam Li <faithilikerun@gmail.com>
1
2
3
Taking account of the new zone append write operation for zoned devices,
4
BLOCK_ACCT_ZONE_APPEND enum is introduced as other I/O request type (read,
5
write, flush).
6
7
Signed-off-by: Sam Li <faithilikerun@gmail.com>
8
Message-id: 20230508051916.178322-3-faithilikerun@gmail.com
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
11
qapi/block-core.json | 68 ++++++++++++++++++++++++++++++++------
12
qapi/block.json | 4 +++
13
include/block/accounting.h | 1 +
14
block/qapi-sysemu.c | 11 ++++++
15
block/qapi.c | 18 ++++++++++
16
hw/block/virtio-blk.c | 4 +++
17
tests/qemu-iotests/227.out | 18 ++++++++++
18
7 files changed, 113 insertions(+), 11 deletions(-)
19
20
diff --git a/qapi/block-core.json b/qapi/block-core.json
21
index XXXXXXX..XXXXXXX 100644
22
--- a/qapi/block-core.json
23
+++ b/qapi/block-core.json
24
@@ -XXX,XX +XXX,XX @@
25
# @min_wr_latency_ns: Minimum latency of write operations in the
26
# defined interval, in nanoseconds.
27
#
28
+# @min_zone_append_latency_ns: Minimum latency of zone append operations
29
+# in the defined interval, in nanoseconds
30
+# (since 8.1)
31
+#
32
# @min_flush_latency_ns: Minimum latency of flush operations in the
33
# defined interval, in nanoseconds.
34
#
35
@@ -XXX,XX +XXX,XX @@
36
# @max_wr_latency_ns: Maximum latency of write operations in the
37
# defined interval, in nanoseconds.
38
#
39
+# @max_zone_append_latency_ns: Maximum latency of zone append operations
40
+# in the defined interval, in nanoseconds
41
+# (since 8.1)
42
+#
43
# @max_flush_latency_ns: Maximum latency of flush operations in the
44
# defined interval, in nanoseconds.
45
#
46
@@ -XXX,XX +XXX,XX @@
47
# @avg_wr_latency_ns: Average latency of write operations in the
48
# defined interval, in nanoseconds.
49
#
50
+# @avg_zone_append_latency_ns: Average latency of zone append operations
51
+# in the defined interval, in nanoseconds
52
+# (since 8.1)
53
+#
54
# @avg_flush_latency_ns: Average latency of flush operations in the
55
# defined interval, in nanoseconds.
56
#
57
@@ -XXX,XX +XXX,XX @@
58
# @avg_wr_queue_depth: Average number of pending write operations in
59
# the defined interval.
60
#
61
+# @avg_zone_append_queue_depth: Average number of pending zone append
62
+# operations in the defined interval
63
+# (since 8.1).
64
+#
65
# Since: 2.5
66
##
67
{ 'struct': 'BlockDeviceTimedStats',
68
'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int',
69
'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int',
70
'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int',
71
- 'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int',
72
- 'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int',
73
- 'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } }
74
+ 'avg_wr_latency_ns': 'int', 'min_zone_append_latency_ns': 'int',
75
+ 'max_zone_append_latency_ns': 'int',
76
+ 'avg_zone_append_latency_ns': 'int',
77
+ 'min_flush_latency_ns': 'int', 'max_flush_latency_ns': 'int',
78
+ 'avg_flush_latency_ns': 'int', 'avg_rd_queue_depth': 'number',
79
+ 'avg_wr_queue_depth': 'number',
80
+ 'avg_zone_append_queue_depth': 'number' } }
81
82
##
83
# @BlockDeviceStats:
84
@@ -XXX,XX +XXX,XX @@
85
#
86
# @wr_bytes: The number of bytes written by the device.
87
#
88
+# @zone_append_bytes: The number of bytes appended by the zoned devices
89
+# (since 8.1)
90
+#
91
# @unmap_bytes: The number of bytes unmapped by the device (Since 4.2)
92
#
93
# @rd_operations: The number of read operations performed by the
94
@@ -XXX,XX +XXX,XX @@
95
# @wr_operations: The number of write operations performed by the
96
# device.
97
#
98
+# @zone_append_operations: The number of zone append operations performed
99
+# by the zoned devices (since 8.1)
100
+#
101
# @flush_operations: The number of cache flush operations performed by
102
# the device (since 0.15)
103
#
104
@@ -XXX,XX +XXX,XX @@
105
# @wr_total_time_ns: Total time spent on writes in nanoseconds (since
106
# 0.15).
107
#
108
+# @zone_append_total_time_ns: Total time spent on zone append writes
109
+# in nanoseconds (since 8.1)
110
+#
111
# @flush_total_time_ns: Total time spent on cache flushes in
112
# nanoseconds (since 0.15).
113
#
114
@@ -XXX,XX +XXX,XX @@
115
# @wr_merged: Number of write requests that have been merged into
116
# another request (Since 2.3).
117
#
118
+# @zone_append_merged: Number of zone append requests that have been merged
119
+# into another request (since 8.1)
120
+#
121
# @unmap_merged: Number of unmap requests that have been merged into
122
# another request (Since 4.2)
123
#
124
@@ -XXX,XX +XXX,XX @@
125
# @failed_wr_operations: The number of failed write operations
126
# performed by the device (Since 2.5)
127
#
128
+# @failed_zone_append_operations: The number of failed zone append write
129
+# operations performed by the zoned devices
130
+# (since 8.1)
131
+#
132
# @failed_flush_operations: The number of failed flush operations
133
# performed by the device (Since 2.5)
134
#
135
@@ -XXX,XX +XXX,XX @@
136
# @invalid_wr_operations: The number of invalid write operations
137
# performed by the device (Since 2.5)
138
#
139
+# @invalid_zone_append_operations: The number of invalid zone append operations
140
+# performed by the zoned device (since 8.1)
141
+#
142
# @invalid_flush_operations: The number of invalid flush operations
143
# performed by the device (Since 2.5)
144
#
145
@@ -XXX,XX +XXX,XX @@
146
#
147
# @wr_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
148
#
149
+# @zone_append_latency_histogram: @BlockLatencyHistogramInfo. (since 8.1)
150
+#
151
# @flush_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
152
#
153
# Since: 0.14
154
##
155
{ 'struct': 'BlockDeviceStats',
156
- 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'unmap_bytes' : 'int',
157
- 'rd_operations': 'int', 'wr_operations': 'int',
158
+ 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'zone_append_bytes': 'int',
159
+ 'unmap_bytes' : 'int', 'rd_operations': 'int',
160
+ 'wr_operations': 'int', 'zone_append_operations': 'int',
161
'flush_operations': 'int', 'unmap_operations': 'int',
162
'rd_total_time_ns': 'int', 'wr_total_time_ns': 'int',
163
- 'flush_total_time_ns': 'int', 'unmap_total_time_ns': 'int',
164
- 'wr_highest_offset': 'int',
165
- 'rd_merged': 'int', 'wr_merged': 'int', 'unmap_merged': 'int',
166
- '*idle_time_ns': 'int',
167
+ 'zone_append_total_time_ns': 'int', 'flush_total_time_ns': 'int',
168
+ 'unmap_total_time_ns': 'int', 'wr_highest_offset': 'int',
169
+ 'rd_merged': 'int', 'wr_merged': 'int', 'zone_append_merged': 'int',
170
+ 'unmap_merged': 'int', '*idle_time_ns': 'int',
171
'failed_rd_operations': 'int', 'failed_wr_operations': 'int',
172
- 'failed_flush_operations': 'int', 'failed_unmap_operations': 'int',
173
- 'invalid_rd_operations': 'int', 'invalid_wr_operations': 'int',
174
+ 'failed_zone_append_operations': 'int',
175
+ 'failed_flush_operations': 'int',
176
+ 'failed_unmap_operations': 'int', 'invalid_rd_operations': 'int',
177
+ 'invalid_wr_operations': 'int',
178
+ 'invalid_zone_append_operations': 'int',
179
'invalid_flush_operations': 'int', 'invalid_unmap_operations': 'int',
180
'account_invalid': 'bool', 'account_failed': 'bool',
181
'timed_stats': ['BlockDeviceTimedStats'],
182
'*rd_latency_histogram': 'BlockLatencyHistogramInfo',
183
'*wr_latency_histogram': 'BlockLatencyHistogramInfo',
184
+ '*zone_append_latency_histogram': 'BlockLatencyHistogramInfo',
185
'*flush_latency_histogram': 'BlockLatencyHistogramInfo' } }
186
187
##
188
diff --git a/qapi/block.json b/qapi/block.json
189
index XXXXXXX..XXXXXXX 100644
190
--- a/qapi/block.json
191
+++ b/qapi/block.json
192
@@ -XXX,XX +XXX,XX @@
193
# @boundaries-write: list of interval boundary values for write
194
# latency histogram.
195
#
196
+# @boundaries-zap: list of interval boundary values for zone append write
197
+# latency histogram.
198
+#
199
# @boundaries-flush: list of interval boundary values for flush
200
# latency histogram.
201
#
202
@@ -XXX,XX +XXX,XX @@
203
'*boundaries': ['uint64'],
204
'*boundaries-read': ['uint64'],
205
'*boundaries-write': ['uint64'],
206
+ '*boundaries-zap': ['uint64'],
207
'*boundaries-flush': ['uint64'] },
208
'allow-preconfig': true }
209
diff --git a/include/block/accounting.h b/include/block/accounting.h
210
index XXXXXXX..XXXXXXX 100644
211
--- a/include/block/accounting.h
212
+++ b/include/block/accounting.h
213
@@ -XXX,XX +XXX,XX @@ enum BlockAcctType {
214
BLOCK_ACCT_READ,
215
BLOCK_ACCT_WRITE,
216
BLOCK_ACCT_FLUSH,
217
+ BLOCK_ACCT_ZONE_APPEND,
218
BLOCK_ACCT_UNMAP,
219
BLOCK_MAX_IOTYPE,
220
};
221
diff --git a/block/qapi-sysemu.c b/block/qapi-sysemu.c
222
index XXXXXXX..XXXXXXX 100644
223
--- a/block/qapi-sysemu.c
224
+++ b/block/qapi-sysemu.c
225
@@ -XXX,XX +XXX,XX @@ void qmp_block_latency_histogram_set(
226
bool has_boundaries, uint64List *boundaries,
227
bool has_boundaries_read, uint64List *boundaries_read,
228
bool has_boundaries_write, uint64List *boundaries_write,
229
+ bool has_boundaries_append, uint64List *boundaries_append,
230
bool has_boundaries_flush, uint64List *boundaries_flush,
231
Error **errp)
232
{
233
@@ -XXX,XX +XXX,XX @@ void qmp_block_latency_histogram_set(
234
}
235
}
236
237
+ if (has_boundaries || has_boundaries_append) {
238
+ ret = block_latency_histogram_set(
239
+ stats, BLOCK_ACCT_ZONE_APPEND,
240
+ has_boundaries_append ? boundaries_append : boundaries);
241
+ if (ret) {
242
+ error_setg(errp, "Device '%s' set append write boundaries fail", id);
243
+ return;
244
+ }
245
+ }
246
+
247
if (has_boundaries || has_boundaries_flush) {
248
ret = block_latency_histogram_set(
249
stats, BLOCK_ACCT_FLUSH,
250
diff --git a/block/qapi.c b/block/qapi.c
251
index XXXXXXX..XXXXXXX 100644
252
--- a/block/qapi.c
253
+++ b/block/qapi.c
254
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
255
256
ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
257
ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
258
+ ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
259
ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
260
ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
261
ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
262
+ ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
263
ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];
264
265
ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
266
ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
267
+ ds->failed_zone_append_operations =
268
+ stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
269
ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
270
ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];
271
272
ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
273
ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
274
+ ds->invalid_zone_append_operations =
275
+ stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
276
ds->invalid_flush_operations =
277
stats->invalid_ops[BLOCK_ACCT_FLUSH];
278
ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];
279
280
ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
281
ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
282
+ ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
283
ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
284
ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
285
ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
286
+ ds->zone_append_total_time_ns =
287
+ stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
288
ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
289
ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
290
ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
291
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
292
293
TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
294
TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
295
+ TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
296
TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
297
298
dev_stats->interval_length = ts->interval_length;
299
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
300
dev_stats->max_wr_latency_ns = timed_average_max(wr);
301
dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
302
303
+ dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
304
+ dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
305
+ dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
306
+
307
dev_stats->min_flush_latency_ns = timed_average_min(fl);
308
dev_stats->max_flush_latency_ns = timed_average_max(fl);
309
dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
310
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
311
block_acct_queue_depth(ts, BLOCK_ACCT_READ);
312
dev_stats->avg_wr_queue_depth =
313
block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
314
+ dev_stats->avg_zone_append_queue_depth =
315
+ block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);
316
317
QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
318
}
319
@@ -XXX,XX +XXX,XX @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
320
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
321
ds->wr_latency_histogram
322
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
323
+ ds->zone_append_latency_histogram
324
+ = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
325
ds->flush_latency_histogram
326
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
327
}
328
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
329
index XXXXXXX..XXXXXXX 100644
330
--- a/hw/block/virtio-blk.c
331
+++ b/hw/block/virtio-blk.c
332
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
333
data->in_num = in_num;
334
data->zone_append_data.offset = offset;
335
qemu_iovec_init_external(&req->qiov, out_iov, out_num);
336
+
337
+ block_acct_start(blk_get_stats(s->blk), &req->acct, len,
338
+ BLOCK_ACCT_ZONE_APPEND);
339
+
340
blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
341
virtio_blk_zone_append_complete, data);
342
return 0;
343
diff --git a/tests/qemu-iotests/227.out b/tests/qemu-iotests/227.out
344
index XXXXXXX..XXXXXXX 100644
345
--- a/tests/qemu-iotests/227.out
346
+++ b/tests/qemu-iotests/227.out
347
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
348
"stats": {
349
"unmap_operations": 0,
350
"unmap_merged": 0,
351
+ "failed_zone_append_operations": 0,
352
"flush_total_time_ns": 0,
353
"wr_highest_offset": 0,
354
"wr_total_time_ns": 0,
355
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
356
"timed_stats": [
357
],
358
"failed_unmap_operations": 0,
359
+ "zone_append_merged": 0,
360
"failed_flush_operations": 0,
361
"account_invalid": true,
362
"rd_total_time_ns": 0,
363
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
364
"unmap_total_time_ns": 0,
365
"invalid_flush_operations": 0,
366
"account_failed": true,
367
+ "zone_append_total_time_ns": 0,
368
+ "zone_append_operations": 0,
369
"rd_operations": 0,
370
+ "zone_append_bytes": 0,
371
+ "invalid_zone_append_operations": 0,
372
"invalid_wr_operations": 0,
373
"invalid_rd_operations": 0
374
},
375
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,if=none
376
"stats": {
377
"unmap_operations": 0,
378
"unmap_merged": 0,
379
+ "failed_zone_append_operations": 0,
380
"flush_total_time_ns": 0,
381
"wr_highest_offset": 0,
382
"wr_total_time_ns": 0,
383
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,if=none
384
"timed_stats": [
385
],
386
"failed_unmap_operations": 0,
387
+ "zone_append_merged": 0,
388
"failed_flush_operations": 0,
389
"account_invalid": true,
390
"rd_total_time_ns": 0,
391
@@ -XXX,XX +XXX,XX @@ Testing: -drive driver=null-co,if=none
392
"unmap_total_time_ns": 0,
393
"invalid_flush_operations": 0,
394
"account_failed": true,
395
+ "zone_append_total_time_ns": 0,
396
+ "zone_append_operations": 0,
397
"rd_operations": 0,
398
+ "zone_append_bytes": 0,
399
+ "invalid_zone_append_operations": 0,
400
"invalid_wr_operations": 0,
401
"invalid_rd_operations": 0
402
},
403
@@ -XXX,XX +XXX,XX @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
404
"stats": {
405
"unmap_operations": 0,
406
"unmap_merged": 0,
407
+ "failed_zone_append_operations": 0,
408
"flush_total_time_ns": 0,
409
"wr_highest_offset": 0,
410
"wr_total_time_ns": 0,
411
@@ -XXX,XX +XXX,XX @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
412
"timed_stats": [
413
],
414
"failed_unmap_operations": 0,
415
+ "zone_append_merged": 0,
416
"failed_flush_operations": 0,
417
"account_invalid": true,
418
"rd_total_time_ns": 0,
419
@@ -XXX,XX +XXX,XX @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
420
"unmap_total_time_ns": 0,
421
"invalid_flush_operations": 0,
422
"account_failed": true,
423
+ "zone_append_total_time_ns": 0,
424
+ "zone_append_operations": 0,
425
"rd_operations": 0,
426
+ "zone_append_bytes": 0,
427
+ "invalid_zone_append_operations": 0,
428
"invalid_wr_operations": 0,
429
"invalid_rd_operations": 0
430
},
431
--
432
2.40.1
diff view generated by jsdifflib
1
From: Philippe Mathieu-Daudé <philmd@redhat.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Document the following functions return the bitmap size
3
Signed-off-by: Sam Li <faithilikerun@gmail.com>
4
if no matching bit is found:
5
6
- find_first_bit
7
- find_next_bit
8
- find_last_bit
9
- find_first_zero_bit
10
- find_next_zero_bit
11
12
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
13
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
14
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Message-id: 20210510200758.2623154-2-philmd@redhat.com
5
Message-id: 20230508051916.178322-4-faithilikerun@gmail.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
7
---
18
include/qemu/bitops.h | 15 ++++++++++++---
8
hw/block/virtio-blk.c | 12 ++++++++++++
19
1 file changed, 12 insertions(+), 3 deletions(-)
9
hw/block/trace-events | 7 +++++++
10
2 files changed, 19 insertions(+)
20
11
21
diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
12
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
22
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
23
--- a/include/qemu/bitops.h
14
--- a/hw/block/virtio-blk.c
24
+++ b/include/qemu/bitops.h
15
+++ b/hw/block/virtio-blk.c
25
@@ -XXX,XX +XXX,XX @@ static inline int test_bit(long nr, const unsigned long *addr)
16
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
26
* @addr: The address to start the search at
17
int64_t nz = data->zone_report_data.nr_zones;
27
* @size: The maximum size to search
18
int8_t err_status = VIRTIO_BLK_S_OK;
28
*
19
29
- * Returns the bit number of the first set bit, or size.
20
+ trace_virtio_blk_zone_report_complete(vdev, req, nz, ret);
30
+ * Returns the bit number of the last set bit,
21
if (ret) {
31
+ * or @size if there is no set bit in the bitmap.
22
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
32
*/
23
goto out;
33
unsigned long find_last_bit(const unsigned long *addr,
24
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
34
unsigned long size);
25
nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
35
@@ -XXX,XX +XXX,XX @@ unsigned long find_last_bit(const unsigned long *addr,
26
sizeof(struct virtio_blk_zone_report)) /
36
* @addr: The address to base the search on
27
sizeof(struct virtio_blk_zone_descriptor);
37
* @offset: The bitnumber to start searching at
28
+ trace_virtio_blk_handle_zone_report(vdev, req,
38
* @size: The bitmap size in bits
29
+ offset >> BDRV_SECTOR_BITS, nr_zones);
39
+ *
30
40
+ * Returns the bit number of the next set bit,
31
zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
41
+ * or @size if there are no further set bits in the bitmap.
32
data = g_malloc(sizeof(ZoneCmdData));
42
*/
33
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
43
unsigned long find_next_bit(const unsigned long *addr,
34
{
44
unsigned long size,
35
VirtIOBlockReq *req = opaque;
45
@@ -XXX,XX +XXX,XX @@ unsigned long find_next_bit(const unsigned long *addr,
36
VirtIOBlock *s = req->dev;
46
* @addr: The address to base the search on
37
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
47
* @offset: The bitnumber to start searching at
38
int8_t err_status = VIRTIO_BLK_S_OK;
48
* @size: The bitmap size in bits
39
+ trace_virtio_blk_zone_mgmt_complete(vdev, req,ret);
49
+ *
40
50
+ * Returns the bit number of the next cleared bit,
41
if (ret) {
51
+ * or @size if there are no further clear bits in the bitmap.
42
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
52
*/
43
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
53
44
/* Entire drive capacity */
54
unsigned long find_next_zero_bit(const unsigned long *addr,
45
offset = 0;
55
@@ -XXX,XX +XXX,XX @@ unsigned long find_next_zero_bit(const unsigned long *addr,
46
len = capacity;
56
* @addr: The address to start the search at
47
+ trace_virtio_blk_handle_zone_reset_all(vdev, req, 0,
57
* @size: The maximum size to search
48
+ bs->total_sectors);
58
*
49
} else {
59
- * Returns the bit number of the first set bit.
50
if (bs->bl.zone_size > capacity - offset) {
60
+ * Returns the bit number of the first set bit,
51
/* The zoned device allows the last smaller zone. */
61
+ * or @size if there is no set bit in the bitmap.
52
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
62
*/
53
} else {
63
static inline unsigned long find_first_bit(const unsigned long *addr,
54
len = bs->bl.zone_size;
64
unsigned long size)
55
}
65
@@ -XXX,XX +XXX,XX @@ static inline unsigned long find_first_bit(const unsigned long *addr,
56
+ trace_virtio_blk_handle_zone_mgmt(vdev, req, op,
66
* @addr: The address to start the search at
57
+ offset >> BDRV_SECTOR_BITS,
67
* @size: The maximum size to search
58
+ len >> BDRV_SECTOR_BITS);
68
*
59
}
69
- * Returns the bit number of the first cleared bit.
60
70
+ * Returns the bit number of the first cleared bit,
61
if (!check_zoned_request(s, offset, len, false, &err_status)) {
71
+ * or @size if there is no clear bit in the bitmap.
62
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
72
*/
63
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
73
static inline unsigned long find_first_zero_bit(const unsigned long *addr,
64
goto out;
74
unsigned long size)
65
}
66
+ trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
67
68
out:
69
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
70
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
71
int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
72
int64_t len = iov_size(out_iov, out_num);
73
74
+ trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS);
75
if (!check_zoned_request(s, offset, len, true, &err_status)) {
76
goto out;
77
}
78
diff --git a/hw/block/trace-events b/hw/block/trace-events
79
index XXXXXXX..XXXXXXX 100644
80
--- a/hw/block/trace-events
81
+++ b/hw/block/trace-events
82
@@ -XXX,XX +XXX,XX @@ pflash_write_unknown(const char *name, uint8_t cmd) "%s: unknown command 0x%02x"
83
# virtio-blk.c
84
virtio_blk_req_complete(void *vdev, void *req, int status) "vdev %p req %p status %d"
85
virtio_blk_rw_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
86
+virtio_blk_zone_report_complete(void *vdev, void *req, unsigned int nr_zones, int ret) "vdev %p req %p nr_zones %u ret %d"
87
+virtio_blk_zone_mgmt_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
88
+virtio_blk_zone_append_complete(void *vdev, void *req, int64_t sector, int ret) "vdev %p req %p, append sector 0x%" PRIx64 " ret %d"
89
virtio_blk_handle_write(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
90
virtio_blk_handle_read(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
91
virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint64_t offset, size_t size, bool is_write) "vdev %p mrb %p start %d num_reqs %d offset %"PRIu64" size %zu is_write %d"
92
+virtio_blk_handle_zone_report(void *vdev, void *req, int64_t sector, unsigned int nr_zones) "vdev %p req %p sector 0x%" PRIx64 " nr_zones %u"
93
+virtio_blk_handle_zone_mgmt(void *vdev, void *req, uint8_t op, int64_t sector, int64_t len) "vdev %p req %p op 0x%x sector 0x%" PRIx64 " len 0x%" PRIx64 ""
94
+virtio_blk_handle_zone_reset_all(void *vdev, void *req, int64_t sector, int64_t len) "vdev %p req %p sector 0x%" PRIx64 " cap 0x%" PRIx64 ""
95
+virtio_blk_handle_zone_append(void *vdev, void *req, int64_t sector) "vdev %p req %p, append sector 0x%" PRIx64 ""
96
97
# hd-geometry.c
98
hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
75
--
99
--
76
2.31.1
100
2.40.1
77
diff view generated by jsdifflib
1
From: Zenghui Yu <yuzenghui@huawei.com>
1
From: Sam Li <faithilikerun@gmail.com>
2
2
3
Quote docs/devel/style.rst (section "Automatic memory deallocation"):
3
Add the documentation about the example of using virtio-blk driver
4
to pass the zoned block devices through to the guest.
4
5
5
* Variables declared with g_auto* MUST always be initialized,
6
Signed-off-by: Sam Li <faithilikerun@gmail.com>
6
otherwise the cleanup function will use uninitialized stack memory
7
Message-id: 20230508051916.178322-5-faithilikerun@gmail.com
7
8
[Fix pre-formatted code syntax
8
Initialize @name properly to get rid of the compilation error (using
9
--Stefan]
9
gcc-7.3.0 on CentOS):
10
11
../hw/remote/proxy.c: In function 'pci_proxy_dev_realize':
12
/usr/include/glib-2.0/glib/glib-autocleanups.h:28:3: error: 'name' may be used uninitialized in this function [-Werror=maybe-uninitialized]
13
g_free (*pp);
14
^~~~~~~~~~~~
15
../hw/remote/proxy.c:350:30: note: 'name' was declared here
16
g_autofree char *name;
17
^~~~
18
19
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
20
Reviewed-by: Jagannathan Raman <jag.raman@oracle.com>
21
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
22
Reviewed-by: Miroslav Rezanina <mrezanin@redhat.com>
23
Message-id: 20210312112143.1369-1-yuzenghui@huawei.com
24
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
25
---
11
---
26
hw/remote/memory.c | 5 ++---
12
docs/devel/zoned-storage.rst | 19 +++++++++++++++++++
27
hw/remote/proxy.c | 3 +--
13
1 file changed, 19 insertions(+)
28
2 files changed, 3 insertions(+), 5 deletions(-)
29
14
30
diff --git a/hw/remote/memory.c b/hw/remote/memory.c
15
diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
31
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
32
--- a/hw/remote/memory.c
17
--- a/docs/devel/zoned-storage.rst
33
+++ b/hw/remote/memory.c
18
+++ b/docs/devel/zoned-storage.rst
34
@@ -XXX,XX +XXX,XX @@ void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
19
@@ -XXX,XX +XXX,XX @@ APIs for zoned storage emulation or testing.
35
20
For example, to test zone_report on a null_blk device using qemu-io is::
36
remote_sysmem_reset();
21
37
22
$ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
38
- for (region = 0; region < msg->num_fds; region++) {
23
+
39
- g_autofree char *name;
24
+To expose the host's zoned block device through virtio-blk, the command line
40
+ for (region = 0; region < msg->num_fds; region++, suffix++) {
25
+can be (includes the -device parameter)::
41
+ g_autofree char *name = g_strdup_printf("remote-mem-%u", suffix);
26
+
42
subregion = g_new(MemoryRegion, 1);
27
+ -blockdev node-name=drive0,driver=host_device,filename=/dev/nullb0,cache.direct=on \
43
- name = g_strdup_printf("remote-mem-%u", suffix++);
28
+ -device virtio-blk-pci,drive=drive0
44
memory_region_init_ram_from_fd(subregion, NULL,
29
+
45
name, sysmem_info->sizes[region],
30
+Or only use the -drive parameter::
46
true, msg->fds[region],
31
+
47
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
32
+ -driver driver=host_device,file=/dev/nullb0,if=virtio,cache.direct=on
48
index XXXXXXX..XXXXXXX 100644
33
+
49
--- a/hw/remote/proxy.c
34
+Additionally, QEMU has several ways of supporting zoned storage, including:
50
+++ b/hw/remote/proxy.c
35
+(1) Using virtio-scsi: --device scsi-block allows for the passing through of
51
@@ -XXX,XX +XXX,XX @@ static void probe_pci_info(PCIDevice *dev, Error **errp)
36
+SCSI ZBC devices, enabling the attachment of ZBC or ZAC HDDs to QEMU.
52
PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
37
+(2) PCI device pass-through: While NVMe ZNS emulation is available for testing
53
38
+purposes, it cannot yet pass through a zoned device from the host. To pass on
54
if (size) {
39
+the NVMe ZNS device to the guest, use VFIO PCI pass the entire NVMe PCI adapter
55
- g_autofree char *name;
40
+through to the guest. Likewise, an HDD HBA can be passed on to QEMU all HDDs
56
+ g_autofree char *name = g_strdup_printf("bar-region-%d", i);
41
+attached to the HBA.
57
pdev->region[i].dev = pdev;
58
pdev->region[i].present = true;
59
if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
60
pdev->region[i].memory = true;
61
}
62
- name = g_strdup_printf("bar-region-%d", i);
63
memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
64
&proxy_mr_ops, &pdev->region[i],
65
name, size);
66
--
42
--
67
2.31.1
43
2.40.1
68
diff view generated by jsdifflib