1
The following changes since commit 36f87b4513373b3cd79c87c9197d17face95d4ac:
1
The following changes since commit 3521ade3510eb5cefb2e27a101667f25dad89935:
2
2
3
Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-2.10-20170630' into staging (2017-06-30 11:58:49 +0100)
3
Merge remote-tracking branch 'remotes/thuth-gitlab/tags/pull-request-2021-07-29' into staging (2021-07-29 13:17:20 +0100)
4
4
5
are available in the git repository at:
5
are available in the Git repository at:
6
6
7
git://github.com/famz/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to c61e684e44272f2acb2bef34cf2aa234582a73a9:
9
for you to fetch changes up to cc8eecd7f105a1dff5876adeb238a14696061a4a:
10
10
11
block: Exploit BDRV_BLOCK_EOF for larger zero blocks (2017-06-30 21:48:06 +0800)
11
MAINTAINERS: Added myself as a reviewer for the NVMe Block Driver (2021-07-29 17:17:34 +0100)
12
13
----------------------------------------------------------------
14
Pull request
15
16
The main fix here is for io_uring. Spurious -EAGAIN errors can happen and the
17
request needs to be resubmitted.
18
19
The MAINTAINERS changes carry no risk and we might as well include them in QEMU
20
6.1.
12
21
13
----------------------------------------------------------------
22
----------------------------------------------------------------
14
23
15
Hi Peter,
24
Fabian Ebner (1):
25
block/io_uring: resubmit when result is -EAGAIN
16
26
17
Here are Eric Blake's enhancement to block layer API. Thanks!
27
Philippe Mathieu-Daudé (1):
28
MAINTAINERS: Added myself as a reviewer for the NVMe Block Driver
18
29
19
----------------------------------------------------------------
30
Stefano Garzarella (1):
31
MAINTAINERS: add Stefano Garzarella as io_uring reviewer
20
32
21
Eric Blake (2):
33
MAINTAINERS | 2 ++
22
block: Add BDRV_BLOCK_EOF to bdrv_get_block_status()
34
block/io_uring.c | 16 +++++++++++++++-
23
block: Exploit BDRV_BLOCK_EOF for larger zero blocks
35
2 files changed, 17 insertions(+), 1 deletion(-)
24
25
block/io.c | 42 +++++++++++++++++++++++++++++++++---------
26
include/block/block.h | 2 ++
27
tests/qemu-iotests/154 | 4 ----
28
tests/qemu-iotests/154.out | 12 ++++++------
29
4 files changed, 41 insertions(+), 19 deletions(-)
30
36
31
--
37
--
32
2.9.4
38
2.31.1
33
39
34
diff view generated by jsdifflib
New patch
1
From: Stefano Garzarella <sgarzare@redhat.com>
1
2
3
I've been working with io_uring for a while so I'd like to help
4
with reviews.
5
6
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
7
Message-Id: <20210728131515.131045-1-sgarzare@redhat.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
MAINTAINERS | 1 +
11
1 file changed, 1 insertion(+)
12
13
diff --git a/MAINTAINERS b/MAINTAINERS
14
index XXXXXXX..XXXXXXX 100644
15
--- a/MAINTAINERS
16
+++ b/MAINTAINERS
17
@@ -XXX,XX +XXX,XX @@ Linux io_uring
18
M: Aarushi Mehta <mehta.aaru20@gmail.com>
19
M: Julia Suvorova <jusual@redhat.com>
20
M: Stefan Hajnoczi <stefanha@redhat.com>
21
+R: Stefano Garzarella <sgarzare@redhat.com>
22
L: qemu-block@nongnu.org
23
S: Maintained
24
F: block/io_uring.c
25
--
26
2.31.1
27
diff view generated by jsdifflib
1
From: Eric Blake <eblake@redhat.com>
1
From: Fabian Ebner <f.ebner@proxmox.com>
2
2
3
When we have a BDS with unallocated clusters, but asking the status
3
Linux SCSI can throw spurious -EAGAIN in some corner cases in its
4
of its underlying bs->file or backing layer encounters an end-of-file
4
completion path, which will end up being the result in the completed
5
condition, we know that the rest of the unallocated area will read as
5
io_uring request.
6
zeroes. However, pre-patch, this required two separate calls to
7
bdrv_get_block_status(), as the first call stops at the point where
8
the underlying file ends. Thanks to BDRV_BLOCK_EOF, we can now widen
9
the results of the primary status if the secondary status already
10
includes BDRV_BLOCK_ZERO.
11
6
12
In turn, this fixes a TODO mentioned in iotest 154, where we can now
7
Resubmitting such requests should allow block jobs to complete, even
13
see that all sectors in a partial cluster at the end of a file read
8
if such spurious errors are encountered.
14
as zero when coupling the shorter backing file's status along with our
15
knowledge that the remaining sectors came from an unallocated cluster.
16
9
17
Also, note that the loop in bdrv_co_get_block_status_above() had an
10
Co-authored-by: Stefan Hajnoczi <stefanha@gmail.com>
18
inefficent exit: in cases where the active layer sets BDRV_BLOCK_ZERO
11
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
19
but does NOT set BDRV_BLOCK_ALLOCATED (namely, where we know we read
12
Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
20
zeroes merely because our unallocated clusters lie beyond the backing
13
Message-id: 20210729091029.65369-1-f.ebner@proxmox.com
21
file's shorter length), we still ended up probing the backing layer
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
22
even though we already had a good answer.
15
---
16
block/io_uring.c | 16 +++++++++++++++-
17
1 file changed, 15 insertions(+), 1 deletion(-)
23
18
24
Signed-off-by: Eric Blake <eblake@redhat.com>
19
diff --git a/block/io_uring.c b/block/io_uring.c
25
Message-Id: <20170505021500.19315-3-eblake@redhat.com>
20
index XXXXXXX..XXXXXXX 100644
26
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
21
--- a/block/io_uring.c
27
Signed-off-by: Fam Zheng <famz@redhat.com>
22
+++ b/block/io_uring.c
28
---
23
@@ -XXX,XX +XXX,XX @@ static void luring_process_completions(LuringState *s)
29
block/io.c | 27 ++++++++++++++++++++++-----
24
total_bytes = ret + luringcb->total_read;
30
tests/qemu-iotests/154 | 4 ----
25
31
tests/qemu-iotests/154.out | 12 ++++++------
26
if (ret < 0) {
32
3 files changed, 28 insertions(+), 15 deletions(-)
27
- if (ret == -EINTR) {
28
+ /*
29
+ * Only writev/readv/fsync requests on regular files or host block
30
+ * devices are submitted. Therefore -EAGAIN is not expected but it's
31
+ * known to happen sometimes with Linux SCSI. Submit again and hope
32
+ * the request completes successfully.
33
+ *
34
+ * For more information, see:
35
+ * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
36
+ *
37
+ * If the code is changed to submit other types of requests in the
38
+ * future, then this workaround may need to be extended to deal with
39
+ * genuine -EAGAIN results that should not be resubmitted
40
+ * immediately.
41
+ */
42
+ if (ret == -EINTR || ret == -EAGAIN) {
43
luring_resubmit(s, luringcb);
44
continue;
45
}
46
--
47
2.31.1
33
48
34
diff --git a/block/io.c b/block/io.c
35
index XXXXXXX..XXXXXXX 100644
36
--- a/block/io.c
37
+++ b/block/io.c
38
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
39
/* Ignore errors. This is just providing extra information, it
40
* is useful but not necessary.
41
*/
42
- if (!file_pnum) {
43
- /* !file_pnum indicates an offset at or beyond the EOF; it is
44
- * perfectly valid for the format block driver to point to such
45
- * offsets, so catch it and mark everything as zero */
46
+ if (ret2 & BDRV_BLOCK_EOF &&
47
+ (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
48
+ /*
49
+ * It is valid for the format block driver to read
50
+ * beyond the end of the underlying file's current
51
+ * size; such areas read as zero.
52
+ */
53
ret |= BDRV_BLOCK_ZERO;
54
} else {
55
/* Limit request to the range reported by the protocol driver */
56
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
57
{
58
BlockDriverState *p;
59
int64_t ret = 0;
60
+ bool first = true;
61
62
assert(bs != base);
63
for (p = bs; p != base; p = backing_bs(p)) {
64
ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
65
- if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
66
+ if (ret < 0) {
67
+ break;
68
+ }
69
+ if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
70
+ /*
71
+ * Reading beyond the end of the file continues to read
72
+ * zeroes, but we can only widen the result to the
73
+ * unallocated length we learned from an earlier
74
+ * iteration.
75
+ */
76
+ *pnum = nb_sectors;
77
+ }
78
+ if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
79
break;
80
}
81
/* [sector_num, pnum] unallocated on this layer, which could be only
82
* the first part of [sector_num, nb_sectors]. */
83
nb_sectors = MIN(nb_sectors, *pnum);
84
+ first = false;
85
}
86
return ret;
87
}
88
diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154
89
index XXXXXXX..XXXXXXX 100755
90
--- a/tests/qemu-iotests/154
91
+++ b/tests/qemu-iotests/154
92
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
93
$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
94
95
# Repeat with backing file holding unallocated cluster.
96
-# TODO: Note that this forces an allocation, because we aren't yet able to
97
-# quickly detect that reads beyond EOF of the backing file are always zero
98
CLUSTER_SIZE=2048 TEST_IMG="$TEST_IMG.base" _make_test_img $((size + 1024))
99
100
# Write at the front: sector-wise, the request is:
101
@@ -XXX,XX +XXX,XX @@ $QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
102
$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
103
104
# Repeat with backing file holding zero'd cluster
105
-# TODO: Note that this forces an allocation, because we aren't yet able to
106
-# quickly detect that reads beyond EOF of the backing file are always zero
107
$QEMU_IO -c "write -z $size 512" "$TEST_IMG.base" | _filter_qemu_io
108
109
# Write at the front: sector-wise, the request is:
110
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
111
index XXXXXXX..XXXXXXX 100644
112
--- a/tests/qemu-iotests/154.out
113
+++ b/tests/qemu-iotests/154.out
114
@@ -XXX,XX +XXX,XX @@ wrote 512/512 bytes at offset 134217728
115
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
116
2048/2048 bytes allocated at offset 128 MiB
117
[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
118
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
119
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
120
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
121
wrote 512/512 bytes at offset 134219264
122
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
123
2048/2048 bytes allocated at offset 128 MiB
124
[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
125
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
126
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
127
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
128
wrote 1024/1024 bytes at offset 134218240
129
1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
130
2048/2048 bytes allocated at offset 128 MiB
131
[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
132
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
133
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
134
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
135
wrote 2048/2048 bytes at offset 134217728
136
2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
137
@@ -XXX,XX +XXX,XX @@ wrote 512/512 bytes at offset 134217728
138
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
139
2048/2048 bytes allocated at offset 128 MiB
140
[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
141
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
142
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
143
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
144
wrote 512/512 bytes at offset 134219264
145
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
146
2048/2048 bytes allocated at offset 128 MiB
147
[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
148
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
149
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
150
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
151
wrote 1024/1024 bytes at offset 134218240
152
1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
153
2048/2048 bytes allocated at offset 128 MiB
154
[{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
155
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
156
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
157
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base
158
wrote 2048/2048 bytes at offset 134217728
159
2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
160
--
161
2.9.4
162
163
diff view generated by jsdifflib
1
From: Eric Blake <eblake@redhat.com>
1
From: Philippe Mathieu-Daudé <philmd@redhat.com>
2
2
3
Just as the block layer already sets BDRV_BLOCK_ALLOCATED as a
3
I'm interested in following the activity around the NVMe bdrv.
4
shortcut for subsequent operations, there are also some optimizations
5
that are made easier if we can quickly tell that *pnum will advance
6
us to the end of a file, via a new BDRV_BLOCK_EOF which gets set
7
by the block layer.
8
4
9
This just plumbs up the new bit; subsequent patches will make use
5
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
10
of it.
6
Message-id: 20210728183340.2018313-1-philmd@redhat.com
7
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
---
9
MAINTAINERS | 1 +
10
1 file changed, 1 insertion(+)
11
11
12
Signed-off-by: Eric Blake <eblake@redhat.com>
12
diff --git a/MAINTAINERS b/MAINTAINERS
13
Message-Id: <20170505021500.19315-2-eblake@redhat.com>
13
index XXXXXXX..XXXXXXX 100644
14
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
14
--- a/MAINTAINERS
15
Signed-off-by: Fam Zheng <famz@redhat.com>
15
+++ b/MAINTAINERS
16
---
16
@@ -XXX,XX +XXX,XX @@ F: block/null.c
17
block/io.c | 15 +++++++++++----
17
NVMe Block Driver
18
include/block/block.h | 2 ++
18
M: Stefan Hajnoczi <stefanha@redhat.com>
19
2 files changed, 13 insertions(+), 4 deletions(-)
19
R: Fam Zheng <fam@euphon.net>
20
+R: Philippe Mathieu-Daudé <philmd@redhat.com>
21
L: qemu-block@nongnu.org
22
S: Supported
23
F: block/nvme*
24
--
25
2.31.1
20
26
21
diff --git a/block/io.c b/block/io.c
22
index XXXXXXX..XXXXXXX 100644
23
--- a/block/io.c
24
+++ b/block/io.c
25
@@ -XXX,XX +XXX,XX @@ typedef struct BdrvCoGetBlockStatusData {
26
* Drivers not implementing the functionality are assumed to not support
27
* backing files, hence all their sectors are reported as allocated.
28
*
29
- * If 'sector_num' is beyond the end of the disk image the return value is 0
30
- * and 'pnum' is set to 0.
31
+ * If 'sector_num' is beyond the end of the disk image the return value is
32
+ * BDRV_BLOCK_EOF and 'pnum' is set to 0.
33
*
34
* 'pnum' is set to the number of sectors (including and immediately following
35
* the specified sector) that are known to be in the same
36
* allocated/unallocated state.
37
*
38
* 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
39
- * beyond the end of the disk image it will be clamped.
40
+ * beyond the end of the disk image it will be clamped; if 'pnum' is set to
41
+ * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
42
*
43
* If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
44
* points to the BDS which the sector range is allocated in.
45
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
46
47
if (sector_num >= total_sectors) {
48
*pnum = 0;
49
- return 0;
50
+ return BDRV_BLOCK_EOF;
51
}
52
53
n = total_sectors - sector_num;
54
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
55
if (!bs->drv->bdrv_co_get_block_status) {
56
*pnum = nb_sectors;
57
ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
58
+ if (sector_num + nb_sectors == total_sectors) {
59
+ ret |= BDRV_BLOCK_EOF;
60
+ }
61
if (bs->drv->protocol_name) {
62
ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
63
}
64
@@ -XXX,XX +XXX,XX @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
65
66
out:
67
bdrv_dec_in_flight(bs);
68
+ if (ret >= 0 && sector_num + *pnum == total_sectors) {
69
+ ret |= BDRV_BLOCK_EOF;
70
+ }
71
return ret;
72
}
73
74
diff --git a/include/block/block.h b/include/block/block.h
75
index XXXXXXX..XXXXXXX 100644
76
--- a/include/block/block.h
77
+++ b/include/block/block.h
78
@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
79
* BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
80
* BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
81
* layer (short for DATA || ZERO), set by block layer
82
+ * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this layer
83
*
84
* Internal flag:
85
* BDRV_BLOCK_RAW: used internally to indicate that the request was
86
@@ -XXX,XX +XXX,XX @@ typedef struct HDGeometry {
87
#define BDRV_BLOCK_OFFSET_VALID 0x04
88
#define BDRV_BLOCK_RAW 0x08
89
#define BDRV_BLOCK_ALLOCATED 0x10
90
+#define BDRV_BLOCK_EOF 0x20
91
#define BDRV_BLOCK_OFFSET_MASK BDRV_SECTOR_MASK
92
93
typedef QSIMPLEQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
94
--
95
2.9.4
96
97
diff view generated by jsdifflib