1
The following changes since commit 5704c36d25ee84e7129722cb0db53df9faefe943:
1
The following changes since commit 3a821c52e1a30ecd9a436f2c67cc66b5628c829f:
2
2
3
Merge remote-tracking branch 'remotes/kraxel/tags/fixes-31-20181112-pull-request' into staging (2018-11-12 15:55:40 +0000)
3
Merge tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme into staging (2022-06-23 14:52:30 -0700)
4
4
5
are available in the Git repository at:
5
are available in the Git repository at:
6
6
7
git://repo.or.cz/qemu/kevin.git tags/for-upstream
7
git://repo.or.cz/qemu/kevin.git tags/for-upstream
8
8
9
for you to fetch changes up to 1a42e5d8298d1b0f90d2254e7d559391dd3a45ca:
9
for you to fetch changes up to 779d82e1d305f2a9cbd7f48cf6555ad58145e04a:
10
10
11
Merge remote-tracking branch 'mreitz/tags/pull-block-2018-11-12' into queue-block (2018-11-12 17:57:32 +0100)
11
vduse-blk: Add name option (2022-06-24 17:07:06 +0200)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Block layer patches:
14
Block layer patches
15
15
16
- file-posix: Don't waste a file descriptor for locking, don't lock the
16
- Add vduse-blk export
17
same bit multiple times
17
- Dirty bitmaps: Fix and improve bitmap merge
18
- nvme: Fix double free and memory leak
18
- gluster: correctly set max_pdiscard
19
- Misc error handling fixes
19
- rbd: report a better error when namespace does not exist
20
- Added NULL checks found by static analysis
20
- aio_wait_kick: add missing memory barrier
21
- Allow more block drivers to not be included in the qemu build
21
- Code cleanups
22
22
23
----------------------------------------------------------------
23
----------------------------------------------------------------
24
Fam Zheng (4):
24
Emanuele Giuseppe Esposito (1):
25
file-posix: Use error API properly
25
aio_wait_kick: add missing memory barrier
26
file-posix: Skip effectiveless OFD lock operations
27
file-posix: Drop s->lock_fd
28
tests: Add unit tests for image locking
29
26
30
Jeff Cody (1):
27
Eric Blake (1):
31
block: Make more block drivers compile-time configurable
28
nbd: Drop dead code spotted by Coverity
32
29
33
Kevin Wolf (1):
30
Fabian Ebner (1):
34
Merge remote-tracking branch 'mreitz/tags/pull-block-2018-11-12' into queue-block
31
block/gluster: correctly set max_pdiscard
35
32
36
Li Qiang (2):
33
Stefan Hajnoczi (3):
37
nvme: don't unref ctrl_mem when device unrealized
34
block: drop unused bdrv_co_drain() API
38
nvme: free cmbuf in nvme_exit
35
block: get rid of blk->guest_block_size
36
qsd: document vduse-blk exports
39
37
40
Liam Merwick (5):
38
Stefano Garzarella (1):
41
job: Fix off-by-one assert checks for JobSTT and JobVerbTable
39
block/rbd: report a better error when namespace does not exist
42
block: Null pointer dereference in blk_root_get_parent_desc()
43
qemu-img: assert block_job_get() does not return NULL in img_commit()
44
block: Fix potential Null pointer dereferences in vvfat.c
45
qcow2: Read outside array bounds in qcow2_pre_write_overlap_check()
46
40
47
Peter Maydell (1):
41
Vladimir Sementsov-Ogievskiy (3):
48
blockdev: Consistently use snapshot_node_name in external_snapshot_prepare()
42
block: block_dirty_bitmap_merge(): fix error path
43
block: improve block_dirty_bitmap_merge(): don't allocate extra bitmap
44
block: simplify handling of try to merge different sized bitmaps
49
45
50
zhenwei pi (1):
46
Xie Yongji (10):
51
blockdev: handle error on block latency histogram set error
47
block: Support passing NULL ops to blk_set_dev_ops()
48
block/export: Fix incorrect length passed to vu_queue_push()
49
block/export: Abstract out the logic of virtio-blk I/O process
50
linux-headers: Add vduse.h
51
libvduse: Add VDUSE (vDPA Device in Userspace) library
52
vduse-blk: Implement vduse-blk export
53
vduse-blk: Add vduse-blk resize support
54
libvduse: Add support for reconnecting
55
vduse-blk: Add serial option
56
vduse-blk: Add name option
52
57
53
configure | 91 ++++++++++++++++++++++++++
58
qapi/block-export.json | 29 +-
54
block/block-backend.c | 3 +-
59
docs/tools/qemu-storage-daemon.rst | 22 +
55
block/file-posix.c | 122 ++++++++++++++++++++---------------
60
meson_options.txt | 4 +
56
block/qcow2-refcount.c | 18 +++---
61
block/export/vduse-blk.h | 20 +
57
block/vvfat.c | 46 ++++++++-----
62
block/export/virtio-blk-handler.h | 37 +
58
blockdev.c | 21 ++++--
63
include/block/aio-wait.h | 2 +
59
hw/block/nvme.c | 6 +-
64
include/block/block-io.h | 1 -
60
job.c | 4 +-
65
include/block/block_int-io.h | 2 +-
61
qemu-img.c | 1 +
66
include/qemu/hbitmap.h | 15 +-
62
tests/test-image-locking.c | 157 +++++++++++++++++++++++++++++++++++++++++++++
67
include/sysemu/block-backend-io.h | 1 -
63
block/Makefile.objs | 22 +++++--
68
linux-headers/linux/vduse.h | 306 ++++++
64
tests/Makefile.include | 2 +
69
subprojects/libvduse/include/atomic.h | 1 +
65
12 files changed, 400 insertions(+), 93 deletions(-)
70
subprojects/libvduse/include/compiler.h | 1 +
66
create mode 100644 tests/test-image-locking.c
71
subprojects/libvduse/libvduse.h | 247 +++++
67
72
block/backup.c | 6 +-
73
block/block-backend.c | 12 +-
74
block/dirty-bitmap.c | 26 +-
75
block/export/export.c | 6 +
76
block/export/vduse-blk.c | 374 ++++++++
77
block/export/vhost-user-blk-server.c | 263 +----
78
block/export/virtio-blk-handler.c | 240 +++++
79
block/gluster.c | 2 +-
80
block/io.c | 15 -
81
block/monitor/bitmap-qmp-cmds.c | 40 +-
82
block/nbd.c | 8 +-
83
block/rbd.c | 24 +
84
hw/block/virtio-blk.c | 1 -
85
hw/block/xen-block.c | 1 -
86
hw/ide/core.c | 1 -
87
hw/scsi/scsi-disk.c | 1 -
88
hw/scsi/scsi-generic.c | 1 -
89
storage-daemon/qemu-storage-daemon.c | 10 +
90
subprojects/libvduse/libvduse.c | 1375 +++++++++++++++++++++++++++
91
util/aio-wait.c | 16 +-
92
util/hbitmap.c | 25 +-
93
MAINTAINERS | 9 +
94
block/export/meson.build | 7 +-
95
meson.build | 34 +
96
scripts/meson-buildoptions.sh | 7 +
97
scripts/update-linux-headers.sh | 2 +-
98
subprojects/libvduse/linux-headers/linux | 1 +
99
subprojects/libvduse/meson.build | 10 +
100
subprojects/libvduse/standard-headers/linux | 1 +
101
43 files changed, 2852 insertions(+), 354 deletions(-)
102
create mode 100644 block/export/vduse-blk.h
103
create mode 100644 block/export/virtio-blk-handler.h
104
create mode 100644 linux-headers/linux/vduse.h
105
create mode 120000 subprojects/libvduse/include/atomic.h
106
create mode 120000 subprojects/libvduse/include/compiler.h
107
create mode 100644 subprojects/libvduse/libvduse.h
108
create mode 100644 block/export/vduse-blk.c
109
create mode 100644 block/export/virtio-blk-handler.c
110
create mode 100644 subprojects/libvduse/libvduse.c
111
create mode 120000 subprojects/libvduse/linux-headers/linux
112
create mode 100644 subprojects/libvduse/meson.build
113
create mode 120000 subprojects/libvduse/standard-headers/linux
diff view generated by jsdifflib
1
From: Liam Merwick <Liam.Merwick@oracle.com>
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
2
3
The commit for 0e4e4318eaa5 increments QCOW2_OL_MAX_BITNR but does not
3
bdrv_co_drain() has not been used since commit 9a0cec664eef ("mirror:
4
add an array entry for QCOW2_OL_BITMAP_DIRECTORY_BITNR to metadata_ol_names[].
4
use bdrv_drained_begin/bdrv_drained_end") in 2016. Remove it so there
5
As a result, an array dereference of metadata_ol_names[8] in
5
are fewer drain scenarios to worry about.
6
qcow2_pre_write_overlap_check() could result in a read outside of the array bounds.
7
6
8
Fixes: 0e4e4318eaa5 ('qcow2: add overlap check for bitmap directory')
7
Use bdrv_drained_begin()/bdrv_drained_end() instead. They are "mixed"
8
functions that can be called from coroutine context. Unlike
9
bdrv_co_drain(), these functions provide control of the length of the
10
drained section, which is usually the right thing.
9
11
10
Cc: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Liam Merwick <Liam.Merwick@oracle.com>
13
Message-Id: <20220521122714.3837731-1-stefanha@redhat.com>
12
Reviewed-by: Eric Blake <eblake@redhat.com>
14
Reviewed-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
13
Reviewed-by: Max Reitz <mreitz@redhat.com>
15
Reviewed-by: Alberto Faria <afaria@redhat.com>
14
Message-id: 1541453919-25973-6-git-send-email-Liam.Merwick@oracle.com
16
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
15
Signed-off-by: Max Reitz <mreitz@redhat.com>
16
---
17
---
17
block/qcow2-refcount.c | 18 ++++++++++--------
18
include/block/block-io.h | 1 -
18
1 file changed, 10 insertions(+), 8 deletions(-)
19
block/io.c | 15 ---------------
20
2 files changed, 16 deletions(-)
19
21
20
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
22
diff --git a/include/block/block-io.h b/include/block/block-io.h
21
index XXXXXXX..XXXXXXX 100644
23
index XXXXXXX..XXXXXXX 100644
22
--- a/block/qcow2-refcount.c
24
--- a/include/block/block-io.h
23
+++ b/block/qcow2-refcount.c
25
+++ b/include/block/block-io.h
24
@@ -XXX,XX +XXX,XX @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
26
@@ -XXX,XX +XXX,XX @@ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
27
cond); })
28
29
void bdrv_drain(BlockDriverState *bs);
30
-void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
31
32
int generated_co_wrapper
33
bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
34
diff --git a/block/io.c b/block/io.c
35
index XXXXXXX..XXXXXXX 100644
36
--- a/block/io.c
37
+++ b/block/io.c
38
@@ -XXX,XX +XXX,XX @@ void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
39
BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
25
}
40
}
26
41
27
static const char *metadata_ol_names[] = {
42
-/*
28
- [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header",
43
- * Wait for pending requests to complete on a single BlockDriverState subtree,
29
- [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table",
44
- * and suspend block driver's internal I/O until next request arrives.
30
- [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table",
45
- *
31
- [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
46
- * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
32
- [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
47
- * AioContext.
33
- [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
48
- */
34
- [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table",
49
-void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
35
- [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table",
50
-{
36
+ [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header",
51
- IO_OR_GS_CODE();
37
+ [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table",
52
- assert(qemu_in_coroutine());
38
+ [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table",
53
- bdrv_drained_begin(bs);
39
+ [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
54
- bdrv_drained_end(bs);
40
+ [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
55
-}
41
+ [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
56
-
42
+ [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table",
57
void bdrv_drain(BlockDriverState *bs)
43
+ [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table",
58
{
44
+ [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = "bitmap directory",
59
IO_OR_GS_CODE();
45
};
46
+QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
47
48
/*
49
* First performs a check for metadata overlaps (through
50
--
60
--
51
2.19.1
61
2.35.3
52
53
diff view generated by jsdifflib
1
From: Liam Merwick <Liam.Merwick@oracle.com>
1
From: Stefan Hajnoczi <stefanha@redhat.com>
2
2
3
The dev_id returned by the call to blk_get_attached_dev_id() in
3
Commit 1b7fd729559c ("block: rename buffer_alignment to
4
blk_root_get_parent_desc() can be NULL (an internal call to
4
guest_block_size") noted:
5
object_get_canonical_path may have returned NULL).
6
5
7
Instead of just checking this case before before dereferencing,
6
At this point, the field is set by the device emulation, but completely
8
adjust blk_get_attached_dev_id() to return the empty string if no
7
ignored by the block layer.
9
object path can be found (similar to the case when blk->dev is NULL
10
and an empty string is returned).
11
8
12
Signed-off-by: Liam Merwick <Liam.Merwick@oracle.com>
9
The last time the value of buffer_alignment/guest_block_size was
13
Message-id: 1541453919-25973-3-git-send-email-Liam.Merwick@oracle.com
10
actually used was before commit 339064d50639 ("block: Don't use guest
14
Reviewed-by: Max Reitz <mreitz@redhat.com>
11
sector size for qemu_blockalign()").
15
Signed-off-by: Max Reitz <mreitz@redhat.com>
12
13
This value has not been used since 2013. Get rid of it.
14
15
Cc: Xie Yongji <xieyongji@bytedance.com>
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Message-Id: <20220518130945.2657905-1-stefanha@redhat.com>
18
Reviewed-by: Paul Durrant <paul@xen.org>
19
Reviewed-by: Eric Blake <eblake@redhat.com>
20
Reviewed-by: Alberto Faria <afaria@redhat.com>
21
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
16
---
22
---
17
block/block-backend.c | 3 ++-
23
include/sysemu/block-backend-io.h | 1 -
18
1 file changed, 2 insertions(+), 1 deletion(-)
24
block/block-backend.c | 10 ----------
25
block/export/vhost-user-blk-server.c | 1 -
26
hw/block/virtio-blk.c | 1 -
27
hw/block/xen-block.c | 1 -
28
hw/ide/core.c | 1 -
29
hw/scsi/scsi-disk.c | 1 -
30
hw/scsi/scsi-generic.c | 1 -
31
8 files changed, 17 deletions(-)
19
32
33
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
34
index XXXXXXX..XXXXXXX 100644
35
--- a/include/sysemu/block-backend-io.h
36
+++ b/include/sysemu/block-backend-io.h
37
@@ -XXX,XX +XXX,XX @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action,
38
void blk_iostatus_set_err(BlockBackend *blk, int error);
39
int blk_get_max_iov(BlockBackend *blk);
40
int blk_get_max_hw_iov(BlockBackend *blk);
41
-void blk_set_guest_block_size(BlockBackend *blk, int align);
42
43
void blk_io_plug(BlockBackend *blk);
44
void blk_io_unplug(BlockBackend *blk);
20
diff --git a/block/block-backend.c b/block/block-backend.c
45
diff --git a/block/block-backend.c b/block/block-backend.c
21
index XXXXXXX..XXXXXXX 100644
46
index XXXXXXX..XXXXXXX 100644
22
--- a/block/block-backend.c
47
--- a/block/block-backend.c
23
+++ b/block/block-backend.c
48
+++ b/block/block-backend.c
24
@@ -XXX,XX +XXX,XX @@ char *blk_get_attached_dev_id(BlockBackend *blk)
49
@@ -XXX,XX +XXX,XX @@ struct BlockBackend {
25
} else if (dev->id) {
50
const BlockDevOps *dev_ops;
26
return g_strdup(dev->id);
51
void *dev_opaque;
52
53
- /* the block size for which the guest device expects atomicity */
54
- int guest_block_size;
55
-
56
/* If the BDS tree is removed, some of its options are stored here (which
57
* can be used to restore those options in the new BDS on insert) */
58
BlockBackendRootState root_state;
59
@@ -XXX,XX +XXX,XX @@ void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
60
blk->dev = NULL;
61
blk->dev_ops = NULL;
62
blk->dev_opaque = NULL;
63
- blk->guest_block_size = 512;
64
blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
65
blk_unref(blk);
66
}
67
@@ -XXX,XX +XXX,XX @@ int blk_get_max_iov(BlockBackend *blk)
68
return blk->root->bs->bl.max_iov;
69
}
70
71
-void blk_set_guest_block_size(BlockBackend *blk, int align)
72
-{
73
- IO_CODE();
74
- blk->guest_block_size = align;
75
-}
76
-
77
void *blk_try_blockalign(BlockBackend *blk, size_t size)
78
{
79
IO_CODE();
80
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
81
index XXXXXXX..XXXXXXX 100644
82
--- a/block/export/vhost-user-blk-server.c
83
+++ b/block/export/vhost-user-blk-server.c
84
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
85
return -EINVAL;
27
}
86
}
28
- return object_get_canonical_path(OBJECT(dev));
87
vexp->blk_size = logical_block_size;
29
+
88
- blk_set_guest_block_size(exp->blk, logical_block_size);
30
+ return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
89
31
}
90
if (vu_opts->has_num_queues) {
32
91
num_queues = vu_opts->num_queues;
33
/*
92
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
93
index XXXXXXX..XXXXXXX 100644
94
--- a/hw/block/virtio-blk.c
95
+++ b/hw/block/virtio-blk.c
96
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
97
98
s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
99
blk_set_dev_ops(s->blk, &virtio_block_ops, s);
100
- blk_set_guest_block_size(s->blk, s->conf.conf.logical_block_size);
101
102
blk_iostatus_enable(s->blk);
103
104
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
105
index XXXXXXX..XXXXXXX 100644
106
--- a/hw/block/xen-block.c
107
+++ b/hw/block/xen-block.c
108
@@ -XXX,XX +XXX,XX @@ static void xen_block_realize(XenDevice *xendev, Error **errp)
109
}
110
111
blk_set_dev_ops(blk, &xen_block_dev_ops, blockdev);
112
- blk_set_guest_block_size(blk, conf->logical_block_size);
113
114
if (conf->discard_granularity == -1) {
115
conf->discard_granularity = conf->physical_block_size;
116
diff --git a/hw/ide/core.c b/hw/ide/core.c
117
index XXXXXXX..XXXXXXX 100644
118
--- a/hw/ide/core.c
119
+++ b/hw/ide/core.c
120
@@ -XXX,XX +XXX,XX @@ int ide_init_drive(IDEState *s, BlockBackend *blk, IDEDriveKind kind,
121
s->smart_selftest_count = 0;
122
if (kind == IDE_CD) {
123
blk_set_dev_ops(blk, &ide_cd_block_ops, s);
124
- blk_set_guest_block_size(blk, 2048);
125
} else {
126
if (!blk_is_inserted(s->blk)) {
127
error_setg(errp, "Device needs media, but drive is empty");
128
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
129
index XXXXXXX..XXXXXXX 100644
130
--- a/hw/scsi/scsi-disk.c
131
+++ b/hw/scsi/scsi-disk.c
132
@@ -XXX,XX +XXX,XX @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
133
} else {
134
blk_set_dev_ops(s->qdev.conf.blk, &scsi_disk_block_ops, s);
135
}
136
- blk_set_guest_block_size(s->qdev.conf.blk, s->qdev.blocksize);
137
138
blk_iostatus_enable(s->qdev.conf.blk);
139
140
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
141
index XXXXXXX..XXXXXXX 100644
142
--- a/hw/scsi/scsi-generic.c
143
+++ b/hw/scsi/scsi-generic.c
144
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
145
s->blocksize = ldl_be_p(&r->buf[8]);
146
s->max_lba = ldq_be_p(&r->buf[0]);
147
}
148
- blk_set_guest_block_size(s->conf.blk, s->blocksize);
149
150
/*
151
* Patch MODE SENSE device specific parameters if the BDS is opened
34
--
152
--
35
2.19.1
153
2.35.3
36
37
diff view generated by jsdifflib
New patch
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>
1
2
3
At the end we ignore failure of bdrv_merge_dirty_bitmap() and report
4
success. And still set errp. That's wrong.
5
6
Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
7
Reviewed-by: Nikita Lapshin <nikita.lapshin@virtuozzo.com>
8
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
9
Message-Id: <20220517111206.23585-2-v.sementsov-og@mail.ru>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
12
block/monitor/bitmap-qmp-cmds.c | 5 ++++-
13
1 file changed, 4 insertions(+), 1 deletion(-)
14
15
diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/block/monitor/bitmap-qmp-cmds.c
18
+++ b/block/monitor/bitmap-qmp-cmds.c
19
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
20
}
21
22
/* Merge into dst; dst is unchanged on failure. */
23
- bdrv_merge_dirty_bitmap(dst, anon, backup, errp);
24
+ if (!bdrv_merge_dirty_bitmap(dst, anon, backup, errp)) {
25
+ dst = NULL;
26
+ goto out;
27
+ }
28
29
out:
30
bdrv_release_dirty_bitmap(anon);
31
--
32
2.35.3
diff view generated by jsdifflib
1
From: zhenwei pi <pizhenwei@bytedance.com>
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>
2
2
3
Function block_latency_histogram_set may return error, but qapi ignore this.
3
We don't need extra bitmap. All we need is to backup the original
4
This can be reproduced easily by qmp command:
4
bitmap when we do first merge. So, drop extra temporary bitmap and work
5
virsh qemu-monitor-command INSTANCE '{"execute":"x-block-latency-histogram-set",
5
directly with target and backup.
6
"arguments":{"device":"drive-virtio-disk1","boundaries":[10,200,40]}}'
7
In fact this command does not work, but we still get success result.
8
6
9
qmp_x_block_latency_histogram_set is a batch setting API, report error ASAP.
7
Still to keep old semantics, that on failure target is unchanged and
8
user don't need to restore, we need a local_backup variable and do
9
restore ourselves on failure path.
10
10
11
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
11
Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
12
Message-Id: <20220517111206.23585-3-v.sementsov-og@mail.ru>
13
Reviewed-by: Eric Blake <eblake@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
14
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
---
15
---
14
blockdev.c | 19 ++++++++++++++++---
16
block/monitor/bitmap-qmp-cmds.c | 41 +++++++++++++++++----------------
15
1 file changed, 16 insertions(+), 3 deletions(-)
17
1 file changed, 21 insertions(+), 20 deletions(-)
16
18
17
diff --git a/blockdev.c b/blockdev.c
19
diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c
18
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
19
--- a/blockdev.c
21
--- a/block/monitor/bitmap-qmp-cmds.c
20
+++ b/blockdev.c
22
+++ b/block/monitor/bitmap-qmp-cmds.c
21
@@ -XXX,XX +XXX,XX @@ void qmp_x_block_latency_histogram_set(
23
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
24
HBitmap **backup, Error **errp)
22
{
25
{
23
BlockBackend *blk = blk_by_name(device);
26
BlockDriverState *bs;
24
BlockAcctStats *stats;
27
- BdrvDirtyBitmap *dst, *src, *anon;
25
+ int ret;
28
+ BdrvDirtyBitmap *dst, *src;
26
29
BlockDirtyBitmapOrStrList *lst;
27
if (!blk) {
30
+ HBitmap *local_backup = NULL;
28
error_setg(errp, "Device '%s' not found", device);
31
29
@@ -XXX,XX +XXX,XX @@ void qmp_x_block_latency_histogram_set(
32
GLOBAL_STATE_CODE();
33
34
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
35
return NULL;
30
}
36
}
31
37
32
if (has_boundaries || has_boundaries_read) {
38
- anon = bdrv_create_dirty_bitmap(bs, bdrv_dirty_bitmap_granularity(dst),
33
- block_latency_histogram_set(
39
- NULL, errp);
34
+ ret = block_latency_histogram_set(
40
- if (!anon) {
35
stats, BLOCK_ACCT_READ,
41
- return NULL;
36
has_boundaries_read ? boundaries_read : boundaries);
42
- }
37
+ if (ret) {
43
-
38
+ error_setg(errp, "Device '%s' set read boundaries fail", device);
44
for (lst = bms; lst; lst = lst->next) {
39
+ return;
45
switch (lst->value->type) {
40
+ }
46
const char *name, *node;
47
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
48
src = bdrv_find_dirty_bitmap(bs, name);
49
if (!src) {
50
error_setg(errp, "Dirty bitmap '%s' not found", name);
51
- dst = NULL;
52
- goto out;
53
+ goto fail;
54
}
55
break;
56
case QTYPE_QDICT:
57
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
58
name = lst->value->u.external.name;
59
src = block_dirty_bitmap_lookup(node, name, NULL, errp);
60
if (!src) {
61
- dst = NULL;
62
- goto out;
63
+ goto fail;
64
}
65
break;
66
default:
67
abort();
68
}
69
70
- if (!bdrv_merge_dirty_bitmap(anon, src, NULL, errp)) {
71
- dst = NULL;
72
- goto out;
73
+ /* We do backup only for first merge operation */
74
+ if (!bdrv_merge_dirty_bitmap(dst, src,
75
+ local_backup ? NULL : &local_backup,
76
+ errp))
77
+ {
78
+ goto fail;
79
}
41
}
80
}
42
81
43
if (has_boundaries || has_boundaries_write) {
82
- /* Merge into dst; dst is unchanged on failure. */
44
- block_latency_histogram_set(
83
- if (!bdrv_merge_dirty_bitmap(dst, anon, backup, errp)) {
45
+ ret = block_latency_histogram_set(
84
- dst = NULL;
46
stats, BLOCK_ACCT_WRITE,
85
- goto out;
47
has_boundaries_write ? boundaries_write : boundaries);
86
+ if (backup) {
48
+ if (ret) {
87
+ *backup = local_backup;
49
+ error_setg(errp, "Device '%s' set write boundaries fail", device);
88
+ } else {
50
+ return;
89
+ hbitmap_free(local_backup);
51
+ }
52
}
90
}
53
91
54
if (has_boundaries || has_boundaries_flush) {
92
- out:
55
- block_latency_histogram_set(
93
- bdrv_release_dirty_bitmap(anon);
56
+ ret = block_latency_histogram_set(
94
return dst;
57
stats, BLOCK_ACCT_FLUSH,
95
+
58
has_boundaries_flush ? boundaries_flush : boundaries);
96
+fail:
59
+ if (ret) {
97
+ if (local_backup) {
60
+ error_setg(errp, "Device '%s' set flush boundaries fail", device);
98
+ bdrv_restore_dirty_bitmap(dst, local_backup);
61
+ return;
99
+ }
62
+ }
100
+
63
}
101
+ return NULL;
64
}
102
}
65
103
104
void qmp_block_dirty_bitmap_merge(const char *node, const char *target,
66
--
105
--
67
2.19.1
106
2.35.3
68
69
diff view generated by jsdifflib
New patch
1
1
From: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>
2
3
We have too much logic to simply check that bitmaps are of the same
4
size. Let's just define that hbitmap_merge() and
5
bdrv_dirty_bitmap_merge_internal() require their argument bitmaps be of
6
same size, this simplifies things.
7
8
Let's look through the callers:
9
10
For backup_init_bcs_bitmap() we already assert that merge can't fail.
11
12
In bdrv_reclaim_dirty_bitmap_locked() we gracefully handle the error
13
that can't happen: successor always has same size as its parent, drop
14
this logic.
15
16
In bdrv_merge_dirty_bitmap() we already has assertion and separate
17
check. Make the check explicit and improve error message.
18
19
Signed-off-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
20
Reviewed-by: Nikita Lapshin <nikita.lapshin@virtuozzo.com>
21
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
22
Message-Id: <20220517111206.23585-4-v.sementsov-og@mail.ru>
23
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
24
---
25
include/block/block_int-io.h | 2 +-
26
include/qemu/hbitmap.h | 15 ++-------------
27
block/backup.c | 6 ++----
28
block/dirty-bitmap.c | 26 +++++++++++---------------
29
util/hbitmap.c | 25 +++++++------------------
30
5 files changed, 23 insertions(+), 51 deletions(-)
31
32
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
33
index XXXXXXX..XXXXXXX 100644
34
--- a/include/block/block_int-io.h
35
+++ b/include/block/block_int-io.h
36
@@ -XXX,XX +XXX,XX @@ bool blk_dev_is_tray_open(BlockBackend *blk);
37
void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
38
39
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
40
-bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
41
+void bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
42
const BdrvDirtyBitmap *src,
43
HBitmap **backup, bool lock);
44
45
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
46
index XXXXXXX..XXXXXXX 100644
47
--- a/include/qemu/hbitmap.h
48
+++ b/include/qemu/hbitmap.h
49
@@ -XXX,XX +XXX,XX @@ void hbitmap_truncate(HBitmap *hb, uint64_t size);
50
*
51
* Store result of merging @a and @b into @result.
52
* @result is allowed to be equal to @a or @b.
53
- *
54
- * Return true if the merge was successful,
55
- * false if it was not attempted.
56
- */
57
-bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result);
58
-
59
-/**
60
- * hbitmap_can_merge:
61
- *
62
- * hbitmap_can_merge(a, b) && hbitmap_can_merge(a, result) is sufficient and
63
- * necessary for hbitmap_merge will not fail.
64
- *
65
+ * All bitmaps must have same size.
66
*/
67
-bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b);
68
+void hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result);
69
70
/**
71
* hbitmap_empty:
72
diff --git a/block/backup.c b/block/backup.c
73
index XXXXXXX..XXXXXXX 100644
74
--- a/block/backup.c
75
+++ b/block/backup.c
76
@@ -XXX,XX +XXX,XX @@ out:
77
78
static void backup_init_bcs_bitmap(BackupBlockJob *job)
79
{
80
- bool ret;
81
uint64_t estimate;
82
BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(job->bcs);
83
84
if (job->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
85
bdrv_clear_dirty_bitmap(bcs_bitmap, NULL);
86
- ret = bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap,
87
- NULL, true);
88
- assert(ret);
89
+ bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap, NULL,
90
+ true);
91
} else if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
92
/*
93
* We can't hog the coroutine to initialize this thoroughly.
94
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
95
index XXXXXXX..XXXXXXX 100644
96
--- a/block/dirty-bitmap.c
97
+++ b/block/dirty-bitmap.c
98
@@ -XXX,XX +XXX,XX @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *parent,
99
return NULL;
100
}
101
102
- if (!hbitmap_merge(parent->bitmap, successor->bitmap, parent->bitmap)) {
103
- error_setg(errp, "Merging of parent and successor bitmap failed");
104
- return NULL;
105
- }
106
+ hbitmap_merge(parent->bitmap, successor->bitmap, parent->bitmap);
107
108
parent->disabled = successor->disabled;
109
parent->busy = false;
110
@@ -XXX,XX +XXX,XX @@ bool bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
111
goto out;
112
}
113
114
- if (!hbitmap_can_merge(dest->bitmap, src->bitmap)) {
115
- error_setg(errp, "Bitmaps are incompatible and can't be merged");
116
+ if (bdrv_dirty_bitmap_size(src) != bdrv_dirty_bitmap_size(dest)) {
117
+ error_setg(errp, "Bitmaps are of different sizes (destination size is %"
118
+ PRId64 ", source size is %" PRId64 ") and can't be merged",
119
+ bdrv_dirty_bitmap_size(dest), bdrv_dirty_bitmap_size(src));
120
goto out;
121
}
122
123
- ret = bdrv_dirty_bitmap_merge_internal(dest, src, backup, false);
124
- assert(ret);
125
+ bdrv_dirty_bitmap_merge_internal(dest, src, backup, false);
126
+ ret = true;
127
128
out:
129
bdrv_dirty_bitmaps_unlock(dest->bs);
130
@@ -XXX,XX +XXX,XX @@ out:
131
/**
132
* bdrv_dirty_bitmap_merge_internal: merge src into dest.
133
* Does NOT check bitmap permissions; not suitable for use as public API.
134
+ * @dest, @src and @backup (if not NULL) must have same size.
135
*
136
* @backup: If provided, make a copy of dest here prior to merge.
137
* @lock: If true, lock and unlock bitmaps on the way in/out.
138
- * returns true if the merge succeeded; false if unattempted.
139
*/
140
-bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
141
+void bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
142
const BdrvDirtyBitmap *src,
143
HBitmap **backup,
144
bool lock)
145
{
146
- bool ret;
147
IO_CODE();
148
149
assert(!bdrv_dirty_bitmap_readonly(dest));
150
@@ -XXX,XX +XXX,XX @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
151
if (backup) {
152
*backup = dest->bitmap;
153
dest->bitmap = hbitmap_alloc(dest->size, hbitmap_granularity(*backup));
154
- ret = hbitmap_merge(*backup, src->bitmap, dest->bitmap);
155
+ hbitmap_merge(*backup, src->bitmap, dest->bitmap);
156
} else {
157
- ret = hbitmap_merge(dest->bitmap, src->bitmap, dest->bitmap);
158
+ hbitmap_merge(dest->bitmap, src->bitmap, dest->bitmap);
159
}
160
161
if (lock) {
162
@@ -XXX,XX +XXX,XX @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
163
bdrv_dirty_bitmaps_unlock(src->bs);
164
}
165
}
166
-
167
- return ret;
168
}
169
diff --git a/util/hbitmap.c b/util/hbitmap.c
170
index XXXXXXX..XXXXXXX 100644
171
--- a/util/hbitmap.c
172
+++ b/util/hbitmap.c
173
@@ -XXX,XX +XXX,XX @@ void hbitmap_truncate(HBitmap *hb, uint64_t size)
174
}
175
}
176
177
-bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b)
178
-{
179
- return (a->orig_size == b->orig_size);
180
-}
181
-
182
/**
183
* hbitmap_sparse_merge: performs dst = dst | src
184
* works with differing granularities.
185
@@ -XXX,XX +XXX,XX @@ static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src)
186
* Given HBitmaps A and B, let R := A (BITOR) B.
187
* Bitmaps A and B will not be modified,
188
* except when bitmap R is an alias of A or B.
189
- *
190
- * @return true if the merge was successful,
191
- * false if it was not attempted.
192
+ * Bitmaps must have same size.
193
*/
194
-bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
195
+void hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
196
{
197
int i;
198
uint64_t j;
199
200
- if (!hbitmap_can_merge(a, b) || !hbitmap_can_merge(a, result)) {
201
- return false;
202
- }
203
- assert(hbitmap_can_merge(b, result));
204
+ assert(a->orig_size == result->orig_size);
205
+ assert(b->orig_size == result->orig_size);
206
207
if ((!hbitmap_count(a) && result == b) ||
208
(!hbitmap_count(b) && result == a)) {
209
- return true;
210
+ return;
211
}
212
213
if (!hbitmap_count(a) && !hbitmap_count(b)) {
214
hbitmap_reset_all(result);
215
- return true;
216
+ return;
217
}
218
219
if (a->granularity != b->granularity) {
220
@@ -XXX,XX +XXX,XX @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
221
if (b != result) {
222
hbitmap_sparse_merge(result, b);
223
}
224
- return true;
225
+ return;
226
}
227
228
/* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
229
@@ -XXX,XX +XXX,XX @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
230
231
/* Recompute the dirty count */
232
result->count = hb_count_between(result, 0, result->size - 1);
233
-
234
- return true;
235
}
236
237
char *hbitmap_sha256(const HBitmap *bitmap, Error **errp)
238
--
239
2.35.3
diff view generated by jsdifflib
1
From: Li Qiang <liq3ea@gmail.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
Currently, when hotplug/unhotplug nvme device, it will cause an
3
This supports passing NULL ops to blk_set_dev_ops()
4
assert in object.c. Following is the backtrack:
4
so that we can remove stale ops in some cases.
5
5
6
ERROR:qom/object.c:981:object_unref: assertion failed: (obj->ref > 0)
6
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
7
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Thread 2 "qemu-system-x86" received signal SIGABRT, Aborted.
8
Message-Id: <20220523084611.91-2-xieyongji@bytedance.com>
9
[Switching to Thread 0x7fffcbd32700 (LWP 18844)]
10
0x00007fffdb9e4fff in raise () from /lib/x86_64-linux-gnu/libc.so.6
11
(gdb) bt
12
/lib/x86_64-linux-gnu/libglib-2.0.so.0
13
/lib/x86_64-linux-gnu/libglib-2.0.so.0
14
qom/object.c:981
15
/home/liqiang02/qemu-upstream/qemu/memory.c:1732
16
/home/liqiang02/qemu-upstream/qemu/memory.c:285
17
util/qemu-thread-posix.c:504
18
/lib/x86_64-linux-gnu/libpthread.so.0
19
20
This is caused by memory_region_unref in nvme_exit.
21
22
Remove it to make the PCIdevice refcount correct.
23
24
Signed-off-by: Li Qiang <liq3ea@gmail.com>
25
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
26
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
27
---
10
---
28
hw/block/nvme.c | 3 ---
11
block/block-backend.c | 2 +-
29
1 file changed, 3 deletions(-)
12
1 file changed, 1 insertion(+), 1 deletion(-)
30
13
31
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
14
diff --git a/block/block-backend.c b/block/block-backend.c
32
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
33
--- a/hw/block/nvme.c
16
--- a/block/block-backend.c
34
+++ b/hw/block/nvme.c
17
+++ b/block/block-backend.c
35
@@ -XXX,XX +XXX,XX @@ static void nvme_exit(PCIDevice *pci_dev)
18
@@ -XXX,XX +XXX,XX @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
36
g_free(n->namespaces);
19
blk->dev_opaque = opaque;
37
g_free(n->cq);
20
38
g_free(n->sq);
21
/* Are we currently quiesced? Should we enforce this right now? */
39
- if (n->cmbsz) {
22
- if (blk->quiesce_counter && ops->drained_begin) {
40
- memory_region_unref(&n->ctrl_mem);
23
+ if (blk->quiesce_counter && ops && ops->drained_begin) {
41
- }
24
ops->drained_begin(opaque);
42
25
}
43
msix_uninit_exclusive_bar(pci_dev);
44
}
26
}
45
--
27
--
46
2.19.1
28
2.35.3
47
48
diff view generated by jsdifflib
New patch
1
From: Xie Yongji <xieyongji@bytedance.com>
1
2
3
Now the req->size is set to the correct value only
4
when handling VIRTIO_BLK_T_GET_ID request. This patch
5
fixes it.
6
7
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
8
Message-Id: <20220523084611.91-3-xieyongji@bytedance.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
12
block/export/vhost-user-blk-server.c | 5 ++---
13
1 file changed, 2 insertions(+), 3 deletions(-)
14
15
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
16
index XXXXXXX..XXXXXXX 100644
17
--- a/block/export/vhost-user-blk-server.c
18
+++ b/block/export/vhost-user-blk-server.c
19
@@ -XXX,XX +XXX,XX @@ static void vu_blk_req_complete(VuBlkReq *req)
20
{
21
VuDev *vu_dev = &req->server->vu_dev;
22
23
- /* IO size with 1 extra status byte */
24
- vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1);
25
+ vu_queue_push(vu_dev, req->vq, &req->elem, req->size);
26
vu_queue_notify(vu_dev, req->vq);
27
28
free(req);
29
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
30
goto err;
31
}
32
33
+ req->size = iov_size(in_iov, in_num);
34
/* We always touch the last byte, so just see how big in_iov is. */
35
req->in = (void *)in_iov[in_num - 1].iov_base
36
+ in_iov[in_num - 1].iov_len
37
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
38
VIRTIO_BLK_ID_BYTES);
39
snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
40
req->in->status = VIRTIO_BLK_S_OK;
41
- req->size = elem->in_sg[0].iov_len;
42
break;
43
}
44
case VIRTIO_BLK_T_DISCARD:
45
--
46
2.35.3
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
Signed-off-by: Fam Zheng <famz@redhat.com>
3
Abstract the common logic of virtio-blk I/O process to a function
4
named virtio_blk_process_req(). It's needed for the following commit.
5
6
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
7
Message-Id: <20220523084611.91-4-xieyongji@bytedance.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
5
---
10
---
6
tests/test-image-locking.c | 157 +++++++++++++++++++++++++++++++++++++
11
block/export/virtio-blk-handler.h | 37 ++++
7
tests/Makefile.include | 2 +
12
block/export/vhost-user-blk-server.c | 259 +++------------------------
8
2 files changed, 159 insertions(+)
13
block/export/virtio-blk-handler.c | 240 +++++++++++++++++++++++++
9
create mode 100644 tests/test-image-locking.c
14
MAINTAINERS | 2 +
15
block/export/meson.build | 2 +-
16
5 files changed, 301 insertions(+), 239 deletions(-)
17
create mode 100644 block/export/virtio-blk-handler.h
18
create mode 100644 block/export/virtio-blk-handler.c
10
19
11
diff --git a/tests/test-image-locking.c b/tests/test-image-locking.c
20
diff --git a/block/export/virtio-blk-handler.h b/block/export/virtio-blk-handler.h
12
new file mode 100644
21
new file mode 100644
13
index XXXXXXX..XXXXXXX
22
index XXXXXXX..XXXXXXX
14
--- /dev/null
23
--- /dev/null
15
+++ b/tests/test-image-locking.c
24
+++ b/block/export/virtio-blk-handler.h
16
@@ -XXX,XX +XXX,XX @@
25
@@ -XXX,XX +XXX,XX @@
17
+/*
26
+/*
18
+ * Image locking tests
27
+ * Handler for virtio-blk I/O
19
+ *
28
+ *
20
+ * Copyright (c) 2018 Red Hat Inc.
29
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
21
+ *
30
+ *
22
+ * Author: Fam Zheng <famz@redhat.com>
31
+ * Author:
32
+ * Xie Yongji <xieyongji@bytedance.com>
23
+ *
33
+ *
24
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
34
+ * This work is licensed under the terms of the GNU GPL, version 2 or
25
+ * of this software and associated documentation files (the "Software"), to deal
35
+ * later. See the COPYING file in the top-level directory.
26
+ * in the Software without restriction, including without limitation the rights
36
+ */
27
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
37
+
28
+ * copies of the Software, and to permit persons to whom the Software is
38
+#ifndef VIRTIO_BLK_HANDLER_H
29
+ * furnished to do so, subject to the following conditions:
39
+#define VIRTIO_BLK_HANDLER_H
40
+
41
+#include "sysemu/block-backend.h"
42
+
43
+#define VIRTIO_BLK_SECTOR_BITS 9
44
+#define VIRTIO_BLK_SECTOR_SIZE (1ULL << VIRTIO_BLK_SECTOR_BITS)
45
+
46
+#define VIRTIO_BLK_MAX_DISCARD_SECTORS 32768
47
+#define VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS 32768
48
+
49
+typedef struct {
50
+ BlockBackend *blk;
51
+ const char *serial;
52
+ uint32_t logical_block_size;
53
+ bool writable;
54
+} VirtioBlkHandler;
55
+
56
+int coroutine_fn virtio_blk_process_req(VirtioBlkHandler *handler,
57
+ struct iovec *in_iov,
58
+ struct iovec *out_iov,
59
+ unsigned int in_num,
60
+ unsigned int out_num);
61
+
62
+#endif /* VIRTIO_BLK_HANDLER_H */
63
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
64
index XXXXXXX..XXXXXXX 100644
65
--- a/block/export/vhost-user-blk-server.c
66
+++ b/block/export/vhost-user-blk-server.c
67
@@ -XXX,XX +XXX,XX @@
68
#include "vhost-user-blk-server.h"
69
#include "qapi/error.h"
70
#include "qom/object_interfaces.h"
71
-#include "sysemu/block-backend.h"
72
#include "util/block-helpers.h"
73
-
74
-/*
75
- * Sector units are 512 bytes regardless of the
76
- * virtio_blk_config->blk_size value.
77
- */
78
-#define VIRTIO_BLK_SECTOR_BITS 9
79
-#define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS)
80
+#include "virtio-blk-handler.h"
81
82
enum {
83
VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1,
84
- VHOST_USER_BLK_MAX_DISCARD_SECTORS = 32768,
85
- VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS = 32768,
86
-};
87
-struct virtio_blk_inhdr {
88
- unsigned char status;
89
};
90
91
typedef struct VuBlkReq {
92
VuVirtqElement elem;
93
- int64_t sector_num;
94
- size_t size;
95
- struct virtio_blk_inhdr *in;
96
- struct virtio_blk_outhdr out;
97
VuServer *server;
98
struct VuVirtq *vq;
99
} VuBlkReq;
100
@@ -XXX,XX +XXX,XX @@ typedef struct VuBlkReq {
101
typedef struct {
102
BlockExport export;
103
VuServer vu_server;
104
- uint32_t blk_size;
105
+ VirtioBlkHandler handler;
106
QIOChannelSocket *sioc;
107
struct virtio_blk_config blkcfg;
108
- bool writable;
109
} VuBlkExport;
110
111
-static void vu_blk_req_complete(VuBlkReq *req)
112
+static void vu_blk_req_complete(VuBlkReq *req, size_t in_len)
113
{
114
VuDev *vu_dev = &req->server->vu_dev;
115
116
- vu_queue_push(vu_dev, req->vq, &req->elem, req->size);
117
+ vu_queue_push(vu_dev, req->vq, &req->elem, in_len);
118
vu_queue_notify(vu_dev, req->vq);
119
120
free(req);
121
}
122
123
-static bool vu_blk_sect_range_ok(VuBlkExport *vexp, uint64_t sector,
124
- size_t size)
125
-{
126
- uint64_t nb_sectors;
127
- uint64_t total_sectors;
128
-
129
- if (size % VIRTIO_BLK_SECTOR_SIZE) {
130
- return false;
131
- }
132
-
133
- nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS;
134
-
135
- QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE);
136
- if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
137
- return false;
138
- }
139
- if ((sector << VIRTIO_BLK_SECTOR_BITS) % vexp->blk_size) {
140
- return false;
141
- }
142
- blk_get_geometry(vexp->export.blk, &total_sectors);
143
- if (sector > total_sectors || nb_sectors > total_sectors - sector) {
144
- return false;
145
- }
146
- return true;
147
-}
148
-
149
-static int coroutine_fn
150
-vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct iovec *iov,
151
- uint32_t iovcnt, uint32_t type)
152
-{
153
- BlockBackend *blk = vexp->export.blk;
154
- struct virtio_blk_discard_write_zeroes desc;
155
- ssize_t size;
156
- uint64_t sector;
157
- uint32_t num_sectors;
158
- uint32_t max_sectors;
159
- uint32_t flags;
160
- int bytes;
161
-
162
- /* Only one desc is currently supported */
163
- if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) {
164
- return VIRTIO_BLK_S_UNSUPP;
165
- }
166
-
167
- size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
168
- if (unlikely(size != sizeof(desc))) {
169
- error_report("Invalid size %zd, expected %zu", size, sizeof(desc));
170
- return VIRTIO_BLK_S_IOERR;
171
- }
172
-
173
- sector = le64_to_cpu(desc.sector);
174
- num_sectors = le32_to_cpu(desc.num_sectors);
175
- flags = le32_to_cpu(desc.flags);
176
- max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ?
177
- VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS :
178
- VHOST_USER_BLK_MAX_DISCARD_SECTORS;
179
-
180
- /* This check ensures that 'bytes' fits in an int */
181
- if (unlikely(num_sectors > max_sectors)) {
182
- return VIRTIO_BLK_S_IOERR;
183
- }
184
-
185
- bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS;
186
-
187
- if (unlikely(!vu_blk_sect_range_ok(vexp, sector, bytes))) {
188
- return VIRTIO_BLK_S_IOERR;
189
- }
190
-
191
- /*
192
- * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
193
- * and write zeroes commands if any unknown flag is set.
194
- */
195
- if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
196
- return VIRTIO_BLK_S_UNSUPP;
197
- }
198
-
199
- if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
200
- int blk_flags = 0;
201
-
202
- if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
203
- blk_flags |= BDRV_REQ_MAY_UNMAP;
204
- }
205
-
206
- if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS,
207
- bytes, blk_flags) == 0) {
208
- return VIRTIO_BLK_S_OK;
209
- }
210
- } else if (type == VIRTIO_BLK_T_DISCARD) {
211
- /*
212
- * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
213
- * discard commands if the unmap flag is set.
214
- */
215
- if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
216
- return VIRTIO_BLK_S_UNSUPP;
217
- }
218
-
219
- if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS,
220
- bytes) == 0) {
221
- return VIRTIO_BLK_S_OK;
222
- }
223
- }
224
-
225
- return VIRTIO_BLK_S_IOERR;
226
-}
227
-
228
/* Called with server refcount increased, must decrease before returning */
229
static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
230
{
231
VuBlkReq *req = opaque;
232
VuServer *server = req->server;
233
VuVirtqElement *elem = &req->elem;
234
- uint32_t type;
235
-
236
VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
237
- BlockBackend *blk = vexp->export.blk;
238
-
239
+ VirtioBlkHandler *handler = &vexp->handler;
240
struct iovec *in_iov = elem->in_sg;
241
struct iovec *out_iov = elem->out_sg;
242
unsigned in_num = elem->in_num;
243
unsigned out_num = elem->out_num;
244
-
245
- /* refer to hw/block/virtio_blk.c */
246
- if (elem->out_num < 1 || elem->in_num < 1) {
247
- error_report("virtio-blk request missing headers");
248
- goto err;
249
- }
250
-
251
- if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
252
- sizeof(req->out)) != sizeof(req->out))) {
253
- error_report("virtio-blk request outhdr too short");
254
- goto err;
255
- }
256
-
257
- iov_discard_front(&out_iov, &out_num, sizeof(req->out));
258
-
259
- if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
260
- error_report("virtio-blk request inhdr too short");
261
- goto err;
262
- }
263
-
264
- req->size = iov_size(in_iov, in_num);
265
- /* We always touch the last byte, so just see how big in_iov is. */
266
- req->in = (void *)in_iov[in_num - 1].iov_base
267
- + in_iov[in_num - 1].iov_len
268
- - sizeof(struct virtio_blk_inhdr);
269
- iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
270
-
271
- type = le32_to_cpu(req->out.type);
272
- switch (type & ~VIRTIO_BLK_T_BARRIER) {
273
- case VIRTIO_BLK_T_IN:
274
- case VIRTIO_BLK_T_OUT: {
275
- QEMUIOVector qiov;
276
- int64_t offset;
277
- ssize_t ret = 0;
278
- bool is_write = type & VIRTIO_BLK_T_OUT;
279
- req->sector_num = le64_to_cpu(req->out.sector);
280
-
281
- if (is_write && !vexp->writable) {
282
- req->in->status = VIRTIO_BLK_S_IOERR;
283
- break;
284
- }
285
-
286
- if (is_write) {
287
- qemu_iovec_init_external(&qiov, out_iov, out_num);
288
- } else {
289
- qemu_iovec_init_external(&qiov, in_iov, in_num);
290
- }
291
-
292
- if (unlikely(!vu_blk_sect_range_ok(vexp,
293
- req->sector_num,
294
- qiov.size))) {
295
- req->in->status = VIRTIO_BLK_S_IOERR;
296
- break;
297
- }
298
-
299
- offset = req->sector_num << VIRTIO_BLK_SECTOR_BITS;
300
-
301
- if (is_write) {
302
- ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
303
- } else {
304
- ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
305
- }
306
- if (ret >= 0) {
307
- req->in->status = VIRTIO_BLK_S_OK;
308
- } else {
309
- req->in->status = VIRTIO_BLK_S_IOERR;
310
- }
311
- break;
312
- }
313
- case VIRTIO_BLK_T_FLUSH:
314
- if (blk_co_flush(blk) == 0) {
315
- req->in->status = VIRTIO_BLK_S_OK;
316
- } else {
317
- req->in->status = VIRTIO_BLK_S_IOERR;
318
- }
319
- break;
320
- case VIRTIO_BLK_T_GET_ID: {
321
- size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
322
- VIRTIO_BLK_ID_BYTES);
323
- snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
324
- req->in->status = VIRTIO_BLK_S_OK;
325
- break;
326
+ int in_len;
327
+
328
+ in_len = virtio_blk_process_req(handler, in_iov, out_iov,
329
+ in_num, out_num);
330
+ if (in_len < 0) {
331
+ free(req);
332
+ vhost_user_server_unref(server);
333
+ return;
334
}
335
- case VIRTIO_BLK_T_DISCARD:
336
- case VIRTIO_BLK_T_WRITE_ZEROES: {
337
- if (!vexp->writable) {
338
- req->in->status = VIRTIO_BLK_S_IOERR;
339
- break;
340
- }
341
-
342
- req->in->status = vu_blk_discard_write_zeroes(vexp, out_iov, out_num,
343
- type);
344
- break;
345
- }
346
- default:
347
- req->in->status = VIRTIO_BLK_S_UNSUPP;
348
- break;
349
- }
350
-
351
- vu_blk_req_complete(req);
352
- vhost_user_server_unref(server);
353
- return;
354
355
-err:
356
- free(req);
357
+ vu_blk_req_complete(req, in_len);
358
vhost_user_server_unref(server);
359
}
360
361
@@ -XXX,XX +XXX,XX @@ static uint64_t vu_blk_get_features(VuDev *dev)
362
1ull << VIRTIO_RING_F_EVENT_IDX |
363
1ull << VHOST_USER_F_PROTOCOL_FEATURES;
364
365
- if (!vexp->writable) {
366
+ if (!vexp->handler.writable) {
367
features |= 1ull << VIRTIO_BLK_F_RO;
368
}
369
370
@@ -XXX,XX +XXX,XX @@ vu_blk_initialize_config(BlockDriverState *bs,
371
config->opt_io_size = cpu_to_le32(1);
372
config->num_queues = cpu_to_le16(num_queues);
373
config->max_discard_sectors =
374
- cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS);
375
+ cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS);
376
config->max_discard_seg = cpu_to_le32(1);
377
config->discard_sector_alignment =
378
cpu_to_le32(blk_size >> VIRTIO_BLK_SECTOR_BITS);
379
config->max_write_zeroes_sectors
380
- = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS);
381
+ = cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS);
382
config->max_write_zeroes_seg = cpu_to_le32(1);
383
}
384
385
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
386
uint64_t logical_block_size;
387
uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT;
388
389
- vexp->writable = opts->writable;
390
vexp->blkcfg.wce = 0;
391
392
if (vu_opts->has_logical_block_size) {
393
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
394
error_propagate(errp, local_err);
395
return -EINVAL;
396
}
397
- vexp->blk_size = logical_block_size;
398
399
if (vu_opts->has_num_queues) {
400
num_queues = vu_opts->num_queues;
401
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
402
error_setg(errp, "num-queues must be greater than 0");
403
return -EINVAL;
404
}
405
+ vexp->handler.blk = exp->blk;
406
+ vexp->handler.serial = "vhost_user_blk";
407
+ vexp->handler.logical_block_size = logical_block_size;
408
+ vexp->handler.writable = opts->writable;
409
410
vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
411
logical_block_size, num_queues);
412
diff --git a/block/export/virtio-blk-handler.c b/block/export/virtio-blk-handler.c
413
new file mode 100644
414
index XXXXXXX..XXXXXXX
415
--- /dev/null
416
+++ b/block/export/virtio-blk-handler.c
417
@@ -XXX,XX +XXX,XX @@
418
+/*
419
+ * Handler for virtio-blk I/O
30
+ *
420
+ *
31
+ * The above copyright notice and this permission notice shall be included in
421
+ * Copyright (c) 2020 Red Hat, Inc.
32
+ * all copies or substantial portions of the Software.
422
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
33
+ *
423
+ *
34
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
424
+ * Author:
35
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
425
+ * Coiby Xu <coiby.xu@gmail.com>
36
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
426
+ * Xie Yongji <xieyongji@bytedance.com>
37
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
427
+ *
38
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
428
+ * This work is licensed under the terms of the GNU GPL, version 2 or
39
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
429
+ * later. See the COPYING file in the top-level directory.
40
+ * THE SOFTWARE.
41
+ */
430
+ */
42
+
431
+
43
+#include "qemu/osdep.h"
432
+#include "qemu/osdep.h"
44
+#include "block/block.h"
433
+#include "qemu/error-report.h"
45
+#include "sysemu/block-backend.h"
434
+#include "virtio-blk-handler.h"
46
+#include "qapi/error.h"
435
+
47
+#include "qapi/qmp/qdict.h"
436
+#include "standard-headers/linux/virtio_blk.h"
48
+
437
+
49
+static BlockBackend *open_image(const char *path,
438
+struct virtio_blk_inhdr {
50
+ uint64_t perm, uint64_t shared_perm,
439
+ unsigned char status;
51
+ Error **errp)
440
+};
441
+
442
+static bool virtio_blk_sect_range_ok(BlockBackend *blk, uint32_t block_size,
443
+ uint64_t sector, size_t size)
52
+{
444
+{
53
+ Error *local_err = NULL;
445
+ uint64_t nb_sectors;
54
+ BlockBackend *blk;
446
+ uint64_t total_sectors;
55
+ QDict *options = qdict_new();
447
+
56
+
448
+ if (size % VIRTIO_BLK_SECTOR_SIZE) {
57
+ qdict_put_str(options, "driver", "raw");
449
+ return false;
58
+ blk = blk_new_open(path, NULL, options, BDRV_O_RDWR, &local_err);
450
+ }
59
+ if (blk) {
451
+
60
+ g_assert_null(local_err);
452
+ nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS;
61
+ if (blk_set_perm(blk, perm, shared_perm, errp)) {
453
+
62
+ blk_unref(blk);
454
+ QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE);
63
+ blk = NULL;
455
+ if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
64
+ }
456
+ return false;
65
+ } else {
457
+ }
66
+ error_propagate(errp, local_err);
458
+ if ((sector << VIRTIO_BLK_SECTOR_BITS) % block_size) {
67
+ }
459
+ return false;
68
+ return blk;
460
+ }
461
+ blk_get_geometry(blk, &total_sectors);
462
+ if (sector > total_sectors || nb_sectors > total_sectors - sector) {
463
+ return false;
464
+ }
465
+ return true;
69
+}
466
+}
70
+
467
+
71
+static void check_locked_bytes(int fd, uint64_t perm_locks,
468
+static int coroutine_fn
72
+ uint64_t shared_perm_locks)
469
+virtio_blk_discard_write_zeroes(VirtioBlkHandler *handler, struct iovec *iov,
470
+ uint32_t iovcnt, uint32_t type)
73
+{
471
+{
74
+ int i;
472
+ BlockBackend *blk = handler->blk;
75
+
473
+ struct virtio_blk_discard_write_zeroes desc;
76
+ if (!perm_locks && !shared_perm_locks) {
474
+ ssize_t size;
77
+ g_assert(!qemu_lock_fd_test(fd, 0, 0, true));
475
+ uint64_t sector;
78
+ return;
476
+ uint32_t num_sectors;
79
+ }
477
+ uint32_t max_sectors;
80
+ for (i = 0; (1ULL << i) <= BLK_PERM_ALL; i++) {
478
+ uint32_t flags;
81
+ uint64_t bit = (1ULL << i);
479
+ int bytes;
82
+ bool perm_expected = !!(bit & perm_locks);
480
+
83
+ bool shared_perm_expected = !!(bit & shared_perm_locks);
481
+ /* Only one desc is currently supported */
84
+ g_assert_cmpint(perm_expected, ==,
482
+ if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) {
85
+ !!qemu_lock_fd_test(fd, 100 + i, 1, true));
483
+ return VIRTIO_BLK_S_UNSUPP;
86
+ g_assert_cmpint(shared_perm_expected, ==,
484
+ }
87
+ !!qemu_lock_fd_test(fd, 200 + i, 1, true));
485
+
88
+ }
486
+ size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
487
+ if (unlikely(size != sizeof(desc))) {
488
+ error_report("Invalid size %zd, expected %zu", size, sizeof(desc));
489
+ return VIRTIO_BLK_S_IOERR;
490
+ }
491
+
492
+ sector = le64_to_cpu(desc.sector);
493
+ num_sectors = le32_to_cpu(desc.num_sectors);
494
+ flags = le32_to_cpu(desc.flags);
495
+ max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ?
496
+ VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS :
497
+ VIRTIO_BLK_MAX_DISCARD_SECTORS;
498
+
499
+ /* This check ensures that 'bytes' fits in an int */
500
+ if (unlikely(num_sectors > max_sectors)) {
501
+ return VIRTIO_BLK_S_IOERR;
502
+ }
503
+
504
+ bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS;
505
+
506
+ if (unlikely(!virtio_blk_sect_range_ok(blk, handler->logical_block_size,
507
+ sector, bytes))) {
508
+ return VIRTIO_BLK_S_IOERR;
509
+ }
510
+
511
+ /*
512
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
513
+ * and write zeroes commands if any unknown flag is set.
514
+ */
515
+ if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
516
+ return VIRTIO_BLK_S_UNSUPP;
517
+ }
518
+
519
+ if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
520
+ int blk_flags = 0;
521
+
522
+ if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
523
+ blk_flags |= BDRV_REQ_MAY_UNMAP;
524
+ }
525
+
526
+ if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS,
527
+ bytes, blk_flags) == 0) {
528
+ return VIRTIO_BLK_S_OK;
529
+ }
530
+ } else if (type == VIRTIO_BLK_T_DISCARD) {
531
+ /*
532
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
533
+ * discard commands if the unmap flag is set.
534
+ */
535
+ if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
536
+ return VIRTIO_BLK_S_UNSUPP;
537
+ }
538
+
539
+ if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS,
540
+ bytes) == 0) {
541
+ return VIRTIO_BLK_S_OK;
542
+ }
543
+ }
544
+
545
+ return VIRTIO_BLK_S_IOERR;
89
+}
546
+}
90
+
547
+
91
+static void test_image_locking_basic(void)
548
+int coroutine_fn virtio_blk_process_req(VirtioBlkHandler *handler,
549
+ struct iovec *in_iov,
550
+ struct iovec *out_iov,
551
+ unsigned int in_num,
552
+ unsigned int out_num)
92
+{
553
+{
93
+ BlockBackend *blk1, *blk2, *blk3;
554
+ BlockBackend *blk = handler->blk;
94
+ char img_path[] = "/tmp/qtest.XXXXXX";
555
+ struct virtio_blk_inhdr *in;
95
+ uint64_t perm, shared_perm;
556
+ struct virtio_blk_outhdr out;
96
+
557
+ uint32_t type;
97
+ int fd = mkstemp(img_path);
558
+ int in_len;
98
+ assert(fd >= 0);
559
+
99
+
560
+ if (out_num < 1 || in_num < 1) {
100
+ perm = BLK_PERM_WRITE | BLK_PERM_CONSISTENT_READ;
561
+ error_report("virtio-blk request missing headers");
101
+ shared_perm = BLK_PERM_ALL;
562
+ return -EINVAL;
102
+ blk1 = open_image(img_path, perm, shared_perm, &error_abort);
563
+ }
103
+ g_assert(blk1);
564
+
104
+
565
+ if (unlikely(iov_to_buf(out_iov, out_num, 0, &out,
105
+ check_locked_bytes(fd, perm, ~shared_perm);
566
+ sizeof(out)) != sizeof(out))) {
106
+
567
+ error_report("virtio-blk request outhdr too short");
107
+ /* compatible perm between blk1 and blk2 */
568
+ return -EINVAL;
108
+ blk2 = open_image(img_path, perm | BLK_PERM_RESIZE, shared_perm, NULL);
569
+ }
109
+ g_assert(blk2);
570
+
110
+ check_locked_bytes(fd, perm | BLK_PERM_RESIZE, ~shared_perm);
571
+ iov_discard_front(&out_iov, &out_num, sizeof(out));
111
+
572
+
112
+ /* incompatible perm with already open blk1 and blk2 */
573
+ if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
113
+ blk3 = open_image(img_path, perm, BLK_PERM_WRITE_UNCHANGED, NULL);
574
+ error_report("virtio-blk request inhdr too short");
114
+ g_assert_null(blk3);
575
+ return -EINVAL;
115
+
576
+ }
116
+ blk_unref(blk2);
577
+
117
+
578
+ /* We always touch the last byte, so just see how big in_iov is. */
118
+ /* Check that extra bytes in blk2 are correctly unlocked */
579
+ in_len = iov_size(in_iov, in_num);
119
+ check_locked_bytes(fd, perm, ~shared_perm);
580
+ in = (void *)in_iov[in_num - 1].iov_base
120
+
581
+ + in_iov[in_num - 1].iov_len
121
+ blk_unref(blk1);
582
+ - sizeof(struct virtio_blk_inhdr);
122
+
583
+ iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
123
+ /* Image is unused, no lock there */
584
+
124
+ check_locked_bytes(fd, 0, 0);
585
+ type = le32_to_cpu(out.type);
125
+ blk3 = open_image(img_path, perm, BLK_PERM_WRITE_UNCHANGED, &error_abort);
586
+ switch (type & ~VIRTIO_BLK_T_BARRIER) {
126
+ g_assert(blk3);
587
+ case VIRTIO_BLK_T_IN:
127
+ blk_unref(blk3);
588
+ case VIRTIO_BLK_T_OUT: {
128
+ close(fd);
589
+ QEMUIOVector qiov;
129
+ unlink(img_path);
590
+ int64_t offset;
591
+ ssize_t ret = 0;
592
+ bool is_write = type & VIRTIO_BLK_T_OUT;
593
+ int64_t sector_num = le64_to_cpu(out.sector);
594
+
595
+ if (is_write && !handler->writable) {
596
+ in->status = VIRTIO_BLK_S_IOERR;
597
+ break;
598
+ }
599
+
600
+ if (is_write) {
601
+ qemu_iovec_init_external(&qiov, out_iov, out_num);
602
+ } else {
603
+ qemu_iovec_init_external(&qiov, in_iov, in_num);
604
+ }
605
+
606
+ if (unlikely(!virtio_blk_sect_range_ok(blk,
607
+ handler->logical_block_size,
608
+ sector_num, qiov.size))) {
609
+ in->status = VIRTIO_BLK_S_IOERR;
610
+ break;
611
+ }
612
+
613
+ offset = sector_num << VIRTIO_BLK_SECTOR_BITS;
614
+
615
+ if (is_write) {
616
+ ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
617
+ } else {
618
+ ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
619
+ }
620
+ if (ret >= 0) {
621
+ in->status = VIRTIO_BLK_S_OK;
622
+ } else {
623
+ in->status = VIRTIO_BLK_S_IOERR;
624
+ }
625
+ break;
626
+ }
627
+ case VIRTIO_BLK_T_FLUSH:
628
+ if (blk_co_flush(blk) == 0) {
629
+ in->status = VIRTIO_BLK_S_OK;
630
+ } else {
631
+ in->status = VIRTIO_BLK_S_IOERR;
632
+ }
633
+ break;
634
+ case VIRTIO_BLK_T_GET_ID: {
635
+ size_t size = MIN(strlen(handler->serial) + 1,
636
+ MIN(iov_size(in_iov, in_num),
637
+ VIRTIO_BLK_ID_BYTES));
638
+ iov_from_buf(in_iov, in_num, 0, handler->serial, size);
639
+ in->status = VIRTIO_BLK_S_OK;
640
+ break;
641
+ }
642
+ case VIRTIO_BLK_T_DISCARD:
643
+ case VIRTIO_BLK_T_WRITE_ZEROES:
644
+ if (!handler->writable) {
645
+ in->status = VIRTIO_BLK_S_IOERR;
646
+ break;
647
+ }
648
+ in->status = virtio_blk_discard_write_zeroes(handler, out_iov,
649
+ out_num, type);
650
+ break;
651
+ default:
652
+ in->status = VIRTIO_BLK_S_UNSUPP;
653
+ break;
654
+ }
655
+
656
+ return in_len;
130
+}
657
+}
131
+
658
diff --git a/MAINTAINERS b/MAINTAINERS
132
+static void test_set_perm_abort(void)
133
+{
134
+ BlockBackend *blk1, *blk2;
135
+ char img_path[] = "/tmp/qtest.XXXXXX";
136
+ uint64_t perm, shared_perm;
137
+ int r;
138
+ int fd = mkstemp(img_path);
139
+ assert(fd >= 0);
140
+
141
+ perm = BLK_PERM_WRITE | BLK_PERM_CONSISTENT_READ;
142
+ shared_perm = BLK_PERM_ALL;
143
+ blk1 = open_image(img_path, perm, shared_perm, &error_abort);
144
+ g_assert(blk1);
145
+
146
+ blk2 = open_image(img_path, perm, shared_perm, &error_abort);
147
+ g_assert(blk2);
148
+
149
+ check_locked_bytes(fd, perm, ~shared_perm);
150
+
151
+ /* A failed blk_set_perm mustn't change perm status (locked bytes) */
152
+ r = blk_set_perm(blk2, perm | BLK_PERM_RESIZE, BLK_PERM_WRITE_UNCHANGED,
153
+ NULL);
154
+ g_assert_cmpint(r, !=, 0);
155
+ check_locked_bytes(fd, perm, ~shared_perm);
156
+ blk_unref(blk1);
157
+ blk_unref(blk2);
158
+}
159
+
160
+int main(int argc, char **argv)
161
+{
162
+ bdrv_init();
163
+ qemu_init_main_loop(&error_abort);
164
+
165
+ g_test_init(&argc, &argv, NULL);
166
+
167
+ if (qemu_has_ofd_lock()) {
168
+ g_test_add_func("/image-locking/basic", test_image_locking_basic);
169
+ g_test_add_func("/image-locking/set-perm-abort", test_set_perm_abort);
170
+ }
171
+
172
+ return g_test_run();
173
+}
174
diff --git a/tests/Makefile.include b/tests/Makefile.include
175
index XXXXXXX..XXXXXXX 100644
659
index XXXXXXX..XXXXXXX 100644
176
--- a/tests/Makefile.include
660
--- a/MAINTAINERS
177
+++ b/tests/Makefile.include
661
+++ b/MAINTAINERS
178
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-bdrv-drain$(EXESUF)
662
@@ -XXX,XX +XXX,XX @@ M: Coiby Xu <Coiby.Xu@gmail.com>
179
check-unit-y += tests/test-blockjob$(EXESUF)
663
S: Maintained
180
check-unit-y += tests/test-blockjob-txn$(EXESUF)
664
F: block/export/vhost-user-blk-server.c
181
check-unit-y += tests/test-block-backend$(EXESUF)
665
F: block/export/vhost-user-blk-server.h
182
+check-unit-y += tests/test-image-locking$(EXESUF)
666
+F: block/export/virtio-blk-handler.c
183
check-unit-y += tests/test-x86-cpuid$(EXESUF)
667
+F: block/export/virtio-blk-handler.h
184
# all code tested by test-x86-cpuid is inside topology.h
668
F: include/qemu/vhost-user-server.h
185
ifeq ($(CONFIG_SOFTMMU),y)
669
F: tests/qtest/libqos/vhost-user-blk.c
186
@@ -XXX,XX +XXX,XX @@ tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(te
670
F: tests/qtest/libqos/vhost-user-blk.h
187
tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
671
diff --git a/block/export/meson.build b/block/export/meson.build
188
tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
672
index XXXXXXX..XXXXXXX 100644
189
tests/test-block-backend$(EXESUF): tests/test-block-backend.o $(test-block-obj-y) $(test-util-obj-y)
673
--- a/block/export/meson.build
190
+tests/test-image-locking$(EXESUF): tests/test-image-locking.o $(test-block-obj-y) $(test-util-obj-y)
674
+++ b/block/export/meson.build
191
tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
675
@@ -XXX,XX +XXX,XX @@
192
tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y)
676
blockdev_ss.add(files('export.c'))
193
tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y) $(test-crypto-obj-y)
677
678
if have_vhost_user_blk_server
679
- blockdev_ss.add(files('vhost-user-blk-server.c'))
680
+ blockdev_ss.add(files('vhost-user-blk-server.c', 'virtio-blk-handler.c'))
681
endif
682
683
blockdev_ss.add(when: fuse, if_true: files('fuse.c'))
194
--
684
--
195
2.19.1
685
2.35.3
196
197
diff view generated by jsdifflib
1
From: Li Qiang <liq3ea@gmail.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
This avoid a memory leak in unhotplug nvme device.
3
This adds vduse header to linux headers so that the
4
relevant VDUSE API can be used in subsequent patches.
4
5
5
Signed-off-by: Li Qiang <liq3ea@gmail.com>
6
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
6
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Message-Id: <20220523084611.91-5-xieyongji@bytedance.com>
7
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
8
---
10
---
9
hw/block/nvme.c | 3 +++
11
linux-headers/linux/vduse.h | 306 ++++++++++++++++++++++++++++++++
10
1 file changed, 3 insertions(+)
12
scripts/update-linux-headers.sh | 2 +-
13
2 files changed, 307 insertions(+), 1 deletion(-)
14
create mode 100644 linux-headers/linux/vduse.h
11
15
12
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
16
diff --git a/linux-headers/linux/vduse.h b/linux-headers/linux/vduse.h
13
index XXXXXXX..XXXXXXX 100644
17
new file mode 100644
14
--- a/hw/block/nvme.c
18
index XXXXXXX..XXXXXXX
15
+++ b/hw/block/nvme.c
19
--- /dev/null
16
@@ -XXX,XX +XXX,XX @@ static void nvme_exit(PCIDevice *pci_dev)
20
+++ b/linux-headers/linux/vduse.h
17
g_free(n->cq);
21
@@ -XXX,XX +XXX,XX @@
18
g_free(n->sq);
22
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
19
23
+#ifndef _VDUSE_H_
20
+ if (n->cmb_size_mb) {
24
+#define _VDUSE_H_
21
+ g_free(n->cmbuf);
25
+
22
+ }
26
+#include <linux/types.h>
23
msix_uninit_exclusive_bar(pci_dev);
27
+
24
}
28
+#define VDUSE_BASE    0x81
29
+
30
+/* The ioctls for control device (/dev/vduse/control) */
31
+
32
+#define VDUSE_API_VERSION    0
33
+
34
+/*
35
+ * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
36
+ * This is used for future extension.
37
+ */
38
+#define VDUSE_GET_API_VERSION    _IOR(VDUSE_BASE, 0x00, __u64)
39
+
40
+/* Set the version of VDUSE API that userspace supported. */
41
+#define VDUSE_SET_API_VERSION    _IOW(VDUSE_BASE, 0x01, __u64)
42
+
43
+/**
44
+ * struct vduse_dev_config - basic configuration of a VDUSE device
45
+ * @name: VDUSE device name, needs to be NUL terminated
46
+ * @vendor_id: virtio vendor id
47
+ * @device_id: virtio device id
48
+ * @features: virtio features
49
+ * @vq_num: the number of virtqueues
50
+ * @vq_align: the allocation alignment of virtqueue's metadata
51
+ * @reserved: for future use, needs to be initialized to zero
52
+ * @config_size: the size of the configuration space
53
+ * @config: the buffer of the configuration space
54
+ *
55
+ * Structure used by VDUSE_CREATE_DEV ioctl to create VDUSE device.
56
+ */
57
+struct vduse_dev_config {
58
+#define VDUSE_NAME_MAX    256
59
+    char name[VDUSE_NAME_MAX];
60
+    __u32 vendor_id;
61
+    __u32 device_id;
62
+    __u64 features;
63
+    __u32 vq_num;
64
+    __u32 vq_align;
65
+    __u32 reserved[13];
66
+    __u32 config_size;
67
+    __u8 config[];
68
+};
69
+
70
+/* Create a VDUSE device which is represented by a char device (/dev/vduse/$NAME) */
71
+#define VDUSE_CREATE_DEV    _IOW(VDUSE_BASE, 0x02, struct vduse_dev_config)
72
+
73
+/*
74
+ * Destroy a VDUSE device. Make sure there are no more references
75
+ * to the char device (/dev/vduse/$NAME).
76
+ */
77
+#define VDUSE_DESTROY_DEV    _IOW(VDUSE_BASE, 0x03, char[VDUSE_NAME_MAX])
78
+
79
+/* The ioctls for VDUSE device (/dev/vduse/$NAME) */
80
+
81
+/**
82
+ * struct vduse_iotlb_entry - entry of IOTLB to describe one IOVA region [start, last]
83
+ * @offset: the mmap offset on returned file descriptor
84
+ * @start: start of the IOVA region
85
+ * @last: last of the IOVA region
86
+ * @perm: access permission of the IOVA region
87
+ *
88
+ * Structure used by VDUSE_IOTLB_GET_FD ioctl to find an overlapped IOVA region.
89
+ */
90
+struct vduse_iotlb_entry {
91
+    __u64 offset;
92
+    __u64 start;
93
+    __u64 last;
94
+#define VDUSE_ACCESS_RO 0x1
95
+#define VDUSE_ACCESS_WO 0x2
96
+#define VDUSE_ACCESS_RW 0x3
97
+    __u8 perm;
98
+};
99
+
100
+/*
101
+ * Find the first IOVA region that overlaps with the range [start, last]
102
+ * and return the corresponding file descriptor. Return -EINVAL means the
103
+ * IOVA region doesn't exist. Caller should set start and last fields.
104
+ */
105
+#define VDUSE_IOTLB_GET_FD    _IOWR(VDUSE_BASE, 0x10, struct vduse_iotlb_entry)
106
+
107
+/*
108
+ * Get the negotiated virtio features. It's a subset of the features in
109
+ * struct vduse_dev_config which can be accepted by virtio driver. It's
110
+ * only valid after FEATURES_OK status bit is set.
111
+ */
112
+#define VDUSE_DEV_GET_FEATURES    _IOR(VDUSE_BASE, 0x11, __u64)
113
+
114
+/**
115
+ * struct vduse_config_data - data used to update configuration space
116
+ * @offset: the offset from the beginning of configuration space
117
+ * @length: the length to write to configuration space
118
+ * @buffer: the buffer used to write from
119
+ *
120
+ * Structure used by VDUSE_DEV_SET_CONFIG ioctl to update device
121
+ * configuration space.
122
+ */
123
+struct vduse_config_data {
124
+    __u32 offset;
125
+    __u32 length;
126
+    __u8 buffer[];
127
+};
128
+
129
+/* Set device configuration space */
130
+#define VDUSE_DEV_SET_CONFIG    _IOW(VDUSE_BASE, 0x12, struct vduse_config_data)
131
+
132
+/*
133
+ * Inject a config interrupt. It's usually used to notify virtio driver
134
+ * that device configuration space has changed.
135
+ */
136
+#define VDUSE_DEV_INJECT_CONFIG_IRQ    _IO(VDUSE_BASE, 0x13)
137
+
138
+/**
139
+ * struct vduse_vq_config - basic configuration of a virtqueue
140
+ * @index: virtqueue index
141
+ * @max_size: the max size of virtqueue
142
+ * @reserved: for future use, needs to be initialized to zero
143
+ *
144
+ * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
145
+ */
146
+struct vduse_vq_config {
147
+    __u32 index;
148
+    __u16 max_size;
149
+    __u16 reserved[13];
150
+};
151
+
152
+/*
153
+ * Setup the specified virtqueue. Make sure all virtqueues have been
154
+ * configured before the device is attached to vDPA bus.
155
+ */
156
+#define VDUSE_VQ_SETUP        _IOW(VDUSE_BASE, 0x14, struct vduse_vq_config)
157
+
158
+/**
159
+ * struct vduse_vq_state_split - split virtqueue state
160
+ * @avail_index: available index
161
+ */
162
+struct vduse_vq_state_split {
163
+    __u16 avail_index;
164
+};
165
+
166
+/**
167
+ * struct vduse_vq_state_packed - packed virtqueue state
168
+ * @last_avail_counter: last driver ring wrap counter observed by device
169
+ * @last_avail_idx: device available index
170
+ * @last_used_counter: device ring wrap counter
171
+ * @last_used_idx: used index
172
+ */
173
+struct vduse_vq_state_packed {
174
+    __u16 last_avail_counter;
175
+    __u16 last_avail_idx;
176
+    __u16 last_used_counter;
177
+    __u16 last_used_idx;
178
+};
179
+
180
+/**
181
+ * struct vduse_vq_info - information of a virtqueue
182
+ * @index: virtqueue index
183
+ * @num: the size of virtqueue
184
+ * @desc_addr: address of desc area
185
+ * @driver_addr: address of driver area
186
+ * @device_addr: address of device area
187
+ * @split: split virtqueue state
188
+ * @packed: packed virtqueue state
189
+ * @ready: ready status of virtqueue
190
+ *
191
+ * Structure used by VDUSE_VQ_GET_INFO ioctl to get virtqueue's information.
192
+ */
193
+struct vduse_vq_info {
194
+    __u32 index;
195
+    __u32 num;
196
+    __u64 desc_addr;
197
+    __u64 driver_addr;
198
+    __u64 device_addr;
199
+    union {
200
+        struct vduse_vq_state_split split;
201
+        struct vduse_vq_state_packed packed;
202
+    };
203
+    __u8 ready;
204
+};
205
+
206
+/* Get the specified virtqueue's information. Caller should set index field. */
207
+#define VDUSE_VQ_GET_INFO    _IOWR(VDUSE_BASE, 0x15, struct vduse_vq_info)
208
+
209
+/**
210
+ * struct vduse_vq_eventfd - eventfd configuration for a virtqueue
211
+ * @index: virtqueue index
212
+ * @fd: eventfd, -1 means de-assigning the eventfd
213
+ *
214
+ * Structure used by VDUSE_VQ_SETUP_KICKFD ioctl to setup kick eventfd.
215
+ */
216
+struct vduse_vq_eventfd {
217
+    __u32 index;
218
+#define VDUSE_EVENTFD_DEASSIGN -1
219
+    int fd;
220
+};
221
+
222
+/*
223
+ * Setup kick eventfd for specified virtqueue. The kick eventfd is used
224
+ * by VDUSE kernel module to notify userspace to consume the avail vring.
225
+ */
226
+#define VDUSE_VQ_SETUP_KICKFD    _IOW(VDUSE_BASE, 0x16, struct vduse_vq_eventfd)
227
+
228
+/*
229
+ * Inject an interrupt for specific virtqueue. It's used to notify virtio driver
230
+ * to consume the used vring.
231
+ */
232
+#define VDUSE_VQ_INJECT_IRQ    _IOW(VDUSE_BASE, 0x17, __u32)
233
+
234
+/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
235
+
236
+/**
237
+ * enum vduse_req_type - request type
238
+ * @VDUSE_GET_VQ_STATE: get the state for specified virtqueue from userspace
239
+ * @VDUSE_SET_STATUS: set the device status
240
+ * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
241
+ * specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
242
+ */
243
+enum vduse_req_type {
244
+    VDUSE_GET_VQ_STATE,
245
+    VDUSE_SET_STATUS,
246
+    VDUSE_UPDATE_IOTLB,
247
+};
248
+
249
+/**
250
+ * struct vduse_vq_state - virtqueue state
251
+ * @index: virtqueue index
252
+ * @split: split virtqueue state
253
+ * @packed: packed virtqueue state
254
+ */
255
+struct vduse_vq_state {
256
+    __u32 index;
257
+    union {
258
+        struct vduse_vq_state_split split;
259
+        struct vduse_vq_state_packed packed;
260
+    };
261
+};
262
+
263
+/**
264
+ * struct vduse_dev_status - device status
265
+ * @status: device status
266
+ */
267
+struct vduse_dev_status {
268
+    __u8 status;
269
+};
270
+
271
+/**
272
+ * struct vduse_iova_range - IOVA range [start, last]
273
+ * @start: start of the IOVA range
274
+ * @last: last of the IOVA range
275
+ */
276
+struct vduse_iova_range {
277
+    __u64 start;
278
+    __u64 last;
279
+};
280
+
281
+/**
282
+ * struct vduse_dev_request - control request
283
+ * @type: request type
284
+ * @request_id: request id
285
+ * @reserved: for future use
286
+ * @vq_state: virtqueue state, only index field is available
287
+ * @s: device status
288
+ * @iova: IOVA range for updating
289
+ * @padding: padding
290
+ *
291
+ * Structure used by read(2) on /dev/vduse/$NAME.
292
+ */
293
+struct vduse_dev_request {
294
+    __u32 type;
295
+    __u32 request_id;
296
+    __u32 reserved[4];
297
+    union {
298
+        struct vduse_vq_state vq_state;
299
+        struct vduse_dev_status s;
300
+        struct vduse_iova_range iova;
301
+        __u32 padding[32];
302
+    };
303
+};
304
+
305
+/**
306
+ * struct vduse_dev_response - response to control request
307
+ * @request_id: corresponding request id
308
+ * @result: the result of request
309
+ * @reserved: for future use, needs to be initialized to zero
310
+ * @vq_state: virtqueue state
311
+ * @padding: padding
312
+ *
313
+ * Structure used by write(2) on /dev/vduse/$NAME.
314
+ */
315
+struct vduse_dev_response {
316
+    __u32 request_id;
317
+#define VDUSE_REQ_RESULT_OK    0x00
318
+#define VDUSE_REQ_RESULT_FAILED    0x01
319
+    __u32 result;
320
+    __u32 reserved[4];
321
+    union {
322
+        struct vduse_vq_state vq_state;
323
+        __u32 padding[32];
324
+    };
325
+};
326
+
327
+#endif /* _VDUSE_H_ */
328
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
329
index XXXXXXX..XXXXXXX 100755
330
--- a/scripts/update-linux-headers.sh
331
+++ b/scripts/update-linux-headers.sh
332
@@ -XXX,XX +XXX,XX @@ done
333
rm -rf "$output/linux-headers/linux"
334
mkdir -p "$output/linux-headers/linux"
335
for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \
336
- psci.h psp-sev.h userfaultfd.h mman.h; do
337
+ psci.h psp-sev.h userfaultfd.h mman.h vduse.h; do
338
cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
339
done
25
340
26
--
341
--
27
2.19.1
342
2.35.3
28
29
diff view generated by jsdifflib
1
From: Liam Merwick <Liam.Merwick@oracle.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
In the assert checking the array dereference of JobVerbTable[verb]
3
VDUSE [1] is a linux framework that makes it possible to implement
4
in job_apply_verb() the check of the index, verb, allows an overrun
4
software-emulated vDPA devices in userspace. This adds a library
5
because an index equal to the array size is permitted.
5
as a subproject to help implementing VDUSE backends in QEMU.
6
6
7
Similarly, in the assert check of JobSTT[s0][s1] with index s1
7
[1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html
8
in job_state_transition(), an off-by-one overrun is not flagged
9
either.
10
8
11
This is not a run-time issue as there are no callers actually
9
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
12
passing in the max value.
10
Message-Id: <20220523084611.91-6-xieyongji@bytedance.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
---
14
meson_options.txt | 2 +
15
subprojects/libvduse/include/atomic.h | 1 +
16
subprojects/libvduse/include/compiler.h | 1 +
17
subprojects/libvduse/libvduse.h | 235 ++++
18
subprojects/libvduse/libvduse.c | 1150 +++++++++++++++++++
19
MAINTAINERS | 5 +
20
meson.build | 15 +
21
scripts/meson-buildoptions.sh | 3 +
22
subprojects/libvduse/linux-headers/linux | 1 +
23
subprojects/libvduse/meson.build | 10 +
24
subprojects/libvduse/standard-headers/linux | 1 +
25
11 files changed, 1424 insertions(+)
26
create mode 120000 subprojects/libvduse/include/atomic.h
27
create mode 120000 subprojects/libvduse/include/compiler.h
28
create mode 100644 subprojects/libvduse/libvduse.h
29
create mode 100644 subprojects/libvduse/libvduse.c
30
create mode 120000 subprojects/libvduse/linux-headers/linux
31
create mode 100644 subprojects/libvduse/meson.build
32
create mode 120000 subprojects/libvduse/standard-headers/linux
13
33
14
Signed-off-by: Liam Merwick <Liam.Merwick@oracle.com>
34
diff --git a/meson_options.txt b/meson_options.txt
15
Reviewed-by: Darren Kenny <Darren.Kenny@oracle.com>
16
Reviewed-by: Mark Kanda <Mark.Kanda@oracle.com>
17
Reviewed-by: Eric Blake <eblake@redhat.com>
18
Reviewed-by: John Snow <jsnow@redhat.com>
19
Message-id: 1541453919-25973-2-git-send-email-Liam.Merwick@oracle.com
20
Signed-off-by: Max Reitz <mreitz@redhat.com>
21
---
22
job.c | 4 ++--
23
1 file changed, 2 insertions(+), 2 deletions(-)
24
25
diff --git a/job.c b/job.c
26
index XXXXXXX..XXXXXXX 100644
35
index XXXXXXX..XXXXXXX 100644
27
--- a/job.c
36
--- a/meson_options.txt
28
+++ b/job.c
37
+++ b/meson_options.txt
29
@@ -XXX,XX +XXX,XX @@ bool job_is_internal(Job *job)
38
@@ -XXX,XX +XXX,XX @@ option('virtfs', type: 'feature', value: 'auto',
30
static void job_state_transition(Job *job, JobStatus s1)
39
description: 'virtio-9p support')
31
{
40
option('virtiofsd', type: 'feature', value: 'auto',
32
JobStatus s0 = job->status;
41
description: 'build virtiofs daemon (virtiofsd)')
33
- assert(s1 >= 0 && s1 <= JOB_STATUS__MAX);
42
+option('libvduse', type: 'feature', value: 'auto',
34
+ assert(s1 >= 0 && s1 < JOB_STATUS__MAX);
43
+ description: 'build VDUSE Library')
35
trace_job_state_transition(job, job->ret,
44
36
JobSTT[s0][s1] ? "allowed" : "disallowed",
45
option('capstone', type: 'feature', value: 'auto',
37
JobStatus_str(s0), JobStatus_str(s1));
46
description: 'Whether and how to find the capstone library')
38
@@ -XXX,XX +XXX,XX @@ static void job_state_transition(Job *job, JobStatus s1)
47
diff --git a/subprojects/libvduse/include/atomic.h b/subprojects/libvduse/include/atomic.h
39
int job_apply_verb(Job *job, JobVerb verb, Error **errp)
48
new file mode 120000
40
{
49
index XXXXXXX..XXXXXXX
41
JobStatus s0 = job->status;
50
--- /dev/null
42
- assert(verb >= 0 && verb <= JOB_VERB__MAX);
51
+++ b/subprojects/libvduse/include/atomic.h
43
+ assert(verb >= 0 && verb < JOB_VERB__MAX);
52
@@ -0,0 +1 @@
44
trace_job_apply_verb(job, JobStatus_str(s0), JobVerb_str(verb),
53
+../../../include/qemu/atomic.h
45
JobVerbTable[verb][s0] ? "allowed" : "prohibited");
54
\ No newline at end of file
46
if (JobVerbTable[verb][s0]) {
55
diff --git a/subprojects/libvduse/include/compiler.h b/subprojects/libvduse/include/compiler.h
56
new file mode 120000
57
index XXXXXXX..XXXXXXX
58
--- /dev/null
59
+++ b/subprojects/libvduse/include/compiler.h
60
@@ -0,0 +1 @@
61
+../../../include/qemu/compiler.h
62
\ No newline at end of file
63
diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
64
new file mode 100644
65
index XXXXXXX..XXXXXXX
66
--- /dev/null
67
+++ b/subprojects/libvduse/libvduse.h
68
@@ -XXX,XX +XXX,XX @@
69
+/*
70
+ * VDUSE (vDPA Device in Userspace) library
71
+ *
72
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
73
+ *
74
+ * Author:
75
+ * Xie Yongji <xieyongji@bytedance.com>
76
+ *
77
+ * This work is licensed under the terms of the GNU GPL, version 2 or
78
+ * later. See the COPYING file in the top-level directory.
79
+ */
80
+
81
+#ifndef LIBVDUSE_H
82
+#define LIBVDUSE_H
83
+
84
+#include <stdint.h>
85
+#include <sys/uio.h>
86
+
87
+#define VIRTQUEUE_MAX_SIZE 1024
88
+
89
+/* VDUSE device structure */
90
+typedef struct VduseDev VduseDev;
91
+
92
+/* Virtqueue structure */
93
+typedef struct VduseVirtq VduseVirtq;
94
+
95
+/* Some operation of VDUSE backend */
96
+typedef struct VduseOps {
97
+ /* Called when virtqueue can be processed */
98
+ void (*enable_queue)(VduseDev *dev, VduseVirtq *vq);
99
+ /* Called when virtqueue processing should be stopped */
100
+ void (*disable_queue)(VduseDev *dev, VduseVirtq *vq);
101
+} VduseOps;
102
+
103
+/* Describing elements of the I/O buffer */
104
+typedef struct VduseVirtqElement {
105
+ /* Descriptor table index */
106
+ unsigned int index;
107
+ /* Number of physically-contiguous device-readable descriptors */
108
+ unsigned int out_num;
109
+ /* Number of physically-contiguous device-writable descriptors */
110
+ unsigned int in_num;
111
+ /* Array to store physically-contiguous device-writable descriptors */
112
+ struct iovec *in_sg;
113
+ /* Array to store physically-contiguous device-readable descriptors */
114
+ struct iovec *out_sg;
115
+} VduseVirtqElement;
116
+
117
+
118
+/**
119
+ * vduse_get_virtio_features:
120
+ *
121
+ * Get supported virtio features
122
+ *
123
+ * Returns: supported feature bits
124
+ */
125
+uint64_t vduse_get_virtio_features(void);
126
+
127
+/**
128
+ * vduse_queue_get_dev:
129
+ * @vq: specified virtqueue
130
+ *
131
+ * Get corresponding VDUSE device from the virtqueue.
132
+ *
133
+ * Returns: a pointer to VDUSE device on success, NULL on failure.
134
+ */
135
+VduseDev *vduse_queue_get_dev(VduseVirtq *vq);
136
+
137
+/**
138
+ * vduse_queue_get_fd:
139
+ * @vq: specified virtqueue
140
+ *
141
+ * Get the kick fd for the virtqueue.
142
+ *
143
+ * Returns: file descriptor on success, -1 on failure.
144
+ */
145
+int vduse_queue_get_fd(VduseVirtq *vq);
146
+
147
+/**
148
+ * vduse_queue_pop:
149
+ * @vq: specified virtqueue
150
+ * @sz: the size of struct to return (must be >= VduseVirtqElement)
151
+ *
152
+ * Pop an element from virtqueue available ring.
153
+ *
154
+ * Returns: a pointer to a structure containing VduseVirtqElement on success,
155
+ * NULL on failure.
156
+ */
157
+void *vduse_queue_pop(VduseVirtq *vq, size_t sz);
158
+
159
+/**
160
+ * vduse_queue_push:
161
+ * @vq: specified virtqueue
162
+ * @elem: pointer to VduseVirtqElement returned by vduse_queue_pop()
163
+ * @len: length in bytes to write
164
+ *
165
+ * Push an element to virtqueue used ring.
166
+ */
167
+void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
168
+ unsigned int len);
169
+/**
170
+ * vduse_queue_notify:
171
+ * @vq: specified virtqueue
172
+ *
173
+ * Request to notify the queue.
174
+ */
175
+void vduse_queue_notify(VduseVirtq *vq);
176
+
177
+/**
178
+ * vduse_dev_get_priv:
179
+ * @dev: VDUSE device
180
+ *
181
+ * Get the private pointer passed to vduse_dev_create().
182
+ *
183
+ * Returns: private pointer on success, NULL on failure.
184
+ */
185
+void *vduse_dev_get_priv(VduseDev *dev);
186
+
187
+/**
188
+ * vduse_dev_get_queue:
189
+ * @dev: VDUSE device
190
+ * @index: virtqueue index
191
+ *
192
+ * Get the specified virtqueue.
193
+ *
194
+ * Returns: a pointer to the virtqueue on success, NULL on failure.
195
+ */
196
+VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index);
197
+
198
+/**
199
+ * vduse_dev_get_fd:
200
+ * @dev: VDUSE device
201
+ *
202
+ * Get the control message fd for the VDUSE device.
203
+ *
204
+ * Returns: file descriptor on success, -1 on failure.
205
+ */
206
+int vduse_dev_get_fd(VduseDev *dev);
207
+
208
+/**
209
+ * vduse_dev_handler:
210
+ * @dev: VDUSE device
211
+ *
212
+ * Used to process the control message.
213
+ *
214
+ * Returns: file descriptor on success, -errno on failure.
215
+ */
216
+int vduse_dev_handler(VduseDev *dev);
217
+
218
+/**
219
+ * vduse_dev_update_config:
220
+ * @dev: VDUSE device
221
+ * @size: the size to write to configuration space
222
+ * @offset: the offset from the beginning of configuration space
223
+ * @buffer: the buffer used to write from
224
+ *
225
+ * Update device configuration space and inject a config interrupt.
226
+ *
227
+ * Returns: 0 on success, -errno on failure.
228
+ */
229
+int vduse_dev_update_config(VduseDev *dev, uint32_t size,
230
+ uint32_t offset, char *buffer);
231
+
232
+/**
233
+ * vduse_dev_setup_queue:
234
+ * @dev: VDUSE device
235
+ * @index: virtqueue index
236
+ * @max_size: the max size of virtqueue
237
+ *
238
+ * Setup the specified virtqueue.
239
+ *
240
+ * Returns: 0 on success, -errno on failure.
241
+ */
242
+int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
243
+
244
+/**
245
+ * vduse_dev_create_by_fd:
246
+ * @fd: passed file descriptor
247
+ * @num_queues: the number of virtqueues
248
+ * @ops: the operation of VDUSE backend
249
+ * @priv: private pointer
250
+ *
251
+ * Create VDUSE device from a passed file descriptor.
252
+ *
253
+ * Returns: pointer to VDUSE device on success, NULL on failure.
254
+ */
255
+VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
256
+ const VduseOps *ops, void *priv);
257
+
258
+/**
259
+ * vduse_dev_create_by_name:
260
+ * @name: VDUSE device name
261
+ * @num_queues: the number of virtqueues
262
+ * @ops: the operation of VDUSE backend
263
+ * @priv: private pointer
264
+ *
265
+ * Create VDUSE device on /dev/vduse/$NAME.
266
+ *
267
+ * Returns: pointer to VDUSE device on success, NULL on failure.
268
+ */
269
+VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
270
+ const VduseOps *ops, void *priv);
271
+
272
+/**
273
+ * vduse_dev_create:
274
+ * @name: VDUSE device name
275
+ * @device_id: virtio device id
276
+ * @vendor_id: virtio vendor id
277
+ * @features: virtio features
278
+ * @num_queues: the number of virtqueues
279
+ * @config_size: the size of the configuration space
280
+ * @config: the buffer of the configuration space
281
+ * @ops: the operation of VDUSE backend
282
+ * @priv: private pointer
283
+ *
284
+ * Create VDUSE device.
285
+ *
286
+ * Returns: pointer to VDUSE device on success, NULL on failure.
287
+ */
288
+VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
289
+ uint32_t vendor_id, uint64_t features,
290
+ uint16_t num_queues, uint32_t config_size,
291
+ char *config, const VduseOps *ops, void *priv);
292
+
293
+/**
294
+ * vduse_dev_destroy:
295
+ * @dev: VDUSE device
296
+ *
297
+ * Destroy the VDUSE device.
298
+ *
299
+ * Returns: 0 on success, -errno on failure.
300
+ */
301
+int vduse_dev_destroy(VduseDev *dev);
302
+
303
+#endif
304
diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
305
new file mode 100644
306
index XXXXXXX..XXXXXXX
307
--- /dev/null
308
+++ b/subprojects/libvduse/libvduse.c
309
@@ -XXX,XX +XXX,XX @@
310
+/*
311
+ * VDUSE (vDPA Device in Userspace) library
312
+ *
313
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
314
+ * Portions of codes and concepts borrowed from libvhost-user.c, so:
315
+ * Copyright IBM, Corp. 2007
316
+ * Copyright (c) 2016 Red Hat, Inc.
317
+ *
318
+ * Author:
319
+ * Xie Yongji <xieyongji@bytedance.com>
320
+ * Anthony Liguori <aliguori@us.ibm.com>
321
+ * Marc-André Lureau <mlureau@redhat.com>
322
+ * Victor Kaplansky <victork@redhat.com>
323
+ *
324
+ * This work is licensed under the terms of the GNU GPL, version 2 or
325
+ * later. See the COPYING file in the top-level directory.
326
+ */
327
+
328
+#include <stdlib.h>
329
+#include <stdio.h>
330
+#include <stdbool.h>
331
+#include <stddef.h>
332
+#include <errno.h>
333
+#include <string.h>
334
+#include <assert.h>
335
+#include <endian.h>
336
+#include <unistd.h>
337
+#include <limits.h>
338
+#include <fcntl.h>
339
+#include <inttypes.h>
340
+
341
+#include <sys/ioctl.h>
342
+#include <sys/eventfd.h>
343
+#include <sys/mman.h>
344
+
345
+#include "include/atomic.h"
346
+#include "linux-headers/linux/virtio_ring.h"
347
+#include "linux-headers/linux/virtio_config.h"
348
+#include "linux-headers/linux/vduse.h"
349
+#include "libvduse.h"
350
+
351
+#define VDUSE_VQ_ALIGN 4096
352
+#define MAX_IOVA_REGIONS 256
353
+
354
+/* Round number down to multiple */
355
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
356
+
357
+/* Round number up to multiple */
358
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
359
+
360
+#ifndef unlikely
361
+#define unlikely(x) __builtin_expect(!!(x), 0)
362
+#endif
363
+
364
+typedef struct VduseRing {
365
+ unsigned int num;
366
+ uint64_t desc_addr;
367
+ uint64_t avail_addr;
368
+ uint64_t used_addr;
369
+ struct vring_desc *desc;
370
+ struct vring_avail *avail;
371
+ struct vring_used *used;
372
+} VduseRing;
373
+
374
+struct VduseVirtq {
375
+ VduseRing vring;
376
+ uint16_t last_avail_idx;
377
+ uint16_t shadow_avail_idx;
378
+ uint16_t used_idx;
379
+ uint16_t signalled_used;
380
+ bool signalled_used_valid;
381
+ int index;
382
+ int inuse;
383
+ bool ready;
384
+ int fd;
385
+ VduseDev *dev;
386
+};
387
+
388
+typedef struct VduseIovaRegion {
389
+ uint64_t iova;
390
+ uint64_t size;
391
+ uint64_t mmap_offset;
392
+ uint64_t mmap_addr;
393
+} VduseIovaRegion;
394
+
395
+struct VduseDev {
396
+ VduseVirtq *vqs;
397
+ VduseIovaRegion regions[MAX_IOVA_REGIONS];
398
+ int num_regions;
399
+ char *name;
400
+ uint32_t device_id;
401
+ uint32_t vendor_id;
402
+ uint16_t num_queues;
403
+ uint16_t queue_size;
404
+ uint64_t features;
405
+ const VduseOps *ops;
406
+ int fd;
407
+ int ctrl_fd;
408
+ void *priv;
409
+};
410
+
411
+static inline bool has_feature(uint64_t features, unsigned int fbit)
412
+{
413
+ assert(fbit < 64);
414
+ return !!(features & (1ULL << fbit));
415
+}
416
+
417
+static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
418
+{
419
+ return has_feature(dev->features, fbit);
420
+}
421
+
422
+uint64_t vduse_get_virtio_features(void)
423
+{
424
+ return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
425
+ (1ULL << VIRTIO_F_VERSION_1) |
426
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
427
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) |
428
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
429
+}
430
+
431
+VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
432
+{
433
+ return vq->dev;
434
+}
435
+
436
+int vduse_queue_get_fd(VduseVirtq *vq)
437
+{
438
+ return vq->fd;
439
+}
440
+
441
+void *vduse_dev_get_priv(VduseDev *dev)
442
+{
443
+ return dev->priv;
444
+}
445
+
446
+VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
447
+{
448
+ return &dev->vqs[index];
449
+}
450
+
451
+int vduse_dev_get_fd(VduseDev *dev)
452
+{
453
+ return dev->fd;
454
+}
455
+
456
+static int vduse_inject_irq(VduseDev *dev, int index)
457
+{
458
+ return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
459
+}
460
+
461
+static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
462
+ uint64_t last)
463
+{
464
+ int i;
465
+
466
+ if (last == start) {
467
+ return;
468
+ }
469
+
470
+ for (i = 0; i < MAX_IOVA_REGIONS; i++) {
471
+ if (!dev->regions[i].mmap_addr) {
472
+ continue;
473
+ }
474
+
475
+ if (start <= dev->regions[i].iova &&
476
+ last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
477
+ munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
478
+ dev->regions[i].mmap_offset + dev->regions[i].size);
479
+ dev->regions[i].mmap_addr = 0;
480
+ dev->num_regions--;
481
+ }
482
+ }
483
+}
484
+
485
+static int vduse_iova_add_region(VduseDev *dev, int fd,
486
+ uint64_t offset, uint64_t start,
487
+ uint64_t last, int prot)
488
+{
489
+ int i;
490
+ uint64_t size = last - start + 1;
491
+ void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
492
+
493
+ if (mmap_addr == MAP_FAILED) {
494
+ close(fd);
495
+ return -EINVAL;
496
+ }
497
+
498
+ for (i = 0; i < MAX_IOVA_REGIONS; i++) {
499
+ if (!dev->regions[i].mmap_addr) {
500
+ dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
501
+ dev->regions[i].mmap_offset = offset;
502
+ dev->regions[i].iova = start;
503
+ dev->regions[i].size = size;
504
+ dev->num_regions++;
505
+ break;
506
+ }
507
+ }
508
+ assert(i < MAX_IOVA_REGIONS);
509
+ close(fd);
510
+
511
+ return 0;
512
+}
513
+
514
+static int perm_to_prot(uint8_t perm)
515
+{
516
+ int prot = 0;
517
+
518
+ switch (perm) {
519
+ case VDUSE_ACCESS_WO:
520
+ prot |= PROT_WRITE;
521
+ break;
522
+ case VDUSE_ACCESS_RO:
523
+ prot |= PROT_READ;
524
+ break;
525
+ case VDUSE_ACCESS_RW:
526
+ prot |= PROT_READ | PROT_WRITE;
527
+ break;
528
+ default:
529
+ break;
530
+ }
531
+
532
+ return prot;
533
+}
534
+
535
+static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
536
+{
537
+ int i, ret;
538
+ struct vduse_iotlb_entry entry;
539
+
540
+ for (i = 0; i < MAX_IOVA_REGIONS; i++) {
541
+ VduseIovaRegion *r = &dev->regions[i];
542
+
543
+ if (!r->mmap_addr) {
544
+ continue;
545
+ }
546
+
547
+ if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
548
+ if ((iova + *plen) > (r->iova + r->size)) {
549
+ *plen = r->iova + r->size - iova;
550
+ }
551
+ return (void *)(uintptr_t)(iova - r->iova +
552
+ r->mmap_addr + r->mmap_offset);
553
+ }
554
+ }
555
+
556
+ entry.start = iova;
557
+ entry.last = iova + 1;
558
+ ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
559
+ if (ret < 0) {
560
+ return NULL;
561
+ }
562
+
563
+ if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
564
+ entry.last, perm_to_prot(entry.perm))) {
565
+ return iova_to_va(dev, plen, iova);
566
+ }
567
+
568
+ return NULL;
569
+}
570
+
571
+static inline uint16_t vring_avail_flags(VduseVirtq *vq)
572
+{
573
+ return le16toh(vq->vring.avail->flags);
574
+}
575
+
576
+static inline uint16_t vring_avail_idx(VduseVirtq *vq)
577
+{
578
+ vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
579
+
580
+ return vq->shadow_avail_idx;
581
+}
582
+
583
+static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
584
+{
585
+ return le16toh(vq->vring.avail->ring[i]);
586
+}
587
+
588
+static inline uint16_t vring_get_used_event(VduseVirtq *vq)
589
+{
590
+ return vring_avail_ring(vq, vq->vring.num);
591
+}
592
+
593
+static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
594
+ unsigned int *head)
595
+{
596
+ /*
597
+ * Grab the next descriptor number they're advertising, and increment
598
+ * the index we've seen.
599
+ */
600
+ *head = vring_avail_ring(vq, idx % vq->vring.num);
601
+
602
+ /* If their number is silly, that's a fatal mistake. */
603
+ if (*head >= vq->vring.num) {
604
+ fprintf(stderr, "Guest says index %u is available\n", *head);
605
+ return false;
606
+ }
607
+
608
+ return true;
609
+}
610
+
611
+static int
612
+vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
613
+ uint64_t addr, size_t len)
614
+{
615
+ struct vring_desc *ori_desc;
616
+ uint64_t read_len;
617
+
618
+ if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
619
+ return -1;
620
+ }
621
+
622
+ if (len == 0) {
623
+ return -1;
624
+ }
625
+
626
+ while (len) {
627
+ read_len = len;
628
+ ori_desc = iova_to_va(dev, &read_len, addr);
629
+ if (!ori_desc) {
630
+ return -1;
631
+ }
632
+
633
+ memcpy(desc, ori_desc, read_len);
634
+ len -= read_len;
635
+ addr += read_len;
636
+ desc += read_len;
637
+ }
638
+
639
+ return 0;
640
+}
641
+
642
+enum {
643
+ VIRTQUEUE_READ_DESC_ERROR = -1,
644
+ VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
645
+ VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
646
+};
647
+
648
+static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
649
+ unsigned int max, unsigned int *next)
650
+{
651
+ /* If this descriptor says it doesn't chain, we're done. */
652
+ if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
653
+ return VIRTQUEUE_READ_DESC_DONE;
654
+ }
655
+
656
+ /* Check they're not leading us off end of descriptors. */
657
+ *next = desc[i].next;
658
+ /* Make sure compiler knows to grab that: we don't want it changing! */
659
+ smp_wmb();
660
+
661
+ if (*next >= max) {
662
+ fprintf(stderr, "Desc next is %u\n", *next);
663
+ return VIRTQUEUE_READ_DESC_ERROR;
664
+ }
665
+
666
+ return VIRTQUEUE_READ_DESC_MORE;
667
+}
668
+
669
+/*
670
+ * Fetch avail_idx from VQ memory only when we really need to know if
671
+ * guest has added some buffers.
672
+ */
673
+static bool vduse_queue_empty(VduseVirtq *vq)
674
+{
675
+ if (unlikely(!vq->vring.avail)) {
676
+ return true;
677
+ }
678
+
679
+ if (vq->shadow_avail_idx != vq->last_avail_idx) {
680
+ return false;
681
+ }
682
+
683
+ return vring_avail_idx(vq) == vq->last_avail_idx;
684
+}
685
+
686
+static bool vduse_queue_should_notify(VduseVirtq *vq)
687
+{
688
+ VduseDev *dev = vq->dev;
689
+ uint16_t old, new;
690
+ bool v;
691
+
692
+ /* We need to expose used array entries before checking used event. */
693
+ smp_mb();
694
+
695
+ /* Always notify when queue is empty (when feature acknowledge) */
696
+ if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
697
+ !vq->inuse && vduse_queue_empty(vq)) {
698
+ return true;
699
+ }
700
+
701
+ if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
702
+ return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
703
+ }
704
+
705
+ v = vq->signalled_used_valid;
706
+ vq->signalled_used_valid = true;
707
+ old = vq->signalled_used;
708
+ new = vq->signalled_used = vq->used_idx;
709
+ return !v || vring_need_event(vring_get_used_event(vq), new, old);
710
+}
711
+
712
+void vduse_queue_notify(VduseVirtq *vq)
713
+{
714
+ VduseDev *dev = vq->dev;
715
+
716
+ if (unlikely(!vq->vring.avail)) {
717
+ return;
718
+ }
719
+
720
+ if (!vduse_queue_should_notify(vq)) {
721
+ return;
722
+ }
723
+
724
+ if (vduse_inject_irq(dev, vq->index) < 0) {
725
+ fprintf(stderr, "Error inject irq for vq %d: %s\n",
726
+ vq->index, strerror(errno));
727
+ }
728
+}
729
+
730
+static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
731
+{
732
+ *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
733
+}
734
+
735
+static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
736
+ struct iovec *iov, unsigned int max_num_sg,
737
+ bool is_write, uint64_t pa, size_t sz)
738
+{
739
+ unsigned num_sg = *p_num_sg;
740
+ VduseDev *dev = vq->dev;
741
+
742
+ assert(num_sg <= max_num_sg);
743
+
744
+ if (!sz) {
745
+ fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
746
+ return false;
747
+ }
748
+
749
+ while (sz) {
750
+ uint64_t len = sz;
751
+
752
+ if (num_sg == max_num_sg) {
753
+ fprintf(stderr,
754
+ "virtio: too many descriptors in indirect table\n");
755
+ return false;
756
+ }
757
+
758
+ iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
759
+ if (iov[num_sg].iov_base == NULL) {
760
+ fprintf(stderr, "virtio: invalid address for buffers\n");
761
+ return false;
762
+ }
763
+ iov[num_sg++].iov_len = len;
764
+ sz -= len;
765
+ pa += len;
766
+ }
767
+
768
+ *p_num_sg = num_sg;
769
+ return true;
770
+}
771
+
772
+static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
773
+ unsigned in_num)
774
+{
775
+ VduseVirtqElement *elem;
776
+ size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
777
+ size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
778
+ size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
779
+
780
+ assert(sz >= sizeof(VduseVirtqElement));
781
+ elem = malloc(out_sg_end);
782
+ if (!elem) {
783
+ return NULL;
784
+ }
785
+ elem->out_num = out_num;
786
+ elem->in_num = in_num;
787
+ elem->in_sg = (void *)elem + in_sg_ofs;
788
+ elem->out_sg = (void *)elem + out_sg_ofs;
789
+ return elem;
790
+}
791
+
792
+static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
793
+{
794
+ struct vring_desc *desc = vq->vring.desc;
795
+ VduseDev *dev = vq->dev;
796
+ uint64_t desc_addr, read_len;
797
+ unsigned int desc_len;
798
+ unsigned int max = vq->vring.num;
799
+ unsigned int i = idx;
800
+ VduseVirtqElement *elem;
801
+ struct iovec iov[VIRTQUEUE_MAX_SIZE];
802
+ struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
803
+ unsigned int out_num = 0, in_num = 0;
804
+ int rc;
805
+
806
+ if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
807
+ if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
808
+ fprintf(stderr, "Invalid size for indirect buffer table\n");
809
+ return NULL;
810
+ }
811
+
812
+ /* loop over the indirect descriptor table */
813
+ desc_addr = le64toh(desc[i].addr);
814
+ desc_len = le32toh(desc[i].len);
815
+ max = desc_len / sizeof(struct vring_desc);
816
+ read_len = desc_len;
817
+ desc = iova_to_va(dev, &read_len, desc_addr);
818
+ if (unlikely(desc && read_len != desc_len)) {
819
+ /* Failed to use zero copy */
820
+ desc = NULL;
821
+ if (!vduse_queue_read_indirect_desc(dev, desc_buf,
822
+ desc_addr,
823
+ desc_len)) {
824
+ desc = desc_buf;
825
+ }
826
+ }
827
+ if (!desc) {
828
+ fprintf(stderr, "Invalid indirect buffer table\n");
829
+ return NULL;
830
+ }
831
+ i = 0;
832
+ }
833
+
834
+ /* Collect all the descriptors */
835
+ do {
836
+ if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
837
+ if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
838
+ VIRTQUEUE_MAX_SIZE - out_num,
839
+ true, le64toh(desc[i].addr),
840
+ le32toh(desc[i].len))) {
841
+ return NULL;
842
+ }
843
+ } else {
844
+ if (in_num) {
845
+ fprintf(stderr, "Incorrect order for descriptors\n");
846
+ return NULL;
847
+ }
848
+ if (!vduse_queue_map_single_desc(vq, &out_num, iov,
849
+ VIRTQUEUE_MAX_SIZE, false,
850
+ le64toh(desc[i].addr),
851
+ le32toh(desc[i].len))) {
852
+ return NULL;
853
+ }
854
+ }
855
+
856
+ /* If we've got too many, that implies a descriptor loop. */
857
+ if ((in_num + out_num) > max) {
858
+ fprintf(stderr, "Looped descriptor\n");
859
+ return NULL;
860
+ }
861
+ rc = vduse_queue_read_next_desc(desc, i, max, &i);
862
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
863
+
864
+ if (rc == VIRTQUEUE_READ_DESC_ERROR) {
865
+ fprintf(stderr, "read descriptor error\n");
866
+ return NULL;
867
+ }
868
+
869
+ /* Now copy what we have collected and mapped */
870
+ elem = vduse_queue_alloc_element(sz, out_num, in_num);
871
+ if (!elem) {
872
+ fprintf(stderr, "read descriptor error\n");
873
+ return NULL;
874
+ }
875
+ elem->index = idx;
876
+ for (i = 0; i < out_num; i++) {
877
+ elem->out_sg[i] = iov[i];
878
+ }
879
+ for (i = 0; i < in_num; i++) {
880
+ elem->in_sg[i] = iov[out_num + i];
881
+ }
882
+
883
+ return elem;
884
+}
885
+
886
+void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
887
+{
888
+ unsigned int head;
889
+ VduseVirtqElement *elem;
890
+ VduseDev *dev = vq->dev;
891
+
892
+ if (unlikely(!vq->vring.avail)) {
893
+ return NULL;
894
+ }
895
+
896
+ if (vduse_queue_empty(vq)) {
897
+ return NULL;
898
+ }
899
+ /* Needed after virtio_queue_empty() */
900
+ smp_rmb();
901
+
902
+ if (vq->inuse >= vq->vring.num) {
903
+ fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
904
+ return NULL;
905
+ }
906
+
907
+ if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
908
+ return NULL;
909
+ }
910
+
911
+ if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
912
+ vring_set_avail_event(vq, vq->last_avail_idx);
913
+ }
914
+
915
+ elem = vduse_queue_map_desc(vq, head, sz);
916
+
917
+ if (!elem) {
918
+ return NULL;
919
+ }
920
+
921
+ vq->inuse++;
922
+
923
+ return elem;
924
+}
925
+
926
+static inline void vring_used_write(VduseVirtq *vq,
927
+ struct vring_used_elem *uelem, int i)
928
+{
929
+ struct vring_used *used = vq->vring.used;
930
+
931
+ used->ring[i] = *uelem;
932
+}
933
+
934
+static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
935
+ unsigned int len, unsigned int idx)
936
+{
937
+ struct vring_used_elem uelem;
938
+
939
+ if (unlikely(!vq->vring.used)) {
940
+ return;
941
+ }
942
+
943
+ idx = (idx + vq->used_idx) % vq->vring.num;
944
+
945
+ uelem.id = htole32(elem->index);
946
+ uelem.len = htole32(len);
947
+ vring_used_write(vq, &uelem, idx);
948
+}
949
+
950
+static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
951
+{
952
+ vq->vring.used->idx = htole16(val);
953
+ vq->used_idx = val;
954
+}
955
+
956
+static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
957
+{
958
+ uint16_t old, new;
959
+
960
+ if (unlikely(!vq->vring.used)) {
961
+ return;
962
+ }
963
+
964
+ /* Make sure buffer is written before we update index. */
965
+ smp_wmb();
966
+
967
+ old = vq->used_idx;
968
+ new = old + count;
969
+ vring_used_idx_set(vq, new);
970
+ vq->inuse -= count;
971
+ if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
972
+ vq->signalled_used_valid = false;
973
+ }
974
+}
975
+
976
+void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
977
+ unsigned int len)
978
+{
979
+ vduse_queue_fill(vq, elem, len, 0);
980
+ vduse_queue_flush(vq, 1);
981
+}
982
+
983
+static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
984
+ uint64_t avail_addr, uint64_t used_addr)
985
+{
986
+ struct VduseDev *dev = vq->dev;
987
+ uint64_t len;
988
+
989
+ len = sizeof(struct vring_desc);
990
+ vq->vring.desc = iova_to_va(dev, &len, desc_addr);
991
+ if (len != sizeof(struct vring_desc)) {
992
+ return -EINVAL;
993
+ }
994
+
995
+ len = sizeof(struct vring_avail);
996
+ vq->vring.avail = iova_to_va(dev, &len, avail_addr);
997
+ if (len != sizeof(struct vring_avail)) {
998
+ return -EINVAL;
999
+ }
1000
+
1001
+ len = sizeof(struct vring_used);
1002
+ vq->vring.used = iova_to_va(dev, &len, used_addr);
1003
+ if (len != sizeof(struct vring_used)) {
1004
+ return -EINVAL;
1005
+ }
1006
+
1007
+ if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
1008
+ fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
1009
+ return -EINVAL;
1010
+ }
1011
+
1012
+ return 0;
1013
+}
1014
+
1015
+static void vduse_queue_enable(VduseVirtq *vq)
1016
+{
1017
+ struct VduseDev *dev = vq->dev;
1018
+ struct vduse_vq_info vq_info;
1019
+ struct vduse_vq_eventfd vq_eventfd;
1020
+ int fd;
1021
+
1022
+ vq_info.index = vq->index;
1023
+ if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
1024
+ fprintf(stderr, "Failed to get vq[%d] info: %s\n",
1025
+ vq->index, strerror(errno));
1026
+ return;
1027
+ }
1028
+
1029
+ if (!vq_info.ready) {
1030
+ return;
1031
+ }
1032
+
1033
+ vq->vring.num = vq_info.num;
1034
+ vq->vring.desc_addr = vq_info.desc_addr;
1035
+ vq->vring.avail_addr = vq_info.driver_addr;
1036
+ vq->vring.used_addr = vq_info.device_addr;
1037
+
1038
+ if (vduse_queue_update_vring(vq, vq_info.desc_addr,
1039
+ vq_info.driver_addr, vq_info.device_addr)) {
1040
+ fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
1041
+ return;
1042
+ }
1043
+
1044
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1045
+ if (fd < 0) {
1046
+ fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
1047
+ return;
1048
+ }
1049
+
1050
+ vq_eventfd.index = vq->index;
1051
+ vq_eventfd.fd = fd;
1052
+ if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
1053
+ fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
1054
+ close(fd);
1055
+ return;
1056
+ }
1057
+
1058
+ vq->fd = fd;
1059
+ vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
1060
+ vq->inuse = 0;
1061
+ vq->used_idx = 0;
1062
+ vq->signalled_used_valid = false;
1063
+ vq->ready = true;
1064
+
1065
+ dev->ops->enable_queue(dev, vq);
1066
+}
1067
+
1068
+static void vduse_queue_disable(VduseVirtq *vq)
1069
+{
1070
+ struct VduseDev *dev = vq->dev;
1071
+ struct vduse_vq_eventfd eventfd;
1072
+
1073
+ if (!vq->ready) {
1074
+ return;
1075
+ }
1076
+
1077
+ dev->ops->disable_queue(dev, vq);
1078
+
1079
+ eventfd.index = vq->index;
1080
+ eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
1081
+ ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
1082
+ close(vq->fd);
1083
+
1084
+ assert(vq->inuse == 0);
1085
+
1086
+ vq->vring.num = 0;
1087
+ vq->vring.desc_addr = 0;
1088
+ vq->vring.avail_addr = 0;
1089
+ vq->vring.used_addr = 0;
1090
+ vq->vring.desc = 0;
1091
+ vq->vring.avail = 0;
1092
+ vq->vring.used = 0;
1093
+ vq->ready = false;
1094
+ vq->fd = -1;
1095
+}
1096
+
1097
+static void vduse_dev_start_dataplane(VduseDev *dev)
1098
+{
1099
+ int i;
1100
+
1101
+ if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1102
+ fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1103
+ return;
1104
+ }
1105
+ assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
1106
+
1107
+ for (i = 0; i < dev->num_queues; i++) {
1108
+ vduse_queue_enable(&dev->vqs[i]);
1109
+ }
1110
+}
1111
+
1112
+static void vduse_dev_stop_dataplane(VduseDev *dev)
1113
+{
1114
+ int i;
1115
+
1116
+ for (i = 0; i < dev->num_queues; i++) {
1117
+ vduse_queue_disable(&dev->vqs[i]);
1118
+ }
1119
+ dev->features = 0;
1120
+ vduse_iova_remove_region(dev, 0, ULONG_MAX);
1121
+}
1122
+
1123
+int vduse_dev_handler(VduseDev *dev)
1124
+{
1125
+ struct vduse_dev_request req;
1126
+ struct vduse_dev_response resp = { 0 };
1127
+ VduseVirtq *vq;
1128
+ int i, ret;
1129
+
1130
+ ret = read(dev->fd, &req, sizeof(req));
1131
+ if (ret != sizeof(req)) {
1132
+ fprintf(stderr, "Read request error [%d]: %s\n",
1133
+ ret, strerror(errno));
1134
+ return -errno;
1135
+ }
1136
+ resp.request_id = req.request_id;
1137
+
1138
+ switch (req.type) {
1139
+ case VDUSE_GET_VQ_STATE:
1140
+ vq = &dev->vqs[req.vq_state.index];
1141
+ resp.vq_state.split.avail_index = vq->last_avail_idx;
1142
+ resp.result = VDUSE_REQ_RESULT_OK;
1143
+ break;
1144
+ case VDUSE_SET_STATUS:
1145
+ if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
1146
+ vduse_dev_start_dataplane(dev);
1147
+ } else if (req.s.status == 0) {
1148
+ vduse_dev_stop_dataplane(dev);
1149
+ }
1150
+ resp.result = VDUSE_REQ_RESULT_OK;
1151
+ break;
1152
+ case VDUSE_UPDATE_IOTLB:
1153
+ /* The iova will be updated by iova_to_va() later, so just remove it */
1154
+ vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
1155
+ for (i = 0; i < dev->num_queues; i++) {
1156
+ VduseVirtq *vq = &dev->vqs[i];
1157
+ if (vq->ready) {
1158
+ if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
1159
+ vq->vring.avail_addr,
1160
+ vq->vring.used_addr)) {
1161
+ fprintf(stderr, "Failed to update vring for vq[%d]\n",
1162
+ vq->index);
1163
+ }
1164
+ }
1165
+ }
1166
+ resp.result = VDUSE_REQ_RESULT_OK;
1167
+ break;
1168
+ default:
1169
+ resp.result = VDUSE_REQ_RESULT_FAILED;
1170
+ break;
1171
+ }
1172
+
1173
+ ret = write(dev->fd, &resp, sizeof(resp));
1174
+ if (ret != sizeof(resp)) {
1175
+ fprintf(stderr, "Write request %d error [%d]: %s\n",
1176
+ req.type, ret, strerror(errno));
1177
+ return -errno;
1178
+ }
1179
+ return 0;
1180
+}
1181
+
1182
+int vduse_dev_update_config(VduseDev *dev, uint32_t size,
1183
+ uint32_t offset, char *buffer)
1184
+{
1185
+ int ret;
1186
+ struct vduse_config_data *data;
1187
+
1188
+ data = malloc(offsetof(struct vduse_config_data, buffer) + size);
1189
+ if (!data) {
1190
+ return -ENOMEM;
1191
+ }
1192
+
1193
+ data->offset = offset;
1194
+ data->length = size;
1195
+ memcpy(data->buffer, buffer, size);
1196
+
1197
+ ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
1198
+ free(data);
1199
+
1200
+ if (ret) {
1201
+ return -errno;
1202
+ }
1203
+
1204
+ if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
1205
+ return -errno;
1206
+ }
1207
+
1208
+ return 0;
1209
+}
1210
+
1211
+int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
1212
+{
1213
+ VduseVirtq *vq = &dev->vqs[index];
1214
+ struct vduse_vq_config vq_config = { 0 };
1215
+
1216
+ if (max_size > VIRTQUEUE_MAX_SIZE) {
1217
+ return -EINVAL;
1218
+ }
1219
+
1220
+ vq_config.index = vq->index;
1221
+ vq_config.max_size = max_size;
1222
+
1223
+ if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
1224
+ return -errno;
1225
+ }
1226
+
1227
+ return 0;
1228
+}
1229
+
1230
+static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
1231
+{
1232
+ VduseVirtq *vqs;
1233
+ int i;
1234
+
1235
+ vqs = calloc(sizeof(VduseVirtq), num_queues);
1236
+ if (!vqs) {
1237
+ return -ENOMEM;
1238
+ }
1239
+
1240
+ for (i = 0; i < num_queues; i++) {
1241
+ vqs[i].index = i;
1242
+ vqs[i].dev = dev;
1243
+ vqs[i].fd = -1;
1244
+ }
1245
+ dev->vqs = vqs;
1246
+
1247
+ return 0;
1248
+}
1249
+
1250
+static int vduse_dev_init(VduseDev *dev, const char *name,
1251
+ uint16_t num_queues, const VduseOps *ops,
1252
+ void *priv)
1253
+{
1254
+ char *dev_path, *dev_name;
1255
+ int ret, fd;
1256
+
1257
+ dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
1258
+ if (!dev_path) {
1259
+ return -ENOMEM;
1260
+ }
1261
+ sprintf(dev_path, "/dev/vduse/%s", name);
1262
+
1263
+ fd = open(dev_path, O_RDWR);
1264
+ free(dev_path);
1265
+ if (fd < 0) {
1266
+ fprintf(stderr, "Failed to open vduse dev %s: %s\n",
1267
+ name, strerror(errno));
1268
+ return -errno;
1269
+ }
1270
+
1271
+ dev_name = strdup(name);
1272
+ if (!dev_name) {
1273
+ close(fd);
1274
+ return -ENOMEM;
1275
+ }
1276
+
1277
+ ret = vduse_dev_init_vqs(dev, num_queues);
1278
+ if (ret) {
1279
+ free(dev_name);
1280
+ close(fd);
1281
+ return ret;
1282
+ }
1283
+
1284
+ dev->name = dev_name;
1285
+ dev->num_queues = num_queues;
1286
+ dev->fd = fd;
1287
+ dev->ops = ops;
1288
+ dev->priv = priv;
1289
+
1290
+ return 0;
1291
+}
1292
+
1293
+static inline bool vduse_name_is_valid(const char *name)
1294
+{
1295
+ return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
1296
+}
1297
+
1298
+VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
1299
+ const VduseOps *ops, void *priv)
1300
+{
1301
+ VduseDev *dev;
1302
+ int ret;
1303
+
1304
+ if (!ops || !ops->enable_queue || !ops->disable_queue) {
1305
+ fprintf(stderr, "Invalid parameter for vduse\n");
1306
+ return NULL;
1307
+ }
1308
+
1309
+ dev = calloc(sizeof(VduseDev), 1);
1310
+ if (!dev) {
1311
+ fprintf(stderr, "Failed to allocate vduse device\n");
1312
+ return NULL;
1313
+ }
1314
+
1315
+ ret = vduse_dev_init_vqs(dev, num_queues);
1316
+ if (ret) {
1317
+ fprintf(stderr, "Failed to init vqs\n");
1318
+ free(dev);
1319
+ return NULL;
1320
+ }
1321
+
1322
+ dev->num_queues = num_queues;
1323
+ dev->fd = fd;
1324
+ dev->ops = ops;
1325
+ dev->priv = priv;
1326
+
1327
+ return dev;
1328
+}
1329
+
1330
+VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1331
+ const VduseOps *ops, void *priv)
1332
+{
1333
+ VduseDev *dev;
1334
+ int ret;
1335
+
1336
+ if (!name || vduse_name_is_valid(name) || !ops ||
1337
+ !ops->enable_queue || !ops->disable_queue) {
1338
+ fprintf(stderr, "Invalid parameter for vduse\n");
1339
+ return NULL;
1340
+ }
1341
+
1342
+ dev = calloc(sizeof(VduseDev), 1);
1343
+ if (!dev) {
1344
+ fprintf(stderr, "Failed to allocate vduse device\n");
1345
+ return NULL;
1346
+ }
1347
+
1348
+ ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1349
+ if (ret < 0) {
1350
+ fprintf(stderr, "Failed to init vduse device %s: %s\n",
1351
+ name, strerror(ret));
1352
+ free(dev);
1353
+ return NULL;
1354
+ }
1355
+
1356
+ return dev;
1357
+}
1358
+
1359
+VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1360
+ uint32_t vendor_id, uint64_t features,
1361
+ uint16_t num_queues, uint32_t config_size,
1362
+ char *config, const VduseOps *ops, void *priv)
1363
+{
1364
+ VduseDev *dev;
1365
+ int ret, ctrl_fd;
1366
+ uint64_t version;
1367
+ struct vduse_dev_config *dev_config;
1368
+ size_t size = offsetof(struct vduse_dev_config, config);
1369
+
1370
+ if (!name || vduse_name_is_valid(name) ||
1371
+ !has_feature(features, VIRTIO_F_VERSION_1) || !config ||
1372
+ !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1373
+ fprintf(stderr, "Invalid parameter for vduse\n");
1374
+ return NULL;
1375
+ }
1376
+
1377
+ dev = calloc(sizeof(VduseDev), 1);
1378
+ if (!dev) {
1379
+ fprintf(stderr, "Failed to allocate vduse device\n");
1380
+ return NULL;
1381
+ }
1382
+
1383
+ ctrl_fd = open("/dev/vduse/control", O_RDWR);
1384
+ if (ctrl_fd < 0) {
1385
+ fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1386
+ strerror(errno));
1387
+ goto err_ctrl;
1388
+ }
1389
+
1390
+ version = VDUSE_API_VERSION;
1391
+ if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1392
+ fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1393
+ version, strerror(errno));
1394
+ goto err_dev;
1395
+ }
1396
+
1397
+ dev_config = calloc(size + config_size, 1);
1398
+ if (!dev_config) {
1399
+ fprintf(stderr, "Failed to allocate config space\n");
1400
+ goto err_dev;
1401
+ }
1402
+
1403
+ strcpy(dev_config->name, name);
1404
+ dev_config->device_id = device_id;
1405
+ dev_config->vendor_id = vendor_id;
1406
+ dev_config->features = features;
1407
+ dev_config->vq_num = num_queues;
1408
+ dev_config->vq_align = VDUSE_VQ_ALIGN;
1409
+ dev_config->config_size = config_size;
1410
+ memcpy(dev_config->config, config, config_size);
1411
+
1412
+ ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1413
+ free(dev_config);
1414
+ if (ret < 0) {
1415
+ fprintf(stderr, "Failed to create vduse device %s: %s\n",
1416
+ name, strerror(errno));
1417
+ goto err_dev;
1418
+ }
1419
+ dev->ctrl_fd = ctrl_fd;
1420
+
1421
+ ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1422
+ if (ret < 0) {
1423
+ fprintf(stderr, "Failed to init vduse device %s: %s\n",
1424
+ name, strerror(ret));
1425
+ goto err;
1426
+ }
1427
+
1428
+ return dev;
1429
+err:
1430
+ ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1431
+err_dev:
1432
+ close(ctrl_fd);
1433
+err_ctrl:
1434
+ free(dev);
1435
+
1436
+ return NULL;
1437
+}
1438
+
1439
+int vduse_dev_destroy(VduseDev *dev)
1440
+{
1441
+ int ret = 0;
1442
+
1443
+ free(dev->vqs);
1444
+ if (dev->fd >= 0) {
1445
+ close(dev->fd);
1446
+ dev->fd = -1;
1447
+ }
1448
+ if (dev->ctrl_fd >= 0) {
1449
+ if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1450
+ ret = -errno;
1451
+ }
1452
+ close(dev->ctrl_fd);
1453
+ dev->ctrl_fd = -1;
1454
+ }
1455
+ free(dev->name);
1456
+ free(dev);
1457
+
1458
+ return ret;
1459
+}
1460
diff --git a/MAINTAINERS b/MAINTAINERS
1461
index XXXXXXX..XXXXXXX 100644
1462
--- a/MAINTAINERS
1463
+++ b/MAINTAINERS
1464
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
1465
S: Supported
1466
F: block/export/fuse.c
1467
1468
+VDUSE library
1469
+M: Xie Yongji <xieyongji@bytedance.com>
1470
+S: Maintained
1471
+F: subprojects/libvduse/
1472
+
1473
Replication
1474
M: Wen Congyang <wencongyang2@huawei.com>
1475
M: Xie Changlong <xiechanglong.d@gmail.com>
1476
diff --git a/meson.build b/meson.build
1477
index XXXXXXX..XXXXXXX 100644
1478
--- a/meson.build
1479
+++ b/meson.build
1480
@@ -XXX,XX +XXX,XX @@ if get_option('fuse_lseek').allowed()
1481
endif
1482
endif
1483
1484
+have_libvduse = (targetos == 'linux')
1485
+if get_option('libvduse').enabled()
1486
+ if targetos != 'linux'
1487
+ error('libvduse requires linux')
1488
+ endif
1489
+elif get_option('libvduse').disabled()
1490
+ have_libvduse = false
1491
+endif
1492
+
1493
# libbpf
1494
libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config')
1495
if libbpf.found() and not cc.links('''
1496
@@ -XXX,XX +XXX,XX @@ if targetos == 'linux' and have_vhost_user
1497
vhost_user = libvhost_user.get_variable('vhost_user_dep')
1498
endif
1499
1500
+libvduse = not_found
1501
+if have_libvduse
1502
+ libvduse_proj = subproject('libvduse')
1503
+ libvduse = libvduse_proj.get_variable('libvduse_dep')
1504
+endif
1505
+
1506
# NOTE: the trace/ subdirectory needs the qapi_trace_events variable
1507
# that is filled in by qapi/.
1508
subdir('qapi')
1509
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
1510
index XXXXXXX..XXXXXXX 100644
1511
--- a/scripts/meson-buildoptions.sh
1512
+++ b/scripts/meson-buildoptions.sh
1513
@@ -XXX,XX +XXX,XX @@ meson_options_help() {
1514
printf "%s\n" ' libssh ssh block device support'
1515
printf "%s\n" ' libudev Use libudev to enumerate host devices'
1516
printf "%s\n" ' libusb libusb support for USB passthrough'
1517
+ printf "%s\n" ' libvduse build VDUSE Library'
1518
printf "%s\n" ' linux-aio Linux AIO support'
1519
printf "%s\n" ' linux-io-uring Linux io_uring support'
1520
printf "%s\n" ' live-block-migration'
1521
@@ -XXX,XX +XXX,XX @@ _meson_option_parse() {
1522
--disable-libudev) printf "%s" -Dlibudev=disabled ;;
1523
--enable-libusb) printf "%s" -Dlibusb=enabled ;;
1524
--disable-libusb) printf "%s" -Dlibusb=disabled ;;
1525
+ --enable-libvduse) printf "%s" -Dlibvduse=enabled ;;
1526
+ --disable-libvduse) printf "%s" -Dlibvduse=disabled ;;
1527
--enable-linux-aio) printf "%s" -Dlinux_aio=enabled ;;
1528
--disable-linux-aio) printf "%s" -Dlinux_aio=disabled ;;
1529
--enable-linux-io-uring) printf "%s" -Dlinux_io_uring=enabled ;;
1530
diff --git a/subprojects/libvduse/linux-headers/linux b/subprojects/libvduse/linux-headers/linux
1531
new file mode 120000
1532
index XXXXXXX..XXXXXXX
1533
--- /dev/null
1534
+++ b/subprojects/libvduse/linux-headers/linux
1535
@@ -0,0 +1 @@
1536
+../../../linux-headers/linux/
1537
\ No newline at end of file
1538
diff --git a/subprojects/libvduse/meson.build b/subprojects/libvduse/meson.build
1539
new file mode 100644
1540
index XXXXXXX..XXXXXXX
1541
--- /dev/null
1542
+++ b/subprojects/libvduse/meson.build
1543
@@ -XXX,XX +XXX,XX @@
1544
+project('libvduse', 'c',
1545
+ license: 'GPL-2.0-or-later',
1546
+ default_options: ['c_std=gnu99'])
1547
+
1548
+libvduse = static_library('vduse',
1549
+ files('libvduse.c'),
1550
+ c_args: '-D_GNU_SOURCE')
1551
+
1552
+libvduse_dep = declare_dependency(link_with: libvduse,
1553
+ include_directories: include_directories('.'))
1554
diff --git a/subprojects/libvduse/standard-headers/linux b/subprojects/libvduse/standard-headers/linux
1555
new file mode 120000
1556
index XXXXXXX..XXXXXXX
1557
--- /dev/null
1558
+++ b/subprojects/libvduse/standard-headers/linux
1559
@@ -0,0 +1 @@
1560
+../../../include/standard-headers/linux/
1561
\ No newline at end of file
47
--
1562
--
48
2.19.1
1563
2.35.3
49
1564
50
1565
diff view generated by jsdifflib
New patch
1
1
From: Xie Yongji <xieyongji@bytedance.com>
2
3
This implements a VDUSE block backends based on
4
the libvduse library. We can use it to export the BDSs
5
for both VM and container (host) usage.
6
7
The new command-line syntax is:
8
9
$ qemu-storage-daemon \
10
--blockdev file,node-name=drive0,filename=test.img \
11
--export vduse-blk,node-name=drive0,id=vduse-export0,writable=on
12
13
After the qemu-storage-daemon started, we need to use
14
the "vdpa" command to attach the device to vDPA bus:
15
16
$ vdpa dev add name vduse-export0 mgmtdev vduse
17
18
Also the device must be removed via the "vdpa" command
19
before we stop the qemu-storage-daemon.
20
21
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
22
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
23
Message-Id: <20220523084611.91-7-xieyongji@bytedance.com>
24
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
25
---
26
qapi/block-export.json | 28 ++-
27
meson_options.txt | 2 +
28
block/export/vduse-blk.h | 20 +++
29
block/export/export.c | 6 +
30
block/export/vduse-blk.c | 329 ++++++++++++++++++++++++++++++++++
31
MAINTAINERS | 4 +-
32
block/export/meson.build | 5 +
33
meson.build | 13 ++
34
scripts/meson-buildoptions.sh | 4 +
35
9 files changed, 407 insertions(+), 4 deletions(-)
36
create mode 100644 block/export/vduse-blk.h
37
create mode 100644 block/export/vduse-blk.c
38
39
diff --git a/qapi/block-export.json b/qapi/block-export.json
40
index XXXXXXX..XXXXXXX 100644
41
--- a/qapi/block-export.json
42
+++ b/qapi/block-export.json
43
@@ -XXX,XX +XXX,XX @@
44
'*allow-other': 'FuseExportAllowOther' },
45
'if': 'CONFIG_FUSE' }
46
47
+##
48
+# @BlockExportOptionsVduseBlk:
49
+#
50
+# A vduse-blk block export.
51
+#
52
+# @num-queues: the number of virtqueues. Defaults to 1.
53
+# @queue-size: the size of virtqueue. Defaults to 256.
54
+# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE]
55
+# and must be power of 2. Defaults to 512 bytes.
56
+#
57
+# Since: 7.1
58
+##
59
+{ 'struct': 'BlockExportOptionsVduseBlk',
60
+ 'data': { '*num-queues': 'uint16',
61
+ '*queue-size': 'uint16',
62
+ '*logical-block-size': 'size'} }
63
+
64
##
65
# @NbdServerAddOptions:
66
#
67
@@ -XXX,XX +XXX,XX @@
68
# @nbd: NBD export
69
# @vhost-user-blk: vhost-user-blk export (since 5.2)
70
# @fuse: FUSE export (since: 6.0)
71
+# @vduse-blk: vduse-blk export (since 7.1)
72
#
73
# Since: 4.2
74
##
75
@@ -XXX,XX +XXX,XX @@
76
'data': [ 'nbd',
77
{ 'name': 'vhost-user-blk',
78
'if': 'CONFIG_VHOST_USER_BLK_SERVER' },
79
- { 'name': 'fuse', 'if': 'CONFIG_FUSE' } ] }
80
+ { 'name': 'fuse', 'if': 'CONFIG_FUSE' },
81
+ { 'name': 'vduse-blk', 'if': 'CONFIG_VDUSE_BLK_EXPORT' } ] }
82
83
##
84
# @BlockExportOptions:
85
@@ -XXX,XX +XXX,XX @@
86
# Describes a block export, i.e. how single node should be exported on an
87
# external interface.
88
#
89
-# @id: A unique identifier for the block export (across all export types)
90
+# @id: A unique identifier for the block export (across the host for vduse-blk
91
+# export type or across all export types for other types)
92
#
93
# @node-name: The node name of the block node to be exported (since: 5.2)
94
#
95
@@ -XXX,XX +XXX,XX @@
96
'vhost-user-blk': { 'type': 'BlockExportOptionsVhostUserBlk',
97
'if': 'CONFIG_VHOST_USER_BLK_SERVER' },
98
'fuse': { 'type': 'BlockExportOptionsFuse',
99
- 'if': 'CONFIG_FUSE' }
100
+ 'if': 'CONFIG_FUSE' },
101
+ 'vduse-blk': { 'type': 'BlockExportOptionsVduseBlk',
102
+ 'if': 'CONFIG_VDUSE_BLK_EXPORT' }
103
} }
104
105
##
106
diff --git a/meson_options.txt b/meson_options.txt
107
index XXXXXXX..XXXXXXX 100644
108
--- a/meson_options.txt
109
+++ b/meson_options.txt
110
@@ -XXX,XX +XXX,XX @@ option('virtiofsd', type: 'feature', value: 'auto',
111
description: 'build virtiofs daemon (virtiofsd)')
112
option('libvduse', type: 'feature', value: 'auto',
113
description: 'build VDUSE Library')
114
+option('vduse_blk_export', type: 'feature', value: 'auto',
115
+ description: 'VDUSE block export support')
116
117
option('capstone', type: 'feature', value: 'auto',
118
description: 'Whether and how to find the capstone library')
119
diff --git a/block/export/vduse-blk.h b/block/export/vduse-blk.h
120
new file mode 100644
121
index XXXXXXX..XXXXXXX
122
--- /dev/null
123
+++ b/block/export/vduse-blk.h
124
@@ -XXX,XX +XXX,XX @@
125
+/*
126
+ * Export QEMU block device via VDUSE
127
+ *
128
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
129
+ *
130
+ * Author:
131
+ * Xie Yongji <xieyongji@bytedance.com>
132
+ *
133
+ * This work is licensed under the terms of the GNU GPL, version 2 or
134
+ * later. See the COPYING file in the top-level directory.
135
+ */
136
+
137
+#ifndef VDUSE_BLK_H
138
+#define VDUSE_BLK_H
139
+
140
+#include "block/export.h"
141
+
142
+extern const BlockExportDriver blk_exp_vduse_blk;
143
+
144
+#endif /* VDUSE_BLK_H */
145
diff --git a/block/export/export.c b/block/export/export.c
146
index XXXXXXX..XXXXXXX 100644
147
--- a/block/export/export.c
148
+++ b/block/export/export.c
149
@@ -XXX,XX +XXX,XX @@
150
#ifdef CONFIG_VHOST_USER_BLK_SERVER
151
#include "vhost-user-blk-server.h"
152
#endif
153
+#ifdef CONFIG_VDUSE_BLK_EXPORT
154
+#include "vduse-blk.h"
155
+#endif
156
157
static const BlockExportDriver *blk_exp_drivers[] = {
158
&blk_exp_nbd,
159
@@ -XXX,XX +XXX,XX @@ static const BlockExportDriver *blk_exp_drivers[] = {
160
#ifdef CONFIG_FUSE
161
&blk_exp_fuse,
162
#endif
163
+#ifdef CONFIG_VDUSE_BLK_EXPORT
164
+ &blk_exp_vduse_blk,
165
+#endif
166
};
167
168
/* Only accessed from the main thread */
169
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
170
new file mode 100644
171
index XXXXXXX..XXXXXXX
172
--- /dev/null
173
+++ b/block/export/vduse-blk.c
174
@@ -XXX,XX +XXX,XX @@
175
+/*
176
+ * Export QEMU block device via VDUSE
177
+ *
178
+ * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
179
+ *
180
+ * Author:
181
+ * Xie Yongji <xieyongji@bytedance.com>
182
+ *
183
+ * This work is licensed under the terms of the GNU GPL, version 2 or
184
+ * later. See the COPYING file in the top-level directory.
185
+ */
186
+
187
+#include <sys/eventfd.h>
188
+
189
+#include "qemu/osdep.h"
190
+#include "qapi/error.h"
191
+#include "block/export.h"
192
+#include "qemu/error-report.h"
193
+#include "util/block-helpers.h"
194
+#include "subprojects/libvduse/libvduse.h"
195
+#include "virtio-blk-handler.h"
196
+
197
+#include "standard-headers/linux/virtio_blk.h"
198
+
199
+#define VDUSE_DEFAULT_NUM_QUEUE 1
200
+#define VDUSE_DEFAULT_QUEUE_SIZE 256
201
+
202
+typedef struct VduseBlkExport {
203
+ BlockExport export;
204
+ VirtioBlkHandler handler;
205
+ VduseDev *dev;
206
+ uint16_t num_queues;
207
+ unsigned int inflight;
208
+} VduseBlkExport;
209
+
210
+typedef struct VduseBlkReq {
211
+ VduseVirtqElement elem;
212
+ VduseVirtq *vq;
213
+} VduseBlkReq;
214
+
215
+static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
216
+{
217
+ vblk_exp->inflight++;
218
+}
219
+
220
+static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
221
+{
222
+ if (--vblk_exp->inflight == 0) {
223
+ aio_wait_kick();
224
+ }
225
+}
226
+
227
+static void vduse_blk_req_complete(VduseBlkReq *req, size_t in_len)
228
+{
229
+ vduse_queue_push(req->vq, &req->elem, in_len);
230
+ vduse_queue_notify(req->vq);
231
+
232
+ free(req);
233
+}
234
+
235
+static void coroutine_fn vduse_blk_virtio_process_req(void *opaque)
236
+{
237
+ VduseBlkReq *req = opaque;
238
+ VduseVirtq *vq = req->vq;
239
+ VduseDev *dev = vduse_queue_get_dev(vq);
240
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
241
+ VirtioBlkHandler *handler = &vblk_exp->handler;
242
+ VduseVirtqElement *elem = &req->elem;
243
+ struct iovec *in_iov = elem->in_sg;
244
+ struct iovec *out_iov = elem->out_sg;
245
+ unsigned in_num = elem->in_num;
246
+ unsigned out_num = elem->out_num;
247
+ int in_len;
248
+
249
+ in_len = virtio_blk_process_req(handler, in_iov,
250
+ out_iov, in_num, out_num);
251
+ if (in_len < 0) {
252
+ free(req);
253
+ return;
254
+ }
255
+
256
+ vduse_blk_req_complete(req, in_len);
257
+ vduse_blk_inflight_dec(vblk_exp);
258
+}
259
+
260
+static void vduse_blk_vq_handler(VduseDev *dev, VduseVirtq *vq)
261
+{
262
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
263
+
264
+ while (1) {
265
+ VduseBlkReq *req;
266
+
267
+ req = vduse_queue_pop(vq, sizeof(VduseBlkReq));
268
+ if (!req) {
269
+ break;
270
+ }
271
+ req->vq = vq;
272
+
273
+ Coroutine *co =
274
+ qemu_coroutine_create(vduse_blk_virtio_process_req, req);
275
+
276
+ vduse_blk_inflight_inc(vblk_exp);
277
+ qemu_coroutine_enter(co);
278
+ }
279
+}
280
+
281
+static void on_vduse_vq_kick(void *opaque)
282
+{
283
+ VduseVirtq *vq = opaque;
284
+ VduseDev *dev = vduse_queue_get_dev(vq);
285
+ int fd = vduse_queue_get_fd(vq);
286
+ eventfd_t kick_data;
287
+
288
+ if (eventfd_read(fd, &kick_data) == -1) {
289
+ error_report("failed to read data from eventfd");
290
+ return;
291
+ }
292
+
293
+ vduse_blk_vq_handler(dev, vq);
294
+}
295
+
296
+static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
297
+{
298
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
299
+
300
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
301
+ true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
302
+}
303
+
304
+static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
305
+{
306
+ VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
307
+
308
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
309
+ true, NULL, NULL, NULL, NULL, NULL);
310
+}
311
+
312
+static const VduseOps vduse_blk_ops = {
313
+ .enable_queue = vduse_blk_enable_queue,
314
+ .disable_queue = vduse_blk_disable_queue,
315
+};
316
+
317
+static void on_vduse_dev_kick(void *opaque)
318
+{
319
+ VduseDev *dev = opaque;
320
+
321
+ vduse_dev_handler(dev);
322
+}
323
+
324
+static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
325
+{
326
+ int i;
327
+
328
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
329
+ true, on_vduse_dev_kick, NULL, NULL, NULL,
330
+ vblk_exp->dev);
331
+
332
+ for (i = 0; i < vblk_exp->num_queues; i++) {
333
+ VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
334
+ int fd = vduse_queue_get_fd(vq);
335
+
336
+ if (fd < 0) {
337
+ continue;
338
+ }
339
+ aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
340
+ on_vduse_vq_kick, NULL, NULL, NULL, vq);
341
+ }
342
+}
343
+
344
+static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
345
+{
346
+ int i;
347
+
348
+ for (i = 0; i < vblk_exp->num_queues; i++) {
349
+ VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
350
+ int fd = vduse_queue_get_fd(vq);
351
+
352
+ if (fd < 0) {
353
+ continue;
354
+ }
355
+ aio_set_fd_handler(vblk_exp->export.ctx, fd,
356
+ true, NULL, NULL, NULL, NULL, NULL);
357
+ }
358
+ aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
359
+ true, NULL, NULL, NULL, NULL, NULL);
360
+
361
+ AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
362
+}
363
+
364
+
365
+static void blk_aio_attached(AioContext *ctx, void *opaque)
366
+{
367
+ VduseBlkExport *vblk_exp = opaque;
368
+
369
+ vblk_exp->export.ctx = ctx;
370
+ vduse_blk_attach_ctx(vblk_exp, ctx);
371
+}
372
+
373
+static void blk_aio_detach(void *opaque)
374
+{
375
+ VduseBlkExport *vblk_exp = opaque;
376
+
377
+ vduse_blk_detach_ctx(vblk_exp);
378
+ vblk_exp->export.ctx = NULL;
379
+}
380
+
381
+static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
382
+ Error **errp)
383
+{
384
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
385
+ BlockExportOptionsVduseBlk *vblk_opts = &opts->u.vduse_blk;
386
+ uint64_t logical_block_size = VIRTIO_BLK_SECTOR_SIZE;
387
+ uint16_t num_queues = VDUSE_DEFAULT_NUM_QUEUE;
388
+ uint16_t queue_size = VDUSE_DEFAULT_QUEUE_SIZE;
389
+ Error *local_err = NULL;
390
+ struct virtio_blk_config config = { 0 };
391
+ uint64_t features;
392
+ int i;
393
+
394
+ if (vblk_opts->has_num_queues) {
395
+ num_queues = vblk_opts->num_queues;
396
+ if (num_queues == 0) {
397
+ error_setg(errp, "num-queues must be greater than 0");
398
+ return -EINVAL;
399
+ }
400
+ }
401
+
402
+ if (vblk_opts->has_queue_size) {
403
+ queue_size = vblk_opts->queue_size;
404
+ if (queue_size <= 2 || !is_power_of_2(queue_size) ||
405
+ queue_size > VIRTQUEUE_MAX_SIZE) {
406
+ error_setg(errp, "queue-size is invalid");
407
+ return -EINVAL;
408
+ }
409
+ }
410
+
411
+ if (vblk_opts->has_logical_block_size) {
412
+ logical_block_size = vblk_opts->logical_block_size;
413
+ check_block_size(exp->id, "logical-block-size", logical_block_size,
414
+ &local_err);
415
+ if (local_err) {
416
+ error_propagate(errp, local_err);
417
+ return -EINVAL;
418
+ }
419
+ }
420
+ vblk_exp->num_queues = num_queues;
421
+ vblk_exp->handler.blk = exp->blk;
422
+ vblk_exp->handler.serial = exp->id;
423
+ vblk_exp->handler.logical_block_size = logical_block_size;
424
+ vblk_exp->handler.writable = opts->writable;
425
+
426
+ config.capacity =
427
+ cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
428
+ config.seg_max = cpu_to_le32(queue_size - 2);
429
+ config.min_io_size = cpu_to_le16(1);
430
+ config.opt_io_size = cpu_to_le32(1);
431
+ config.num_queues = cpu_to_le16(num_queues);
432
+ config.blk_size = cpu_to_le32(logical_block_size);
433
+ config.max_discard_sectors = cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS);
434
+ config.max_discard_seg = cpu_to_le32(1);
435
+ config.discard_sector_alignment =
436
+ cpu_to_le32(logical_block_size >> VIRTIO_BLK_SECTOR_BITS);
437
+ config.max_write_zeroes_sectors =
438
+ cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS);
439
+ config.max_write_zeroes_seg = cpu_to_le32(1);
440
+
441
+ features = vduse_get_virtio_features() |
442
+ (1ULL << VIRTIO_BLK_F_SEG_MAX) |
443
+ (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
444
+ (1ULL << VIRTIO_BLK_F_BLK_SIZE) |
445
+ (1ULL << VIRTIO_BLK_F_FLUSH) |
446
+ (1ULL << VIRTIO_BLK_F_DISCARD) |
447
+ (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
448
+
449
+ if (num_queues > 1) {
450
+ features |= 1ULL << VIRTIO_BLK_F_MQ;
451
+ }
452
+ if (!opts->writable) {
453
+ features |= 1ULL << VIRTIO_BLK_F_RO;
454
+ }
455
+
456
+ vblk_exp->dev = vduse_dev_create(exp->id, VIRTIO_ID_BLOCK, 0,
457
+ features, num_queues,
458
+ sizeof(struct virtio_blk_config),
459
+ (char *)&config, &vduse_blk_ops,
460
+ vblk_exp);
461
+ if (!vblk_exp->dev) {
462
+ error_setg(errp, "failed to create vduse device");
463
+ return -ENOMEM;
464
+ }
465
+
466
+ for (i = 0; i < num_queues; i++) {
467
+ vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
468
+ }
469
+
470
+ aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
471
+ on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
472
+
473
+ blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
474
+ vblk_exp);
475
+
476
+ return 0;
477
+}
478
+
479
+static void vduse_blk_exp_delete(BlockExport *exp)
480
+{
481
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
482
+
483
+ blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
484
+ vblk_exp);
485
+ vduse_dev_destroy(vblk_exp->dev);
486
+}
487
+
488
+static void vduse_blk_exp_request_shutdown(BlockExport *exp)
489
+{
490
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
491
+
492
+ aio_context_acquire(vblk_exp->export.ctx);
493
+ vduse_blk_detach_ctx(vblk_exp);
494
+ aio_context_acquire(vblk_exp->export.ctx);
495
+}
496
+
497
+const BlockExportDriver blk_exp_vduse_blk = {
498
+ .type = BLOCK_EXPORT_TYPE_VDUSE_BLK,
499
+ .instance_size = sizeof(VduseBlkExport),
500
+ .create = vduse_blk_exp_create,
501
+ .delete = vduse_blk_exp_delete,
502
+ .request_shutdown = vduse_blk_exp_request_shutdown,
503
+};
504
diff --git a/MAINTAINERS b/MAINTAINERS
505
index XXXXXXX..XXXXXXX 100644
506
--- a/MAINTAINERS
507
+++ b/MAINTAINERS
508
@@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org
509
S: Supported
510
F: block/export/fuse.c
511
512
-VDUSE library
513
+VDUSE library and block device exports
514
M: Xie Yongji <xieyongji@bytedance.com>
515
S: Maintained
516
F: subprojects/libvduse/
517
+F: block/export/vduse-blk.c
518
+F: block/export/vduse-blk.h
519
520
Replication
521
M: Wen Congyang <wencongyang2@huawei.com>
522
diff --git a/block/export/meson.build b/block/export/meson.build
523
index XXXXXXX..XXXXXXX 100644
524
--- a/block/export/meson.build
525
+++ b/block/export/meson.build
526
@@ -XXX,XX +XXX,XX @@ if have_vhost_user_blk_server
527
endif
528
529
blockdev_ss.add(when: fuse, if_true: files('fuse.c'))
530
+
531
+if have_vduse_blk_export
532
+ blockdev_ss.add(files('vduse-blk.c', 'virtio-blk-handler.c'))
533
+ blockdev_ss.add(libvduse)
534
+endif
535
diff --git a/meson.build b/meson.build
536
index XXXXXXX..XXXXXXX 100644
537
--- a/meson.build
538
+++ b/meson.build
539
@@ -XXX,XX +XXX,XX @@ elif get_option('libvduse').disabled()
540
have_libvduse = false
541
endif
542
543
+have_vduse_blk_export = (have_libvduse and targetos == 'linux')
544
+if get_option('vduse_blk_export').enabled()
545
+ if targetos != 'linux'
546
+ error('vduse_blk_export requires linux')
547
+ elif not have_libvduse
548
+ error('vduse_blk_export requires libvduse support')
549
+ endif
550
+elif get_option('vduse_blk_export').disabled()
551
+ have_vduse_blk_export = false
552
+endif
553
+
554
# libbpf
555
libbpf = dependency('libbpf', required: get_option('bpf'), method: 'pkg-config')
556
if libbpf.found() and not cc.links('''
557
@@ -XXX,XX +XXX,XX @@ config_host_data.set('CONFIG_VHOST_CRYPTO', have_vhost_user_crypto)
558
config_host_data.set('CONFIG_VHOST_VDPA', have_vhost_vdpa)
559
config_host_data.set('CONFIG_VMNET', vmnet.found())
560
config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', have_vhost_user_blk_server)
561
+config_host_data.set('CONFIG_VDUSE_BLK_EXPORT', have_vduse_blk_export)
562
config_host_data.set('CONFIG_PNG', png.found())
563
config_host_data.set('CONFIG_VNC', vnc.found())
564
config_host_data.set('CONFIG_VNC_JPEG', jpeg.found())
565
@@ -XXX,XX +XXX,XX @@ if have_block
566
summary_info += {'qed support': get_option('qed').allowed()}
567
summary_info += {'parallels support': get_option('parallels').allowed()}
568
summary_info += {'FUSE exports': fuse}
569
+ summary_info += {'VDUSE block exports': have_vduse_blk_export}
570
endif
571
summary(summary_info, bool_yn: true, section: 'Block layer support')
572
573
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
574
index XXXXXXX..XXXXXXX 100644
575
--- a/scripts/meson-buildoptions.sh
576
+++ b/scripts/meson-buildoptions.sh
577
@@ -XXX,XX +XXX,XX @@ meson_options_help() {
578
printf "%s\n" ' vhost-user vhost-user backend support'
579
printf "%s\n" ' vhost-user-blk-server'
580
printf "%s\n" ' build vhost-user-blk server'
581
+ printf "%s\n" ' vduse-blk-export'
582
+ printf "%s\n" ' VDUSE block export support'
583
printf "%s\n" ' vhost-vdpa vhost-vdpa kernel backend support'
584
printf "%s\n" ' virglrenderer virgl rendering support'
585
printf "%s\n" ' virtfs virtio-9p support'
586
@@ -XXX,XX +XXX,XX @@ _meson_option_parse() {
587
--disable-vhost-user) printf "%s" -Dvhost_user=disabled ;;
588
--enable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=enabled ;;
589
--disable-vhost-user-blk-server) printf "%s" -Dvhost_user_blk_server=disabled ;;
590
+ --enable-vduse-blk-export) printf "%s" -Dvduse_blk_export=enabled ;;
591
+ --disable-vduse-blk-export) printf "%s" -Dvduse_blk_export=disabled ;;
592
--enable-vhost-vdpa) printf "%s" -Dvhost_vdpa=enabled ;;
593
--disable-vhost-vdpa) printf "%s" -Dvhost_vdpa=disabled ;;
594
--enable-virglrenderer) printf "%s" -Dvirglrenderer=enabled ;;
595
--
596
2.35.3
diff view generated by jsdifflib
1
From: Liam Merwick <Liam.Merwick@oracle.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
The calls to find_mapping_for_cluster() may return NULL but it
3
To support block resize, this uses vduse_dev_update_config()
4
isn't always checked for before dereferencing the value returned.
4
to update the capacity field in configuration space and inject
5
Additionally, add some asserts to cover cases where NULL can't
5
config interrupt on the block resize callback.
6
be returned but which might not be obvious at first glance.
7
6
8
Signed-off-by: Liam Merwick <Liam.Merwick@oracle.com>
7
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
9
Message-id: 1541453919-25973-5-git-send-email-Liam.Merwick@oracle.com
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
[mreitz: Dropped superfluous check of "mapping" following an assertion
9
Message-Id: <20220523084611.91-8-xieyongji@bytedance.com>
11
that it is not NULL, and fixed some indentation]
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Max Reitz <mreitz@redhat.com>
13
---
11
---
14
block/vvfat.c | 46 ++++++++++++++++++++++++++++++----------------
12
block/export/vduse-blk.c | 20 ++++++++++++++++++++
15
1 file changed, 30 insertions(+), 16 deletions(-)
13
1 file changed, 20 insertions(+)
16
14
17
diff --git a/block/vvfat.c b/block/vvfat.c
15
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
18
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
19
--- a/block/vvfat.c
17
--- a/block/export/vduse-blk.c
20
+++ b/block/vvfat.c
18
+++ b/block/export/vduse-blk.c
21
@@ -XXX,XX +XXX,XX @@ static inline void array_free(array_t* array)
19
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
22
/* does not automatically grow */
20
vblk_exp->export.ctx = NULL;
23
static inline void* array_get(array_t* array,unsigned int index) {
24
assert(index < array->next);
25
+ assert(array->pointer);
26
return array->pointer + index * array->item_size;
27
}
21
}
28
22
29
-static inline int array_ensure_allocated(array_t* array, int index)
23
+static void vduse_blk_resize(void *opaque)
30
+static inline void array_ensure_allocated(array_t *array, int index)
24
+{
25
+ BlockExport *exp = opaque;
26
+ VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
27
+ struct virtio_blk_config config;
28
+
29
+ config.capacity =
30
+ cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
31
+ vduse_dev_update_config(vblk_exp->dev, sizeof(config.capacity),
32
+ offsetof(struct virtio_blk_config, capacity),
33
+ (char *)&config.capacity);
34
+}
35
+
36
+static const BlockDevOps vduse_block_ops = {
37
+ .resize_cb = vduse_blk_resize,
38
+};
39
+
40
static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
41
Error **errp)
31
{
42
{
32
if((index + 1) * array->item_size > array->size) {
43
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
33
int new_size = (index + 32) * array->item_size;
44
blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
34
array->pointer = g_realloc(array->pointer, new_size);
45
vblk_exp);
35
- if (!array->pointer)
46
36
- return -1;
47
+ blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
37
+ assert(array->pointer);
48
+
38
memset(array->pointer + array->size, 0, new_size - array->size);
49
return 0;
39
array->size = new_size;
40
array->next = index + 1;
41
}
42
-
43
- return 0;
44
}
50
}
45
51
46
static inline void* array_get_next(array_t* array) {
52
@@ -XXX,XX +XXX,XX @@ static void vduse_blk_exp_delete(BlockExport *exp)
47
unsigned int next = array->next;
53
48
54
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
49
- if (array_ensure_allocated(array, next) < 0)
55
vblk_exp);
50
- return NULL;
56
+ blk_set_dev_ops(exp->blk, NULL, NULL);
51
-
57
vduse_dev_destroy(vblk_exp->dev);
52
+ array_ensure_allocated(array, next);
53
array->next = next + 1;
54
return array_get(array, next);
55
}
58
}
56
@@ -XXX,XX +XXX,XX @@ static int commit_direntries(BDRVVVFATState* s,
57
direntry_t* direntry = array_get(&(s->directory), dir_index);
58
uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry);
59
mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
60
-
61
int factor = 0x10 * s->sectors_per_cluster;
62
int old_cluster_count, new_cluster_count;
63
- int current_dir_index = mapping->info.dir.first_dir_index;
64
- int first_dir_index = current_dir_index;
65
+ int current_dir_index;
66
+ int first_dir_index;
67
int ret, i;
68
uint32_t c;
69
70
-DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index));
71
-
72
assert(direntry);
73
assert(mapping);
74
assert(mapping->begin == first_cluster);
75
@@ -XXX,XX +XXX,XX @@ DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapp
76
assert(mapping->mode & MODE_DIRECTORY);
77
assert(dir_index == 0 || is_directory(direntry));
78
79
+ DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n",
80
+ mapping->path, parent_mapping_index));
81
+
82
+ current_dir_index = mapping->info.dir.first_dir_index;
83
+ first_dir_index = current_dir_index;
84
mapping->info.dir.parent_mapping_index = parent_mapping_index;
85
86
if (first_cluster == 0) {
87
@@ -XXX,XX +XXX,XX @@ DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapp
88
direntry = array_get(&(s->directory), first_dir_index + i);
89
if (is_directory(direntry) && !is_dot(direntry)) {
90
mapping = find_mapping_for_cluster(s, first_cluster);
91
+ if (mapping == NULL) {
92
+ return -1;
93
+ }
94
assert(mapping->mode & MODE_DIRECTORY);
95
ret = commit_direntries(s, first_dir_index + i,
96
array_index(&(s->mapping), mapping));
97
@@ -XXX,XX +XXX,XX @@ static int commit_one_file(BDRVVVFATState* s,
98
assert(offset < size);
99
assert((offset % s->cluster_size) == 0);
100
101
+ if (mapping == NULL) {
102
+ return -1;
103
+ }
104
+
105
for (i = s->cluster_size; i < offset; i += s->cluster_size)
106
c = modified_fat_get(s, c);
107
108
@@ -XXX,XX +XXX,XX @@ static int handle_renames_and_mkdirs(BDRVVVFATState* s)
109
if (commit->action == ACTION_RENAME) {
110
mapping_t* mapping = find_mapping_for_cluster(s,
111
commit->param.rename.cluster);
112
- char* old_path = mapping->path;
113
+ char *old_path;
114
115
+ if (mapping == NULL) {
116
+ return -1;
117
+ }
118
+ old_path = mapping->path;
119
assert(commit->path);
120
mapping->path = commit->path;
121
if (rename(old_path, mapping->path))
122
@@ -XXX,XX +XXX,XX @@ static int handle_renames_and_mkdirs(BDRVVVFATState* s)
123
direntry_t* d = direntry + i;
124
125
if (is_file(d) || (is_directory(d) && !is_dot(d))) {
126
+ int l;
127
+ char *new_path;
128
mapping_t* m = find_mapping_for_cluster(s,
129
begin_of_direntry(d));
130
- int l = strlen(m->path);
131
- char* new_path = g_malloc(l + diff + 1);
132
+ if (m == NULL) {
133
+ return -1;
134
+ }
135
+ l = strlen(m->path);
136
+ new_path = g_malloc(l + diff + 1);
137
138
assert(!strncmp(m->path, mapping->path, l2));
139
59
140
--
60
--
141
2.19.1
61
2.35.3
142
143
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
Use error_report for situations that affect user operation (i.e. we're
3
To support reconnecting after restart or crash, VDUSE backend
4
actually returning error), and warn_report/warn_report_err when some
4
might need to resubmit inflight I/Os. This stores the metadata
5
less critical error happened but the user operation can still carry on.
5
such as the index of inflight I/O's descriptors to a shm file so
6
that VDUSE backend can restore them during reconnecting.
6
7
7
For raw_normalize_devicepath, add Error parameter to propagate to
8
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
8
its callers.
9
Message-Id: <20220523084611.91-9-xieyongji@bytedance.com>
9
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Suggested-by: Markus Armbruster <armbru@redhat.com>
11
Signed-off-by: Fam Zheng <famz@redhat.com>
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
13
---
12
---
14
block/file-posix.c | 39 ++++++++++++++++-----------------------
13
subprojects/libvduse/libvduse.h | 12 ++
15
1 file changed, 16 insertions(+), 23 deletions(-)
14
block/export/vduse-blk.c | 19 ++-
15
subprojects/libvduse/libvduse.c | 235 +++++++++++++++++++++++++++++++-
16
3 files changed, 260 insertions(+), 6 deletions(-)
16
17
17
diff --git a/block/file-posix.c b/block/file-posix.c
18
diff --git a/subprojects/libvduse/libvduse.h b/subprojects/libvduse/libvduse.h
18
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
19
--- a/block/file-posix.c
20
--- a/subprojects/libvduse/libvduse.h
20
+++ b/block/file-posix.c
21
+++ b/subprojects/libvduse/libvduse.h
21
@@ -XXX,XX +XXX,XX @@ static int cdrom_reopen(BlockDriverState *bs);
22
@@ -XXX,XX +XXX,XX @@ int vduse_dev_update_config(VduseDev *dev, uint32_t size,
23
*/
24
int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size);
25
26
+/**
27
+ * vduse_set_reconnect_log_file:
28
+ * @dev: VDUSE device
29
+ * @file: filename of reconnect log
30
+ *
31
+ * Specify the file to store log for reconnecting. It should
32
+ * be called before vduse_dev_setup_queue().
33
+ *
34
+ * Returns: 0 on success, -errno on failure.
35
+ */
36
+int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename);
37
+
38
/**
39
* vduse_dev_create_by_fd:
40
* @fd: passed file descriptor
41
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
42
index XXXXXXX..XXXXXXX 100644
43
--- a/block/export/vduse-blk.c
44
+++ b/block/export/vduse-blk.c
45
@@ -XXX,XX +XXX,XX @@ typedef struct VduseBlkExport {
46
VirtioBlkHandler handler;
47
VduseDev *dev;
48
uint16_t num_queues;
49
+ char *recon_file;
50
unsigned int inflight;
51
} VduseBlkExport;
52
53
@@ -XXX,XX +XXX,XX @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)
54
55
aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
56
true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
57
+ /* Make sure we don't miss any kick afer reconnecting */
58
+ eventfd_write(vduse_queue_get_fd(vq), 1);
59
}
60
61
static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
62
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
63
return -ENOMEM;
64
}
65
66
+ vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
67
+ g_get_tmp_dir(), exp->id);
68
+ if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
69
+ error_setg(errp, "failed to set reconnect log file");
70
+ vduse_dev_destroy(vblk_exp->dev);
71
+ g_free(vblk_exp->recon_file);
72
+ return -EINVAL;
73
+ }
74
+
75
for (i = 0; i < num_queues; i++) {
76
vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
77
}
78
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
79
static void vduse_blk_exp_delete(BlockExport *exp)
80
{
81
VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
82
+ int ret;
83
84
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
85
vblk_exp);
86
blk_set_dev_ops(exp->blk, NULL, NULL);
87
- vduse_dev_destroy(vblk_exp->dev);
88
+ ret = vduse_dev_destroy(vblk_exp->dev);
89
+ if (ret != -EBUSY) {
90
+ unlink(vblk_exp->recon_file);
91
+ }
92
+ g_free(vblk_exp->recon_file);
93
}
94
95
static void vduse_blk_exp_request_shutdown(BlockExport *exp)
96
diff --git a/subprojects/libvduse/libvduse.c b/subprojects/libvduse/libvduse.c
97
index XXXXXXX..XXXXXXX 100644
98
--- a/subprojects/libvduse/libvduse.c
99
+++ b/subprojects/libvduse/libvduse.c
100
@@ -XXX,XX +XXX,XX @@
101
#define VDUSE_VQ_ALIGN 4096
102
#define MAX_IOVA_REGIONS 256
103
104
+#define LOG_ALIGNMENT 64
105
+
106
/* Round number down to multiple */
107
#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
108
109
@@ -XXX,XX +XXX,XX @@
110
#define unlikely(x) __builtin_expect(!!(x), 0)
22
#endif
111
#endif
23
112
24
#if defined(__NetBSD__)
113
+typedef struct VduseDescStateSplit {
25
-static int raw_normalize_devicepath(const char **filename)
114
+ uint8_t inflight;
26
+static int raw_normalize_devicepath(const char **filename, Error **errp)
115
+ uint8_t padding[5];
27
{
116
+ uint16_t next;
28
static char namebuf[PATH_MAX];
117
+ uint64_t counter;
29
const char *dp, *fname;
118
+} VduseDescStateSplit;
30
@@ -XXX,XX +XXX,XX @@ static int raw_normalize_devicepath(const char **filename)
119
+
31
fname = *filename;
120
+typedef struct VduseVirtqLogInflight {
32
dp = strrchr(fname, '/');
121
+ uint64_t features;
33
if (lstat(fname, &sb) < 0) {
122
+ uint16_t version;
34
- fprintf(stderr, "%s: stat failed: %s\n",
123
+ uint16_t desc_num;
35
- fname, strerror(errno));
124
+ uint16_t last_batch_head;
36
+ error_setg_errno(errp, errno, "%s: stat failed", fname);
125
+ uint16_t used_idx;
126
+ VduseDescStateSplit desc[];
127
+} VduseVirtqLogInflight;
128
+
129
+typedef struct VduseVirtqLog {
130
+ VduseVirtqLogInflight inflight;
131
+} VduseVirtqLog;
132
+
133
+typedef struct VduseVirtqInflightDesc {
134
+ uint16_t index;
135
+ uint64_t counter;
136
+} VduseVirtqInflightDesc;
137
+
138
typedef struct VduseRing {
139
unsigned int num;
140
uint64_t desc_addr;
141
@@ -XXX,XX +XXX,XX @@ struct VduseVirtq {
142
bool ready;
143
int fd;
144
VduseDev *dev;
145
+ VduseVirtqInflightDesc *resubmit_list;
146
+ uint16_t resubmit_num;
147
+ uint64_t counter;
148
+ VduseVirtqLog *log;
149
};
150
151
typedef struct VduseIovaRegion {
152
@@ -XXX,XX +XXX,XX @@ struct VduseDev {
153
int fd;
154
int ctrl_fd;
155
void *priv;
156
+ void *log;
157
};
158
159
+static inline size_t vduse_vq_log_size(uint16_t queue_size)
160
+{
161
+ return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
162
+ sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
163
+}
164
+
165
+static void *vduse_log_get(const char *filename, size_t size)
166
+{
167
+ void *ptr = MAP_FAILED;
168
+ int fd;
169
+
170
+ fd = open(filename, O_RDWR | O_CREAT, 0600);
171
+ if (fd == -1) {
172
+ return MAP_FAILED;
173
+ }
174
+
175
+ if (ftruncate(fd, size) == -1) {
176
+ goto out;
177
+ }
178
+
179
+ ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
180
+
181
+out:
182
+ close(fd);
183
+ return ptr;
184
+}
185
+
186
static inline bool has_feature(uint64_t features, unsigned int fbit)
187
{
188
assert(fbit < 64);
189
@@ -XXX,XX +XXX,XX @@ static int vduse_inject_irq(VduseDev *dev, int index)
190
return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
191
}
192
193
+static int inflight_desc_compare(const void *a, const void *b)
194
+{
195
+ VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
196
+ *desc1 = (VduseVirtqInflightDesc *)b;
197
+
198
+ if (desc1->counter > desc0->counter &&
199
+ (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
200
+ return 1;
201
+ }
202
+
203
+ return -1;
204
+}
205
+
206
+static int vduse_queue_check_inflights(VduseVirtq *vq)
207
+{
208
+ int i = 0;
209
+ VduseDev *dev = vq->dev;
210
+
211
+ vq->used_idx = le16toh(vq->vring.used->idx);
212
+ vq->resubmit_num = 0;
213
+ vq->resubmit_list = NULL;
214
+ vq->counter = 0;
215
+
216
+ if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
217
+ if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) {
218
+ return -1;
219
+ }
220
+
221
+ vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0;
222
+
223
+ barrier();
224
+
225
+ vq->log->inflight.used_idx = vq->used_idx;
226
+ }
227
+
228
+ for (i = 0; i < vq->log->inflight.desc_num; i++) {
229
+ if (vq->log->inflight.desc[i].inflight == 1) {
230
+ vq->inuse++;
231
+ }
232
+ }
233
+
234
+ vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
235
+
236
+ if (vq->inuse) {
237
+ vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc));
238
+ if (!vq->resubmit_list) {
239
+ return -1;
240
+ }
241
+
242
+ for (i = 0; i < vq->log->inflight.desc_num; i++) {
243
+ if (vq->log->inflight.desc[i].inflight) {
244
+ vq->resubmit_list[vq->resubmit_num].index = i;
245
+ vq->resubmit_list[vq->resubmit_num].counter =
246
+ vq->log->inflight.desc[i].counter;
247
+ vq->resubmit_num++;
248
+ }
249
+ }
250
+
251
+ if (vq->resubmit_num > 1) {
252
+ qsort(vq->resubmit_list, vq->resubmit_num,
253
+ sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
254
+ }
255
+ vq->counter = vq->resubmit_list[0].counter + 1;
256
+ }
257
+
258
+ vduse_inject_irq(dev, vq->index);
259
+
260
+ return 0;
261
+}
262
+
263
+static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
264
+{
265
+ vq->log->inflight.desc[desc_idx].counter = vq->counter++;
266
+
267
+ barrier();
268
+
269
+ vq->log->inflight.desc[desc_idx].inflight = 1;
270
+
271
+ return 0;
272
+}
273
+
274
+static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
275
+{
276
+ vq->log->inflight.last_batch_head = desc_idx;
277
+
278
+ return 0;
279
+}
280
+
281
+static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
282
+{
283
+ vq->log->inflight.desc[desc_idx].inflight = 0;
284
+
285
+ barrier();
286
+
287
+ vq->log->inflight.used_idx = vq->used_idx;
288
+
289
+ return 0;
290
+}
291
+
292
static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
293
uint64_t last)
294
{
295
@@ -XXX,XX +XXX,XX @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
296
unsigned int head;
297
VduseVirtqElement *elem;
298
VduseDev *dev = vq->dev;
299
+ int i;
300
301
if (unlikely(!vq->vring.avail)) {
302
return NULL;
303
}
304
305
+ if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
306
+ i = (--vq->resubmit_num);
307
+ elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
308
+
309
+ if (!vq->resubmit_num) {
310
+ free(vq->resubmit_list);
311
+ vq->resubmit_list = NULL;
312
+ }
313
+
314
+ return elem;
315
+ }
316
+
317
if (vduse_queue_empty(vq)) {
318
return NULL;
319
}
320
@@ -XXX,XX +XXX,XX @@ void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
321
322
vq->inuse++;
323
324
+ vduse_queue_inflight_get(vq, head);
325
+
326
return elem;
327
}
328
329
@@ -XXX,XX +XXX,XX @@ void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
330
unsigned int len)
331
{
332
vduse_queue_fill(vq, elem, len, 0);
333
+ vduse_queue_inflight_pre_put(vq, elem->index);
334
vduse_queue_flush(vq, 1);
335
+ vduse_queue_inflight_post_put(vq, elem->index);
336
}
337
338
static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
339
@@ -XXX,XX +XXX,XX @@ static void vduse_queue_enable(VduseVirtq *vq)
340
}
341
342
vq->fd = fd;
343
- vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
344
- vq->inuse = 0;
345
- vq->used_idx = 0;
346
vq->signalled_used_valid = false;
347
vq->ready = true;
348
349
+ if (vduse_queue_check_inflights(vq)) {
350
+ fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index);
351
+ close(fd);
352
+ return;
353
+ }
354
+
355
dev->ops->enable_queue(dev, vq);
356
}
357
358
@@ -XXX,XX +XXX,XX @@ static void vduse_dev_start_dataplane(VduseDev *dev)
359
360
static void vduse_dev_stop_dataplane(VduseDev *dev)
361
{
362
+ size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
363
int i;
364
365
for (i = 0; i < dev->num_queues; i++) {
366
vduse_queue_disable(&dev->vqs[i]);
367
}
368
+ if (dev->log) {
369
+ memset(dev->log, 0, log_size);
370
+ }
371
dev->features = 0;
372
vduse_iova_remove_region(dev, 0, ULONG_MAX);
373
}
374
@@ -XXX,XX +XXX,XX @@ int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
37
return -errno;
375
return -errno;
38
}
376
}
39
377
40
@@ -XXX,XX +XXX,XX @@ static int raw_normalize_devicepath(const char **filename)
378
+ vduse_queue_enable(vq);
41
snprintf(namebuf, PATH_MAX, "%.*s/r%s",
379
+
42
(int)(dp - fname), fname, dp + 1);
380
+ return 0;
43
}
381
+}
44
- fprintf(stderr, "%s is a block device", fname);
382
+
45
*filename = namebuf;
383
+int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename)
46
- fprintf(stderr, ", using %s\n", *filename);
384
+{
47
+ warn_report("%s is a block device, using %s", fname, *filename);
385
+
48
386
+ size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
387
+ void *log;
388
+ int i;
389
+
390
+ dev->log = log = vduse_log_get(filename, log_size);
391
+ if (log == MAP_FAILED) {
392
+ fprintf(stderr, "Failed to get vduse log\n");
393
+ return -EINVAL;
394
+ }
395
+
396
+ for (i = 0; i < dev->num_queues; i++) {
397
+ dev->vqs[i].log = log;
398
+ dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
399
+ log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
400
+ }
401
+
49
return 0;
402
return 0;
50
}
403
}
51
#else
404
52
-static int raw_normalize_devicepath(const char **filename)
405
@@ -XXX,XX +XXX,XX @@ static int vduse_dev_init(VduseDev *dev, const char *name,
53
+static int raw_normalize_devicepath(const char **filename, Error **errp)
406
return -errno;
54
{
407
}
55
return 0;
408
56
}
409
+ if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
57
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
410
+ fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
58
411
+ close(fd);
59
filename = qemu_opt_get(opts, "filename");
412
+ return -errno;
60
413
+ }
61
- ret = raw_normalize_devicepath(&filename);
414
+
62
+ ret = raw_normalize_devicepath(&filename, errp);
415
dev_name = strdup(name);
63
if (ret != 0) {
416
if (!dev_name) {
64
- error_setg_errno(errp, -ret, "Could not normalize device path");
417
close(fd);
65
goto fail;
418
@@ -XXX,XX +XXX,XX @@ VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
66
}
419
return NULL;
67
420
}
68
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
421
69
case ON_OFF_AUTO_ON:
422
+ if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
70
s->use_lock = true;
423
+ fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
71
if (!qemu_has_ofd_lock()) {
424
+ free(dev);
72
- fprintf(stderr,
425
+ return NULL;
73
- "File lock requested but OFD locking syscall is "
426
+ }
74
- "unavailable, falling back to POSIX file locks.\n"
427
+
75
- "Due to the implementation, locks can be lost "
428
ret = vduse_dev_init_vqs(dev, num_queues);
76
- "unexpectedly.\n");
429
if (ret) {
77
+ warn_report("File lock requested but OFD locking syscall is "
430
fprintf(stderr, "Failed to init vqs\n");
78
+ "unavailable, falling back to POSIX file locks");
431
@@ -XXX,XX +XXX,XX @@ VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
79
+ error_printf("Due to the implementation, locks can be lost "
432
80
+ "unexpectedly.\n");
433
ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
81
}
434
free(dev_config);
82
break;
435
- if (ret < 0) {
83
case ON_OFF_AUTO_OFF:
436
+ if (ret && errno != EEXIST) {
84
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
437
fprintf(stderr, "Failed to create vduse device %s: %s\n",
85
/* Theoretically the above call only unlocks bytes and it cannot
438
name, strerror(errno));
86
* fail. Something weird happened, report it.
439
goto err_dev;
87
*/
440
@@ -XXX,XX +XXX,XX @@ err_ctrl:
88
- error_report_err(local_err);
441
89
+ warn_report_err(local_err);
442
int vduse_dev_destroy(VduseDev *dev)
90
}
443
{
91
break;
444
- int ret = 0;
92
case RAW_PL_COMMIT:
445
+ size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
93
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
446
+ int i, ret = 0;
94
/* Theoretically the above call only unlocks bytes and it cannot
447
95
* fail. Something weird happened, report it.
448
+ if (dev->log) {
96
*/
449
+ munmap(dev->log, log_size);
97
- error_report_err(local_err);
450
+ }
98
+ warn_report_err(local_err);
451
+ for (i = 0; i < dev->num_queues; i++) {
99
}
452
+ free(dev->vqs[i].resubmit_list);
100
break;
453
+ }
101
}
454
free(dev->vqs);
102
@@ -XXX,XX +XXX,XX @@ static int raw_reopen_prepare(BDRVReopenState *state,
455
if (dev->fd >= 0) {
103
/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
456
close(dev->fd);
104
if (rs->fd == -1) {
105
const char *normalized_filename = state->bs->filename;
106
- ret = raw_normalize_devicepath(&normalized_filename);
107
- if (ret < 0) {
108
- error_setg_errno(errp, -ret, "Could not normalize device path");
109
- } else {
110
+ ret = raw_normalize_devicepath(&normalized_filename, errp);
111
+ if (ret >= 0) {
112
assert(!(rs->open_flags & O_CREAT));
113
rs->fd = qemu_open(normalized_filename, rs->open_flags);
114
if (rs->fd == -1) {
115
@@ -XXX,XX +XXX,XX @@ static int aio_worker(void *arg)
116
ret = handle_aiocb_truncate(aiocb);
117
break;
118
default:
119
- fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
120
+ error_report("invalid aio request (0x%x)", aiocb->aio_type);
121
ret = -EINVAL;
122
break;
123
}
124
@@ -XXX,XX +XXX,XX @@ out_unlock:
125
* not mean the whole creation operation has failed. So
126
* report it the user for their convenience, but do not report
127
* it to the caller. */
128
- error_report_err(local_err);
129
+ warn_report_err(local_err);
130
}
131
132
out_close:
133
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts
134
135
(void)has_prefix;
136
137
- ret = raw_normalize_devicepath(&filename);
138
+ ret = raw_normalize_devicepath(&filename, errp);
139
if (ret < 0) {
140
- error_setg_errno(errp, -ret, "Could not normalize device path");
141
return ret;
142
}
143
144
--
457
--
145
2.19.1
458
2.35.3
146
147
diff view generated by jsdifflib
New patch
1
From: Stefan Hajnoczi <stefanha@redhat.com>
1
2
3
Document vduse-blk exports in qemu-storage-daemon --help and the
4
qemu-storage-daemon(1) man page.
5
6
Based-on: <20220523084611.91-1-xieyongji@bytedance.com>
7
Cc: Xie Yongji <xieyongji@bytedance.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-Id: <20220525121947.859820-1-stefanha@redhat.com>
10
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
11
---
12
docs/tools/qemu-storage-daemon.rst | 21 +++++++++++++++++++++
13
storage-daemon/qemu-storage-daemon.c | 9 +++++++++
14
2 files changed, 30 insertions(+)
15
16
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
17
index XXXXXXX..XXXXXXX 100644
18
--- a/docs/tools/qemu-storage-daemon.rst
19
+++ b/docs/tools/qemu-storage-daemon.rst
20
@@ -XXX,XX +XXX,XX @@ Standard options:
21
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
22
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
23
--export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
24
+ --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>]
25
26
is a block export definition. ``node-name`` is the block node that should be
27
exported. ``writable`` determines whether or not the export allows write
28
@@ -XXX,XX +XXX,XX @@ Standard options:
29
``allow-other`` to auto (the default) will try enabling this option, and on
30
error fall back to disabling it.
31
32
+ The ``vduse-blk`` export type uses the ``id`` as the VDUSE device name.
33
+ ``num-queues`` sets the number of virtqueues (the default is 1).
34
+ ``queue-size`` sets the virtqueue descriptor table size (the default is 256).
35
+
36
+ The instantiated VDUSE device must then be added to the vDPA bus using the
37
+ vdpa(8) command from the iproute2 project::
38
+
39
+ # vdpa dev add name <id> mgmtdev vduse
40
+
41
+ The device can be removed from the vDPA bus later as follows::
42
+
43
+ # vdpa dev del <id>
44
+
45
+ For more information about attaching vDPA devices to the host with
46
+ virtio_vdpa.ko or attaching them to guests with vhost_vdpa.ko, see
47
+ https://vdpa-dev.gitlab.io/.
48
+
49
+ For more information about VDUSE, see
50
+ https://docs.kernel.org/userspace-api/vduse.html.
51
+
52
.. option:: --monitor MONITORDEF
53
54
is a QMP monitor definition. See the :manpage:`qemu(1)` manual page for
55
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
56
index XXXXXXX..XXXXXXX 100644
57
--- a/storage-daemon/qemu-storage-daemon.c
58
+++ b/storage-daemon/qemu-storage-daemon.c
59
@@ -XXX,XX +XXX,XX @@ static void help(void)
60
" vhost-user-blk device over file descriptor\n"
61
"\n"
62
#endif /* CONFIG_VHOST_USER_BLK_SERVER */
63
+#ifdef CONFIG_VDUSE_BLK_EXPORT
64
+" --export [type=]vduse-blk,id=<id>,node-name=<node-name>\n"
65
+" [,writable=on|off][,num-queues=<num-queues>]\n"
66
+" [,queue-size=<queue-size>]\n"
67
+" [,logical-block-size=<logical-block-size>]\n"
68
+" export the specified block node as a vduse-blk\n"
69
+" device using the id as the VDUSE device name\n"
70
+"\n"
71
+#endif /* CONFIG_VDUSE_BLK_EXPORT */
72
" --monitor [chardev=]name[,mode=control][,pretty[=on|off]]\n"
73
" configure a QMP monitor\n"
74
"\n"
75
--
76
2.35.3
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
1
From: Stefano Garzarella <sgarzare@redhat.com>
2
2
3
If we know we've already locked the bytes, don't do it again; similarly
3
If the namespace does not exist, rbd_create() fails with -ENOENT and
4
don't unlock a byte if we haven't locked it. This doesn't change the
4
QEMU reports a generic "error rbd create: No such file or directory":
5
behavior, but fixes a corner case explained below.
6
5
7
Libvirt had an error handling bug that an image can get its (ownership,
6
$ qemu-img create rbd:rbd/namespace/image 1M
8
file mode, SELinux) permissions changed (RHBZ 1584982) by mistake behind
7
Formatting 'rbd:rbd/namespace/image', fmt=raw size=1048576
9
QEMU. Specifically, an image in use by Libvirt VM has:
8
qemu-img: rbd:rbd/namespace/image: error rbd create: No such file or directory
10
9
11
$ ls -lhZ b.img
10
Unfortunately rados_ioctx_set_namespace() does not fail if the namespace
12
-rw-r--r--. qemu qemu system_u:object_r:svirt_image_t:s0:c600,c690 b.img
11
does not exist, so let's use rbd_namespace_exists() in qemu_rbd_connect()
12
to check if the namespace exists, reporting a more understandable error:
13
13
14
Trying to attach it a second time won't work because of image locking.
14
$ qemu-img create rbd:rbd/namespace/image 1M
15
And after the error, it becomes:
15
Formatting 'rbd:rbd/namespace/image', fmt=raw size=1048576
16
qemu-img: rbd:rbd/namespace/image: namespace 'namespace' does not exist
16
17
17
$ ls -lhZ b.img
18
Reported-by: Tingting Mao <timao@redhat.com>
18
-rw-r--r--. root root system_u:object_r:virt_image_t:s0 b.img
19
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
19
20
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
20
Then, we won't be able to do OFD lock operations with the existing fd.
21
Message-Id: <20220517071012.6120-1-sgarzare@redhat.com>
21
In other words, the code such as in blk_detach_dev:
22
23
blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
24
25
can abort() QEMU, out of environmental changes.
26
27
This patch is an easy fix to this and the change is regardlessly
28
reasonable, so do it.
29
30
Signed-off-by: Fam Zheng <famz@redhat.com>
31
Reviewed-by: Max Reitz <mreitz@redhat.com>
32
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
22
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
33
---
23
---
34
block/file-posix.c | 54 +++++++++++++++++++++++++++++++++++++---------
24
block/rbd.c | 24 ++++++++++++++++++++++++
35
1 file changed, 44 insertions(+), 10 deletions(-)
25
meson.build | 6 ++++++
26
2 files changed, 30 insertions(+)
36
27
37
diff --git a/block/file-posix.c b/block/file-posix.c
28
diff --git a/block/rbd.c b/block/rbd.c
38
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
39
--- a/block/file-posix.c
30
--- a/block/rbd.c
40
+++ b/block/file-posix.c
31
+++ b/block/rbd.c
41
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState {
32
@@ -XXX,XX +XXX,XX @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
42
uint64_t perm;
33
error_setg_errno(errp, -r, "error opening pool %s", opts->pool);
43
uint64_t shared_perm;
34
goto failed_shutdown;
44
35
}
45
+ /* The perms bits whose corresponding bytes are already locked in
46
+ * s->lock_fd. */
47
+ uint64_t locked_perm;
48
+ uint64_t locked_shared_perm;
49
+
36
+
50
#ifdef CONFIG_XFS
37
+#ifdef HAVE_RBD_NAMESPACE_EXISTS
51
bool is_xfs:1;
38
+ if (opts->has_q_namespace && strlen(opts->q_namespace) > 0) {
52
#endif
39
+ bool exists;
53
@@ -XXX,XX +XXX,XX @@ typedef enum {
54
* file; if @unlock == true, also unlock the unneeded bytes.
55
* @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
56
*/
57
-static int raw_apply_lock_bytes(int fd,
58
+static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
59
uint64_t perm_lock_bits,
60
uint64_t shared_perm_lock_bits,
61
bool unlock, Error **errp)
62
{
63
int ret;
64
int i;
65
+ uint64_t locked_perm, locked_shared_perm;
66
+
40
+
67
+ if (s) {
41
+ r = rbd_namespace_exists(*io_ctx, opts->q_namespace, &exists);
68
+ locked_perm = s->locked_perm;
42
+ if (r < 0) {
69
+ locked_shared_perm = s->locked_shared_perm;
43
+ error_setg_errno(errp, -r, "error checking namespace");
70
+ } else {
44
+ goto failed_ioctx_destroy;
71
+ /*
45
+ }
72
+ * We don't have the previous bits, just lock/unlock for each of the
46
+
73
+ * requested bits.
47
+ if (!exists) {
74
+ */
48
+ error_setg(errp, "namespace '%s' does not exist",
75
+ if (unlock) {
49
+ opts->q_namespace);
76
+ locked_perm = BLK_PERM_ALL;
50
+ r = -ENOENT;
77
+ locked_shared_perm = BLK_PERM_ALL;
51
+ goto failed_ioctx_destroy;
78
+ } else {
79
+ locked_perm = 0;
80
+ locked_shared_perm = 0;
81
+ }
52
+ }
82
+ }
53
+ }
83
54
+#endif
84
PERM_FOREACH(i) {
55
+
85
int off = RAW_LOCK_PERM_BASE + i;
56
/*
86
- if (perm_lock_bits & (1ULL << i)) {
57
* Set the namespace after opening the io context on the pool,
87
+ uint64_t bit = (1ULL << i);
58
* if nspace == NULL or if nspace == "", it is just as we did nothing
88
+ if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
59
@@ -XXX,XX +XXX,XX @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
89
ret = qemu_lock_fd(fd, off, 1, false);
60
r = 0;
90
if (ret) {
61
goto out;
91
error_setg(errp, "Failed to lock byte %d", off);
62
92
return ret;
63
+#ifdef HAVE_RBD_NAMESPACE_EXISTS
93
+ } else if (s) {
64
+failed_ioctx_destroy:
94
+ s->locked_perm |= bit;
65
+ rados_ioctx_destroy(*io_ctx);
95
}
66
+#endif
96
- } else if (unlock) {
67
failed_shutdown:
97
+ } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
68
rados_shutdown(*cluster);
98
ret = qemu_unlock_fd(fd, off, 1);
69
out:
99
if (ret) {
70
diff --git a/meson.build b/meson.build
100
error_setg(errp, "Failed to unlock byte %d", off);
71
index XXXXXXX..XXXXXXX 100644
101
return ret;
72
--- a/meson.build
102
+ } else if (s) {
73
+++ b/meson.build
103
+ s->locked_perm &= ~bit;
74
@@ -XXX,XX +XXX,XX @@ config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs'))
104
}
75
config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: util))
105
}
76
config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul'))
106
}
77
config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
107
PERM_FOREACH(i) {
78
+if rbd.found()
108
int off = RAW_LOCK_SHARED_BASE + i;
79
+ config_host_data.set('HAVE_RBD_NAMESPACE_EXISTS',
109
- if (shared_perm_lock_bits & (1ULL << i)) {
80
+ cc.has_function('rbd_namespace_exists',
110
+ uint64_t bit = (1ULL << i);
81
+ dependencies: rbd,
111
+ if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
82
+ prefix: '#include <rbd/librbd.h>'))
112
ret = qemu_lock_fd(fd, off, 1, false);
83
+endif
113
if (ret) {
84
if rdma.found()
114
error_setg(errp, "Failed to lock byte %d", off);
85
config_host_data.set('HAVE_IBV_ADVISE_MR',
115
return ret;
86
cc.has_function('ibv_advise_mr',
116
+ } else if (s) {
117
+ s->locked_shared_perm |= bit;
118
}
119
- } else if (unlock) {
120
+ } else if (unlock && (locked_shared_perm & bit) &&
121
+ !(shared_perm_lock_bits & bit)) {
122
ret = qemu_unlock_fd(fd, off, 1);
123
if (ret) {
124
error_setg(errp, "Failed to unlock byte %d", off);
125
return ret;
126
+ } else if (s) {
127
+ s->locked_shared_perm &= ~bit;
128
}
129
}
130
}
131
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
132
133
switch (op) {
134
case RAW_PL_PREPARE:
135
- ret = raw_apply_lock_bytes(s->lock_fd, s->perm | new_perm,
136
+ ret = raw_apply_lock_bytes(s, s->lock_fd, s->perm | new_perm,
137
~s->shared_perm | ~new_shared,
138
false, errp);
139
if (!ret) {
140
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
141
op = RAW_PL_ABORT;
142
/* fall through to unlock bytes. */
143
case RAW_PL_ABORT:
144
- raw_apply_lock_bytes(s->lock_fd, s->perm, ~s->shared_perm,
145
+ raw_apply_lock_bytes(s, s->lock_fd, s->perm, ~s->shared_perm,
146
true, &local_err);
147
if (local_err) {
148
/* Theoretically the above call only unlocks bytes and it cannot
149
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
150
}
151
break;
152
case RAW_PL_COMMIT:
153
- raw_apply_lock_bytes(s->lock_fd, new_perm, ~new_shared,
154
+ raw_apply_lock_bytes(s, s->lock_fd, new_perm, ~new_shared,
155
true, &local_err);
156
if (local_err) {
157
/* Theoretically the above call only unlocks bytes and it cannot
158
@@ -XXX,XX +XXX,XX @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
159
shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
160
161
/* Step one: Take locks */
162
- result = raw_apply_lock_bytes(fd, perm, ~shared, false, errp);
163
+ result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
164
if (result < 0) {
165
goto out_close;
166
}
167
@@ -XXX,XX +XXX,XX @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
168
}
169
170
out_unlock:
171
- raw_apply_lock_bytes(fd, 0, 0, true, &local_err);
172
+ raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
173
if (local_err) {
174
/* The above call should not fail, and if it does, that does
175
* not mean the whole creation operation has failed. So
176
--
87
--
177
2.19.1
88
2.35.3
178
179
diff view generated by jsdifflib
1
From: Peter Maydell <peter.maydell@linaro.org>
1
From: Fabian Ebner <f.ebner@proxmox.com>
2
2
3
In the function external_snapshot_prepare() we have a
3
On 64-bit platforms, assigning SIZE_MAX to the int64_t max_pdiscard
4
BlockdevSnapshotSync struct, which has the usual combination
4
results in a negative value, and the following assertion would trigger
5
of has_snapshot_node_name and snapshot_node_name fields for an
5
down the line (it's not the same max_pdiscard, but computed from the
6
optional field. We set up a local variable
6
other one):
7
const char *snapshot_node_name =
7
qemu-system-x86_64: ../block/io.c:3166: bdrv_co_pdiscard: Assertion
8
s->has_snapshot_node_name ? s->snapshot_node_name : NULL;
8
`max_pdiscard >= bs->bl.request_alignment' failed.
9
9
10
and then mostly use "if (!snapshot_node_name)" for checking
10
On 32-bit platforms, it's fine to keep using SIZE_MAX.
11
whether we have a snapshot node name. The exception is that in
12
one place we check s->has_snapshot_node_name instead. This
13
confuses Coverity (CID 1396473), which thinks it might be
14
possible to get here with s->has_snapshot_node_name true but
15
snapshot_node_name NULL, and warns that the call to
16
qdict_put_str() will segfault in that case.
17
11
18
Make the code consistent and unconfuse Coverity by using
12
The assertion in qemu_gluster_co_pdiscard() is checking that the value
19
the same check for this conditional that we do in the rest
13
of 'bytes' can safely be passed to glfs_discard_async(), which takes a
20
of the surrounding code.
14
size_t for the argument in question, so it is kept as is. And since
15
max_pdiscard is still <= SIZE_MAX, relying on max_pdiscard is still
16
fine.
21
17
22
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
18
Fixes: 0c8022876f ("block: use int64_t instead of int in driver discard handlers")
23
Reviewed-by: Alberto Garcia <berto@igalia.com>
19
Cc: qemu-stable@nongnu.org
20
Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
21
Message-Id: <20220520075922.43972-1-f.ebner@proxmox.com>
22
Reviewed-by: Eric Blake <eblake@redhat.com>
23
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
24
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
24
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
25
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
25
---
26
---
26
blockdev.c | 2 +-
27
block/gluster.c | 2 +-
27
1 file changed, 1 insertion(+), 1 deletion(-)
28
1 file changed, 1 insertion(+), 1 deletion(-)
28
29
29
diff --git a/blockdev.c b/blockdev.c
30
diff --git a/block/gluster.c b/block/gluster.c
30
index XXXXXXX..XXXXXXX 100644
31
index XXXXXXX..XXXXXXX 100644
31
--- a/blockdev.c
32
--- a/block/gluster.c
32
+++ b/blockdev.c
33
+++ b/block/gluster.c
33
@@ -XXX,XX +XXX,XX @@ static void external_snapshot_prepare(BlkActionState *common,
34
@@ -XXX,XX +XXX,XX @@ out:
34
}
35
static void qemu_gluster_refresh_limits(BlockDriverState *bs, Error **errp)
35
36
{
36
options = qdict_new();
37
bs->bl.max_transfer = GLUSTER_MAX_TRANSFER;
37
- if (s->has_snapshot_node_name) {
38
- bs->bl.max_pdiscard = SIZE_MAX;
38
+ if (snapshot_node_name) {
39
+ bs->bl.max_pdiscard = MIN(SIZE_MAX, INT64_MAX);
39
qdict_put_str(options, "node-name", snapshot_node_name);
40
}
40
}
41
41
qdict_put_str(options, "driver", format);
42
static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
42
--
43
--
43
2.19.1
44
2.35.3
44
45
diff view generated by jsdifflib
New patch
1
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
1
2
3
It seems that aio_wait_kick always required a memory barrier
4
or atomic operation in the caller, but nobody actually
5
took care of doing it.
6
7
Let's put the barrier in the function instead, and pair it
8
with another one in AIO_WAIT_WHILE. Read aio_wait_kick()
9
comment for further explanation.
10
11
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
12
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
13
Message-Id: <20220524173054.12651-1-eesposit@redhat.com>
14
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
15
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
16
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
17
---
18
include/block/aio-wait.h | 2 ++
19
util/aio-wait.c | 16 +++++++++++++++-
20
2 files changed, 17 insertions(+), 1 deletion(-)
21
22
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
23
index XXXXXXX..XXXXXXX 100644
24
--- a/include/block/aio-wait.h
25
+++ b/include/block/aio-wait.h
26
@@ -XXX,XX +XXX,XX @@ extern AioWait global_aio_wait;
27
AioContext *ctx_ = (ctx); \
28
/* Increment wait_->num_waiters before evaluating cond. */ \
29
qatomic_inc(&wait_->num_waiters); \
30
+ /* Paired with smp_mb in aio_wait_kick(). */ \
31
+ smp_mb(); \
32
if (ctx_ && in_aio_context_home_thread(ctx_)) { \
33
while ((cond)) { \
34
aio_poll(ctx_, true); \
35
diff --git a/util/aio-wait.c b/util/aio-wait.c
36
index XXXXXXX..XXXXXXX 100644
37
--- a/util/aio-wait.c
38
+++ b/util/aio-wait.c
39
@@ -XXX,XX +XXX,XX @@ static void dummy_bh_cb(void *opaque)
40
41
void aio_wait_kick(void)
42
{
43
- /* The barrier (or an atomic op) is in the caller. */
44
+ /*
45
+ * Paired with smp_mb in AIO_WAIT_WHILE. Here we have:
46
+ * write(condition);
47
+ * aio_wait_kick() {
48
+ * smp_mb();
49
+ * read(num_waiters);
50
+ * }
51
+ *
52
+ * And in AIO_WAIT_WHILE:
53
+ * write(num_waiters);
54
+ * smp_mb();
55
+ * read(condition);
56
+ */
57
+ smp_mb();
58
+
59
if (qatomic_read(&global_aio_wait.num_waiters)) {
60
aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
61
}
62
--
63
2.35.3
diff view generated by jsdifflib
1
From: Jeff Cody <jcody@redhat.com>
1
From: Eric Blake <eblake@redhat.com>
2
2
3
This adds configure options to control the following block drivers:
3
CID 1488362 points out that the second 'rc >= 0' check is now dead
4
code.
4
5
5
* Bochs
6
Reported-by: Peter Maydell <peter.maydell@linaro.org>
6
* Cloop
7
Fixes: 172f5f1a40(nbd: remove peppering of nbd_client_connected)
7
* Dmg
8
Signed-off-by: Eric Blake <eblake@redhat.com>
8
* Qcow (V1)
9
Message-Id: <20220516210519.76135-1-eblake@redhat.com>
9
* Vdi
10
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
10
* Vvfat
11
Reviewed-by: Vladimir Sementsov-Ogievskiy <v.sementsov-og@mail.ru>
11
* qed
12
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
* parallels
13
---
13
* sheepdog
14
block/nbd.c | 8 ++------
15
1 file changed, 2 insertions(+), 6 deletions(-)
14
16
15
Each of these defaults to being enabled.
17
diff --git a/block/nbd.c b/block/nbd.c
16
17
Signed-off-by: Jeff Cody <jcody@redhat.com>
18
Signed-off-by: Markus Armbruster <armbru@redhat.com>
19
Message-id: 20181107063644.2254-1-armbru@redhat.com
20
Signed-off-by: Max Reitz <mreitz@redhat.com>
21
---
22
configure | 91 +++++++++++++++++++++++++++++++++++++++++++++
23
block/Makefile.objs | 22 ++++++++---
24
2 files changed, 107 insertions(+), 6 deletions(-)
25
26
diff --git a/configure b/configure
27
index XXXXXXX..XXXXXXX 100755
28
--- a/configure
29
+++ b/configure
30
@@ -XXX,XX +XXX,XX @@ tcmalloc="no"
31
jemalloc="no"
32
replication="yes"
33
vxhs=""
34
+bochs="yes"
35
+cloop="yes"
36
+dmg="yes"
37
+qcow1="yes"
38
+vdi="yes"
39
+vvfat="yes"
40
+qed="yes"
41
+parallels="yes"
42
+sheepdog="yes"
43
libxml2=""
44
docker="no"
45
debug_mutex="no"
46
@@ -XXX,XX +XXX,XX @@ for opt do
47
;;
48
--enable-vxhs) vxhs="yes"
49
;;
50
+ --disable-bochs) bochs="no"
51
+ ;;
52
+ --enable-bochs) bochs="yes"
53
+ ;;
54
+ --disable-cloop) cloop="no"
55
+ ;;
56
+ --enable-cloop) cloop="yes"
57
+ ;;
58
+ --disable-dmg) dmg="no"
59
+ ;;
60
+ --enable-dmg) dmg="yes"
61
+ ;;
62
+ --disable-qcow1) qcow1="no"
63
+ ;;
64
+ --enable-qcow1) qcow1="yes"
65
+ ;;
66
+ --disable-vdi) vdi="no"
67
+ ;;
68
+ --enable-vdi) vdi="yes"
69
+ ;;
70
+ --disable-vvfat) vvfat="no"
71
+ ;;
72
+ --enable-vvfat) vvfat="yes"
73
+ ;;
74
+ --disable-qed) qed="no"
75
+ ;;
76
+ --enable-qed) qed="yes"
77
+ ;;
78
+ --disable-parallels) parallels="no"
79
+ ;;
80
+ --enable-parallels) parallels="yes"
81
+ ;;
82
+ --disable-sheepdog) sheepdog="no"
83
+ ;;
84
+ --enable-sheepdog) sheepdog="yes"
85
+ ;;
86
--disable-vhost-user) vhost_user="no"
87
;;
88
--enable-vhost-user)
89
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available:
90
qom-cast-debug cast debugging support
91
tools build qemu-io, qemu-nbd and qemu-image tools
92
vxhs Veritas HyperScale vDisk backend support
93
+ bochs bochs image format support
94
+ cloop cloop image format support
95
+ dmg dmg image format support
96
+ qcow1 qcow v1 image format support
97
+ vdi vdi image format support
98
+ vvfat vvfat image format support
99
+ qed qed image format support
100
+ parallels parallels image format support
101
+ sheepdog sheepdog block driver support
102
crypto-afalg Linux AF_ALG crypto backend driver
103
vhost-user vhost-user support
104
capstone capstone disassembler support
105
@@ -XXX,XX +XXX,XX @@ echo "jemalloc support $jemalloc"
106
echo "avx2 optimization $avx2_opt"
107
echo "replication support $replication"
108
echo "VxHS block device $vxhs"
109
+echo "bochs support $bochs"
110
+echo "cloop support $cloop"
111
+echo "dmg support $dmg"
112
+echo "qcow v1 support $qcow1"
113
+echo "vdi support $vdi"
114
+echo "vvfat support $vvfat"
115
+echo "qed support $qed"
116
+echo "parallels support $parallels"
117
+echo "sheepdog support $sheepdog"
118
echo "capstone $capstone"
119
echo "docker $docker"
120
echo "libpmem support $libpmem"
121
@@ -XXX,XX +XXX,XX @@ if test "$libpmem" = "yes" ; then
122
echo "CONFIG_LIBPMEM=y" >> $config_host_mak
123
fi
124
125
+if test "$bochs" = "yes" ; then
126
+ echo "CONFIG_BOCHS=y" >> $config_host_mak
127
+fi
128
+if test "$cloop" = "yes" ; then
129
+ echo "CONFIG_CLOOP=y" >> $config_host_mak
130
+fi
131
+if test "$dmg" = "yes" ; then
132
+ echo "CONFIG_DMG=y" >> $config_host_mak
133
+fi
134
+if test "$qcow1" = "yes" ; then
135
+ echo "CONFIG_QCOW1=y" >> $config_host_mak
136
+fi
137
+if test "$vdi" = "yes" ; then
138
+ echo "CONFIG_VDI=y" >> $config_host_mak
139
+fi
140
+if test "$vvfat" = "yes" ; then
141
+ echo "CONFIG_VVFAT=y" >> $config_host_mak
142
+fi
143
+if test "$qed" = "yes" ; then
144
+ echo "CONFIG_QED=y" >> $config_host_mak
145
+fi
146
+if test "$parallels" = "yes" ; then
147
+ echo "CONFIG_PARALLELS=y" >> $config_host_mak
148
+fi
149
+if test "$sheepdog" = "yes" ; then
150
+ echo "CONFIG_SHEEPDOG=y" >> $config_host_mak
151
+fi
152
+
153
if test "$tcg_interpreter" = "yes"; then
154
QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
155
elif test "$ARCH" = "sparc64" ; then
156
diff --git a/block/Makefile.objs b/block/Makefile.objs
157
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
158
--- a/block/Makefile.objs
19
--- a/block/nbd.c
159
+++ b/block/Makefile.objs
20
+++ b/block/nbd.c
160
@@ -XXX,XX +XXX,XX @@
21
@@ -XXX,XX +XXX,XX @@ static int coroutine_fn nbd_co_send_request(BlockDriverState *bs,
161
-block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o
22
if (qiov) {
162
+block-obj-y += raw-format.o vmdk.o vpc.o
23
qio_channel_set_cork(s->ioc, true);
163
+block-obj-$(CONFIG_QCOW1) += qcow.o
24
rc = nbd_send_request(s->ioc, request);
164
+block-obj-$(CONFIG_VDI) += vdi.o
25
- if (rc >= 0) {
165
+block-obj-$(CONFIG_CLOOP) += cloop.o
26
- if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
166
+block-obj-$(CONFIG_BOCHS) += bochs.o
27
- NULL) < 0) {
167
+block-obj-$(CONFIG_VVFAT) += vvfat.o
28
- rc = -EIO;
168
+block-obj-$(CONFIG_DMG) += dmg.o
29
- }
169
+
30
- } else if (rc >= 0) {
170
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o qcow2-bitmap.o
31
+ if (rc >= 0 && qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
171
-block-obj-y += qed.o qed-l2-cache.o qed-table.o qed-cluster.o
32
+ NULL) < 0) {
172
-block-obj-y += qed-check.o
33
rc = -EIO;
173
+block-obj-$(CONFIG_QED) += qed.o qed-l2-cache.o qed-table.o qed-cluster.o
34
}
174
+block-obj-$(CONFIG_QED) += qed-check.o
35
qio_channel_set_cork(s->ioc, false);
175
block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
176
block-obj-y += quorum.o
177
-block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
178
+block-obj-y += blkdebug.o blkverify.o blkreplay.o
179
+block-obj-$(CONFIG_PARALLELS) += parallels.o
180
block-obj-y += blklogwrites.o
181
block-obj-y += block-backend.o snapshot.o qapi.o
182
block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
183
@@ -XXX,XX +XXX,XX @@ block-obj-y += null.o mirror.o commit.o io.o create.o
184
block-obj-y += throttle-groups.o
185
block-obj-$(CONFIG_LINUX) += nvme.o
186
187
-block-obj-y += nbd.o nbd-client.o sheepdog.o
188
+block-obj-y += nbd.o nbd-client.o
189
+block-obj-$(CONFIG_SHEEPDOG) += sheepdog.o
190
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
191
block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
192
block-obj-$(CONFIG_LIBNFS) += nfs.o
193
@@ -XXX,XX +XXX,XX @@ gluster.o-libs := $(GLUSTERFS_LIBS)
194
vxhs.o-libs := $(VXHS_LIBS)
195
ssh.o-cflags := $(LIBSSH2_CFLAGS)
196
ssh.o-libs := $(LIBSSH2_LIBS)
197
-block-obj-$(if $(CONFIG_BZIP2),m,n) += dmg-bz2.o
198
+block-obj-dmg-bz2-$(CONFIG_BZIP2) += dmg-bz2.o
199
+block-obj-$(if $(CONFIG_DMG),m,n) += $(block-obj-dmg-bz2-y)
200
dmg-bz2.o-libs := $(BZIP2_LIBS)
201
qcow.o-libs := -lz
202
linux-aio.o-libs := -laio
203
--
36
--
204
2.19.1
37
2.35.3
205
206
diff view generated by jsdifflib
1
From: Fam Zheng <famz@redhat.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
The lock_fd field is not strictly necessary because transferring locked
3
Add a 'serial' option to allow user to specify this value
4
bytes from old fd to the new one shouldn't fail anyway. This spares the
4
explicitly. And the default value is changed to an empty
5
user one fd per image.
5
string as what we did in "hw/block/virtio-blk.c".
6
6
7
Signed-off-by: Fam Zheng <famz@redhat.com>
7
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
8
Reviewed-by: Max Reitz <mreitz@redhat.com>
8
Message-Id: <20220614051532.92-6-xieyongji@bytedance.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
9
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
10
---
10
---
11
block/file-posix.c | 37 +++++++++++++------------------------
11
qapi/block-export.json | 4 +++-
12
1 file changed, 13 insertions(+), 24 deletions(-)
12
docs/tools/qemu-storage-daemon.rst | 2 +-
13
block/export/virtio-blk-handler.h | 2 +-
14
block/export/vduse-blk.c | 20 ++++++++++++++------
15
block/export/vhost-user-blk-server.c | 4 +++-
16
storage-daemon/qemu-storage-daemon.c | 1 +
17
6 files changed, 23 insertions(+), 10 deletions(-)
13
18
14
diff --git a/block/file-posix.c b/block/file-posix.c
19
diff --git a/qapi/block-export.json b/qapi/block-export.json
15
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
16
--- a/block/file-posix.c
21
--- a/qapi/block-export.json
17
+++ b/block/file-posix.c
22
+++ b/qapi/block-export.json
18
@@ -XXX,XX +XXX,XX @@ do { \
23
@@ -XXX,XX +XXX,XX @@
19
24
# @queue-size: the size of virtqueue. Defaults to 256.
20
typedef struct BDRVRawState {
25
# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE]
21
int fd;
26
# and must be power of 2. Defaults to 512 bytes.
22
- int lock_fd;
27
+# @serial: the serial number of virtio block device. Defaults to empty string.
23
bool use_lock;
28
#
24
int type;
29
# Since: 7.1
25
int open_flags;
30
##
26
@@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState {
31
{ 'struct': 'BlockExportOptionsVduseBlk',
27
uint64_t shared_perm;
32
'data': { '*num-queues': 'uint16',
28
33
'*queue-size': 'uint16',
29
/* The perms bits whose corresponding bytes are already locked in
34
- '*logical-block-size': 'size'} }
30
- * s->lock_fd. */
35
+ '*logical-block-size': 'size',
31
+ * s->fd. */
36
+ '*serial': 'str' } }
32
uint64_t locked_perm;
37
33
uint64_t locked_shared_perm;
38
##
34
39
# @NbdServerAddOptions:
35
@@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
40
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
41
index XXXXXXX..XXXXXXX 100644
42
--- a/docs/tools/qemu-storage-daemon.rst
43
+++ b/docs/tools/qemu-storage-daemon.rst
44
@@ -XXX,XX +XXX,XX @@ Standard options:
45
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
46
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
47
--export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
48
- --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>]
49
+ --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
50
51
is a block export definition. ``node-name`` is the block node that should be
52
exported. ``writable`` determines whether or not the export allows write
53
diff --git a/block/export/virtio-blk-handler.h b/block/export/virtio-blk-handler.h
54
index XXXXXXX..XXXXXXX 100644
55
--- a/block/export/virtio-blk-handler.h
56
+++ b/block/export/virtio-blk-handler.h
57
@@ -XXX,XX +XXX,XX @@
58
59
typedef struct {
60
BlockBackend *blk;
61
- const char *serial;
62
+ char *serial;
63
uint32_t logical_block_size;
64
bool writable;
65
} VirtioBlkHandler;
66
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
67
index XXXXXXX..XXXXXXX 100644
68
--- a/block/export/vduse-blk.c
69
+++ b/block/export/vduse-blk.c
70
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
71
Error *local_err = NULL;
72
struct virtio_blk_config config = { 0 };
73
uint64_t features;
74
- int i;
75
+ int i, ret;
76
77
if (vblk_opts->has_num_queues) {
78
num_queues = vblk_opts->num_queues;
79
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
36
}
80
}
37
s->fd = fd;
81
vblk_exp->num_queues = num_queues;
38
82
vblk_exp->handler.blk = exp->blk;
39
- s->lock_fd = -1;
83
- vblk_exp->handler.serial = exp->id;
40
- if (s->use_lock) {
84
+ vblk_exp->handler.serial = g_strdup(vblk_opts->has_serial ?
41
- fd = qemu_open(filename, s->open_flags);
85
+ vblk_opts->serial : "");
42
- if (fd < 0) {
86
vblk_exp->handler.logical_block_size = logical_block_size;
43
- ret = -errno;
87
vblk_exp->handler.writable = opts->writable;
44
- error_setg_errno(errp, errno, "Could not open '%s' for locking",
88
45
- filename);
89
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
46
- qemu_close(s->fd);
90
vblk_exp);
47
- goto fail;
91
if (!vblk_exp->dev) {
48
- }
92
error_setg(errp, "failed to create vduse device");
49
- s->lock_fd = fd;
93
- return -ENOMEM;
50
- }
94
+ ret = -ENOMEM;
51
s->perm = 0;
95
+ goto err_dev;
52
s->shared_perm = BLK_PERM_ALL;
53
54
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
55
return 0;
56
}
96
}
57
97
58
- assert(s->lock_fd > 0);
98
vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
59
-
99
g_get_tmp_dir(), exp->id);
60
switch (op) {
100
if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
61
case RAW_PL_PREPARE:
101
error_setg(errp, "failed to set reconnect log file");
62
- ret = raw_apply_lock_bytes(s, s->lock_fd, s->perm | new_perm,
102
- vduse_dev_destroy(vblk_exp->dev);
63
+ ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
103
- g_free(vblk_exp->recon_file);
64
~s->shared_perm | ~new_shared,
104
- return -EINVAL;
65
false, errp);
105
+ ret = -EINVAL;
66
if (!ret) {
106
+ goto err;
67
- ret = raw_check_lock_bytes(s->lock_fd, new_perm, new_shared, errp);
68
+ ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
69
if (!ret) {
70
return 0;
71
}
72
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
73
op = RAW_PL_ABORT;
74
/* fall through to unlock bytes. */
75
case RAW_PL_ABORT:
76
- raw_apply_lock_bytes(s, s->lock_fd, s->perm, ~s->shared_perm,
77
+ raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
78
true, &local_err);
79
if (local_err) {
80
/* Theoretically the above call only unlocks bytes and it cannot
81
@@ -XXX,XX +XXX,XX @@ static int raw_handle_perm_lock(BlockDriverState *bs,
82
}
83
break;
84
case RAW_PL_COMMIT:
85
- raw_apply_lock_bytes(s, s->lock_fd, new_perm, ~new_shared,
86
+ raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
87
true, &local_err);
88
if (local_err) {
89
/* Theoretically the above call only unlocks bytes and it cannot
90
@@ -XXX,XX +XXX,XX @@ static void raw_reopen_commit(BDRVReopenState *state)
91
{
92
BDRVRawReopenState *rs = state->opaque;
93
BDRVRawState *s = state->bs->opaque;
94
+ Error *local_err = NULL;
95
96
s->check_cache_dropped = rs->check_cache_dropped;
97
s->open_flags = rs->open_flags;
98
99
+ /* Copy locks to the new fd before closing the old one. */
100
+ raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm,
101
+ ~s->locked_shared_perm, false, &local_err);
102
+ if (local_err) {
103
+ /* shouldn't fail in a sane host, but report it just in case. */
104
+ error_report_err(local_err);
105
+ }
106
qemu_close(s->fd);
107
s->fd = rs->fd;
108
109
@@ -XXX,XX +XXX,XX @@ static void raw_close(BlockDriverState *bs)
110
qemu_close(s->fd);
111
s->fd = -1;
112
}
107
}
113
- if (s->lock_fd >= 0) {
108
114
- qemu_close(s->lock_fd);
109
for (i = 0; i < num_queues; i++) {
115
- s->lock_fd = -1;
110
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
116
- }
111
blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
112
113
return 0;
114
+err:
115
+ vduse_dev_destroy(vblk_exp->dev);
116
+ g_free(vblk_exp->recon_file);
117
+err_dev:
118
+ g_free(vblk_exp->handler.serial);
119
+ return ret;
117
}
120
}
118
121
119
/**
122
static void vduse_blk_exp_delete(BlockExport *exp)
123
@@ -XXX,XX +XXX,XX @@ static void vduse_blk_exp_delete(BlockExport *exp)
124
unlink(vblk_exp->recon_file);
125
}
126
g_free(vblk_exp->recon_file);
127
+ g_free(vblk_exp->handler.serial);
128
}
129
130
static void vduse_blk_exp_request_shutdown(BlockExport *exp)
131
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
132
index XXXXXXX..XXXXXXX 100644
133
--- a/block/export/vhost-user-blk-server.c
134
+++ b/block/export/vhost-user-blk-server.c
135
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
136
return -EINVAL;
137
}
138
vexp->handler.blk = exp->blk;
139
- vexp->handler.serial = "vhost_user_blk";
140
+ vexp->handler.serial = g_strdup("vhost_user_blk");
141
vexp->handler.logical_block_size = logical_block_size;
142
vexp->handler.writable = opts->writable;
143
144
@@ -XXX,XX +XXX,XX @@ static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
145
num_queues, &vu_blk_iface, errp)) {
146
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
147
blk_aio_detach, vexp);
148
+ g_free(vexp->handler.serial);
149
return -EADDRNOTAVAIL;
150
}
151
152
@@ -XXX,XX +XXX,XX @@ static void vu_blk_exp_delete(BlockExport *exp)
153
154
blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
155
vexp);
156
+ g_free(vexp->handler.serial);
157
}
158
159
const BlockExportDriver blk_exp_vhost_user_blk = {
160
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
161
index XXXXXXX..XXXXXXX 100644
162
--- a/storage-daemon/qemu-storage-daemon.c
163
+++ b/storage-daemon/qemu-storage-daemon.c
164
@@ -XXX,XX +XXX,XX @@ static void help(void)
165
" [,writable=on|off][,num-queues=<num-queues>]\n"
166
" [,queue-size=<queue-size>]\n"
167
" [,logical-block-size=<logical-block-size>]\n"
168
+" [,serial=<serial-number>]\n"
169
" export the specified block node as a vduse-blk\n"
170
" device using the id as the VDUSE device name\n"
171
"\n"
120
--
172
--
121
2.19.1
173
2.35.3
122
123
diff view generated by jsdifflib
1
From: Liam Merwick <Liam.Merwick@oracle.com>
1
From: Xie Yongji <xieyongji@bytedance.com>
2
2
3
Although the function block_job_get() can return NULL, it would be a
3
Currently we use 'id' option as the name of VDUSE device.
4
serious bug if it did so (because the job yields before executing anything
4
It's a bit confusing since we use one value for two different
5
(if it started successfully); but otherwise, commit_active_start() would
5
purposes: the ID to identfy the export within QEMU (must be
6
have returned an error). However, as a precaution, before dereferencing
6
distinct from any other exports in the same QEMU process, but
7
the 'job' pointer in img_commit() assert it is not NULL.
7
can overlap with names used by other processes), and the VDUSE
8
name to uniquely identify it on the host (must be distinct from
9
other VDUSE devices on the same host, but can overlap with other
10
export types like NBD in the same process). To make it clear,
11
this patch adds a separate 'name' option to specify the VDUSE
12
name for the vduse-blk export instead.
8
13
9
Signed-off-by: Liam Merwick <Liam.Merwick@oracle.com>
14
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
10
Reviewed-by: Max Reitz <mreitz@redhat.com>
15
Message-Id: <20220614051532.92-7-xieyongji@bytedance.com>
11
Message-id: 1541453919-25973-4-git-send-email-Liam.Merwick@oracle.com
16
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
12
Signed-off-by: Max Reitz <mreitz@redhat.com>
13
---
17
---
14
qemu-img.c | 1 +
18
qapi/block-export.json | 7 ++++---
15
1 file changed, 1 insertion(+)
19
docs/tools/qemu-storage-daemon.rst | 5 +++--
20
block/export/vduse-blk.c | 4 ++--
21
storage-daemon/qemu-storage-daemon.c | 8 ++++----
22
4 files changed, 13 insertions(+), 11 deletions(-)
16
23
17
diff --git a/qemu-img.c b/qemu-img.c
24
diff --git a/qapi/block-export.json b/qapi/block-export.json
18
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
19
--- a/qemu-img.c
26
--- a/qapi/block-export.json
20
+++ b/qemu-img.c
27
+++ b/qapi/block-export.json
21
@@ -XXX,XX +XXX,XX @@ static int img_commit(int argc, char **argv)
28
@@ -XXX,XX +XXX,XX @@
29
#
30
# A vduse-blk block export.
31
#
32
+# @name: the name of VDUSE device (must be unique across the host).
33
# @num-queues: the number of virtqueues. Defaults to 1.
34
# @queue-size: the size of virtqueue. Defaults to 256.
35
# @logical-block-size: Logical block size in bytes. Range [512, PAGE_SIZE]
36
@@ -XXX,XX +XXX,XX @@
37
# Since: 7.1
38
##
39
{ 'struct': 'BlockExportOptionsVduseBlk',
40
- 'data': { '*num-queues': 'uint16',
41
+ 'data': { 'name': 'str',
42
+ '*num-queues': 'uint16',
43
'*queue-size': 'uint16',
44
'*logical-block-size': 'size',
45
'*serial': 'str' } }
46
@@ -XXX,XX +XXX,XX @@
47
# Describes a block export, i.e. how single node should be exported on an
48
# external interface.
49
#
50
-# @id: A unique identifier for the block export (across the host for vduse-blk
51
-# export type or across all export types for other types)
52
+# @id: A unique identifier for the block export (across all export types)
53
#
54
# @node-name: The node name of the block node to be exported (since: 5.2)
55
#
56
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
57
index XXXXXXX..XXXXXXX 100644
58
--- a/docs/tools/qemu-storage-daemon.rst
59
+++ b/docs/tools/qemu-storage-daemon.rst
60
@@ -XXX,XX +XXX,XX @@ Standard options:
61
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
62
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
63
--export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
64
- --export [type=]vduse-blk,id=<id>,node-name=<node-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
65
+ --export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
66
67
is a block export definition. ``node-name`` is the block node that should be
68
exported. ``writable`` determines whether or not the export allows write
69
@@ -XXX,XX +XXX,XX @@ Standard options:
70
``allow-other`` to auto (the default) will try enabling this option, and on
71
error fall back to disabling it.
72
73
- The ``vduse-blk`` export type uses the ``id`` as the VDUSE device name.
74
+ The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
75
+ to create the VDUSE device.
76
``num-queues`` sets the number of virtqueues (the default is 1).
77
``queue-size`` sets the virtqueue descriptor table size (the default is 256).
78
79
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
80
index XXXXXXX..XXXXXXX 100644
81
--- a/block/export/vduse-blk.c
82
+++ b/block/export/vduse-blk.c
83
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
84
features |= 1ULL << VIRTIO_BLK_F_RO;
22
}
85
}
23
86
24
job = block_job_get("commit");
87
- vblk_exp->dev = vduse_dev_create(exp->id, VIRTIO_ID_BLOCK, 0,
25
+ assert(job);
88
+ vblk_exp->dev = vduse_dev_create(vblk_opts->name, VIRTIO_ID_BLOCK, 0,
26
run_block_job(job, &local_err);
89
features, num_queues,
27
if (local_err) {
90
sizeof(struct virtio_blk_config),
28
goto unref_backing;
91
(char *)&config, &vduse_blk_ops,
92
@@ -XXX,XX +XXX,XX @@ static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
93
}
94
95
vblk_exp->recon_file = g_strdup_printf("%s/vduse-blk-%s",
96
- g_get_tmp_dir(), exp->id);
97
+ g_get_tmp_dir(), vblk_opts->name);
98
if (vduse_set_reconnect_log_file(vblk_exp->dev, vblk_exp->recon_file)) {
99
error_setg(errp, "failed to set reconnect log file");
100
ret = -EINVAL;
101
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
102
index XXXXXXX..XXXXXXX 100644
103
--- a/storage-daemon/qemu-storage-daemon.c
104
+++ b/storage-daemon/qemu-storage-daemon.c
105
@@ -XXX,XX +XXX,XX @@ static void help(void)
106
#endif /* CONFIG_VHOST_USER_BLK_SERVER */
107
#ifdef CONFIG_VDUSE_BLK_EXPORT
108
" --export [type=]vduse-blk,id=<id>,node-name=<node-name>\n"
109
-" [,writable=on|off][,num-queues=<num-queues>]\n"
110
-" [,queue-size=<queue-size>]\n"
111
+" ,name=<vduse-name>[,writable=on|off]\n"
112
+" [,num-queues=<num-queues>][,queue-size=<queue-size>]\n"
113
" [,logical-block-size=<logical-block-size>]\n"
114
" [,serial=<serial-number>]\n"
115
-" export the specified block node as a vduse-blk\n"
116
-" device using the id as the VDUSE device name\n"
117
+" export the specified block node as a\n"
118
+" vduse-blk device\n"
119
"\n"
120
#endif /* CONFIG_VDUSE_BLK_EXPORT */
121
" --monitor [chardev=]name[,mode=control][,pretty[=on|off]]\n"
29
--
122
--
30
2.19.1
123
2.35.3
31
32
diff view generated by jsdifflib