1 | The following changes since commit 67f17e23baca5dd545fe98b01169cc351a70fe35: | 1 | The following changes since commit 4c55b1d0bad8a703f0499fe62e3761a0cd288da3: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging (2020-03-06 17:15:36 +0000) | 3 | Merge remote-tracking branch 'remotes/armbru/tags/pull-error-2017-04-24' into staging (2017-04-24 14:49:48 +0100) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the git repository at: |
6 | 6 | ||
7 | https://github.com/stefanha/qemu.git tags/block-pull-request | 7 | git://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to d37d0e365afb6825a90d8356fc6adcc1f58f40f3: | 9 | for you to fetch changes up to ecfa185400ade2abc9915efa924cbad1e15a21a4: |
10 | 10 | ||
11 | aio-posix: remove idle poll handlers to improve scalability (2020-03-09 16:45:16 +0000) | 11 | qemu-iotests: _cleanup_qemu must be called on exit (2017-04-24 15:09:33 -0400) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull v2, with 32-bit errors fixed. I don't have OS X to test compile on, |
15 | 15 | but I think it is safe to assume the cause of the compile error was the same. | |
16 | ---------------------------------------------------------------- | 16 | ---------------------------------------------------------------- |
17 | 17 | ||
18 | Stefan Hajnoczi (9): | 18 | Ashish Mittal (2): |
19 | qemu/queue.h: clear linked list pointers on remove | 19 | block/vxhs.c: Add support for a new block device type called "vxhs" |
20 | aio-posix: remove confusing QLIST_SAFE_REMOVE() | 20 | block/vxhs.c: Add qemu-iotests for new block device type "vxhs" |
21 | aio-posix: completely stop polling when disabled | ||
22 | aio-posix: move RCU_READ_LOCK() into run_poll_handlers() | ||
23 | aio-posix: extract ppoll(2) and epoll(7) fd monitoring | ||
24 | aio-posix: simplify FDMonOps->update() prototype | ||
25 | aio-posix: add io_uring fd monitoring implementation | ||
26 | aio-posix: support userspace polling of fd monitoring | ||
27 | aio-posix: remove idle poll handlers to improve scalability | ||
28 | 21 | ||
29 | MAINTAINERS | 2 + | 22 | Jeff Cody (10): |
30 | configure | 5 + | 23 | qemu-iotests: exclude vxhs from image creation via protocol |
31 | include/block/aio.h | 71 ++++++- | 24 | block: add bdrv_set_read_only() helper function |
32 | include/qemu/queue.h | 19 +- | 25 | block: do not set BDS read_only if copy_on_read enabled |
33 | util/Makefile.objs | 3 + | 26 | block: honor BDRV_O_ALLOW_RDWR when clearing bs->read_only |
34 | util/aio-posix.c | 451 ++++++++++++++---------------------------- | 27 | block: code movement |
35 | util/aio-posix.h | 81 ++++++++ | 28 | block: introduce bdrv_can_set_read_only() |
36 | util/fdmon-epoll.c | 155 +++++++++++++++ | 29 | block: use bdrv_can_set_read_only() during reopen |
37 | util/fdmon-io_uring.c | 332 +++++++++++++++++++++++++++++++ | 30 | block/rbd - update variable names to more apt names |
38 | util/fdmon-poll.c | 107 ++++++++++ | 31 | block/rbd: Add support for reopen() |
39 | util/trace-events | 2 + | 32 | qemu-iotests: _cleanup_qemu must be called on exit |
40 | 11 files changed, 915 insertions(+), 313 deletions(-) | 33 | |
41 | create mode 100644 util/aio-posix.h | 34 | block.c | 56 +++- |
42 | create mode 100644 util/fdmon-epoll.c | 35 | block/Makefile.objs | 2 + |
43 | create mode 100644 util/fdmon-io_uring.c | 36 | block/bochs.c | 5 +- |
44 | create mode 100644 util/fdmon-poll.c | 37 | block/cloop.c | 5 +- |
38 | block/dmg.c | 6 +- | ||
39 | block/rbd.c | 65 +++-- | ||
40 | block/trace-events | 17 ++ | ||
41 | block/vvfat.c | 19 +- | ||
42 | block/vxhs.c | 575 +++++++++++++++++++++++++++++++++++++++ | ||
43 | configure | 39 +++ | ||
44 | include/block/block.h | 2 + | ||
45 | qapi/block-core.json | 23 +- | ||
46 | tests/qemu-iotests/017 | 1 + | ||
47 | tests/qemu-iotests/020 | 1 + | ||
48 | tests/qemu-iotests/028 | 1 + | ||
49 | tests/qemu-iotests/029 | 1 + | ||
50 | tests/qemu-iotests/073 | 1 + | ||
51 | tests/qemu-iotests/094 | 11 +- | ||
52 | tests/qemu-iotests/102 | 5 +- | ||
53 | tests/qemu-iotests/109 | 1 + | ||
54 | tests/qemu-iotests/114 | 1 + | ||
55 | tests/qemu-iotests/117 | 1 + | ||
56 | tests/qemu-iotests/130 | 2 + | ||
57 | tests/qemu-iotests/134 | 1 + | ||
58 | tests/qemu-iotests/140 | 1 + | ||
59 | tests/qemu-iotests/141 | 1 + | ||
60 | tests/qemu-iotests/143 | 1 + | ||
61 | tests/qemu-iotests/156 | 2 + | ||
62 | tests/qemu-iotests/158 | 1 + | ||
63 | tests/qemu-iotests/common | 6 + | ||
64 | tests/qemu-iotests/common.config | 13 + | ||
65 | tests/qemu-iotests/common.filter | 1 + | ||
66 | tests/qemu-iotests/common.rc | 19 ++ | ||
67 | 33 files changed, 844 insertions(+), 42 deletions(-) | ||
68 | create mode 100644 block/vxhs.c | ||
45 | 69 | ||
46 | -- | 70 | -- |
47 | 2.24.1 | 71 | 2.9.3 |
48 | 72 | ||
73 | diff view generated by jsdifflib |
1 | The ppoll(2) and epoll(7) file descriptor monitoring implementations are | 1 | From: Ashish Mittal <ashmit602@gmail.com> |
---|---|---|---|
2 | mixed with the core util/aio-posix.c code. Before adding another | ||
3 | implementation for Linux io_uring, extract out the existing | ||
4 | ones so there is a clear interface and the core code is simpler. | ||
5 | 2 | ||
6 | The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps | 3 | Source code for the qnio library that this code loads can be downloaded from: |
7 | struct. See the patch for details. | 4 | https://github.com/VeritasHyperScale/libqnio.git |
8 | 5 | ||
9 | Semantic changes: | 6 | Sample command line using JSON syntax: |
10 | 1. ppoll(2) now reflects events from pollfds[] back into AioHandlers | 7 | ./x86_64-softmmu/qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 |
11 | while we're still on the clock for adaptive polling. This was | 8 | -k en-us -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 |
12 | already happening for epoll(7), so if it's really an issue then we'll | 9 | -msg timestamp=on |
13 | need to fix both in the future. | 10 | 'json:{"driver":"vxhs","vdisk-id":"c3e9095a-a5ee-4dce-afeb-2a59fb387410", |
14 | 2. epoll(7)'s fallback to ppoll(2) while external events are disabled | 11 | "server":{"host":"172.172.17.4","port":"9999"}}' |
15 | was broken when the number of fds exceeded the epoll(7) upgrade | ||
16 | threshold. I guess this code path simply wasn't tested and no one | ||
17 | noticed the bug. I didn't go out of my way to fix it but the correct | ||
18 | code is simpler than preserving the bug. | ||
19 | 12 | ||
20 | I also took some liberties in removing the unnecessary | 13 | Sample command line using URI syntax: |
21 | AioContext->epoll_available (just check AioContext->epollfd != -1 | 14 | qemu-img convert -f raw -O raw -n |
22 | instead) and AioContext->epoll_enabled (it's implicit if our | 15 | /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad |
23 | AioContext->fdmon_ops callbacks are being invoked) fields. | 16 | vxhs://192.168.0.1:9999/c6718f6b-0401-441d-a8c3-1f0064d75ee0 |
24 | 17 | ||
25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 18 | Sample command line using TLS credentials (run in secure mode): |
26 | Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com | 19 | ./qemu-io --object |
27 | Message-Id: <20200305170806.1313245-4-stefanha@redhat.com> | 20 | tls-creds-x509,id=tls0,dir=/etc/pki/qemu/vxhs,endpoint=client -c 'read |
21 | -v 66000 2.5k' 'json:{"server.host": "127.0.0.1", "server.port": "9999", | ||
22 | "vdisk-id": "/test.raw", "driver": "vxhs", "tls-creds":"tls0"}' | ||
23 | |||
24 | [Jeff: Modified trace-events with the correct string formatting] | ||
25 | |||
26 | Signed-off-by: Ashish Mittal <Ashish.Mittal@veritas.com> | ||
27 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
28 | Reviewed-by: Jeff Cody <jcody@redhat.com> | ||
29 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
30 | Message-id: 1491277689-24949-2-git-send-email-Ashish.Mittal@veritas.com | ||
28 | --- | 31 | --- |
29 | MAINTAINERS | 2 + | 32 | block/Makefile.objs | 2 + |
30 | include/block/aio.h | 36 +++++- | 33 | block/trace-events | 17 ++ |
31 | util/Makefile.objs | 2 + | 34 | block/vxhs.c | 575 +++++++++++++++++++++++++++++++++++++++++++++++++++ |
32 | util/aio-posix.c | 286 ++------------------------------------------ | 35 | configure | 39 ++++ |
33 | util/aio-posix.h | 61 ++++++++++ | 36 | qapi/block-core.json | 23 ++- |
34 | util/fdmon-epoll.c | 151 +++++++++++++++++++++++ | 37 | 5 files changed, 654 insertions(+), 2 deletions(-) |
35 | util/fdmon-poll.c | 104 ++++++++++++++++ | 38 | create mode 100644 block/vxhs.c |
36 | 7 files changed, 366 insertions(+), 276 deletions(-) | ||
37 | create mode 100644 util/aio-posix.h | ||
38 | create mode 100644 util/fdmon-epoll.c | ||
39 | create mode 100644 util/fdmon-poll.c | ||
40 | 39 | ||
41 | diff --git a/MAINTAINERS b/MAINTAINERS | 40 | diff --git a/block/Makefile.objs b/block/Makefile.objs |
42 | index XXXXXXX..XXXXXXX 100644 | 41 | index XXXXXXX..XXXXXXX 100644 |
43 | --- a/MAINTAINERS | 42 | --- a/block/Makefile.objs |
44 | +++ b/MAINTAINERS | 43 | +++ b/block/Makefile.objs |
45 | @@ -XXX,XX +XXX,XX @@ L: qemu-block@nongnu.org | 44 | @@ -XXX,XX +XXX,XX @@ block-obj-$(CONFIG_LIBNFS) += nfs.o |
46 | S: Supported | 45 | block-obj-$(CONFIG_CURL) += curl.o |
47 | F: util/async.c | 46 | block-obj-$(CONFIG_RBD) += rbd.o |
48 | F: util/aio-*.c | 47 | block-obj-$(CONFIG_GLUSTERFS) += gluster.o |
49 | +F: util/aio-*.h | 48 | +block-obj-$(CONFIG_VXHS) += vxhs.o |
50 | +F: util/fdmon-*.c | 49 | block-obj-$(CONFIG_LIBSSH2) += ssh.o |
51 | F: block/io.c | 50 | block-obj-y += accounting.o dirty-bitmap.o |
52 | F: migration/block* | 51 | block-obj-y += write-threshold.o |
53 | F: include/block/aio.h | 52 | @@ -XXX,XX +XXX,XX @@ rbd.o-cflags := $(RBD_CFLAGS) |
54 | diff --git a/include/block/aio.h b/include/block/aio.h | 53 | rbd.o-libs := $(RBD_LIBS) |
54 | gluster.o-cflags := $(GLUSTERFS_CFLAGS) | ||
55 | gluster.o-libs := $(GLUSTERFS_LIBS) | ||
56 | +vxhs.o-libs := $(VXHS_LIBS) | ||
57 | ssh.o-cflags := $(LIBSSH2_CFLAGS) | ||
58 | ssh.o-libs := $(LIBSSH2_LIBS) | ||
59 | block-obj-$(if $(CONFIG_BZIP2),m,n) += dmg-bz2.o | ||
60 | diff --git a/block/trace-events b/block/trace-events | ||
55 | index XXXXXXX..XXXXXXX 100644 | 61 | index XXXXXXX..XXXXXXX 100644 |
56 | --- a/include/block/aio.h | 62 | --- a/block/trace-events |
57 | +++ b/include/block/aio.h | 63 | +++ b/block/trace-events |
58 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool; | 64 | @@ -XXX,XX +XXX,XX @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s |
59 | struct LinuxAioState; | 65 | qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 |
60 | struct LuringState; | 66 | qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 |
61 | 67 | qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" | |
62 | +/* Callbacks for file descriptor monitoring implementations */ | 68 | + |
63 | +typedef struct { | 69 | +# block/vxhs.c |
64 | + /* | 70 | +vxhs_iio_callback(int error) "ctx is NULL: error %d" |
65 | + * update: | 71 | +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no i/o %d, %d" |
66 | + * @ctx: the AioContext | 72 | +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, errno %d" |
67 | + * @node: the handler | 73 | +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d" |
68 | + * @is_new: is the file descriptor already being monitored? | 74 | +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, void *acb, int ret, int err) "IO ERROR (vDisk %s) FOR : Read/Write = %d size = %"PRIu64" offset = %"PRIu64" ACB = %p. Error = %d, errno = %d" |
69 | + * | 75 | +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat ioctl failed, ret = %d, errno = %d" |
70 | + * Add/remove/modify a monitored file descriptor. There are three cases: | 76 | +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s stat ioctl returned size %"PRIu64 |
71 | + * 1. node->pfd.events == 0 means remove the file descriptor. | 77 | +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %"PRIu64 |
72 | + * 2. !is_new means modify an already monitored file descriptor. | 78 | +vxhs_parse_uri_filename(const char *filename) "URI passed via bdrv_parse_filename %s" |
73 | + * 3. is_new means add a new file descriptor. | 79 | +vxhs_open_vdiskid(const char *vdisk_id) "Opening vdisk-id %s" |
74 | + * | 80 | +vxhs_open_hostinfo(char *of_vsa_addr, int port) "Adding host %s:%d to BDRVVXHSState" |
75 | + * Called with ctx->list_lock acquired. | 81 | +vxhs_open_iio_open(const char *host) "Failed to connect to storage agent on host %s" |
76 | + */ | 82 | +vxhs_parse_uri_hostinfo(char *host, int port) "Host: IP %s, Port %d" |
77 | + void (*update)(AioContext *ctx, AioHandler *node, bool is_new); | 83 | +vxhs_close(char *vdisk_guid) "Closing vdisk %s" |
78 | + | 84 | +vxhs_get_creds(const char *cacert, const char *client_key, const char *client_cert) "cacert %s, client_key %s, client_cert %s" |
79 | + /* | 85 | diff --git a/block/vxhs.c b/block/vxhs.c |
80 | + * wait: | ||
81 | + * @ctx: the AioContext | ||
82 | + * @ready_list: list for handlers that become ready | ||
83 | + * @timeout: maximum duration to wait, in nanoseconds | ||
84 | + * | ||
85 | + * Wait for file descriptors to become ready and place them on ready_list. | ||
86 | + * | ||
87 | + * Called with ctx->list_lock incremented but not locked. | ||
88 | + * | ||
89 | + * Returns: number of ready file descriptors. | ||
90 | + */ | ||
91 | + int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); | ||
92 | +} FDMonOps; | ||
93 | + | ||
94 | /* | ||
95 | * Each aio_bh_poll() call carves off a slice of the BH list, so that newly | ||
96 | * scheduled BHs are not processed until the next aio_bh_poll() call. All | ||
97 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | ||
98 | |||
99 | /* epoll(7) state used when built with CONFIG_EPOLL */ | ||
100 | int epollfd; | ||
101 | - bool epoll_enabled; | ||
102 | - bool epoll_available; | ||
103 | + | ||
104 | + const FDMonOps *fdmon_ops; | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | diff --git a/util/Makefile.objs b/util/Makefile.objs | ||
109 | index XXXXXXX..XXXXXXX 100644 | ||
110 | --- a/util/Makefile.objs | ||
111 | +++ b/util/Makefile.objs | ||
112 | @@ -XXX,XX +XXX,XX @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o | ||
113 | util-obj-y += main-loop.o | ||
114 | util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o | ||
115 | util-obj-$(CONFIG_POSIX) += aio-posix.o | ||
116 | +util-obj-$(CONFIG_POSIX) += fdmon-poll.o | ||
117 | +util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o | ||
118 | util-obj-$(CONFIG_POSIX) += compatfd.o | ||
119 | util-obj-$(CONFIG_POSIX) += event_notifier-posix.o | ||
120 | util-obj-$(CONFIG_POSIX) += mmap-alloc.o | ||
121 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
122 | index XXXXXXX..XXXXXXX 100644 | ||
123 | --- a/util/aio-posix.c | ||
124 | +++ b/util/aio-posix.c | ||
125 | @@ -XXX,XX +XXX,XX @@ | ||
126 | #include "qemu/sockets.h" | ||
127 | #include "qemu/cutils.h" | ||
128 | #include "trace.h" | ||
129 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
130 | -#include <sys/epoll.h> | ||
131 | -#endif | ||
132 | +#include "aio-posix.h" | ||
133 | |||
134 | -struct AioHandler | ||
135 | -{ | ||
136 | - GPollFD pfd; | ||
137 | - IOHandler *io_read; | ||
138 | - IOHandler *io_write; | ||
139 | - AioPollFn *io_poll; | ||
140 | - IOHandler *io_poll_begin; | ||
141 | - IOHandler *io_poll_end; | ||
142 | - void *opaque; | ||
143 | - bool is_external; | ||
144 | - QLIST_ENTRY(AioHandler) node; | ||
145 | - QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
146 | - QLIST_ENTRY(AioHandler) node_deleted; | ||
147 | -}; | ||
148 | - | ||
149 | -/* Add a handler to a ready list */ | ||
150 | -static void add_ready_handler(AioHandlerList *ready_list, | ||
151 | - AioHandler *node, | ||
152 | - int revents) | ||
153 | +void aio_add_ready_handler(AioHandlerList *ready_list, | ||
154 | + AioHandler *node, | ||
155 | + int revents) | ||
156 | { | ||
157 | QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ | ||
158 | node->pfd.revents = revents; | ||
159 | QLIST_INSERT_HEAD(ready_list, node, node_ready); | ||
160 | } | ||
161 | |||
162 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
163 | - | ||
164 | -/* The fd number threshold to switch to epoll */ | ||
165 | -#define EPOLL_ENABLE_THRESHOLD 64 | ||
166 | - | ||
167 | -static void aio_epoll_disable(AioContext *ctx) | ||
168 | -{ | ||
169 | - ctx->epoll_enabled = false; | ||
170 | - if (!ctx->epoll_available) { | ||
171 | - return; | ||
172 | - } | ||
173 | - ctx->epoll_available = false; | ||
174 | - close(ctx->epollfd); | ||
175 | -} | ||
176 | - | ||
177 | -static inline int epoll_events_from_pfd(int pfd_events) | ||
178 | -{ | ||
179 | - return (pfd_events & G_IO_IN ? EPOLLIN : 0) | | ||
180 | - (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | ||
181 | - (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | ||
182 | - (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
183 | -} | ||
184 | - | ||
185 | -static bool aio_epoll_try_enable(AioContext *ctx) | ||
186 | -{ | ||
187 | - AioHandler *node; | ||
188 | - struct epoll_event event; | ||
189 | - | ||
190 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
191 | - int r; | ||
192 | - if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { | ||
193 | - continue; | ||
194 | - } | ||
195 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
196 | - event.data.ptr = node; | ||
197 | - r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | ||
198 | - if (r) { | ||
199 | - return false; | ||
200 | - } | ||
201 | - } | ||
202 | - ctx->epoll_enabled = true; | ||
203 | - return true; | ||
204 | -} | ||
205 | - | ||
206 | -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
207 | -{ | ||
208 | - struct epoll_event event; | ||
209 | - int r; | ||
210 | - int ctl; | ||
211 | - | ||
212 | - if (!ctx->epoll_enabled) { | ||
213 | - return; | ||
214 | - } | ||
215 | - if (!node->pfd.events) { | ||
216 | - ctl = EPOLL_CTL_DEL; | ||
217 | - } else { | ||
218 | - event.data.ptr = node; | ||
219 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
220 | - ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
221 | - } | ||
222 | - | ||
223 | - r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | ||
224 | - if (r) { | ||
225 | - aio_epoll_disable(ctx); | ||
226 | - } | ||
227 | -} | ||
228 | - | ||
229 | -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list, | ||
230 | - int64_t timeout) | ||
231 | -{ | ||
232 | - GPollFD pfd = { | ||
233 | - .fd = ctx->epollfd, | ||
234 | - .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, | ||
235 | - }; | ||
236 | - AioHandler *node; | ||
237 | - int i, ret = 0; | ||
238 | - struct epoll_event events[128]; | ||
239 | - | ||
240 | - if (timeout > 0) { | ||
241 | - ret = qemu_poll_ns(&pfd, 1, timeout); | ||
242 | - if (ret > 0) { | ||
243 | - timeout = 0; | ||
244 | - } | ||
245 | - } | ||
246 | - if (timeout <= 0 || ret > 0) { | ||
247 | - ret = epoll_wait(ctx->epollfd, events, | ||
248 | - ARRAY_SIZE(events), | ||
249 | - timeout); | ||
250 | - if (ret <= 0) { | ||
251 | - goto out; | ||
252 | - } | ||
253 | - for (i = 0; i < ret; i++) { | ||
254 | - int ev = events[i].events; | ||
255 | - int revents = (ev & EPOLLIN ? G_IO_IN : 0) | | ||
256 | - (ev & EPOLLOUT ? G_IO_OUT : 0) | | ||
257 | - (ev & EPOLLHUP ? G_IO_HUP : 0) | | ||
258 | - (ev & EPOLLERR ? G_IO_ERR : 0); | ||
259 | - | ||
260 | - node = events[i].data.ptr; | ||
261 | - add_ready_handler(ready_list, node, revents); | ||
262 | - } | ||
263 | - } | ||
264 | -out: | ||
265 | - return ret; | ||
266 | -} | ||
267 | - | ||
268 | -static bool aio_epoll_enabled(AioContext *ctx) | ||
269 | -{ | ||
270 | - /* Fall back to ppoll when external clients are disabled. */ | ||
271 | - return !aio_external_disabled(ctx) && ctx->epoll_enabled; | ||
272 | -} | ||
273 | - | ||
274 | -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | ||
275 | - unsigned npfd, int64_t timeout) | ||
276 | -{ | ||
277 | - if (!ctx->epoll_available) { | ||
278 | - return false; | ||
279 | - } | ||
280 | - if (aio_epoll_enabled(ctx)) { | ||
281 | - return true; | ||
282 | - } | ||
283 | - if (npfd >= EPOLL_ENABLE_THRESHOLD) { | ||
284 | - if (aio_epoll_try_enable(ctx)) { | ||
285 | - return true; | ||
286 | - } else { | ||
287 | - aio_epoll_disable(ctx); | ||
288 | - } | ||
289 | - } | ||
290 | - return false; | ||
291 | -} | ||
292 | - | ||
293 | -#else | ||
294 | - | ||
295 | -static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
296 | -{ | ||
297 | -} | ||
298 | - | ||
299 | -static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list, | ||
300 | - int64_t timeout) | ||
301 | -{ | ||
302 | - assert(false); | ||
303 | -} | ||
304 | - | ||
305 | -static bool aio_epoll_enabled(AioContext *ctx) | ||
306 | -{ | ||
307 | - return false; | ||
308 | -} | ||
309 | - | ||
310 | -static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | ||
311 | - unsigned npfd, int64_t timeout) | ||
312 | -{ | ||
313 | - return false; | ||
314 | -} | ||
315 | - | ||
316 | -#endif | ||
317 | - | ||
318 | static AioHandler *find_aio_handler(AioContext *ctx, int fd) | ||
319 | { | ||
320 | AioHandler *node; | ||
321 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
322 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
323 | |||
324 | if (new_node) { | ||
325 | - aio_epoll_update(ctx, new_node, is_new); | ||
326 | + ctx->fdmon_ops->update(ctx, new_node, is_new); | ||
327 | } else if (node) { | ||
328 | /* Unregister deleted fd_handler */ | ||
329 | - aio_epoll_update(ctx, node, false); | ||
330 | + ctx->fdmon_ops->update(ctx, node, false); | ||
331 | } | ||
332 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
333 | aio_notify(ctx); | ||
334 | @@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx) | ||
335 | timerlistgroup_run_timers(&ctx->tlg); | ||
336 | } | ||
337 | |||
338 | -/* These thread-local variables are used only in a small part of aio_poll | ||
339 | - * around the call to the poll() system call. In particular they are not | ||
340 | - * used while aio_poll is performing callbacks, which makes it much easier | ||
341 | - * to think about reentrancy! | ||
342 | - * | ||
343 | - * Stack-allocated arrays would be perfect but they have size limitations; | ||
344 | - * heap allocation is expensive enough that we want to reuse arrays across | ||
345 | - * calls to aio_poll(). And because poll() has to be called without holding | ||
346 | - * any lock, the arrays cannot be stored in AioContext. Thread-local data | ||
347 | - * has none of the disadvantages of these three options. | ||
348 | - */ | ||
349 | -static __thread GPollFD *pollfds; | ||
350 | -static __thread AioHandler **nodes; | ||
351 | -static __thread unsigned npfd, nalloc; | ||
352 | -static __thread Notifier pollfds_cleanup_notifier; | ||
353 | - | ||
354 | -static void pollfds_cleanup(Notifier *n, void *unused) | ||
355 | -{ | ||
356 | - g_assert(npfd == 0); | ||
357 | - g_free(pollfds); | ||
358 | - g_free(nodes); | ||
359 | - nalloc = 0; | ||
360 | -} | ||
361 | - | ||
362 | -static void add_pollfd(AioHandler *node) | ||
363 | -{ | ||
364 | - if (npfd == nalloc) { | ||
365 | - if (nalloc == 0) { | ||
366 | - pollfds_cleanup_notifier.notify = pollfds_cleanup; | ||
367 | - qemu_thread_atexit_add(&pollfds_cleanup_notifier); | ||
368 | - nalloc = 8; | ||
369 | - } else { | ||
370 | - g_assert(nalloc <= INT_MAX); | ||
371 | - nalloc *= 2; | ||
372 | - } | ||
373 | - pollfds = g_renew(GPollFD, pollfds, nalloc); | ||
374 | - nodes = g_renew(AioHandler *, nodes, nalloc); | ||
375 | - } | ||
376 | - nodes[npfd] = node; | ||
377 | - pollfds[npfd] = (GPollFD) { | ||
378 | - .fd = node->pfd.fd, | ||
379 | - .events = node->pfd.events, | ||
380 | - }; | ||
381 | - npfd++; | ||
382 | -} | ||
383 | - | ||
384 | static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
385 | { | ||
386 | bool progress = false; | ||
387 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
388 | bool aio_poll(AioContext *ctx, bool blocking) | ||
389 | { | ||
390 | AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); | ||
391 | - AioHandler *node; | ||
392 | - int i; | ||
393 | int ret = 0; | ||
394 | bool progress; | ||
395 | int64_t timeout; | ||
396 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
397 | * system call---a single round of run_poll_handlers_once suffices. | ||
398 | */ | ||
399 | if (timeout || atomic_read(&ctx->poll_disable_cnt)) { | ||
400 | - assert(npfd == 0); | ||
401 | - | ||
402 | - /* fill pollfds */ | ||
403 | - | ||
404 | - if (!aio_epoll_enabled(ctx)) { | ||
405 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
406 | - if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events | ||
407 | - && aio_node_check(ctx, node->is_external)) { | ||
408 | - add_pollfd(node); | ||
409 | - } | ||
410 | - } | ||
411 | - } | ||
412 | - | ||
413 | - /* wait until next event */ | ||
414 | - if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) { | ||
415 | - npfd = 0; /* pollfds[] is not being used */ | ||
416 | - ret = aio_epoll(ctx, &ready_list, timeout); | ||
417 | - } else { | ||
418 | - ret = qemu_poll_ns(pollfds, npfd, timeout); | ||
419 | - } | ||
420 | + ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
421 | } | ||
422 | |||
423 | if (blocking) { | ||
424 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
425 | } | ||
426 | } | ||
427 | |||
428 | - /* if we have any readable fds, dispatch event */ | ||
429 | - if (ret > 0) { | ||
430 | - for (i = 0; i < npfd; i++) { | ||
431 | - int revents = pollfds[i].revents; | ||
432 | - | ||
433 | - if (revents) { | ||
434 | - add_ready_handler(&ready_list, nodes[i], revents); | ||
435 | - } | ||
436 | - } | ||
437 | - } | ||
438 | - | ||
439 | - npfd = 0; | ||
440 | - | ||
441 | progress |= aio_bh_poll(ctx); | ||
442 | |||
443 | if (ret > 0) { | ||
444 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
445 | |||
446 | void aio_context_setup(AioContext *ctx) | ||
447 | { | ||
448 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
449 | - assert(!ctx->epollfd); | ||
450 | - ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); | ||
451 | - if (ctx->epollfd == -1) { | ||
452 | - fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); | ||
453 | - ctx->epoll_available = false; | ||
454 | - } else { | ||
455 | - ctx->epoll_available = true; | ||
456 | - } | ||
457 | -#endif | ||
458 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
459 | + ctx->epollfd = -1; | ||
460 | + | ||
461 | + fdmon_epoll_setup(ctx); | ||
462 | } | ||
463 | |||
464 | void aio_context_destroy(AioContext *ctx) | ||
465 | { | ||
466 | -#ifdef CONFIG_EPOLL_CREATE1 | ||
467 | - aio_epoll_disable(ctx); | ||
468 | -#endif | ||
469 | + fdmon_epoll_disable(ctx); | ||
470 | } | ||
471 | |||
472 | void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, | ||
473 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
474 | new file mode 100644 | 86 | new file mode 100644 |
475 | index XXXXXXX..XXXXXXX | 87 | index XXXXXXX..XXXXXXX |
476 | --- /dev/null | 88 | --- /dev/null |
477 | +++ b/util/aio-posix.h | 89 | +++ b/block/vxhs.c |
478 | @@ -XXX,XX +XXX,XX @@ | 90 | @@ -XXX,XX +XXX,XX @@ |
479 | +/* | 91 | +/* |
480 | + * AioContext POSIX event loop implementation internal APIs | 92 | + * QEMU Block driver for Veritas HyperScale (VxHS) |
481 | + * | 93 | + * |
482 | + * Copyright IBM, Corp. 2008 | 94 | + * Copyright (c) 2017 Veritas Technologies LLC. |
483 | + * Copyright Red Hat, Inc. 2020 | ||
484 | + * | 95 | + * |
485 | + * Authors: | 96 | + * This work is licensed under the terms of the GNU GPL, version 2 or later. |
486 | + * Anthony Liguori <aliguori@us.ibm.com> | 97 | + * See the COPYING file in the top-level directory. |
487 | + * | 98 | + * |
488 | + * This work is licensed under the terms of the GNU GPL, version 2. See | ||
489 | + * the COPYING file in the top-level directory. | ||
490 | + * | ||
491 | + * Contributions after 2012-01-13 are licensed under the terms of the | ||
492 | + * GNU GPL, version 2 or (at your option) any later version. | ||
493 | + */ | 99 | + */ |
494 | + | 100 | + |
495 | +#ifndef AIO_POSIX_H | 101 | +#include "qemu/osdep.h" |
496 | +#define AIO_POSIX_H | 102 | +#include <qnio/qnio_api.h> |
497 | + | 103 | +#include <sys/param.h> |
498 | +#include "block/aio.h" | 104 | +#include "block/block_int.h" |
499 | + | 105 | +#include "qapi/qmp/qerror.h" |
500 | +struct AioHandler { | 106 | +#include "qapi/qmp/qdict.h" |
501 | + GPollFD pfd; | 107 | +#include "qapi/qmp/qstring.h" |
502 | + IOHandler *io_read; | 108 | +#include "trace.h" |
503 | + IOHandler *io_write; | 109 | +#include "qemu/uri.h" |
504 | + AioPollFn *io_poll; | 110 | +#include "qapi/error.h" |
505 | + IOHandler *io_poll_begin; | 111 | +#include "qemu/uuid.h" |
506 | + IOHandler *io_poll_end; | 112 | +#include "crypto/tlscredsx509.h" |
507 | + void *opaque; | 113 | + |
508 | + bool is_external; | 114 | +#define VXHS_OPT_FILENAME "filename" |
509 | + QLIST_ENTRY(AioHandler) node; | 115 | +#define VXHS_OPT_VDISK_ID "vdisk-id" |
510 | + QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | 116 | +#define VXHS_OPT_SERVER "server" |
511 | + QLIST_ENTRY(AioHandler) node_deleted; | 117 | +#define VXHS_OPT_HOST "host" |
512 | +}; | 118 | +#define VXHS_OPT_PORT "port" |
513 | + | 119 | + |
514 | +/* Add a handler to a ready list */ | 120 | +/* Only accessed under QEMU global mutex */ |
515 | +void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node, | 121 | +static uint32_t vxhs_ref; |
516 | + int revents); | 122 | + |
517 | + | 123 | +typedef enum { |
518 | +extern const FDMonOps fdmon_poll_ops; | 124 | + VDISK_AIO_READ, |
519 | + | 125 | + VDISK_AIO_WRITE, |
520 | +#ifdef CONFIG_EPOLL_CREATE1 | 126 | +} VDISKAIOCmd; |
521 | +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd); | 127 | + |
522 | +void fdmon_epoll_setup(AioContext *ctx); | ||
523 | +void fdmon_epoll_disable(AioContext *ctx); | ||
524 | +#else | ||
525 | +static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) | ||
526 | +{ | ||
527 | + return false; | ||
528 | +} | ||
529 | + | ||
530 | +static inline void fdmon_epoll_setup(AioContext *ctx) | ||
531 | +{ | ||
532 | +} | ||
533 | + | ||
534 | +static inline void fdmon_epoll_disable(AioContext *ctx) | ||
535 | +{ | ||
536 | +} | ||
537 | +#endif /* !CONFIG_EPOLL_CREATE1 */ | ||
538 | + | ||
539 | +#endif /* AIO_POSIX_H */ | ||
540 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
541 | new file mode 100644 | ||
542 | index XXXXXXX..XXXXXXX | ||
543 | --- /dev/null | ||
544 | +++ b/util/fdmon-epoll.c | ||
545 | @@ -XXX,XX +XXX,XX @@ | ||
546 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
547 | +/* | 128 | +/* |
548 | + * epoll(7) file descriptor monitoring | 129 | + * HyperScale AIO callbacks structure |
549 | + */ | 130 | + */ |
550 | + | 131 | +typedef struct VXHSAIOCB { |
551 | +#include "qemu/osdep.h" | 132 | + BlockAIOCB common; |
552 | +#include <sys/epoll.h> | 133 | + int err; |
553 | +#include "qemu/rcu_queue.h" | 134 | +} VXHSAIOCB; |
554 | +#include "aio-posix.h" | 135 | + |
555 | + | 136 | +typedef struct VXHSvDiskHostsInfo { |
556 | +/* The fd number threshold to switch to epoll */ | 137 | + void *dev_handle; /* Device handle */ |
557 | +#define EPOLL_ENABLE_THRESHOLD 64 | 138 | + char *host; /* Host name or IP */ |
558 | + | 139 | + int port; /* Host's port number */ |
559 | +void fdmon_epoll_disable(AioContext *ctx) | 140 | +} VXHSvDiskHostsInfo; |
560 | +{ | 141 | + |
561 | + if (ctx->epollfd >= 0) { | 142 | +/* |
562 | + close(ctx->epollfd); | 143 | + * Structure per vDisk maintained for state |
563 | + ctx->epollfd = -1; | 144 | + */ |
564 | + } | 145 | +typedef struct BDRVVXHSState { |
565 | + | 146 | + VXHSvDiskHostsInfo vdisk_hostinfo; /* Per host info */ |
566 | + /* Switch back */ | 147 | + char *vdisk_guid; |
567 | + ctx->fdmon_ops = &fdmon_poll_ops; | 148 | + char *tlscredsid; /* tlscredsid */ |
568 | +} | 149 | +} BDRVVXHSState; |
569 | + | 150 | + |
570 | +static inline int epoll_events_from_pfd(int pfd_events) | 151 | +static void vxhs_complete_aio_bh(void *opaque) |
571 | +{ | 152 | +{ |
572 | + return (pfd_events & G_IO_IN ? EPOLLIN : 0) | | 153 | + VXHSAIOCB *acb = opaque; |
573 | + (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | 154 | + BlockCompletionFunc *cb = acb->common.cb; |
574 | + (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | 155 | + void *cb_opaque = acb->common.opaque; |
575 | + (pfd_events & G_IO_ERR ? EPOLLERR : 0); | 156 | + int ret = 0; |
576 | +} | 157 | + |
577 | + | 158 | + if (acb->err != 0) { |
578 | +static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | 159 | + trace_vxhs_complete_aio(acb, acb->err); |
579 | +{ | 160 | + ret = (-EIO); |
580 | + struct epoll_event event; | 161 | + } |
581 | + int r; | 162 | + |
582 | + int ctl; | 163 | + qemu_aio_unref(acb); |
583 | + | 164 | + cb(cb_opaque, ret); |
584 | + if (!node->pfd.events) { | 165 | +} |
585 | + ctl = EPOLL_CTL_DEL; | 166 | + |
586 | + } else { | 167 | +/* |
587 | + event.data.ptr = node; | 168 | + * Called from a libqnio thread |
588 | + event.events = epoll_events_from_pfd(node->pfd.events); | 169 | + */ |
589 | + ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | 170 | +static void vxhs_iio_callback(void *ctx, uint32_t opcode, uint32_t error) |
590 | + } | 171 | +{ |
591 | + | 172 | + VXHSAIOCB *acb = NULL; |
592 | + r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | 173 | + |
593 | + if (r) { | 174 | + switch (opcode) { |
594 | + fdmon_epoll_disable(ctx); | 175 | + case IRP_READ_REQUEST: |
595 | + } | 176 | + case IRP_WRITE_REQUEST: |
596 | +} | 177 | + |
597 | + | 178 | + /* |
598 | +static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list, | 179 | + * ctx is VXHSAIOCB* |
599 | + int64_t timeout) | 180 | + * ctx is NULL if error is QNIOERROR_CHANNEL_HUP |
600 | +{ | 181 | + */ |
601 | + GPollFD pfd = { | 182 | + if (ctx) { |
602 | + .fd = ctx->epollfd, | 183 | + acb = ctx; |
603 | + .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, | 184 | + } else { |
604 | + }; | 185 | + trace_vxhs_iio_callback(error); |
605 | + AioHandler *node; | ||
606 | + int i, ret = 0; | ||
607 | + struct epoll_event events[128]; | ||
608 | + | ||
609 | + /* Fall back while external clients are disabled */ | ||
610 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
611 | + return fdmon_poll_ops.wait(ctx, ready_list, timeout); | ||
612 | + } | ||
613 | + | ||
614 | + if (timeout > 0) { | ||
615 | + ret = qemu_poll_ns(&pfd, 1, timeout); | ||
616 | + if (ret > 0) { | ||
617 | + timeout = 0; | ||
618 | + } | ||
619 | + } | ||
620 | + if (timeout <= 0 || ret > 0) { | ||
621 | + ret = epoll_wait(ctx->epollfd, events, | ||
622 | + ARRAY_SIZE(events), | ||
623 | + timeout); | ||
624 | + if (ret <= 0) { | ||
625 | + goto out; | 186 | + goto out; |
626 | + } | 187 | + } |
627 | + for (i = 0; i < ret; i++) { | 188 | + |
628 | + int ev = events[i].events; | 189 | + if (error) { |
629 | + int revents = (ev & EPOLLIN ? G_IO_IN : 0) | | 190 | + if (!acb->err) { |
630 | + (ev & EPOLLOUT ? G_IO_OUT : 0) | | 191 | + acb->err = error; |
631 | + (ev & EPOLLHUP ? G_IO_HUP : 0) | | 192 | + } |
632 | + (ev & EPOLLERR ? G_IO_ERR : 0); | 193 | + trace_vxhs_iio_callback(error); |
633 | + | ||
634 | + node = events[i].data.ptr; | ||
635 | + aio_add_ready_handler(ready_list, node, revents); | ||
636 | + } | 194 | + } |
195 | + | ||
196 | + aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs), | ||
197 | + vxhs_complete_aio_bh, acb); | ||
198 | + break; | ||
199 | + | ||
200 | + default: | ||
201 | + if (error == QNIOERROR_HUP) { | ||
202 | + /* | ||
203 | + * Channel failed, spontaneous notification, | ||
204 | + * not in response to I/O | ||
205 | + */ | ||
206 | + trace_vxhs_iio_callback_chnfail(error, errno); | ||
207 | + } else { | ||
208 | + trace_vxhs_iio_callback_unknwn(opcode, error); | ||
209 | + } | ||
210 | + break; | ||
637 | + } | 211 | + } |
638 | +out: | 212 | +out: |
213 | + return; | ||
214 | +} | ||
215 | + | ||
216 | +static QemuOptsList runtime_opts = { | ||
217 | + .name = "vxhs", | ||
218 | + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), | ||
219 | + .desc = { | ||
220 | + { | ||
221 | + .name = VXHS_OPT_FILENAME, | ||
222 | + .type = QEMU_OPT_STRING, | ||
223 | + .help = "URI to the Veritas HyperScale image", | ||
224 | + }, | ||
225 | + { | ||
226 | + .name = VXHS_OPT_VDISK_ID, | ||
227 | + .type = QEMU_OPT_STRING, | ||
228 | + .help = "UUID of the VxHS vdisk", | ||
229 | + }, | ||
230 | + { | ||
231 | + .name = "tls-creds", | ||
232 | + .type = QEMU_OPT_STRING, | ||
233 | + .help = "ID of the TLS/SSL credentials to use", | ||
234 | + }, | ||
235 | + { /* end of list */ } | ||
236 | + }, | ||
237 | +}; | ||
238 | + | ||
239 | +static QemuOptsList runtime_tcp_opts = { | ||
240 | + .name = "vxhs_tcp", | ||
241 | + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), | ||
242 | + .desc = { | ||
243 | + { | ||
244 | + .name = VXHS_OPT_HOST, | ||
245 | + .type = QEMU_OPT_STRING, | ||
246 | + .help = "host address (ipv4 addresses)", | ||
247 | + }, | ||
248 | + { | ||
249 | + .name = VXHS_OPT_PORT, | ||
250 | + .type = QEMU_OPT_NUMBER, | ||
251 | + .help = "port number on which VxHSD is listening (default 9999)", | ||
252 | + .def_value_str = "9999" | ||
253 | + }, | ||
254 | + { /* end of list */ } | ||
255 | + }, | ||
256 | +}; | ||
257 | + | ||
258 | +/* | ||
259 | + * Parse incoming URI and populate *options with the host | ||
260 | + * and device information | ||
261 | + */ | ||
262 | +static int vxhs_parse_uri(const char *filename, QDict *options) | ||
263 | +{ | ||
264 | + URI *uri = NULL; | ||
265 | + char *port; | ||
266 | + int ret = 0; | ||
267 | + | ||
268 | + trace_vxhs_parse_uri_filename(filename); | ||
269 | + uri = uri_parse(filename); | ||
270 | + if (!uri || !uri->server || !uri->path) { | ||
271 | + uri_free(uri); | ||
272 | + return -EINVAL; | ||
273 | + } | ||
274 | + | ||
275 | + qdict_put(options, VXHS_OPT_SERVER".host", qstring_from_str(uri->server)); | ||
276 | + | ||
277 | + if (uri->port) { | ||
278 | + port = g_strdup_printf("%d", uri->port); | ||
279 | + qdict_put(options, VXHS_OPT_SERVER".port", qstring_from_str(port)); | ||
280 | + g_free(port); | ||
281 | + } | ||
282 | + | ||
283 | + qdict_put(options, "vdisk-id", qstring_from_str(uri->path)); | ||
284 | + | ||
285 | + trace_vxhs_parse_uri_hostinfo(uri->server, uri->port); | ||
286 | + uri_free(uri); | ||
287 | + | ||
639 | + return ret; | 288 | + return ret; |
640 | +} | 289 | +} |
641 | + | 290 | + |
642 | +static const FDMonOps fdmon_epoll_ops = { | 291 | +static void vxhs_parse_filename(const char *filename, QDict *options, |
643 | + .update = fdmon_epoll_update, | 292 | + Error **errp) |
644 | + .wait = fdmon_epoll_wait, | 293 | +{ |
294 | + if (qdict_haskey(options, "vdisk-id") || qdict_haskey(options, "server")) { | ||
295 | + error_setg(errp, "vdisk-id/server and a file name may not be specified " | ||
296 | + "at the same time"); | ||
297 | + return; | ||
298 | + } | ||
299 | + | ||
300 | + if (strstr(filename, "://")) { | ||
301 | + int ret = vxhs_parse_uri(filename, options); | ||
302 | + if (ret < 0) { | ||
303 | + error_setg(errp, "Invalid URI. URI should be of the form " | ||
304 | + " vxhs://<host_ip>:<port>/<vdisk-id>"); | ||
305 | + } | ||
306 | + } | ||
307 | +} | ||
308 | + | ||
309 | +static int vxhs_init_and_ref(void) | ||
310 | +{ | ||
311 | + if (vxhs_ref++ == 0) { | ||
312 | + if (iio_init(QNIO_VERSION, vxhs_iio_callback)) { | ||
313 | + return -ENODEV; | ||
314 | + } | ||
315 | + } | ||
316 | + return 0; | ||
317 | +} | ||
318 | + | ||
319 | +static void vxhs_unref(void) | ||
320 | +{ | ||
321 | + if (--vxhs_ref == 0) { | ||
322 | + iio_fini(); | ||
323 | + } | ||
324 | +} | ||
325 | + | ||
326 | +static void vxhs_get_tls_creds(const char *id, char **cacert, | ||
327 | + char **key, char **cert, Error **errp) | ||
328 | +{ | ||
329 | + Object *obj; | ||
330 | + QCryptoTLSCreds *creds; | ||
331 | + QCryptoTLSCredsX509 *creds_x509; | ||
332 | + | ||
333 | + obj = object_resolve_path_component( | ||
334 | + object_get_objects_root(), id); | ||
335 | + | ||
336 | + if (!obj) { | ||
337 | + error_setg(errp, "No TLS credentials with id '%s'", | ||
338 | + id); | ||
339 | + return; | ||
340 | + } | ||
341 | + | ||
342 | + creds_x509 = (QCryptoTLSCredsX509 *) | ||
343 | + object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS_X509); | ||
344 | + | ||
345 | + if (!creds_x509) { | ||
346 | + error_setg(errp, "Object with id '%s' is not TLS credentials", | ||
347 | + id); | ||
348 | + return; | ||
349 | + } | ||
350 | + | ||
351 | + creds = &creds_x509->parent_obj; | ||
352 | + | ||
353 | + if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) { | ||
354 | + error_setg(errp, | ||
355 | + "Expecting TLS credentials with a client endpoint"); | ||
356 | + return; | ||
357 | + } | ||
358 | + | ||
359 | + /* | ||
360 | + * Get the cacert, client_cert and client_key file names. | ||
361 | + */ | ||
362 | + if (!creds->dir) { | ||
363 | + error_setg(errp, "TLS object missing 'dir' property value"); | ||
364 | + return; | ||
365 | + } | ||
366 | + | ||
367 | + *cacert = g_strdup_printf("%s/%s", creds->dir, | ||
368 | + QCRYPTO_TLS_CREDS_X509_CA_CERT); | ||
369 | + *cert = g_strdup_printf("%s/%s", creds->dir, | ||
370 | + QCRYPTO_TLS_CREDS_X509_CLIENT_CERT); | ||
371 | + *key = g_strdup_printf("%s/%s", creds->dir, | ||
372 | + QCRYPTO_TLS_CREDS_X509_CLIENT_KEY); | ||
373 | +} | ||
374 | + | ||
375 | +static int vxhs_open(BlockDriverState *bs, QDict *options, | ||
376 | + int bdrv_flags, Error **errp) | ||
377 | +{ | ||
378 | + BDRVVXHSState *s = bs->opaque; | ||
379 | + void *dev_handlep; | ||
380 | + QDict *backing_options = NULL; | ||
381 | + QemuOpts *opts = NULL; | ||
382 | + QemuOpts *tcp_opts = NULL; | ||
383 | + char *of_vsa_addr = NULL; | ||
384 | + Error *local_err = NULL; | ||
385 | + const char *vdisk_id_opt; | ||
386 | + const char *server_host_opt; | ||
387 | + int ret = 0; | ||
388 | + char *cacert = NULL; | ||
389 | + char *client_key = NULL; | ||
390 | + char *client_cert = NULL; | ||
391 | + | ||
392 | + ret = vxhs_init_and_ref(); | ||
393 | + if (ret < 0) { | ||
394 | + ret = -EINVAL; | ||
395 | + goto out; | ||
396 | + } | ||
397 | + | ||
398 | + /* Create opts info from runtime_opts and runtime_tcp_opts list */ | ||
399 | + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); | ||
400 | + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort); | ||
401 | + | ||
402 | + qemu_opts_absorb_qdict(opts, options, &local_err); | ||
403 | + if (local_err) { | ||
404 | + ret = -EINVAL; | ||
405 | + goto out; | ||
406 | + } | ||
407 | + | ||
408 | + /* vdisk-id is the disk UUID */ | ||
409 | + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); | ||
410 | + if (!vdisk_id_opt) { | ||
411 | + error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID); | ||
412 | + ret = -EINVAL; | ||
413 | + goto out; | ||
414 | + } | ||
415 | + | ||
416 | + /* vdisk-id may contain a leading '/' */ | ||
417 | + if (strlen(vdisk_id_opt) > UUID_FMT_LEN + 1) { | ||
418 | + error_setg(&local_err, "vdisk-id cannot be more than %d characters", | ||
419 | + UUID_FMT_LEN); | ||
420 | + ret = -EINVAL; | ||
421 | + goto out; | ||
422 | + } | ||
423 | + | ||
424 | + s->vdisk_guid = g_strdup(vdisk_id_opt); | ||
425 | + trace_vxhs_open_vdiskid(vdisk_id_opt); | ||
426 | + | ||
427 | + /* get the 'server.' arguments */ | ||
428 | + qdict_extract_subqdict(options, &backing_options, VXHS_OPT_SERVER"."); | ||
429 | + | ||
430 | + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); | ||
431 | + if (local_err != NULL) { | ||
432 | + ret = -EINVAL; | ||
433 | + goto out; | ||
434 | + } | ||
435 | + | ||
436 | + server_host_opt = qemu_opt_get(tcp_opts, VXHS_OPT_HOST); | ||
437 | + if (!server_host_opt) { | ||
438 | + error_setg(&local_err, QERR_MISSING_PARAMETER, | ||
439 | + VXHS_OPT_SERVER"."VXHS_OPT_HOST); | ||
440 | + ret = -EINVAL; | ||
441 | + goto out; | ||
442 | + } | ||
443 | + | ||
444 | + if (strlen(server_host_opt) > MAXHOSTNAMELEN) { | ||
445 | + error_setg(&local_err, "server.host cannot be more than %d characters", | ||
446 | + MAXHOSTNAMELEN); | ||
447 | + ret = -EINVAL; | ||
448 | + goto out; | ||
449 | + } | ||
450 | + | ||
451 | + /* check if we got tls-creds via the --object argument */ | ||
452 | + s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds")); | ||
453 | + if (s->tlscredsid) { | ||
454 | + vxhs_get_tls_creds(s->tlscredsid, &cacert, &client_key, | ||
455 | + &client_cert, &local_err); | ||
456 | + if (local_err != NULL) { | ||
457 | + ret = -EINVAL; | ||
458 | + goto out; | ||
459 | + } | ||
460 | + trace_vxhs_get_creds(cacert, client_key, client_cert); | ||
461 | + } | ||
462 | + | ||
463 | + s->vdisk_hostinfo.host = g_strdup(server_host_opt); | ||
464 | + s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts, | ||
465 | + VXHS_OPT_PORT), | ||
466 | + NULL, 0); | ||
467 | + | ||
468 | + trace_vxhs_open_hostinfo(s->vdisk_hostinfo.host, | ||
469 | + s->vdisk_hostinfo.port); | ||
470 | + | ||
471 | + of_vsa_addr = g_strdup_printf("of://%s:%d", | ||
472 | + s->vdisk_hostinfo.host, | ||
473 | + s->vdisk_hostinfo.port); | ||
474 | + | ||
475 | + /* | ||
476 | + * Open qnio channel to storage agent if not opened before | ||
477 | + */ | ||
478 | + dev_handlep = iio_open(of_vsa_addr, s->vdisk_guid, 0, | ||
479 | + cacert, client_key, client_cert); | ||
480 | + if (dev_handlep == NULL) { | ||
481 | + trace_vxhs_open_iio_open(of_vsa_addr); | ||
482 | + ret = -ENODEV; | ||
483 | + goto out; | ||
484 | + } | ||
485 | + s->vdisk_hostinfo.dev_handle = dev_handlep; | ||
486 | + | ||
487 | +out: | ||
488 | + g_free(of_vsa_addr); | ||
489 | + QDECREF(backing_options); | ||
490 | + qemu_opts_del(tcp_opts); | ||
491 | + qemu_opts_del(opts); | ||
492 | + g_free(cacert); | ||
493 | + g_free(client_key); | ||
494 | + g_free(client_cert); | ||
495 | + | ||
496 | + if (ret < 0) { | ||
497 | + vxhs_unref(); | ||
498 | + error_propagate(errp, local_err); | ||
499 | + g_free(s->vdisk_hostinfo.host); | ||
500 | + g_free(s->vdisk_guid); | ||
501 | + g_free(s->tlscredsid); | ||
502 | + s->vdisk_guid = NULL; | ||
503 | + } | ||
504 | + | ||
505 | + return ret; | ||
506 | +} | ||
507 | + | ||
508 | +static const AIOCBInfo vxhs_aiocb_info = { | ||
509 | + .aiocb_size = sizeof(VXHSAIOCB) | ||
645 | +}; | 510 | +}; |
646 | + | 511 | + |
647 | +static bool fdmon_epoll_try_enable(AioContext *ctx) | 512 | +/* |
648 | +{ | 513 | + * This allocates QEMU-VXHS callback for each IO |
649 | + AioHandler *node; | 514 | + * and is passed to QNIO. When QNIO completes the work, |
650 | + struct epoll_event event; | 515 | + * it will be passed back through the callback. |
651 | + | 516 | + */ |
652 | + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | 517 | +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, |
653 | + int r; | 518 | + QEMUIOVector *qiov, int nb_sectors, |
654 | + if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { | 519 | + BlockCompletionFunc *cb, void *opaque, |
655 | + continue; | 520 | + VDISKAIOCmd iodir) |
656 | + } | 521 | +{ |
657 | + event.events = epoll_events_from_pfd(node->pfd.events); | 522 | + VXHSAIOCB *acb = NULL; |
658 | + event.data.ptr = node; | 523 | + BDRVVXHSState *s = bs->opaque; |
659 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | 524 | + size_t size; |
660 | + if (r) { | 525 | + uint64_t offset; |
661 | + return false; | 526 | + int iio_flags = 0; |
662 | + } | 527 | + int ret = 0; |
663 | + } | 528 | + void *dev_handle = s->vdisk_hostinfo.dev_handle; |
664 | + | 529 | + |
665 | + ctx->fdmon_ops = &fdmon_epoll_ops; | 530 | + offset = sector_num * BDRV_SECTOR_SIZE; |
666 | + return true; | 531 | + size = nb_sectors * BDRV_SECTOR_SIZE; |
667 | +} | 532 | + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); |
668 | + | 533 | + |
669 | +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) | 534 | + /* |
670 | +{ | 535 | + * Initialize VXHSAIOCB. |
671 | + if (ctx->epollfd < 0) { | 536 | + */ |
672 | + return false; | 537 | + acb->err = 0; |
673 | + } | 538 | + |
674 | + | 539 | + iio_flags = IIO_FLAG_ASYNC; |
675 | + /* Do not upgrade while external clients are disabled */ | 540 | + |
676 | + if (atomic_read(&ctx->external_disable_cnt)) { | 541 | + switch (iodir) { |
677 | + return false; | 542 | + case VDISK_AIO_WRITE: |
678 | + } | 543 | + ret = iio_writev(dev_handle, acb, qiov->iov, qiov->niov, |
679 | + | 544 | + offset, (uint64_t)size, iio_flags); |
680 | + if (npfd >= EPOLL_ENABLE_THRESHOLD) { | 545 | + break; |
681 | + if (fdmon_epoll_try_enable(ctx)) { | 546 | + case VDISK_AIO_READ: |
682 | + return true; | 547 | + ret = iio_readv(dev_handle, acb, qiov->iov, qiov->niov, |
683 | + } else { | 548 | + offset, (uint64_t)size, iio_flags); |
684 | + fdmon_epoll_disable(ctx); | 549 | + break; |
685 | + } | 550 | + default: |
686 | + } | 551 | + trace_vxhs_aio_rw_invalid(iodir); |
687 | + return false; | 552 | + goto errout; |
688 | +} | 553 | + } |
689 | + | 554 | + |
690 | +void fdmon_epoll_setup(AioContext *ctx) | 555 | + if (ret != 0) { |
691 | +{ | 556 | + trace_vxhs_aio_rw_ioerr(s->vdisk_guid, iodir, size, offset, |
692 | + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); | 557 | + acb, ret, errno); |
693 | + if (ctx->epollfd == -1) { | 558 | + goto errout; |
694 | + fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); | 559 | + } |
695 | + } | 560 | + return &acb->common; |
696 | +} | 561 | + |
697 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | 562 | +errout: |
698 | new file mode 100644 | 563 | + qemu_aio_unref(acb); |
699 | index XXXXXXX..XXXXXXX | 564 | + return NULL; |
700 | --- /dev/null | 565 | +} |
701 | +++ b/util/fdmon-poll.c | 566 | + |
567 | +static BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, | ||
568 | + int64_t sector_num, QEMUIOVector *qiov, | ||
569 | + int nb_sectors, | ||
570 | + BlockCompletionFunc *cb, void *opaque) | ||
571 | +{ | ||
572 | + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, cb, | ||
573 | + opaque, VDISK_AIO_READ); | ||
574 | +} | ||
575 | + | ||
576 | +static BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, | ||
577 | + int64_t sector_num, QEMUIOVector *qiov, | ||
578 | + int nb_sectors, | ||
579 | + BlockCompletionFunc *cb, void *opaque) | ||
580 | +{ | ||
581 | + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, | ||
582 | + cb, opaque, VDISK_AIO_WRITE); | ||
583 | +} | ||
584 | + | ||
585 | +static void vxhs_close(BlockDriverState *bs) | ||
586 | +{ | ||
587 | + BDRVVXHSState *s = bs->opaque; | ||
588 | + | ||
589 | + trace_vxhs_close(s->vdisk_guid); | ||
590 | + | ||
591 | + g_free(s->vdisk_guid); | ||
592 | + s->vdisk_guid = NULL; | ||
593 | + | ||
594 | + /* | ||
595 | + * Close vDisk device | ||
596 | + */ | ||
597 | + if (s->vdisk_hostinfo.dev_handle) { | ||
598 | + iio_close(s->vdisk_hostinfo.dev_handle); | ||
599 | + s->vdisk_hostinfo.dev_handle = NULL; | ||
600 | + } | ||
601 | + | ||
602 | + vxhs_unref(); | ||
603 | + | ||
604 | + /* | ||
605 | + * Free the dynamically allocated host string etc | ||
606 | + */ | ||
607 | + g_free(s->vdisk_hostinfo.host); | ||
608 | + g_free(s->tlscredsid); | ||
609 | + s->tlscredsid = NULL; | ||
610 | + s->vdisk_hostinfo.host = NULL; | ||
611 | + s->vdisk_hostinfo.port = 0; | ||
612 | +} | ||
613 | + | ||
614 | +static int64_t vxhs_get_vdisk_stat(BDRVVXHSState *s) | ||
615 | +{ | ||
616 | + int64_t vdisk_size = -1; | ||
617 | + int ret = 0; | ||
618 | + void *dev_handle = s->vdisk_hostinfo.dev_handle; | ||
619 | + | ||
620 | + ret = iio_ioctl(dev_handle, IOR_VDISK_STAT, &vdisk_size, 0); | ||
621 | + if (ret < 0) { | ||
622 | + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); | ||
623 | + return -EIO; | ||
624 | + } | ||
625 | + | ||
626 | + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); | ||
627 | + return vdisk_size; | ||
628 | +} | ||
629 | + | ||
630 | +/* | ||
631 | + * Returns the size of vDisk in bytes. This is required | ||
632 | + * by QEMU block upper block layer so that it is visible | ||
633 | + * to guest. | ||
634 | + */ | ||
635 | +static int64_t vxhs_getlength(BlockDriverState *bs) | ||
636 | +{ | ||
637 | + BDRVVXHSState *s = bs->opaque; | ||
638 | + int64_t vdisk_size; | ||
639 | + | ||
640 | + vdisk_size = vxhs_get_vdisk_stat(s); | ||
641 | + if (vdisk_size < 0) { | ||
642 | + return -EIO; | ||
643 | + } | ||
644 | + | ||
645 | + return vdisk_size; | ||
646 | +} | ||
647 | + | ||
648 | +static BlockDriver bdrv_vxhs = { | ||
649 | + .format_name = "vxhs", | ||
650 | + .protocol_name = "vxhs", | ||
651 | + .instance_size = sizeof(BDRVVXHSState), | ||
652 | + .bdrv_file_open = vxhs_open, | ||
653 | + .bdrv_parse_filename = vxhs_parse_filename, | ||
654 | + .bdrv_close = vxhs_close, | ||
655 | + .bdrv_getlength = vxhs_getlength, | ||
656 | + .bdrv_aio_readv = vxhs_aio_readv, | ||
657 | + .bdrv_aio_writev = vxhs_aio_writev, | ||
658 | +}; | ||
659 | + | ||
660 | +static void bdrv_vxhs_init(void) | ||
661 | +{ | ||
662 | + bdrv_register(&bdrv_vxhs); | ||
663 | +} | ||
664 | + | ||
665 | +block_init(bdrv_vxhs_init); | ||
666 | diff --git a/configure b/configure | ||
667 | index XXXXXXX..XXXXXXX 100755 | ||
668 | --- a/configure | ||
669 | +++ b/configure | ||
670 | @@ -XXX,XX +XXX,XX @@ numa="" | ||
671 | tcmalloc="no" | ||
672 | jemalloc="no" | ||
673 | replication="yes" | ||
674 | +vxhs="" | ||
675 | |||
676 | supported_cpu="no" | ||
677 | supported_os="no" | ||
678 | @@ -XXX,XX +XXX,XX @@ for opt do | ||
679 | ;; | ||
680 | --enable-replication) replication="yes" | ||
681 | ;; | ||
682 | + --disable-vxhs) vxhs="no" | ||
683 | + ;; | ||
684 | + --enable-vxhs) vxhs="yes" | ||
685 | + ;; | ||
686 | *) | ||
687 | echo "ERROR: unknown option $opt" | ||
688 | echo "Try '$0 --help' for more information" | ||
689 | @@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available: | ||
690 | xfsctl xfsctl support | ||
691 | qom-cast-debug cast debugging support | ||
692 | tools build qemu-io, qemu-nbd and qemu-image tools | ||
693 | + vxhs Veritas HyperScale vDisk backend support | ||
694 | |||
695 | NOTE: The object files are built at the place where configure is launched | ||
696 | EOF | ||
697 | @@ -XXX,XX +XXX,XX @@ if compile_prog "" "" ; then | ||
698 | fi | ||
699 | |||
700 | ########################################## | ||
701 | +# Veritas HyperScale block driver VxHS | ||
702 | +# Check if libvxhs is installed | ||
703 | + | ||
704 | +if test "$vxhs" != "no" ; then | ||
705 | + cat > $TMPC <<EOF | ||
706 | +#include <stdint.h> | ||
707 | +#include <qnio/qnio_api.h> | ||
708 | + | ||
709 | +void *vxhs_callback; | ||
710 | + | ||
711 | +int main(void) { | ||
712 | + iio_init(QNIO_VERSION, vxhs_callback); | ||
713 | + return 0; | ||
714 | +} | ||
715 | +EOF | ||
716 | + vxhs_libs="-lvxhs -lssl" | ||
717 | + if compile_prog "" "$vxhs_libs" ; then | ||
718 | + vxhs=yes | ||
719 | + else | ||
720 | + if test "$vxhs" = "yes" ; then | ||
721 | + feature_not_found "vxhs block device" "Install libvxhs See github" | ||
722 | + fi | ||
723 | + vxhs=no | ||
724 | + fi | ||
725 | +fi | ||
726 | + | ||
727 | +########################################## | ||
728 | # End of CC checks | ||
729 | # After here, no more $cc or $ld runs | ||
730 | |||
731 | @@ -XXX,XX +XXX,XX @@ echo "tcmalloc support $tcmalloc" | ||
732 | echo "jemalloc support $jemalloc" | ||
733 | echo "avx2 optimization $avx2_opt" | ||
734 | echo "replication support $replication" | ||
735 | +echo "VxHS block device $vxhs" | ||
736 | |||
737 | if test "$sdl_too_old" = "yes"; then | ||
738 | echo "-> Your SDL version is too old - please upgrade to have SDL support" | ||
739 | @@ -XXX,XX +XXX,XX @@ if test "$pthread_setname_np" = "yes" ; then | ||
740 | echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak | ||
741 | fi | ||
742 | |||
743 | +if test "$vxhs" = "yes" ; then | ||
744 | + echo "CONFIG_VXHS=y" >> $config_host_mak | ||
745 | + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak | ||
746 | +fi | ||
747 | + | ||
748 | if test "$tcg_interpreter" = "yes"; then | ||
749 | QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" | ||
750 | elif test "$ARCH" = "sparc64" ; then | ||
751 | diff --git a/qapi/block-core.json b/qapi/block-core.json | ||
752 | index XXXXXXX..XXXXXXX 100644 | ||
753 | --- a/qapi/block-core.json | ||
754 | +++ b/qapi/block-core.json | ||
702 | @@ -XXX,XX +XXX,XX @@ | 755 | @@ -XXX,XX +XXX,XX @@ |
703 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | 756 | # |
704 | +/* | 757 | # Drivers that are supported in block device operations. |
705 | + * poll(2) file descriptor monitoring | 758 | # |
706 | + * | 759 | +# @vxhs: Since 2.10 |
707 | + * Uses ppoll(2) when available, g_poll() otherwise. | 760 | +# |
708 | + */ | 761 | # Since: 2.9 |
709 | + | 762 | ## |
710 | +#include "qemu/osdep.h" | 763 | { 'enum': 'BlockdevDriver', |
711 | +#include "aio-posix.h" | 764 | @@ -XXX,XX +XXX,XX @@ |
712 | +#include "qemu/rcu_queue.h" | 765 | 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs', |
713 | + | 766 | 'null-aio', 'null-co', 'parallels', 'qcow', 'qcow2', 'qed', |
714 | +/* | 767 | 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh', |
715 | + * These thread-local variables are used only in fdmon_poll_wait() around the | 768 | - 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } |
716 | + * call to the poll() system call. In particular they are not used while | 769 | + 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] } |
717 | + * aio_poll is performing callbacks, which makes it much easier to think about | 770 | |
718 | + * reentrancy! | 771 | ## |
719 | + * | 772 | # @BlockdevOptionsFile: |
720 | + * Stack-allocated arrays would be perfect but they have size limitations; | 773 | @@ -XXX,XX +XXX,XX @@ |
721 | + * heap allocation is expensive enough that we want to reuse arrays across | 774 | 'data': { '*offset': 'int', '*size': 'int' } } |
722 | + * calls to aio_poll(). And because poll() has to be called without holding | 775 | |
723 | + * any lock, the arrays cannot be stored in AioContext. Thread-local data | 776 | ## |
724 | + * has none of the disadvantages of these three options. | 777 | +# @BlockdevOptionsVxHS: |
725 | + */ | 778 | +# |
726 | +static __thread GPollFD *pollfds; | 779 | +# Driver specific block device options for VxHS |
727 | +static __thread AioHandler **nodes; | 780 | +# |
728 | +static __thread unsigned npfd, nalloc; | 781 | +# @vdisk-id: UUID of VxHS volume |
729 | +static __thread Notifier pollfds_cleanup_notifier; | 782 | +# @server: vxhs server IP, port |
730 | + | 783 | +# @tls-creds: TLS credentials ID |
731 | +static void pollfds_cleanup(Notifier *n, void *unused) | 784 | +# |
732 | +{ | 785 | +# Since: 2.10 |
733 | + g_assert(npfd == 0); | 786 | +## |
734 | + g_free(pollfds); | 787 | +{ 'struct': 'BlockdevOptionsVxHS', |
735 | + g_free(nodes); | 788 | + 'data': { 'vdisk-id': 'str', |
736 | + nalloc = 0; | 789 | + 'server': 'InetSocketAddressBase', |
737 | +} | 790 | + '*tls-creds': 'str' } } |
738 | + | 791 | + |
739 | +static void add_pollfd(AioHandler *node) | 792 | +## |
740 | +{ | 793 | # @BlockdevOptions: |
741 | + if (npfd == nalloc) { | 794 | # |
742 | + if (nalloc == 0) { | 795 | # Options for creating a block device. Many options are available for all |
743 | + pollfds_cleanup_notifier.notify = pollfds_cleanup; | 796 | @@ -XXX,XX +XXX,XX @@ |
744 | + qemu_thread_atexit_add(&pollfds_cleanup_notifier); | 797 | 'vhdx': 'BlockdevOptionsGenericFormat', |
745 | + nalloc = 8; | 798 | 'vmdk': 'BlockdevOptionsGenericCOWFormat', |
746 | + } else { | 799 | 'vpc': 'BlockdevOptionsGenericFormat', |
747 | + g_assert(nalloc <= INT_MAX); | 800 | - 'vvfat': 'BlockdevOptionsVVFAT' |
748 | + nalloc *= 2; | 801 | + 'vvfat': 'BlockdevOptionsVVFAT', |
749 | + } | 802 | + 'vxhs': 'BlockdevOptionsVxHS' |
750 | + pollfds = g_renew(GPollFD, pollfds, nalloc); | 803 | } } |
751 | + nodes = g_renew(AioHandler *, nodes, nalloc); | 804 | |
752 | + } | 805 | ## |
753 | + nodes[npfd] = node; | ||
754 | + pollfds[npfd] = (GPollFD) { | ||
755 | + .fd = node->pfd.fd, | ||
756 | + .events = node->pfd.events, | ||
757 | + }; | ||
758 | + npfd++; | ||
759 | +} | ||
760 | + | ||
761 | +static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
762 | + int64_t timeout) | ||
763 | +{ | ||
764 | + AioHandler *node; | ||
765 | + int ret; | ||
766 | + | ||
767 | + assert(npfd == 0); | ||
768 | + | ||
769 | + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
770 | + if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events | ||
771 | + && aio_node_check(ctx, node->is_external)) { | ||
772 | + add_pollfd(node); | ||
773 | + } | ||
774 | + } | ||
775 | + | ||
776 | + /* epoll(7) is faster above a certain number of fds */ | ||
777 | + if (fdmon_epoll_try_upgrade(ctx, npfd)) { | ||
778 | + return ctx->fdmon_ops->wait(ctx, ready_list, timeout); | ||
779 | + } | ||
780 | + | ||
781 | + ret = qemu_poll_ns(pollfds, npfd, timeout); | ||
782 | + if (ret > 0) { | ||
783 | + int i; | ||
784 | + | ||
785 | + for (i = 0; i < npfd; i++) { | ||
786 | + int revents = pollfds[i].revents; | ||
787 | + | ||
788 | + if (revents) { | ||
789 | + aio_add_ready_handler(ready_list, nodes[i], revents); | ||
790 | + } | ||
791 | + } | ||
792 | + } | ||
793 | + | ||
794 | + npfd = 0; | ||
795 | + return ret; | ||
796 | +} | ||
797 | + | ||
798 | +static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
799 | +{ | ||
800 | + /* Do nothing, AioHandler already contains the state we'll need */ | ||
801 | +} | ||
802 | + | ||
803 | +const FDMonOps fdmon_poll_ops = { | ||
804 | + .update = fdmon_poll_update, | ||
805 | + .wait = fdmon_poll_wait, | ||
806 | +}; | ||
807 | -- | 806 | -- |
808 | 2.24.1 | 807 | 2.9.3 |
809 | 808 | ||
809 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Ashish Mittal <ashmit602@gmail.com> | ||
1 | 2 | ||
3 | These changes use a vxhs test server that is a part of the following | ||
4 | repository: | ||
5 | https://github.com/VeritasHyperScale/libqnio.git | ||
6 | |||
7 | Signed-off-by: Ashish Mittal <Ashish.Mittal@veritas.com> | ||
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Reviewed-by: Jeff Cody <jcody@redhat.com> | ||
10 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
11 | Message-id: 1491277689-24949-3-git-send-email-Ashish.Mittal@veritas.com | ||
12 | --- | ||
13 | tests/qemu-iotests/common | 6 ++++++ | ||
14 | tests/qemu-iotests/common.config | 13 +++++++++++++ | ||
15 | tests/qemu-iotests/common.filter | 1 + | ||
16 | tests/qemu-iotests/common.rc | 19 +++++++++++++++++++ | ||
17 | 4 files changed, 39 insertions(+) | ||
18 | |||
19 | diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/tests/qemu-iotests/common | ||
22 | +++ b/tests/qemu-iotests/common | ||
23 | @@ -XXX,XX +XXX,XX @@ check options | ||
24 | -ssh test ssh | ||
25 | -nfs test nfs | ||
26 | -luks test luks | ||
27 | + -vxhs test vxhs | ||
28 | -xdiff graphical mode diff | ||
29 | -nocache use O_DIRECT on backing file | ||
30 | -misalign misalign memory allocations | ||
31 | @@ -XXX,XX +XXX,XX @@ testlist options | ||
32 | xpand=false | ||
33 | ;; | ||
34 | |||
35 | + -vxhs) | ||
36 | + IMGPROTO=vxhs | ||
37 | + xpand=false | ||
38 | + ;; | ||
39 | + | ||
40 | -ssh) | ||
41 | IMGPROTO=ssh | ||
42 | xpand=false | ||
43 | diff --git a/tests/qemu-iotests/common.config b/tests/qemu-iotests/common.config | ||
44 | index XXXXXXX..XXXXXXX 100644 | ||
45 | --- a/tests/qemu-iotests/common.config | ||
46 | +++ b/tests/qemu-iotests/common.config | ||
47 | @@ -XXX,XX +XXX,XX @@ if [ -z "$QEMU_NBD_PROG" ]; then | ||
48 | export QEMU_NBD_PROG="`set_prog_path qemu-nbd`" | ||
49 | fi | ||
50 | |||
51 | +if [ -z "$QEMU_VXHS_PROG" ]; then | ||
52 | + export QEMU_VXHS_PROG="`set_prog_path qnio_server`" | ||
53 | +fi | ||
54 | + | ||
55 | _qemu_wrapper() | ||
56 | { | ||
57 | ( | ||
58 | @@ -XXX,XX +XXX,XX @@ _qemu_nbd_wrapper() | ||
59 | ) | ||
60 | } | ||
61 | |||
62 | +_qemu_vxhs_wrapper() | ||
63 | +{ | ||
64 | + ( | ||
65 | + echo $BASHPID > "${TEST_DIR}/qemu-vxhs.pid" | ||
66 | + exec "$QEMU_VXHS_PROG" $QEMU_VXHS_OPTIONS "$@" | ||
67 | + ) | ||
68 | +} | ||
69 | + | ||
70 | export QEMU=_qemu_wrapper | ||
71 | export QEMU_IMG=_qemu_img_wrapper | ||
72 | export QEMU_IO=_qemu_io_wrapper | ||
73 | export QEMU_NBD=_qemu_nbd_wrapper | ||
74 | +export QEMU_VXHS=_qemu_vxhs_wrapper | ||
75 | |||
76 | QEMU_IMG_EXTRA_ARGS= | ||
77 | if [ "$IMGOPTSSYNTAX" = "true" ]; then | ||
78 | diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter | ||
79 | index XXXXXXX..XXXXXXX 100644 | ||
80 | --- a/tests/qemu-iotests/common.filter | ||
81 | +++ b/tests/qemu-iotests/common.filter | ||
82 | @@ -XXX,XX +XXX,XX @@ _filter_img_info() | ||
83 | -e "s#$TEST_DIR#TEST_DIR#g" \ | ||
84 | -e "s#$IMGFMT#IMGFMT#g" \ | ||
85 | -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \ | ||
86 | + -e 's#json.*vdisk-id.*vxhs"}}#TEST_DIR/t.IMGFMT#' \ | ||
87 | -e "/encrypted: yes/d" \ | ||
88 | -e "/cluster_size: [0-9]\\+/d" \ | ||
89 | -e "/table_size: [0-9]\\+/d" \ | ||
90 | diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc | ||
91 | index XXXXXXX..XXXXXXX 100644 | ||
92 | --- a/tests/qemu-iotests/common.rc | ||
93 | +++ b/tests/qemu-iotests/common.rc | ||
94 | @@ -XXX,XX +XXX,XX @@ else | ||
95 | elif [ "$IMGPROTO" = "nfs" ]; then | ||
96 | TEST_DIR="nfs://127.0.0.1/$TEST_DIR" | ||
97 | TEST_IMG=$TEST_DIR/t.$IMGFMT | ||
98 | + elif [ "$IMGPROTO" = "vxhs" ]; then | ||
99 | + TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT | ||
100 | + TEST_IMG="vxhs://127.0.0.1:9999/t.$IMGFMT" | ||
101 | else | ||
102 | TEST_IMG=$IMGPROTO:$TEST_DIR/t.$IMGFMT | ||
103 | fi | ||
104 | @@ -XXX,XX +XXX,XX @@ _make_test_img() | ||
105 | eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 -f $IMGFMT $TEST_IMG_FILE >/dev/null &" | ||
106 | sleep 1 # FIXME: qemu-nbd needs to be listening before we continue | ||
107 | fi | ||
108 | + | ||
109 | + # Start QNIO server on image directory for vxhs protocol | ||
110 | + if [ $IMGPROTO = "vxhs" ]; then | ||
111 | + eval "$QEMU_VXHS -d $TEST_DIR > /dev/null &" | ||
112 | + sleep 1 # Wait for server to come up. | ||
113 | + fi | ||
114 | } | ||
115 | |||
116 | _rm_test_img() | ||
117 | @@ -XXX,XX +XXX,XX @@ _cleanup_test_img() | ||
118 | fi | ||
119 | rm -f "$TEST_IMG_FILE" | ||
120 | ;; | ||
121 | + vxhs) | ||
122 | + if [ -f "${TEST_DIR}/qemu-vxhs.pid" ]; then | ||
123 | + local QEMU_VXHS_PID | ||
124 | + read QEMU_VXHS_PID < "${TEST_DIR}/qemu-vxhs.pid" | ||
125 | + kill ${QEMU_VXHS_PID} >/dev/null 2>&1 | ||
126 | + rm -f "${TEST_DIR}/qemu-vxhs.pid" | ||
127 | + fi | ||
128 | + rm -f "$TEST_IMG_FILE" | ||
129 | + ;; | ||
130 | + | ||
131 | file) | ||
132 | _rm_test_img "$TEST_DIR/t.$IMGFMT" | ||
133 | _rm_test_img "$TEST_DIR/t.$IMGFMT.orig" | ||
134 | -- | ||
135 | 2.9.3 | ||
136 | |||
137 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | The protocol VXHS does not support image creation. Some tests expect | ||
2 | to be able to create images through the protocol. Exclude VXHS from | ||
3 | these tests. | ||
1 | 4 | ||
5 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
6 | --- | ||
7 | tests/qemu-iotests/017 | 1 + | ||
8 | tests/qemu-iotests/020 | 1 + | ||
9 | tests/qemu-iotests/029 | 1 + | ||
10 | tests/qemu-iotests/073 | 1 + | ||
11 | tests/qemu-iotests/114 | 1 + | ||
12 | tests/qemu-iotests/130 | 1 + | ||
13 | tests/qemu-iotests/134 | 1 + | ||
14 | tests/qemu-iotests/156 | 1 + | ||
15 | tests/qemu-iotests/158 | 1 + | ||
16 | 9 files changed, 9 insertions(+) | ||
17 | |||
18 | diff --git a/tests/qemu-iotests/017 b/tests/qemu-iotests/017 | ||
19 | index XXXXXXX..XXXXXXX 100755 | ||
20 | --- a/tests/qemu-iotests/017 | ||
21 | +++ b/tests/qemu-iotests/017 | ||
22 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
23 | # Any format supporting backing files | ||
24 | _supported_fmt qcow qcow2 vmdk qed | ||
25 | _supported_proto generic | ||
26 | +_unsupported_proto vxhs | ||
27 | _supported_os Linux | ||
28 | _unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat" | ||
29 | |||
30 | diff --git a/tests/qemu-iotests/020 b/tests/qemu-iotests/020 | ||
31 | index XXXXXXX..XXXXXXX 100755 | ||
32 | --- a/tests/qemu-iotests/020 | ||
33 | +++ b/tests/qemu-iotests/020 | ||
34 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
35 | # Any format supporting backing files | ||
36 | _supported_fmt qcow qcow2 vmdk qed | ||
37 | _supported_proto generic | ||
38 | +_unsupported_proto vxhs | ||
39 | _supported_os Linux | ||
40 | _unsupported_imgopts "subformat=monolithicFlat" \ | ||
41 | "subformat=twoGbMaxExtentFlat" \ | ||
42 | diff --git a/tests/qemu-iotests/029 b/tests/qemu-iotests/029 | ||
43 | index XXXXXXX..XXXXXXX 100755 | ||
44 | --- a/tests/qemu-iotests/029 | ||
45 | +++ b/tests/qemu-iotests/029 | ||
46 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
47 | # Any format supporting intenal snapshots | ||
48 | _supported_fmt qcow2 | ||
49 | _supported_proto generic | ||
50 | +_unsupported_proto vxhs | ||
51 | _supported_os Linux | ||
52 | # Internal snapshots are (currently) impossible with refcount_bits=1 | ||
53 | _unsupported_imgopts 'refcount_bits=1[^0-9]' | ||
54 | diff --git a/tests/qemu-iotests/073 b/tests/qemu-iotests/073 | ||
55 | index XXXXXXX..XXXXXXX 100755 | ||
56 | --- a/tests/qemu-iotests/073 | ||
57 | +++ b/tests/qemu-iotests/073 | ||
58 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
59 | |||
60 | _supported_fmt qcow2 | ||
61 | _supported_proto generic | ||
62 | +_unsupported_proto vxhs | ||
63 | _supported_os Linux | ||
64 | |||
65 | CLUSTER_SIZE=64k | ||
66 | diff --git a/tests/qemu-iotests/114 b/tests/qemu-iotests/114 | ||
67 | index XXXXXXX..XXXXXXX 100755 | ||
68 | --- a/tests/qemu-iotests/114 | ||
69 | +++ b/tests/qemu-iotests/114 | ||
70 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
71 | |||
72 | _supported_fmt qcow2 | ||
73 | _supported_proto generic | ||
74 | +_unsupported_proto vxhs | ||
75 | _supported_os Linux | ||
76 | |||
77 | |||
78 | diff --git a/tests/qemu-iotests/130 b/tests/qemu-iotests/130 | ||
79 | index XXXXXXX..XXXXXXX 100755 | ||
80 | --- a/tests/qemu-iotests/130 | ||
81 | +++ b/tests/qemu-iotests/130 | ||
82 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
83 | |||
84 | _supported_fmt qcow2 | ||
85 | _supported_proto generic | ||
86 | +_unsupported_proto vxhs | ||
87 | _supported_os Linux | ||
88 | |||
89 | qemu_comm_method="monitor" | ||
90 | diff --git a/tests/qemu-iotests/134 b/tests/qemu-iotests/134 | ||
91 | index XXXXXXX..XXXXXXX 100755 | ||
92 | --- a/tests/qemu-iotests/134 | ||
93 | +++ b/tests/qemu-iotests/134 | ||
94 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
95 | |||
96 | _supported_fmt qcow2 | ||
97 | _supported_proto generic | ||
98 | +_unsupported_proto vxhs | ||
99 | _supported_os Linux | ||
100 | |||
101 | |||
102 | diff --git a/tests/qemu-iotests/156 b/tests/qemu-iotests/156 | ||
103 | index XXXXXXX..XXXXXXX 100755 | ||
104 | --- a/tests/qemu-iotests/156 | ||
105 | +++ b/tests/qemu-iotests/156 | ||
106 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
107 | |||
108 | _supported_fmt qcow2 qed | ||
109 | _supported_proto generic | ||
110 | +_unsupported_proto vxhs | ||
111 | _supported_os Linux | ||
112 | |||
113 | # Create source disk | ||
114 | diff --git a/tests/qemu-iotests/158 b/tests/qemu-iotests/158 | ||
115 | index XXXXXXX..XXXXXXX 100755 | ||
116 | --- a/tests/qemu-iotests/158 | ||
117 | +++ b/tests/qemu-iotests/158 | ||
118 | @@ -XXX,XX +XXX,XX @@ trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
119 | |||
120 | _supported_fmt qcow2 | ||
121 | _supported_proto generic | ||
122 | +_unsupported_proto vxhs | ||
123 | _supported_os Linux | ||
124 | |||
125 | |||
126 | -- | ||
127 | 2.9.3 | ||
128 | |||
129 | diff view generated by jsdifflib |
1 | Now that run_poll_handlers_once() is only called by run_poll_handlers() | 1 | We have a helper wrapper for checking for the BDS read_only flag, |
---|---|---|---|
2 | we can improve the CPU time profile by moving the expensive | 2 | add a helper wrapper to set the read_only flag as well. |
3 | RCU_READ_LOCK() out of the polling loop. | ||
4 | 3 | ||
5 | This reduces the run_poll_handlers() from 40% CPU to 10% CPU in perf's | 4 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
6 | sampling profiler output. | 5 | Signed-off-by: Jeff Cody <jcody@redhat.com> |
6 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
7 | Message-id: 9b18972d05f5fa2ac16c014f0af98d680553048d.1491597120.git.jcody@redhat.com | ||
8 | --- | ||
9 | block.c | 5 +++++ | ||
10 | block/bochs.c | 2 +- | ||
11 | block/cloop.c | 2 +- | ||
12 | block/dmg.c | 2 +- | ||
13 | block/rbd.c | 2 +- | ||
14 | block/vvfat.c | 4 ++-- | ||
15 | include/block/block.h | 1 + | ||
16 | 7 files changed, 12 insertions(+), 6 deletions(-) | ||
7 | 17 | ||
8 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 18 | diff --git a/block.c b/block.c |
9 | Link: https://lore.kernel.org/r/20200305170806.1313245-3-stefanha@redhat.com | 19 | index XXXXXXX..XXXXXXX 100644 |
10 | Message-Id: <20200305170806.1313245-3-stefanha@redhat.com> | 20 | --- a/block.c |
11 | --- | 21 | +++ b/block.c |
12 | util/aio-posix.c | 20 ++++++++++---------- | 22 | @@ -XXX,XX +XXX,XX @@ void path_combine(char *dest, int dest_size, |
13 | 1 file changed, 10 insertions(+), 10 deletions(-) | 23 | } |
24 | } | ||
25 | |||
26 | +void bdrv_set_read_only(BlockDriverState *bs, bool read_only) | ||
27 | +{ | ||
28 | + bs->read_only = read_only; | ||
29 | +} | ||
30 | + | ||
31 | void bdrv_get_full_backing_filename_from_filename(const char *backed, | ||
32 | const char *backing, | ||
33 | char *dest, size_t sz, | ||
34 | diff --git a/block/bochs.c b/block/bochs.c | ||
35 | index XXXXXXX..XXXXXXX 100644 | ||
36 | --- a/block/bochs.c | ||
37 | +++ b/block/bochs.c | ||
38 | @@ -XXX,XX +XXX,XX @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, | ||
39 | return -EINVAL; | ||
40 | } | ||
41 | |||
42 | - bs->read_only = true; /* no write support yet */ | ||
43 | + bdrv_set_read_only(bs, true); /* no write support yet */ | ||
44 | |||
45 | ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)); | ||
46 | if (ret < 0) { | ||
47 | diff --git a/block/cloop.c b/block/cloop.c | ||
48 | index XXXXXXX..XXXXXXX 100644 | ||
49 | --- a/block/cloop.c | ||
50 | +++ b/block/cloop.c | ||
51 | @@ -XXX,XX +XXX,XX @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, | ||
52 | return -EINVAL; | ||
53 | } | ||
54 | |||
55 | - bs->read_only = true; | ||
56 | + bdrv_set_read_only(bs, true); | ||
57 | |||
58 | /* read header */ | ||
59 | ret = bdrv_pread(bs->file, 128, &s->block_size, 4); | ||
60 | diff --git a/block/dmg.c b/block/dmg.c | ||
61 | index XXXXXXX..XXXXXXX 100644 | ||
62 | --- a/block/dmg.c | ||
63 | +++ b/block/dmg.c | ||
64 | @@ -XXX,XX +XXX,XX @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, | ||
65 | } | ||
66 | |||
67 | block_module_load_one("dmg-bz2"); | ||
68 | - bs->read_only = true; | ||
69 | + bdrv_set_read_only(bs, true); | ||
70 | |||
71 | s->n_chunks = 0; | ||
72 | s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; | ||
73 | diff --git a/block/rbd.c b/block/rbd.c | ||
74 | index XXXXXXX..XXXXXXX 100644 | ||
75 | --- a/block/rbd.c | ||
76 | +++ b/block/rbd.c | ||
77 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, | ||
78 | goto failed_open; | ||
79 | } | ||
80 | |||
81 | - bs->read_only = (s->snap != NULL); | ||
82 | + bdrv_set_read_only(bs, (s->snap != NULL)); | ||
83 | |||
84 | qemu_opts_del(opts); | ||
85 | return 0; | ||
86 | diff --git a/block/vvfat.c b/block/vvfat.c | ||
87 | index XXXXXXX..XXXXXXX 100644 | ||
88 | --- a/block/vvfat.c | ||
89 | +++ b/block/vvfat.c | ||
90 | @@ -XXX,XX +XXX,XX @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, | ||
91 | s->current_cluster=0xffffffff; | ||
92 | |||
93 | /* read only is the default for safety */ | ||
94 | - bs->read_only = true; | ||
95 | + bdrv_set_read_only(bs, true); | ||
96 | s->qcow = NULL; | ||
97 | s->qcow_filename = NULL; | ||
98 | s->fat2 = NULL; | ||
99 | @@ -XXX,XX +XXX,XX @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, | ||
100 | if (ret < 0) { | ||
101 | goto fail; | ||
102 | } | ||
103 | - bs->read_only = false; | ||
104 | + bdrv_set_read_only(bs, false); | ||
105 | } | ||
106 | |||
107 | bs->total_sectors = cyls * heads * secs; | ||
108 | diff --git a/include/block/block.h b/include/block/block.h | ||
109 | index XXXXXXX..XXXXXXX 100644 | ||
110 | --- a/include/block/block.h | ||
111 | +++ b/include/block/block.h | ||
112 | @@ -XXX,XX +XXX,XX @@ int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, | ||
113 | int64_t sector_num, int nb_sectors, int *pnum); | ||
114 | |||
115 | bool bdrv_is_read_only(BlockDriverState *bs); | ||
116 | +void bdrv_set_read_only(BlockDriverState *bs, bool read_only); | ||
117 | bool bdrv_is_sg(BlockDriverState *bs); | ||
118 | bool bdrv_is_inserted(BlockDriverState *bs); | ||
119 | int bdrv_media_changed(BlockDriverState *bs); | ||
120 | -- | ||
121 | 2.9.3 | ||
14 | 122 | ||
15 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
16 | index XXXXXXX..XXXXXXX 100644 | ||
17 | --- a/util/aio-posix.c | ||
18 | +++ b/util/aio-posix.c | ||
19 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | ||
20 | bool progress = false; | ||
21 | AioHandler *node; | ||
22 | |||
23 | - /* | ||
24 | - * Optimization: ->io_poll() handlers often contain RCU read critical | ||
25 | - * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() | ||
26 | - * -> rcu_read_lock() -> ... sequences with expensive memory | ||
27 | - * synchronization primitives. Make the entire polling loop an RCU | ||
28 | - * critical section because nested rcu_read_lock()/rcu_read_unlock() calls | ||
29 | - * are cheap. | ||
30 | - */ | ||
31 | - RCU_READ_LOCK_GUARD(); | ||
32 | - | ||
33 | QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
34 | if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && | ||
35 | aio_node_check(ctx, node->is_external) && | ||
36 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
37 | |||
38 | trace_run_poll_handlers_begin(ctx, max_ns, *timeout); | ||
39 | |||
40 | + /* | ||
41 | + * Optimization: ->io_poll() handlers often contain RCU read critical | ||
42 | + * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() | ||
43 | + * -> rcu_read_lock() -> ... sequences with expensive memory | ||
44 | + * synchronization primitives. Make the entire polling loop an RCU | ||
45 | + * critical section because nested rcu_read_lock()/rcu_read_unlock() calls | ||
46 | + * are cheap. | ||
47 | + */ | ||
48 | + RCU_READ_LOCK_GUARD(); | ||
49 | + | ||
50 | start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
51 | do { | ||
52 | progress = run_poll_handlers_once(ctx, timeout); | ||
53 | -- | ||
54 | 2.24.1 | ||
55 | 123 | diff view generated by jsdifflib |
1 | When there are many poll handlers it's likely that some of them are idle | 1 | A few block drivers will set the BDS read_only flag from their |
---|---|---|---|
2 | most of the time. Remove handlers that haven't had activity recently so | 2 | .bdrv_open() function. This means the bs->read_only flag could |
3 | that the polling loop scales better for guests with a large number of | 3 | be set after we enable copy_on_read, as the BDRV_O_COPY_ON_READ |
4 | devices. | 4 | flag check occurs prior to the call to bdrv->bdrv_open(). |
5 | 5 | ||
6 | This feature only takes effect for the Linux io_uring fd monitoring | 6 | This adds an error return to bdrv_set_read_only(), and an error will be |
7 | implementation because it is capable of combining fd monitoring with | 7 | return if we try to set the BDS to read_only while copy_on_read is |
8 | userspace polling. The other implementations can't do that and risk | 8 | enabled. |
9 | starving fds in favor of poll handlers, so don't try this optimization | ||
10 | when they are in use. | ||
11 | 9 | ||
12 | IOPS improves from 10k to 105k when the guest has 100 | 10 | This patch also changes the behavior of vvfat. Before, vvfat could |
13 | virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 | 11 | override the drive 'readonly' flag with its own, internal 'rw' flag. |
14 | device for rw=randread,iodepth=1,bs=4k,ioengine=libaio on NVMe. | ||
15 | 12 | ||
16 | [Clarified aio_poll_handlers locking discipline explanation in comment | 13 | For instance, this -drive parameter would result in a writable image: |
17 | after discussion with Paolo Bonzini <pbonzini@redhat.com>. | ||
18 | --Stefan] | ||
19 | 14 | ||
20 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 15 | "-drive format=vvfat,dir=/tmp/vvfat,rw,if=virtio,readonly=on" |
21 | Link: https://lore.kernel.org/r/20200305170806.1313245-8-stefanha@redhat.com | 16 | |
22 | Message-Id: <20200305170806.1313245-8-stefanha@redhat.com> | 17 | This is not correct. Now, attempting to use the above -drive parameter |
18 | will result in an error (i.e., 'rw' is incompatible with 'readonly=on'). | ||
19 | |||
20 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
21 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
22 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
23 | Message-id: 0c5b4c1cc2c651471b131f21376dfd5ea24d2196.1491597120.git.jcody@redhat.com | ||
23 | --- | 24 | --- |
24 | include/block/aio.h | 8 ++++ | 25 | block.c | 10 +++++++++- |
25 | util/aio-posix.c | 93 +++++++++++++++++++++++++++++++++++++++++---- | 26 | block/bochs.c | 5 ++++- |
26 | util/aio-posix.h | 2 + | 27 | block/cloop.c | 5 ++++- |
27 | util/trace-events | 2 + | 28 | block/dmg.c | 6 +++++- |
28 | 4 files changed, 98 insertions(+), 7 deletions(-) | 29 | block/rbd.c | 11 ++++++++++- |
30 | block/vvfat.c | 19 +++++++++++++++---- | ||
31 | include/block/block.h | 2 +- | ||
32 | 7 files changed, 48 insertions(+), 10 deletions(-) | ||
29 | 33 | ||
30 | diff --git a/include/block/aio.h b/include/block/aio.h | 34 | diff --git a/block.c b/block.c |
31 | index XXXXXXX..XXXXXXX 100644 | 35 | index XXXXXXX..XXXXXXX 100644 |
32 | --- a/include/block/aio.h | 36 | --- a/block.c |
33 | +++ b/include/block/aio.h | 37 | +++ b/block.c |
34 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | 38 | @@ -XXX,XX +XXX,XX @@ void path_combine(char *dest, int dest_size, |
35 | int64_t poll_grow; /* polling time growth factor */ | 39 | } |
36 | int64_t poll_shrink; /* polling time shrink factor */ | 40 | } |
37 | 41 | ||
38 | + /* | 42 | -void bdrv_set_read_only(BlockDriverState *bs, bool read_only) |
39 | + * List of handlers participating in userspace polling. Protected by | 43 | +int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) |
40 | + * ctx->list_lock. Iterated and modified mostly by the event loop thread | ||
41 | + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() | ||
42 | + * only touches the list to delete nodes if ctx->list_lock's count is zero. | ||
43 | + */ | ||
44 | + AioHandlerList poll_aio_handlers; | ||
45 | + | ||
46 | /* Are we in polling mode or monitoring file descriptors? */ | ||
47 | bool poll_started; | ||
48 | |||
49 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
50 | index XXXXXXX..XXXXXXX 100644 | ||
51 | --- a/util/aio-posix.c | ||
52 | +++ b/util/aio-posix.c | ||
53 | @@ -XXX,XX +XXX,XX @@ | ||
54 | #include "trace.h" | ||
55 | #include "aio-posix.h" | ||
56 | |||
57 | +/* Stop userspace polling on a handler if it isn't active for some time */ | ||
58 | +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) | ||
59 | + | ||
60 | bool aio_poll_disabled(AioContext *ctx) | ||
61 | { | 44 | { |
62 | return atomic_read(&ctx->poll_disable_cnt); | 45 | + /* Do not set read_only if copy_on_read is enabled */ |
63 | @@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) | 46 | + if (bs->copy_on_read && read_only) { |
64 | * deleted because deleted nodes are only cleaned up while | 47 | + error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled", |
65 | * no one is walking the handlers list. | 48 | + bdrv_get_device_or_node_name(bs)); |
66 | */ | 49 | + return -EINVAL; |
67 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
68 | QLIST_REMOVE(node, node); | ||
69 | return true; | ||
70 | } | ||
71 | @@ -XXX,XX +XXX,XX @@ static bool poll_set_started(AioContext *ctx, bool started) | ||
72 | ctx->poll_started = started; | ||
73 | |||
74 | qemu_lockcnt_inc(&ctx->list_lock); | ||
75 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | ||
76 | + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { | ||
77 | IOHandler *fn; | ||
78 | |||
79 | if (QLIST_IS_INSERTED(node, node_deleted)) { | ||
80 | @@ -XXX,XX +XXX,XX @@ static void aio_free_deleted_handlers(AioContext *ctx) | ||
81 | while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { | ||
82 | QLIST_REMOVE(node, node); | ||
83 | QLIST_REMOVE(node, node_deleted); | ||
84 | + QLIST_SAFE_REMOVE(node, node_poll); | ||
85 | g_free(node); | ||
86 | } | ||
87 | |||
88 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) | ||
89 | revents = node->pfd.revents & node->pfd.events; | ||
90 | node->pfd.revents = 0; | ||
91 | |||
92 | + /* | ||
93 | + * Start polling AioHandlers when they become ready because activity is | ||
94 | + * likely to continue. Note that starvation is theoretically possible when | ||
95 | + * fdmon_supports_polling(), but only until the fd fires for the first | ||
96 | + * time. | ||
97 | + */ | ||
98 | + if (!QLIST_IS_INSERTED(node, node_deleted) && | ||
99 | + !QLIST_IS_INSERTED(node, node_poll) && | ||
100 | + node->io_poll) { | ||
101 | + trace_poll_add(ctx, node, node->pfd.fd, revents); | ||
102 | + if (ctx->poll_started && node->io_poll_begin) { | ||
103 | + node->io_poll_begin(node->opaque); | ||
104 | + } | ||
105 | + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); | ||
106 | + } | 50 | + } |
107 | + | 51 | + |
108 | if (!QLIST_IS_INSERTED(node, node_deleted) && | 52 | bs->read_only = read_only; |
109 | (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && | 53 | + return 0; |
110 | aio_node_check(ctx, node->is_external) && | ||
111 | @@ -XXX,XX +XXX,XX @@ void aio_dispatch(AioContext *ctx) | ||
112 | timerlistgroup_run_timers(&ctx->tlg); | ||
113 | } | 54 | } |
114 | 55 | ||
115 | -static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | 56 | void bdrv_get_full_backing_filename_from_filename(const char *backed, |
116 | +static bool run_poll_handlers_once(AioContext *ctx, | 57 | diff --git a/block/bochs.c b/block/bochs.c |
117 | + int64_t now, | 58 | index XXXXXXX..XXXXXXX 100644 |
118 | + int64_t *timeout) | 59 | --- a/block/bochs.c |
119 | { | 60 | +++ b/block/bochs.c |
120 | bool progress = false; | 61 | @@ -XXX,XX +XXX,XX @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, |
121 | AioHandler *node; | 62 | return -EINVAL; |
122 | + AioHandler *tmp; | 63 | } |
123 | 64 | ||
124 | - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { | 65 | - bdrv_set_read_only(bs, true); /* no write support yet */ |
125 | - if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll && | 66 | + ret = bdrv_set_read_only(bs, true, errp); /* no write support yet */ |
126 | - aio_node_check(ctx, node->is_external) && | 67 | + if (ret < 0) { |
127 | + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { | 68 | + return ret; |
128 | + if (aio_node_check(ctx, node->is_external) && | 69 | + } |
129 | node->io_poll(node->opaque)) { | 70 | |
130 | + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; | 71 | ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)); |
131 | + | 72 | if (ret < 0) { |
132 | /* | 73 | diff --git a/block/cloop.c b/block/cloop.c |
133 | * Polling was successful, exit try_poll_mode immediately | 74 | index XXXXXXX..XXXXXXX 100644 |
134 | * to adjust the next polling time. | 75 | --- a/block/cloop.c |
135 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout) | 76 | +++ b/block/cloop.c |
136 | return progress; | 77 | @@ -XXX,XX +XXX,XX @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, |
137 | } | 78 | return -EINVAL; |
138 | 79 | } | |
139 | +static bool fdmon_supports_polling(AioContext *ctx) | 80 | |
140 | +{ | 81 | - bdrv_set_read_only(bs, true); |
141 | + return ctx->fdmon_ops->need_wait != aio_poll_disabled; | 82 | + ret = bdrv_set_read_only(bs, true, errp); |
142 | +} | 83 | + if (ret < 0) { |
143 | + | 84 | + return ret; |
144 | +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) | 85 | + } |
145 | +{ | 86 | |
146 | + AioHandler *node; | 87 | /* read header */ |
147 | + AioHandler *tmp; | 88 | ret = bdrv_pread(bs->file, 128, &s->block_size, 4); |
148 | + bool progress = false; | 89 | diff --git a/block/dmg.c b/block/dmg.c |
149 | + | 90 | index XXXXXXX..XXXXXXX 100644 |
150 | + /* | 91 | --- a/block/dmg.c |
151 | + * File descriptor monitoring implementations without userspace polling | 92 | +++ b/block/dmg.c |
152 | + * support suffer from starvation when a subset of handlers is polled | 93 | @@ -XXX,XX +XXX,XX @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, |
153 | + * because fds will not be processed in a timely fashion. Don't remove | 94 | return -EINVAL; |
154 | + * idle poll handlers. | 95 | } |
155 | + */ | 96 | |
156 | + if (!fdmon_supports_polling(ctx)) { | 97 | + ret = bdrv_set_read_only(bs, true, errp); |
157 | + return false; | 98 | + if (ret < 0) { |
99 | + return ret; | ||
158 | + } | 100 | + } |
159 | + | 101 | + |
160 | + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { | 102 | block_module_load_one("dmg-bz2"); |
161 | + if (node->poll_idle_timeout == 0LL) { | 103 | - bdrv_set_read_only(bs, true); |
162 | + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; | 104 | |
163 | + } else if (now >= node->poll_idle_timeout) { | 105 | s->n_chunks = 0; |
164 | + trace_poll_remove(ctx, node, node->pfd.fd); | 106 | s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; |
165 | + node->poll_idle_timeout = 0LL; | 107 | diff --git a/block/rbd.c b/block/rbd.c |
166 | + QLIST_SAFE_REMOVE(node, node_poll); | 108 | index XXXXXXX..XXXXXXX 100644 |
167 | + if (ctx->poll_started && node->io_poll_end) { | 109 | --- a/block/rbd.c |
168 | + node->io_poll_end(node->opaque); | 110 | +++ b/block/rbd.c |
169 | + | 111 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, |
170 | + /* | 112 | goto failed_shutdown; |
171 | + * Final poll in case ->io_poll_end() races with an event. | 113 | } |
172 | + * Nevermind about re-adding the handler in the rare case where | 114 | |
173 | + * this causes progress. | 115 | + /* rbd_open is always r/w */ |
174 | + */ | 116 | r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); |
175 | + progress = node->io_poll(node->opaque) || progress; | 117 | if (r < 0) { |
176 | + } | 118 | error_setg_errno(errp, -r, "error reading header from %s", s->name); |
119 | goto failed_open; | ||
120 | } | ||
121 | |||
122 | - bdrv_set_read_only(bs, (s->snap != NULL)); | ||
123 | + /* If we are using an rbd snapshot, we must be r/o, otherwise | ||
124 | + * leave as-is */ | ||
125 | + if (s->snap != NULL) { | ||
126 | + r = bdrv_set_read_only(bs, true, &local_err); | ||
127 | + if (r < 0) { | ||
128 | + error_propagate(errp, local_err); | ||
129 | + goto failed_open; | ||
177 | + } | 130 | + } |
178 | + } | 131 | + } |
179 | + | 132 | |
180 | + return progress; | 133 | qemu_opts_del(opts); |
181 | +} | 134 | return 0; |
182 | + | 135 | diff --git a/block/vvfat.c b/block/vvfat.c |
183 | /* run_poll_handlers: | ||
184 | * @ctx: the AioContext | ||
185 | * @max_ns: maximum time to poll for, in nanoseconds | ||
186 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
187 | |||
188 | start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); | ||
189 | do { | ||
190 | - progress = run_poll_handlers_once(ctx, timeout); | ||
191 | + progress = run_poll_handlers_once(ctx, start_time, timeout); | ||
192 | elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; | ||
193 | max_ns = qemu_soonest_timeout(*timeout, max_ns); | ||
194 | assert(!(max_ns && progress)); | ||
195 | } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); | ||
196 | |||
197 | + if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { | ||
198 | + *timeout = 0; | ||
199 | + progress = true; | ||
200 | + } | ||
201 | + | ||
202 | /* If time has passed with no successful polling, adjust *timeout to | ||
203 | * keep the same ending time. | ||
204 | */ | ||
205 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
206 | */ | ||
207 | static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
208 | { | ||
209 | - int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
210 | + int64_t max_ns; | ||
211 | + | ||
212 | + if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { | ||
213 | + return false; | ||
214 | + } | ||
215 | |||
216 | + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | ||
217 | if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
218 | poll_set_started(ctx, true); | ||
219 | |||
220 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
221 | index XXXXXXX..XXXXXXX 100644 | 136 | index XXXXXXX..XXXXXXX 100644 |
222 | --- a/util/aio-posix.h | 137 | --- a/block/vvfat.c |
223 | +++ b/util/aio-posix.h | 138 | +++ b/block/vvfat.c |
224 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | 139 | @@ -XXX,XX +XXX,XX @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, |
225 | QLIST_ENTRY(AioHandler) node; | 140 | |
226 | QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | 141 | s->current_cluster=0xffffffff; |
227 | QLIST_ENTRY(AioHandler) node_deleted; | 142 | |
228 | + QLIST_ENTRY(AioHandler) node_poll; | 143 | - /* read only is the default for safety */ |
229 | #ifdef CONFIG_LINUX_IO_URING | 144 | - bdrv_set_read_only(bs, true); |
230 | QSLIST_ENTRY(AioHandler) node_submitted; | 145 | s->qcow = NULL; |
231 | unsigned flags; /* see fdmon-io_uring.c */ | 146 | s->qcow_filename = NULL; |
232 | #endif | 147 | s->fat2 = NULL; |
233 | + int64_t poll_idle_timeout; /* when to stop userspace polling */ | 148 | @@ -XXX,XX +XXX,XX @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, |
234 | bool is_external; | 149 | s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); |
235 | }; | 150 | |
236 | 151 | if (qemu_opt_get_bool(opts, "rw", false)) { | |
237 | diff --git a/util/trace-events b/util/trace-events | 152 | - ret = enable_write_target(bs, errp); |
153 | + if (!bdrv_is_read_only(bs)) { | ||
154 | + ret = enable_write_target(bs, errp); | ||
155 | + if (ret < 0) { | ||
156 | + goto fail; | ||
157 | + } | ||
158 | + } else { | ||
159 | + ret = -EPERM; | ||
160 | + error_setg(errp, | ||
161 | + "Unable to set VVFAT to 'rw' when drive is read-only"); | ||
162 | + goto fail; | ||
163 | + } | ||
164 | + } else { | ||
165 | + /* read only is the default for safety */ | ||
166 | + ret = bdrv_set_read_only(bs, true, &local_err); | ||
167 | if (ret < 0) { | ||
168 | + error_propagate(errp, local_err); | ||
169 | goto fail; | ||
170 | } | ||
171 | - bdrv_set_read_only(bs, false); | ||
172 | } | ||
173 | |||
174 | bs->total_sectors = cyls * heads * secs; | ||
175 | diff --git a/include/block/block.h b/include/block/block.h | ||
238 | index XXXXXXX..XXXXXXX 100644 | 176 | index XXXXXXX..XXXXXXX 100644 |
239 | --- a/util/trace-events | 177 | --- a/include/block/block.h |
240 | +++ b/util/trace-events | 178 | +++ b/include/block/block.h |
241 | @@ -XXX,XX +XXX,XX @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ | 179 | @@ -XXX,XX +XXX,XX @@ int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, |
242 | run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64 | 180 | int64_t sector_num, int nb_sectors, int *pnum); |
243 | poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 | 181 | |
244 | poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 | 182 | bool bdrv_is_read_only(BlockDriverState *bs); |
245 | +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x" | 183 | -void bdrv_set_read_only(BlockDriverState *bs, bool read_only); |
246 | +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d" | 184 | +int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); |
247 | 185 | bool bdrv_is_sg(BlockDriverState *bs); | |
248 | # async.c | 186 | bool bdrv_is_inserted(BlockDriverState *bs); |
249 | aio_co_schedule(void *ctx, void *co) "ctx %p co %p" | 187 | int bdrv_media_changed(BlockDriverState *bs); |
250 | -- | 188 | -- |
251 | 2.24.1 | 189 | 2.9.3 |
252 | 190 | ||
191 | diff view generated by jsdifflib |
1 | The AioHandler *node, bool is_new arguments are more complicated to | 1 | The BDRV_O_ALLOW_RDWR flag allows / prohibits the changing of |
---|---|---|---|
2 | think about than simply being given AioHandler *old_node, AioHandler | 2 | the BDS 'read_only' state, but there are a few places where it |
3 | *new_node. | 3 | is ignored. In the bdrv_set_read_only() helper, make sure to |
4 | honor the flag. | ||
4 | 5 | ||
5 | Furthermore, the new Linux io_uring file descriptor monitoring mechanism | 6 | Signed-off-by: Jeff Cody <jcody@redhat.com> |
6 | added by the new patch requires access to both the old and the new | 7 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
7 | nodes. Make this change now in preparation. | 8 | Reviewed-by: John Snow <jsnow@redhat.com> |
9 | Message-id: be2e5fb2d285cbece2b6d06bed54a6f56520d251.1491597120.git.jcody@redhat.com | ||
10 | --- | ||
11 | block.c | 7 +++++++ | ||
12 | 1 file changed, 7 insertions(+) | ||
8 | 13 | ||
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 14 | diff --git a/block.c b/block.c |
10 | Link: https://lore.kernel.org/r/20200305170806.1313245-5-stefanha@redhat.com | ||
11 | Message-Id: <20200305170806.1313245-5-stefanha@redhat.com> | ||
12 | --- | ||
13 | include/block/aio.h | 13 ++++++------- | ||
14 | util/aio-posix.c | 7 +------ | ||
15 | util/fdmon-epoll.c | 21 ++++++++++++--------- | ||
16 | util/fdmon-poll.c | 4 +++- | ||
17 | 4 files changed, 22 insertions(+), 23 deletions(-) | ||
18 | |||
19 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
20 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/include/block/aio.h | 16 | --- a/block.c |
22 | +++ b/include/block/aio.h | 17 | +++ b/block.c |
23 | @@ -XXX,XX +XXX,XX @@ typedef struct { | 18 | @@ -XXX,XX +XXX,XX @@ int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) |
24 | /* | 19 | return -EINVAL; |
25 | * update: | ||
26 | * @ctx: the AioContext | ||
27 | - * @node: the handler | ||
28 | - * @is_new: is the file descriptor already being monitored? | ||
29 | + * @old_node: the existing handler or NULL if this file descriptor is being | ||
30 | + * monitored for the first time | ||
31 | + * @new_node: the new handler or NULL if this file descriptor is being | ||
32 | + * removed | ||
33 | * | ||
34 | - * Add/remove/modify a monitored file descriptor. There are three cases: | ||
35 | - * 1. node->pfd.events == 0 means remove the file descriptor. | ||
36 | - * 2. !is_new means modify an already monitored file descriptor. | ||
37 | - * 3. is_new means add a new file descriptor. | ||
38 | + * Add/remove/modify a monitored file descriptor. | ||
39 | * | ||
40 | * Called with ctx->list_lock acquired. | ||
41 | */ | ||
42 | - void (*update)(AioContext *ctx, AioHandler *node, bool is_new); | ||
43 | + void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node); | ||
44 | |||
45 | /* | ||
46 | * wait: | ||
47 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
48 | index XXXXXXX..XXXXXXX 100644 | ||
49 | --- a/util/aio-posix.c | ||
50 | +++ b/util/aio-posix.c | ||
51 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
52 | atomic_set(&ctx->poll_disable_cnt, | ||
53 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
54 | |||
55 | - if (new_node) { | ||
56 | - ctx->fdmon_ops->update(ctx, new_node, is_new); | ||
57 | - } else if (node) { | ||
58 | - /* Unregister deleted fd_handler */ | ||
59 | - ctx->fdmon_ops->update(ctx, node, false); | ||
60 | - } | ||
61 | + ctx->fdmon_ops->update(ctx, node, new_node); | ||
62 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
63 | aio_notify(ctx); | ||
64 | |||
65 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
66 | index XXXXXXX..XXXXXXX 100644 | ||
67 | --- a/util/fdmon-epoll.c | ||
68 | +++ b/util/fdmon-epoll.c | ||
69 | @@ -XXX,XX +XXX,XX @@ static inline int epoll_events_from_pfd(int pfd_events) | ||
70 | (pfd_events & G_IO_ERR ? EPOLLERR : 0); | ||
71 | } | ||
72 | |||
73 | -static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
74 | +static void fdmon_epoll_update(AioContext *ctx, | ||
75 | + AioHandler *old_node, | ||
76 | + AioHandler *new_node) | ||
77 | { | ||
78 | - struct epoll_event event; | ||
79 | + struct epoll_event event = { | ||
80 | + .data.ptr = new_node, | ||
81 | + .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0, | ||
82 | + }; | ||
83 | int r; | ||
84 | - int ctl; | ||
85 | |||
86 | - if (!node->pfd.events) { | ||
87 | - ctl = EPOLL_CTL_DEL; | ||
88 | + if (!new_node) { | ||
89 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event); | ||
90 | + } else if (!old_node) { | ||
91 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event); | ||
92 | } else { | ||
93 | - event.data.ptr = node; | ||
94 | - event.events = epoll_events_from_pfd(node->pfd.events); | ||
95 | - ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; | ||
96 | + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event); | ||
97 | } | 20 | } |
98 | 21 | ||
99 | - r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event); | 22 | + /* Do not clear read_only if it is prohibited */ |
100 | if (r) { | 23 | + if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR)) { |
101 | fdmon_epoll_disable(ctx); | 24 | + error_setg(errp, "Node '%s' is read only", |
102 | } | 25 | + bdrv_get_device_or_node_name(bs)); |
103 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | 26 | + return -EPERM; |
104 | index XXXXXXX..XXXXXXX 100644 | 27 | + } |
105 | --- a/util/fdmon-poll.c | 28 | + |
106 | +++ b/util/fdmon-poll.c | 29 | bs->read_only = read_only; |
107 | @@ -XXX,XX +XXX,XX @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, | 30 | return 0; |
108 | return ret; | ||
109 | } | ||
110 | |||
111 | -static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new) | ||
112 | +static void fdmon_poll_update(AioContext *ctx, | ||
113 | + AioHandler *old_node, | ||
114 | + AioHandler *new_node) | ||
115 | { | ||
116 | /* Do nothing, AioHandler already contains the state we'll need */ | ||
117 | } | 31 | } |
118 | -- | 32 | -- |
119 | 2.24.1 | 33 | 2.9.3 |
120 | 34 | ||
35 | diff view generated by jsdifflib |
1 | Unlike ppoll(2) and epoll(7), Linux io_uring completions can be polled | 1 | Move bdrv_is_read_only() up with its friends. |
---|---|---|---|
2 | from userspace. Previously userspace polling was only allowed when all | ||
3 | AioHandler's had an ->io_poll() callback. This prevented starvation of | ||
4 | fds by userspace pollable handlers. | ||
5 | 2 | ||
6 | Add the FDMonOps->need_wait() callback that enables userspace polling | 3 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
7 | even when some AioHandlers lack ->io_poll(). | 4 | Reviewed-by: John Snow <jsnow@redhat.com> |
5 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
6 | Message-id: 73b2399459760c32506f9407efb9dddb3a2789de.1491597120.git.jcody@redhat.com | ||
7 | --- | ||
8 | block.c | 10 +++++----- | ||
9 | 1 file changed, 5 insertions(+), 5 deletions(-) | ||
8 | 10 | ||
9 | For example, it's now possible to do userspace polling when a TCP/IP | 11 | diff --git a/block.c b/block.c |
10 | socket is monitored thanks to Linux io_uring. | ||
11 | |||
12 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
13 | Link: https://lore.kernel.org/r/20200305170806.1313245-7-stefanha@redhat.com | ||
14 | Message-Id: <20200305170806.1313245-7-stefanha@redhat.com> | ||
15 | --- | ||
16 | include/block/aio.h | 19 +++++++++++++++++++ | ||
17 | util/aio-posix.c | 11 ++++++++--- | ||
18 | util/fdmon-epoll.c | 1 + | ||
19 | util/fdmon-io_uring.c | 6 ++++++ | ||
20 | util/fdmon-poll.c | 1 + | ||
21 | 5 files changed, 35 insertions(+), 3 deletions(-) | ||
22 | |||
23 | diff --git a/include/block/aio.h b/include/block/aio.h | ||
24 | index XXXXXXX..XXXXXXX 100644 | 12 | index XXXXXXX..XXXXXXX 100644 |
25 | --- a/include/block/aio.h | 13 | --- a/block.c |
26 | +++ b/include/block/aio.h | 14 | +++ b/block.c |
27 | @@ -XXX,XX +XXX,XX @@ struct ThreadPool; | 15 | @@ -XXX,XX +XXX,XX @@ void path_combine(char *dest, int dest_size, |
28 | struct LinuxAioState; | 16 | } |
29 | struct LuringState; | 17 | } |
30 | 18 | ||
31 | +/* Is polling disabled? */ | 19 | +bool bdrv_is_read_only(BlockDriverState *bs) |
32 | +bool aio_poll_disabled(AioContext *ctx); | ||
33 | + | ||
34 | /* Callbacks for file descriptor monitoring implementations */ | ||
35 | typedef struct { | ||
36 | /* | ||
37 | @@ -XXX,XX +XXX,XX @@ typedef struct { | ||
38 | * Returns: number of ready file descriptors. | ||
39 | */ | ||
40 | int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); | ||
41 | + | ||
42 | + /* | ||
43 | + * need_wait: | ||
44 | + * @ctx: the AioContext | ||
45 | + * | ||
46 | + * Tell aio_poll() when to stop userspace polling early because ->wait() | ||
47 | + * has fds ready. | ||
48 | + * | ||
49 | + * File descriptor monitoring implementations that cannot poll fd readiness | ||
50 | + * from userspace should use aio_poll_disabled() here. This ensures that | ||
51 | + * file descriptors are not starved by handlers that frequently make | ||
52 | + * progress via userspace polling. | ||
53 | + * | ||
54 | + * Returns: true if ->wait() should be called, false otherwise. | ||
55 | + */ | ||
56 | + bool (*need_wait)(AioContext *ctx); | ||
57 | } FDMonOps; | ||
58 | |||
59 | /* | ||
60 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
61 | index XXXXXXX..XXXXXXX 100644 | ||
62 | --- a/util/aio-posix.c | ||
63 | +++ b/util/aio-posix.c | ||
64 | @@ -XXX,XX +XXX,XX @@ | ||
65 | #include "trace.h" | ||
66 | #include "aio-posix.h" | ||
67 | |||
68 | +bool aio_poll_disabled(AioContext *ctx) | ||
69 | +{ | 20 | +{ |
70 | + return atomic_read(&ctx->poll_disable_cnt); | 21 | + return bs->read_only; |
71 | +} | 22 | +} |
72 | + | 23 | + |
73 | void aio_add_ready_handler(AioHandlerList *ready_list, | 24 | int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) |
74 | AioHandler *node, | ||
75 | int revents) | ||
76 | @@ -XXX,XX +XXX,XX @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) | ||
77 | elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; | ||
78 | max_ns = qemu_soonest_timeout(*timeout, max_ns); | ||
79 | assert(!(max_ns && progress)); | ||
80 | - } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt)); | ||
81 | + } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); | ||
82 | |||
83 | /* If time has passed with no successful polling, adjust *timeout to | ||
84 | * keep the same ending time. | ||
85 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | ||
86 | { | 25 | { |
87 | int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); | 26 | /* Do not set read_only if copy_on_read is enabled */ |
88 | 27 | @@ -XXX,XX +XXX,XX @@ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) | |
89 | - if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) { | 28 | *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; |
90 | + if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { | ||
91 | poll_set_started(ctx, true); | ||
92 | |||
93 | if (run_poll_handlers(ctx, max_ns, timeout)) { | ||
94 | @@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking) | ||
95 | /* If polling is allowed, non-blocking aio_poll does not need the | ||
96 | * system call---a single round of run_poll_handlers_once suffices. | ||
97 | */ | ||
98 | - if (timeout || atomic_read(&ctx->poll_disable_cnt)) { | ||
99 | + if (timeout || ctx->fdmon_ops->need_wait(ctx)) { | ||
100 | ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); | ||
101 | } | ||
102 | |||
103 | diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c | ||
104 | index XXXXXXX..XXXXXXX 100644 | ||
105 | --- a/util/fdmon-epoll.c | ||
106 | +++ b/util/fdmon-epoll.c | ||
107 | @@ -XXX,XX +XXX,XX @@ out: | ||
108 | static const FDMonOps fdmon_epoll_ops = { | ||
109 | .update = fdmon_epoll_update, | ||
110 | .wait = fdmon_epoll_wait, | ||
111 | + .need_wait = aio_poll_disabled, | ||
112 | }; | ||
113 | |||
114 | static bool fdmon_epoll_try_enable(AioContext *ctx) | ||
115 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c | ||
116 | index XXXXXXX..XXXXXXX 100644 | ||
117 | --- a/util/fdmon-io_uring.c | ||
118 | +++ b/util/fdmon-io_uring.c | ||
119 | @@ -XXX,XX +XXX,XX @@ static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
120 | return process_cq_ring(ctx, ready_list); | ||
121 | } | 29 | } |
122 | 30 | ||
123 | +static bool fdmon_io_uring_need_wait(AioContext *ctx) | 31 | -bool bdrv_is_read_only(BlockDriverState *bs) |
124 | +{ | 32 | -{ |
125 | + return io_uring_cq_ready(&ctx->fdmon_io_uring); | 33 | - return bs->read_only; |
126 | +} | 34 | -} |
127 | + | 35 | - |
128 | static const FDMonOps fdmon_io_uring_ops = { | 36 | bool bdrv_is_sg(BlockDriverState *bs) |
129 | .update = fdmon_io_uring_update, | 37 | { |
130 | .wait = fdmon_io_uring_wait, | 38 | return bs->sg; |
131 | + .need_wait = fdmon_io_uring_need_wait, | ||
132 | }; | ||
133 | |||
134 | bool fdmon_io_uring_setup(AioContext *ctx) | ||
135 | diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c | ||
136 | index XXXXXXX..XXXXXXX 100644 | ||
137 | --- a/util/fdmon-poll.c | ||
138 | +++ b/util/fdmon-poll.c | ||
139 | @@ -XXX,XX +XXX,XX @@ static void fdmon_poll_update(AioContext *ctx, | ||
140 | const FDMonOps fdmon_poll_ops = { | ||
141 | .update = fdmon_poll_update, | ||
142 | .wait = fdmon_poll_wait, | ||
143 | + .need_wait = aio_poll_disabled, | ||
144 | }; | ||
145 | -- | 39 | -- |
146 | 2.24.1 | 40 | 2.9.3 |
147 | 41 | ||
42 | diff view generated by jsdifflib |
1 | Do not leave stale linked list pointers around after removal. It's | 1 | Introduce check function for setting read_only flags. Will return < 0 on |
---|---|---|---|
2 | safer to set them to NULL so that use-after-removal results in an | 2 | error, with appropriate Error value set. Does not alter any flags. |
3 | immediate segfault. | ||
4 | 3 | ||
5 | The RCU queue removal macros are unchanged since nodes may still be | 4 | Signed-off-by: Jeff Cody <jcody@redhat.com> |
6 | traversed after removal. | 5 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
6 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
7 | Message-id: e2bba34ac3bc76a0c42adc390413f358ae0566e8.1491597120.git.jcody@redhat.com | ||
8 | --- | ||
9 | block.c | 14 +++++++++++++- | ||
10 | include/block/block.h | 1 + | ||
11 | 2 files changed, 14 insertions(+), 1 deletion(-) | ||
7 | 12 | ||
8 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> | 13 | diff --git a/block.c b/block.c |
9 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 14 | index XXXXXXX..XXXXXXX 100644 |
10 | Link: https://lore.kernel.org/r/20200224103406.1894923-2-stefanha@redhat.com | 15 | --- a/block.c |
11 | Message-Id: <20200224103406.1894923-2-stefanha@redhat.com> | 16 | +++ b/block.c |
12 | --- | 17 | @@ -XXX,XX +XXX,XX @@ bool bdrv_is_read_only(BlockDriverState *bs) |
13 | include/qemu/queue.h | 19 +++++++++++++++---- | 18 | return bs->read_only; |
14 | 1 file changed, 15 insertions(+), 4 deletions(-) | 19 | } |
20 | |||
21 | -int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) | ||
22 | +int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) | ||
23 | { | ||
24 | /* Do not set read_only if copy_on_read is enabled */ | ||
25 | if (bs->copy_on_read && read_only) { | ||
26 | @@ -XXX,XX +XXX,XX @@ int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) | ||
27 | return -EPERM; | ||
28 | } | ||
29 | |||
30 | + return 0; | ||
31 | +} | ||
32 | + | ||
33 | +int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) | ||
34 | +{ | ||
35 | + int ret = 0; | ||
36 | + | ||
37 | + ret = bdrv_can_set_read_only(bs, read_only, errp); | ||
38 | + if (ret < 0) { | ||
39 | + return ret; | ||
40 | + } | ||
41 | + | ||
42 | bs->read_only = read_only; | ||
43 | return 0; | ||
44 | } | ||
45 | diff --git a/include/block/block.h b/include/block/block.h | ||
46 | index XXXXXXX..XXXXXXX 100644 | ||
47 | --- a/include/block/block.h | ||
48 | +++ b/include/block/block.h | ||
49 | @@ -XXX,XX +XXX,XX @@ int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, | ||
50 | int64_t sector_num, int nb_sectors, int *pnum); | ||
51 | |||
52 | bool bdrv_is_read_only(BlockDriverState *bs); | ||
53 | +int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); | ||
54 | int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); | ||
55 | bool bdrv_is_sg(BlockDriverState *bs); | ||
56 | bool bdrv_is_inserted(BlockDriverState *bs); | ||
57 | -- | ||
58 | 2.9.3 | ||
15 | 59 | ||
16 | diff --git a/include/qemu/queue.h b/include/qemu/queue.h | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/include/qemu/queue.h | ||
19 | +++ b/include/qemu/queue.h | ||
20 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
21 | (elm)->field.le_next->field.le_prev = \ | ||
22 | (elm)->field.le_prev; \ | ||
23 | *(elm)->field.le_prev = (elm)->field.le_next; \ | ||
24 | + (elm)->field.le_next = NULL; \ | ||
25 | + (elm)->field.le_prev = NULL; \ | ||
26 | } while (/*CONSTCOND*/0) | ||
27 | |||
28 | /* | ||
29 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
30 | } while (/*CONSTCOND*/0) | ||
31 | |||
32 | #define QSLIST_REMOVE_HEAD(head, field) do { \ | ||
33 | - (head)->slh_first = (head)->slh_first->field.sle_next; \ | ||
34 | + typeof((head)->slh_first) elm = (head)->slh_first; \ | ||
35 | + (head)->slh_first = elm->field.sle_next; \ | ||
36 | + elm->field.sle_next = NULL; \ | ||
37 | } while (/*CONSTCOND*/0) | ||
38 | |||
39 | #define QSLIST_REMOVE_AFTER(slistelm, field) do { \ | ||
40 | - (slistelm)->field.sle_next = \ | ||
41 | - QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field); \ | ||
42 | + typeof(slistelm) next = (slistelm)->field.sle_next; \ | ||
43 | + (slistelm)->field.sle_next = next->field.sle_next; \ | ||
44 | + next->field.sle_next = NULL; \ | ||
45 | } while (/*CONSTCOND*/0) | ||
46 | |||
47 | #define QSLIST_REMOVE(head, elm, type, field) do { \ | ||
48 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
49 | while (curelm->field.sle_next != (elm)) \ | ||
50 | curelm = curelm->field.sle_next; \ | ||
51 | curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \ | ||
52 | + (elm)->field.sle_next = NULL; \ | ||
53 | } \ | ||
54 | } while (/*CONSTCOND*/0) | ||
55 | |||
56 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
57 | } while (/*CONSTCOND*/0) | ||
58 | |||
59 | #define QSIMPLEQ_REMOVE_HEAD(head, field) do { \ | ||
60 | - if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\ | ||
61 | + typeof((head)->sqh_first) elm = (head)->sqh_first; \ | ||
62 | + if (((head)->sqh_first = elm->field.sqe_next) == NULL) \ | ||
63 | (head)->sqh_last = &(head)->sqh_first; \ | ||
64 | + elm->field.sqe_next = NULL; \ | ||
65 | } while (/*CONSTCOND*/0) | ||
66 | |||
67 | #define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do { \ | ||
68 | @@ -XXX,XX +XXX,XX @@ struct { \ | ||
69 | if ((curelm->field.sqe_next = \ | ||
70 | curelm->field.sqe_next->field.sqe_next) == NULL) \ | ||
71 | (head)->sqh_last = &(curelm)->field.sqe_next; \ | ||
72 | + (elm)->field.sqe_next = NULL; \ | ||
73 | } \ | ||
74 | } while (/*CONSTCOND*/0) | ||
75 | |||
76 | @@ -XXX,XX +XXX,XX @@ union { \ | ||
77 | (head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \ | ||
78 | (elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \ | ||
79 | (elm)->field.tqe_circ.tql_prev = NULL; \ | ||
80 | + (elm)->field.tqe_circ.tql_next = NULL; \ | ||
81 | + (elm)->field.tqe_next = NULL; \ | ||
82 | } while (/*CONSTCOND*/0) | ||
83 | |||
84 | /* remove @left, @right and all elements in between from @head */ | ||
85 | -- | ||
86 | 2.24.1 | ||
87 | 60 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
2 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
3 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
4 | Message-id: 00aed7ffdd7be4b9ed9ce1007d50028a72b34ebe.1491597120.git.jcody@redhat.com | ||
5 | --- | ||
6 | block.c | 14 ++++++++------ | ||
7 | 1 file changed, 8 insertions(+), 6 deletions(-) | ||
1 | 8 | ||
9 | diff --git a/block.c b/block.c | ||
10 | index XXXXXXX..XXXXXXX 100644 | ||
11 | --- a/block.c | ||
12 | +++ b/block.c | ||
13 | @@ -XXX,XX +XXX,XX @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, | ||
14 | BlockDriver *drv; | ||
15 | QemuOpts *opts; | ||
16 | const char *value; | ||
17 | + bool read_only; | ||
18 | |||
19 | assert(reopen_state != NULL); | ||
20 | assert(reopen_state->bs->drv != NULL); | ||
21 | @@ -XXX,XX +XXX,XX @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, | ||
22 | qdict_put(reopen_state->options, "driver", qstring_from_str(value)); | ||
23 | } | ||
24 | |||
25 | - /* if we are to stay read-only, do not allow permission change | ||
26 | - * to r/w */ | ||
27 | - if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && | ||
28 | - reopen_state->flags & BDRV_O_RDWR) { | ||
29 | - error_setg(errp, "Node '%s' is read only", | ||
30 | - bdrv_get_device_or_node_name(reopen_state->bs)); | ||
31 | + /* If we are to stay read-only, do not allow permission change | ||
32 | + * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is | ||
33 | + * not set, or if the BDS still has copy_on_read enabled */ | ||
34 | + read_only = !(reopen_state->flags & BDRV_O_RDWR); | ||
35 | + ret = bdrv_can_set_read_only(reopen_state->bs, read_only, &local_err); | ||
36 | + if (local_err) { | ||
37 | + error_propagate(errp, local_err); | ||
38 | goto error; | ||
39 | } | ||
40 | |||
41 | -- | ||
42 | 2.9.3 | ||
43 | |||
44 | diff view generated by jsdifflib |
1 | QLIST_SAFE_REMOVE() is confusing here because the node must be on the | 1 | Update 'clientname' to be 'user', which tracks better with both |
---|---|---|---|
2 | list. We actually just wanted to clear the linked list pointers when | 2 | the QAPI and rados variable naming. |
3 | removing it from the list. QLIST_REMOVE() now does this, so switch to | ||
4 | it. | ||
5 | 3 | ||
6 | Suggested-by: Paolo Bonzini <pbonzini@redhat.com> | 4 | Update 'name' to be 'image_name', as it indicates the rbd image. |
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 5 | Naming it 'image' would have been ideal, but we are using that for |
8 | Link: https://lore.kernel.org/r/20200224103406.1894923-3-stefanha@redhat.com | 6 | the rados_image_t value returned by rbd_open(). |
9 | Message-Id: <20200224103406.1894923-3-stefanha@redhat.com> | 7 | |
8 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
9 | Signed-off-by: Jeff Cody <jcody@redhat.com> | ||
10 | Reviewed-by: John Snow <jsnow@redhat.com> | ||
11 | Message-id: b7ec1fb2e1cf36f9b6911631447a5b0422590b7d.1491597120.git.jcody@redhat.com | ||
10 | --- | 12 | --- |
11 | util/aio-posix.c | 2 +- | 13 | block/rbd.c | 33 +++++++++++++++++---------------- |
12 | 1 file changed, 1 insertion(+), 1 deletion(-) | 14 | 1 file changed, 17 insertions(+), 16 deletions(-) |
13 | 15 | ||
14 | diff --git a/util/aio-posix.c b/util/aio-posix.c | 16 | diff --git a/block/rbd.c b/block/rbd.c |
15 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100644 |
16 | --- a/util/aio-posix.c | 18 | --- a/block/rbd.c |
17 | +++ b/util/aio-posix.c | 19 | +++ b/block/rbd.c |
18 | @@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_ready_handlers(AioContext *ctx, | 20 | @@ -XXX,XX +XXX,XX @@ typedef struct BDRVRBDState { |
19 | AioHandler *node; | 21 | rados_t cluster; |
20 | 22 | rados_ioctx_t io_ctx; | |
21 | while ((node = QLIST_FIRST(ready_list))) { | 23 | rbd_image_t image; |
22 | - QLIST_SAFE_REMOVE(node, node_ready); | 24 | - char *name; |
23 | + QLIST_REMOVE(node, node_ready); | 25 | + char *image_name; |
24 | progress = aio_dispatch_handler(ctx, node) || progress; | 26 | char *snap; |
27 | } BDRVRBDState; | ||
28 | |||
29 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) | ||
30 | int64_t bytes = 0; | ||
31 | int64_t objsize; | ||
32 | int obj_order = 0; | ||
33 | - const char *pool, *name, *conf, *clientname, *keypairs; | ||
34 | + const char *pool, *image_name, *conf, *user, *keypairs; | ||
35 | const char *secretid; | ||
36 | rados_t cluster; | ||
37 | rados_ioctx_t io_ctx; | ||
38 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) | ||
39 | */ | ||
40 | pool = qdict_get_try_str(options, "pool"); | ||
41 | conf = qdict_get_try_str(options, "conf"); | ||
42 | - clientname = qdict_get_try_str(options, "user"); | ||
43 | - name = qdict_get_try_str(options, "image"); | ||
44 | + user = qdict_get_try_str(options, "user"); | ||
45 | + image_name = qdict_get_try_str(options, "image"); | ||
46 | keypairs = qdict_get_try_str(options, "=keyvalue-pairs"); | ||
47 | |||
48 | - ret = rados_create(&cluster, clientname); | ||
49 | + ret = rados_create(&cluster, user); | ||
50 | if (ret < 0) { | ||
51 | error_setg_errno(errp, -ret, "error initializing"); | ||
52 | goto exit; | ||
53 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) | ||
54 | goto shutdown; | ||
25 | } | 55 | } |
26 | 56 | ||
57 | - ret = rbd_create(io_ctx, name, bytes, &obj_order); | ||
58 | + ret = rbd_create(io_ctx, image_name, bytes, &obj_order); | ||
59 | if (ret < 0) { | ||
60 | error_setg_errno(errp, -ret, "error rbd create"); | ||
61 | } | ||
62 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, | ||
63 | Error **errp) | ||
64 | { | ||
65 | BDRVRBDState *s = bs->opaque; | ||
66 | - const char *pool, *snap, *conf, *clientname, *name, *keypairs; | ||
67 | + const char *pool, *snap, *conf, *user, *image_name, *keypairs; | ||
68 | const char *secretid; | ||
69 | QemuOpts *opts; | ||
70 | Error *local_err = NULL; | ||
71 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, | ||
72 | pool = qemu_opt_get(opts, "pool"); | ||
73 | conf = qemu_opt_get(opts, "conf"); | ||
74 | snap = qemu_opt_get(opts, "snapshot"); | ||
75 | - clientname = qemu_opt_get(opts, "user"); | ||
76 | - name = qemu_opt_get(opts, "image"); | ||
77 | + user = qemu_opt_get(opts, "user"); | ||
78 | + image_name = qemu_opt_get(opts, "image"); | ||
79 | keypairs = qemu_opt_get(opts, "=keyvalue-pairs"); | ||
80 | |||
81 | - if (!pool || !name) { | ||
82 | + if (!pool || !image_name) { | ||
83 | error_setg(errp, "Parameters 'pool' and 'image' are required"); | ||
84 | r = -EINVAL; | ||
85 | goto failed_opts; | ||
86 | } | ||
87 | |||
88 | - r = rados_create(&s->cluster, clientname); | ||
89 | + r = rados_create(&s->cluster, user); | ||
90 | if (r < 0) { | ||
91 | error_setg_errno(errp, -r, "error initializing"); | ||
92 | goto failed_opts; | ||
93 | } | ||
94 | |||
95 | s->snap = g_strdup(snap); | ||
96 | - s->name = g_strdup(name); | ||
97 | + s->image_name = g_strdup(image_name); | ||
98 | |||
99 | /* try default location when conf=NULL, but ignore failure */ | ||
100 | r = rados_conf_read_file(s->cluster, conf); | ||
101 | @@ -XXX,XX +XXX,XX @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, | ||
102 | } | ||
103 | |||
104 | /* rbd_open is always r/w */ | ||
105 | - r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); | ||
106 | + r = rbd_open(s->io_ctx, s->image_name, &s->image, s->snap); | ||
107 | if (r < 0) { | ||
108 | - error_setg_errno(errp, -r, "error reading header from %s", s->name); | ||
109 | + error_setg_errno(errp, -r, "error reading header from %s", | ||
110 | + s->image_name); | ||
111 | goto failed_open; | ||
112 | } | ||
113 | |||
114 | @@ -XXX,XX +XXX,XX @@ failed_open: | ||
115 | failed_shutdown: | ||
116 | rados_shutdown(s->cluster); | ||
117 | g_free(s->snap); | ||
118 | - g_free(s->name); | ||
119 | + g_free(s->image_name); | ||
120 | failed_opts: | ||
121 | qemu_opts_del(opts); | ||
122 | g_free(mon_host); | ||
123 | @@ -XXX,XX +XXX,XX @@ static void qemu_rbd_close(BlockDriverState *bs) | ||
124 | rbd_close(s->image); | ||
125 | rados_ioctx_destroy(s->io_ctx); | ||
126 | g_free(s->snap); | ||
127 | - g_free(s->name); | ||
128 | + g_free(s->image_name); | ||
129 | rados_shutdown(s->cluster); | ||
130 | } | ||
131 | |||
27 | -- | 132 | -- |
28 | 2.24.1 | 133 | 2.9.3 |
29 | 134 | ||
135 | diff view generated by jsdifflib |
1 | The recent Linux io_uring API has several advantages over ppoll(2) and | 1 | This adds support for reopen in rbd, for changing between r/w and r/o. |
---|---|---|---|
2 | epoll(2). Details are given in the source code. | ||
3 | 2 | ||
4 | Add an io_uring implementation and make it the default on Linux. | 3 | Note, that this is only a flag change, but we will block a change from |
5 | Performance is the same as with epoll(7) but later patches add | 4 | r/o to r/w if we are using an RBD internal snapshot. |
6 | optimizations that take advantage of io_uring. | ||
7 | 5 | ||
8 | It is necessary to change how aio_set_fd_handler() deals with deleting | 6 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> |
9 | AioHandlers since removing monitored file descriptors is asynchronous in | 7 | Signed-off-by: Jeff Cody <jcody@redhat.com> |
10 | io_uring. fdmon_io_uring_remove() marks the AioHandler deleted and | 8 | Reviewed-by: John Snow <jsnow@redhat.com> |
11 | aio_set_fd_handler() will let it handle deletion in that case. | 9 | Message-id: d4e87539167ec6527d44c97b164eabcccf96e4f3.1491597120.git.jcody@redhat.com |
10 | --- | ||
11 | block/rbd.c | 21 +++++++++++++++++++++ | ||
12 | 1 file changed, 21 insertions(+) | ||
12 | 13 | ||
13 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 14 | diff --git a/block/rbd.c b/block/rbd.c |
14 | Link: https://lore.kernel.org/r/20200305170806.1313245-6-stefanha@redhat.com | 15 | index XXXXXXX..XXXXXXX 100644 |
15 | Message-Id: <20200305170806.1313245-6-stefanha@redhat.com> | 16 | --- a/block/rbd.c |
16 | --- | 17 | +++ b/block/rbd.c |
17 | configure | 5 + | 18 | @@ -XXX,XX +XXX,XX @@ failed_opts: |
18 | include/block/aio.h | 9 ++ | 19 | return r; |
19 | util/Makefile.objs | 1 + | 20 | } |
20 | util/aio-posix.c | 20 ++- | 21 | |
21 | util/aio-posix.h | 20 ++- | ||
22 | util/fdmon-io_uring.c | 326 ++++++++++++++++++++++++++++++++++++++++++ | ||
23 | 6 files changed, 376 insertions(+), 5 deletions(-) | ||
24 | create mode 100644 util/fdmon-io_uring.c | ||
25 | |||
26 | diff --git a/configure b/configure | ||
27 | index XXXXXXX..XXXXXXX 100755 | ||
28 | --- a/configure | ||
29 | +++ b/configure | ||
30 | @@ -XXX,XX +XXX,XX @@ if test "$linux_io_uring" != "no" ; then | ||
31 | linux_io_uring_cflags=$($pkg_config --cflags liburing) | ||
32 | linux_io_uring_libs=$($pkg_config --libs liburing) | ||
33 | linux_io_uring=yes | ||
34 | + | 22 | + |
35 | + # io_uring is used in libqemuutil.a where per-file -libs variables are not | 23 | +/* Since RBD is currently always opened R/W via the API, |
36 | + # seen by programs linking the archive. It's not ideal, but just add the | 24 | + * we just need to check if we are using a snapshot or not, in |
37 | + # library dependency globally. | 25 | + * order to determine if we will allow it to be R/W */ |
38 | + LIBS="$linux_io_uring_libs $LIBS" | 26 | +static int qemu_rbd_reopen_prepare(BDRVReopenState *state, |
39 | else | 27 | + BlockReopenQueue *queue, Error **errp) |
40 | if test "$linux_io_uring" = "yes" ; then | 28 | +{ |
41 | feature_not_found "linux io_uring" "Install liburing devel" | 29 | + BDRVRBDState *s = state->bs->opaque; |
42 | diff --git a/include/block/aio.h b/include/block/aio.h | 30 | + int ret = 0; |
43 | index XXXXXXX..XXXXXXX 100644 | ||
44 | --- a/include/block/aio.h | ||
45 | +++ b/include/block/aio.h | ||
46 | @@ -XXX,XX +XXX,XX @@ | ||
47 | #ifndef QEMU_AIO_H | ||
48 | #define QEMU_AIO_H | ||
49 | |||
50 | +#ifdef CONFIG_LINUX_IO_URING | ||
51 | +#include <liburing.h> | ||
52 | +#endif | ||
53 | #include "qemu/queue.h" | ||
54 | #include "qemu/event_notifier.h" | ||
55 | #include "qemu/thread.h" | ||
56 | @@ -XXX,XX +XXX,XX @@ struct BHListSlice { | ||
57 | QSIMPLEQ_ENTRY(BHListSlice) next; | ||
58 | }; | ||
59 | |||
60 | +typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; | ||
61 | + | 31 | + |
62 | struct AioContext { | 32 | + if (s->snap && state->flags & BDRV_O_RDWR) { |
63 | GSource source; | 33 | + error_setg(errp, |
64 | 34 | + "Cannot change node '%s' to r/w when using RBD snapshot", | |
65 | @@ -XXX,XX +XXX,XX @@ struct AioContext { | 35 | + bdrv_get_device_or_node_name(state->bs)); |
66 | * locking. | 36 | + ret = -EINVAL; |
67 | */ | ||
68 | struct LuringState *linux_io_uring; | ||
69 | + | ||
70 | + /* State for file descriptor monitoring using Linux io_uring */ | ||
71 | + struct io_uring fdmon_io_uring; | ||
72 | + AioHandlerSList submit_list; | ||
73 | #endif | ||
74 | |||
75 | /* TimerLists for calling timers - one per clock type. Has its own | ||
76 | diff --git a/util/Makefile.objs b/util/Makefile.objs | ||
77 | index XXXXXXX..XXXXXXX 100644 | ||
78 | --- a/util/Makefile.objs | ||
79 | +++ b/util/Makefile.objs | ||
80 | @@ -XXX,XX +XXX,XX @@ util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o | ||
81 | util-obj-$(CONFIG_POSIX) += aio-posix.o | ||
82 | util-obj-$(CONFIG_POSIX) += fdmon-poll.o | ||
83 | util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o | ||
84 | +util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o | ||
85 | util-obj-$(CONFIG_POSIX) += compatfd.o | ||
86 | util-obj-$(CONFIG_POSIX) += event_notifier-posix.o | ||
87 | util-obj-$(CONFIG_POSIX) += mmap-alloc.o | ||
88 | diff --git a/util/aio-posix.c b/util/aio-posix.c | ||
89 | index XXXXXXX..XXXXXXX 100644 | ||
90 | --- a/util/aio-posix.c | ||
91 | +++ b/util/aio-posix.c | ||
92 | @@ -XXX,XX +XXX,XX @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) | ||
93 | g_source_remove_poll(&ctx->source, &node->pfd); | ||
94 | } | ||
95 | |||
96 | + node->pfd.revents = 0; | ||
97 | + | ||
98 | + /* If the fd monitor has already marked it deleted, leave it alone */ | ||
99 | + if (QLIST_IS_INSERTED(node, node_deleted)) { | ||
100 | + return false; | ||
101 | + } | 37 | + } |
102 | + | 38 | + |
103 | /* If a read is in progress, just mark the node as deleted */ | 39 | + return ret; |
104 | if (qemu_lockcnt_count(&ctx->list_lock)) { | ||
105 | QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | ||
106 | - node->pfd.revents = 0; | ||
107 | return false; | ||
108 | } | ||
109 | /* Otherwise, delete it for real. We can't just mark it as | ||
110 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
111 | |||
112 | QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node); | ||
113 | } | ||
114 | - if (node) { | ||
115 | - deleted = aio_remove_fd_handler(ctx, node); | ||
116 | - } | ||
117 | |||
118 | /* No need to order poll_disable_cnt writes against other updates; | ||
119 | * the counter is only used to avoid wasting time and latency on | ||
120 | @@ -XXX,XX +XXX,XX @@ void aio_set_fd_handler(AioContext *ctx, | ||
121 | atomic_read(&ctx->poll_disable_cnt) + poll_disable_change); | ||
122 | |||
123 | ctx->fdmon_ops->update(ctx, node, new_node); | ||
124 | + if (node) { | ||
125 | + deleted = aio_remove_fd_handler(ctx, node); | ||
126 | + } | ||
127 | qemu_lockcnt_unlock(&ctx->list_lock); | ||
128 | aio_notify(ctx); | ||
129 | |||
130 | @@ -XXX,XX +XXX,XX @@ void aio_context_setup(AioContext *ctx) | ||
131 | ctx->fdmon_ops = &fdmon_poll_ops; | ||
132 | ctx->epollfd = -1; | ||
133 | |||
134 | + /* Use the fastest fd monitoring implementation if available */ | ||
135 | + if (fdmon_io_uring_setup(ctx)) { | ||
136 | + return; | ||
137 | + } | ||
138 | + | ||
139 | fdmon_epoll_setup(ctx); | ||
140 | } | ||
141 | |||
142 | void aio_context_destroy(AioContext *ctx) | ||
143 | { | ||
144 | + fdmon_io_uring_destroy(ctx); | ||
145 | fdmon_epoll_disable(ctx); | ||
146 | } | ||
147 | |||
148 | diff --git a/util/aio-posix.h b/util/aio-posix.h | ||
149 | index XXXXXXX..XXXXXXX 100644 | ||
150 | --- a/util/aio-posix.h | ||
151 | +++ b/util/aio-posix.h | ||
152 | @@ -XXX,XX +XXX,XX @@ struct AioHandler { | ||
153 | IOHandler *io_poll_begin; | ||
154 | IOHandler *io_poll_end; | ||
155 | void *opaque; | ||
156 | - bool is_external; | ||
157 | QLIST_ENTRY(AioHandler) node; | ||
158 | QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ | ||
159 | QLIST_ENTRY(AioHandler) node_deleted; | ||
160 | +#ifdef CONFIG_LINUX_IO_URING | ||
161 | + QSLIST_ENTRY(AioHandler) node_submitted; | ||
162 | + unsigned flags; /* see fdmon-io_uring.c */ | ||
163 | +#endif | ||
164 | + bool is_external; | ||
165 | }; | ||
166 | |||
167 | /* Add a handler to a ready list */ | ||
168 | @@ -XXX,XX +XXX,XX @@ static inline void fdmon_epoll_disable(AioContext *ctx) | ||
169 | } | ||
170 | #endif /* !CONFIG_EPOLL_CREATE1 */ | ||
171 | |||
172 | +#ifdef CONFIG_LINUX_IO_URING | ||
173 | +bool fdmon_io_uring_setup(AioContext *ctx); | ||
174 | +void fdmon_io_uring_destroy(AioContext *ctx); | ||
175 | +#else | ||
176 | +static inline bool fdmon_io_uring_setup(AioContext *ctx) | ||
177 | +{ | ||
178 | + return false; | ||
179 | +} | 40 | +} |
180 | + | 41 | + |
181 | +static inline void fdmon_io_uring_destroy(AioContext *ctx) | 42 | static void qemu_rbd_close(BlockDriverState *bs) |
182 | +{ | 43 | { |
183 | +} | 44 | BDRVRBDState *s = bs->opaque; |
184 | +#endif /* !CONFIG_LINUX_IO_URING */ | 45 | @@ -XXX,XX +XXX,XX @@ static BlockDriver bdrv_rbd = { |
185 | + | 46 | .bdrv_parse_filename = qemu_rbd_parse_filename, |
186 | #endif /* AIO_POSIX_H */ | 47 | .bdrv_file_open = qemu_rbd_open, |
187 | diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c | 48 | .bdrv_close = qemu_rbd_close, |
188 | new file mode 100644 | 49 | + .bdrv_reopen_prepare = qemu_rbd_reopen_prepare, |
189 | index XXXXXXX..XXXXXXX | 50 | .bdrv_create = qemu_rbd_create, |
190 | --- /dev/null | 51 | .bdrv_has_zero_init = bdrv_has_zero_init_1, |
191 | +++ b/util/fdmon-io_uring.c | 52 | .bdrv_get_info = qemu_rbd_getinfo, |
192 | @@ -XXX,XX +XXX,XX @@ | ||
193 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
194 | +/* | ||
195 | + * Linux io_uring file descriptor monitoring | ||
196 | + * | ||
197 | + * The Linux io_uring API supports file descriptor monitoring with a few | ||
198 | + * advantages over existing APIs like poll(2) and epoll(7): | ||
199 | + * | ||
200 | + * 1. Userspace polling of events is possible because the completion queue (cq | ||
201 | + * ring) is shared between the kernel and userspace. This allows | ||
202 | + * applications that rely on userspace polling to also monitor file | ||
203 | + * descriptors in the same userspace polling loop. | ||
204 | + * | ||
205 | + * 2. Submission and completion is batched and done together in a single system | ||
206 | + * call. This minimizes the number of system calls. | ||
207 | + * | ||
208 | + * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than | ||
209 | + * poll(2). | ||
210 | + * | ||
211 | + * 4. Nanosecond timeouts are supported so it requires fewer syscalls than | ||
212 | + * epoll(7). | ||
213 | + * | ||
214 | + * This code only monitors file descriptors and does not do asynchronous disk | ||
215 | + * I/O. Implementing disk I/O efficiently has other requirements and should | ||
216 | + * use a separate io_uring so it does not make sense to unify the code. | ||
217 | + * | ||
218 | + * File descriptor monitoring is implemented using the following operations: | ||
219 | + * | ||
220 | + * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored. | ||
221 | + * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When | ||
222 | + * the poll mask changes for a file descriptor it is first removed and then | ||
223 | + * re-added with the new poll mask, so this operation is also used as part | ||
224 | + * of modifying an existing monitored file descriptor. | ||
225 | + * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait | ||
226 | + * for events. This operation self-cancels if another event completes | ||
227 | + * before the timeout. | ||
228 | + * | ||
229 | + * io_uring calls the submission queue the "sq ring" and the completion queue | ||
230 | + * the "cq ring". Ring entries are called "sqe" and "cqe", respectively. | ||
231 | + * | ||
232 | + * The code is structured so that sq/cq rings are only modified within | ||
233 | + * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on | ||
234 | + * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD | ||
235 | + * and/or IORING_OP_POLL_REMOVE sqes for them. | ||
236 | + */ | ||
237 | + | ||
238 | +#include "qemu/osdep.h" | ||
239 | +#include <poll.h> | ||
240 | +#include "qemu/rcu_queue.h" | ||
241 | +#include "aio-posix.h" | ||
242 | + | ||
243 | +enum { | ||
244 | + FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ | ||
245 | + | ||
246 | + /* AioHandler::flags */ | ||
247 | + FDMON_IO_URING_PENDING = (1 << 0), | ||
248 | + FDMON_IO_URING_ADD = (1 << 1), | ||
249 | + FDMON_IO_URING_REMOVE = (1 << 2), | ||
250 | +}; | ||
251 | + | ||
252 | +static inline int poll_events_from_pfd(int pfd_events) | ||
253 | +{ | ||
254 | + return (pfd_events & G_IO_IN ? POLLIN : 0) | | ||
255 | + (pfd_events & G_IO_OUT ? POLLOUT : 0) | | ||
256 | + (pfd_events & G_IO_HUP ? POLLHUP : 0) | | ||
257 | + (pfd_events & G_IO_ERR ? POLLERR : 0); | ||
258 | +} | ||
259 | + | ||
260 | +static inline int pfd_events_from_poll(int poll_events) | ||
261 | +{ | ||
262 | + return (poll_events & POLLIN ? G_IO_IN : 0) | | ||
263 | + (poll_events & POLLOUT ? G_IO_OUT : 0) | | ||
264 | + (poll_events & POLLHUP ? G_IO_HUP : 0) | | ||
265 | + (poll_events & POLLERR ? G_IO_ERR : 0); | ||
266 | +} | ||
267 | + | ||
268 | +/* | ||
269 | + * Returns an sqe for submitting a request. Only be called within | ||
270 | + * fdmon_io_uring_wait(). | ||
271 | + */ | ||
272 | +static struct io_uring_sqe *get_sqe(AioContext *ctx) | ||
273 | +{ | ||
274 | + struct io_uring *ring = &ctx->fdmon_io_uring; | ||
275 | + struct io_uring_sqe *sqe = io_uring_get_sqe(ring); | ||
276 | + int ret; | ||
277 | + | ||
278 | + if (likely(sqe)) { | ||
279 | + return sqe; | ||
280 | + } | ||
281 | + | ||
282 | + /* No free sqes left, submit pending sqes first */ | ||
283 | + ret = io_uring_submit(ring); | ||
284 | + assert(ret > 1); | ||
285 | + sqe = io_uring_get_sqe(ring); | ||
286 | + assert(sqe); | ||
287 | + return sqe; | ||
288 | +} | ||
289 | + | ||
290 | +/* Atomically enqueue an AioHandler for sq ring submission */ | ||
291 | +static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags) | ||
292 | +{ | ||
293 | + unsigned old_flags; | ||
294 | + | ||
295 | + old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags); | ||
296 | + if (!(old_flags & FDMON_IO_URING_PENDING)) { | ||
297 | + QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted); | ||
298 | + } | ||
299 | +} | ||
300 | + | ||
301 | +/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */ | ||
302 | +static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) | ||
303 | +{ | ||
304 | + AioHandler *node = QSLIST_FIRST(head); | ||
305 | + | ||
306 | + if (!node) { | ||
307 | + return NULL; | ||
308 | + } | ||
309 | + | ||
310 | + /* Doesn't need to be atomic since fill_sq_ring() moves the list */ | ||
311 | + QSLIST_REMOVE_HEAD(head, node_submitted); | ||
312 | + | ||
313 | + /* | ||
314 | + * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two | ||
315 | + * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and | ||
316 | + * telling process_cqe() to delete the AioHandler when its | ||
317 | + * IORING_OP_POLL_ADD completes. | ||
318 | + */ | ||
319 | + *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | | ||
320 | + FDMON_IO_URING_ADD)); | ||
321 | + return node; | ||
322 | +} | ||
323 | + | ||
324 | +static void fdmon_io_uring_update(AioContext *ctx, | ||
325 | + AioHandler *old_node, | ||
326 | + AioHandler *new_node) | ||
327 | +{ | ||
328 | + if (new_node) { | ||
329 | + enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD); | ||
330 | + } | ||
331 | + | ||
332 | + if (old_node) { | ||
333 | + /* | ||
334 | + * Deletion is tricky because IORING_OP_POLL_ADD and | ||
335 | + * IORING_OP_POLL_REMOVE are async. We need to wait for the original | ||
336 | + * IORING_OP_POLL_ADD to complete before this handler can be freed | ||
337 | + * safely. | ||
338 | + * | ||
339 | + * It's possible that the file descriptor becomes ready and the | ||
340 | + * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is | ||
341 | + * submitted, too. | ||
342 | + * | ||
343 | + * Mark this handler deleted right now but don't place it on | ||
344 | + * ctx->deleted_aio_handlers yet. Instead, manually fudge the list | ||
345 | + * entry to make QLIST_IS_INSERTED() think this handler has been | ||
346 | + * inserted and other code recognizes this AioHandler as deleted. | ||
347 | + * | ||
348 | + * Once the original IORING_OP_POLL_ADD completes we enqueue the | ||
349 | + * handler on the real ctx->deleted_aio_handlers list to be freed. | ||
350 | + */ | ||
351 | + assert(!QLIST_IS_INSERTED(old_node, node_deleted)); | ||
352 | + old_node->node_deleted.le_prev = &old_node->node_deleted.le_next; | ||
353 | + | ||
354 | + enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE); | ||
355 | + } | ||
356 | +} | ||
357 | + | ||
358 | +static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) | ||
359 | +{ | ||
360 | + struct io_uring_sqe *sqe = get_sqe(ctx); | ||
361 | + int events = poll_events_from_pfd(node->pfd.events); | ||
362 | + | ||
363 | + io_uring_prep_poll_add(sqe, node->pfd.fd, events); | ||
364 | + io_uring_sqe_set_data(sqe, node); | ||
365 | +} | ||
366 | + | ||
367 | +static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) | ||
368 | +{ | ||
369 | + struct io_uring_sqe *sqe = get_sqe(ctx); | ||
370 | + | ||
371 | + io_uring_prep_poll_remove(sqe, node); | ||
372 | +} | ||
373 | + | ||
374 | +/* Add a timeout that self-cancels when another cqe becomes ready */ | ||
375 | +static void add_timeout_sqe(AioContext *ctx, int64_t ns) | ||
376 | +{ | ||
377 | + struct io_uring_sqe *sqe; | ||
378 | + struct __kernel_timespec ts = { | ||
379 | + .tv_sec = ns / NANOSECONDS_PER_SECOND, | ||
380 | + .tv_nsec = ns % NANOSECONDS_PER_SECOND, | ||
381 | + }; | ||
382 | + | ||
383 | + sqe = get_sqe(ctx); | ||
384 | + io_uring_prep_timeout(sqe, &ts, 1, 0); | ||
385 | +} | ||
386 | + | ||
387 | +/* Add sqes from ctx->submit_list for submission */ | ||
388 | +static void fill_sq_ring(AioContext *ctx) | ||
389 | +{ | ||
390 | + AioHandlerSList submit_list; | ||
391 | + AioHandler *node; | ||
392 | + unsigned flags; | ||
393 | + | ||
394 | + QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list); | ||
395 | + | ||
396 | + while ((node = dequeue(&submit_list, &flags))) { | ||
397 | + /* Order matters, just in case both flags were set */ | ||
398 | + if (flags & FDMON_IO_URING_ADD) { | ||
399 | + add_poll_add_sqe(ctx, node); | ||
400 | + } | ||
401 | + if (flags & FDMON_IO_URING_REMOVE) { | ||
402 | + add_poll_remove_sqe(ctx, node); | ||
403 | + } | ||
404 | + } | ||
405 | +} | ||
406 | + | ||
407 | +/* Returns true if a handler became ready */ | ||
408 | +static bool process_cqe(AioContext *ctx, | ||
409 | + AioHandlerList *ready_list, | ||
410 | + struct io_uring_cqe *cqe) | ||
411 | +{ | ||
412 | + AioHandler *node = io_uring_cqe_get_data(cqe); | ||
413 | + unsigned flags; | ||
414 | + | ||
415 | + /* poll_timeout and poll_remove have a zero user_data field */ | ||
416 | + if (!node) { | ||
417 | + return false; | ||
418 | + } | ||
419 | + | ||
420 | + /* | ||
421 | + * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race | ||
422 | + * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE | ||
423 | + * bit before IORING_OP_POLL_REMOVE is submitted. | ||
424 | + */ | ||
425 | + flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); | ||
426 | + if (flags & FDMON_IO_URING_REMOVE) { | ||
427 | + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); | ||
428 | + return false; | ||
429 | + } | ||
430 | + | ||
431 | + aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); | ||
432 | + | ||
433 | + /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ | ||
434 | + add_poll_add_sqe(ctx, node); | ||
435 | + return true; | ||
436 | +} | ||
437 | + | ||
438 | +static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) | ||
439 | +{ | ||
440 | + struct io_uring *ring = &ctx->fdmon_io_uring; | ||
441 | + struct io_uring_cqe *cqe; | ||
442 | + unsigned num_cqes = 0; | ||
443 | + unsigned num_ready = 0; | ||
444 | + unsigned head; | ||
445 | + | ||
446 | + io_uring_for_each_cqe(ring, head, cqe) { | ||
447 | + if (process_cqe(ctx, ready_list, cqe)) { | ||
448 | + num_ready++; | ||
449 | + } | ||
450 | + | ||
451 | + num_cqes++; | ||
452 | + } | ||
453 | + | ||
454 | + io_uring_cq_advance(ring, num_cqes); | ||
455 | + return num_ready; | ||
456 | +} | ||
457 | + | ||
458 | +static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, | ||
459 | + int64_t timeout) | ||
460 | +{ | ||
461 | + unsigned wait_nr = 1; /* block until at least one cqe is ready */ | ||
462 | + int ret; | ||
463 | + | ||
464 | + /* Fall back while external clients are disabled */ | ||
465 | + if (atomic_read(&ctx->external_disable_cnt)) { | ||
466 | + return fdmon_poll_ops.wait(ctx, ready_list, timeout); | ||
467 | + } | ||
468 | + | ||
469 | + if (timeout == 0) { | ||
470 | + wait_nr = 0; /* non-blocking */ | ||
471 | + } else if (timeout > 0) { | ||
472 | + add_timeout_sqe(ctx, timeout); | ||
473 | + } | ||
474 | + | ||
475 | + fill_sq_ring(ctx); | ||
476 | + | ||
477 | + ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); | ||
478 | + assert(ret >= 0); | ||
479 | + | ||
480 | + return process_cq_ring(ctx, ready_list); | ||
481 | +} | ||
482 | + | ||
483 | +static const FDMonOps fdmon_io_uring_ops = { | ||
484 | + .update = fdmon_io_uring_update, | ||
485 | + .wait = fdmon_io_uring_wait, | ||
486 | +}; | ||
487 | + | ||
488 | +bool fdmon_io_uring_setup(AioContext *ctx) | ||
489 | +{ | ||
490 | + int ret; | ||
491 | + | ||
492 | + ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); | ||
493 | + if (ret != 0) { | ||
494 | + return false; | ||
495 | + } | ||
496 | + | ||
497 | + QSLIST_INIT(&ctx->submit_list); | ||
498 | + ctx->fdmon_ops = &fdmon_io_uring_ops; | ||
499 | + return true; | ||
500 | +} | ||
501 | + | ||
502 | +void fdmon_io_uring_destroy(AioContext *ctx) | ||
503 | +{ | ||
504 | + if (ctx->fdmon_ops == &fdmon_io_uring_ops) { | ||
505 | + AioHandler *node; | ||
506 | + | ||
507 | + io_uring_queue_exit(&ctx->fdmon_io_uring); | ||
508 | + | ||
509 | + /* No need to submit these anymore, just free them. */ | ||
510 | + while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { | ||
511 | + QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); | ||
512 | + QLIST_REMOVE(node, node); | ||
513 | + g_free(node); | ||
514 | + } | ||
515 | + | ||
516 | + ctx->fdmon_ops = &fdmon_poll_ops; | ||
517 | + } | ||
518 | +} | ||
519 | -- | 53 | -- |
520 | 2.24.1 | 54 | 2.9.3 |
521 | 55 | ||
56 | diff view generated by jsdifflib |
1 | One iteration of polling is always performed even when polling is | 1 | For the tests that use the common.qemu functions for running a QEMU |
---|---|---|---|
2 | disabled. This is done because: | 2 | process, _cleanup_qemu must be called in the exit function. |
3 | 1. Userspace polling is cheaper than making a syscall. We might get | ||
4 | lucky. | ||
5 | 2. We must poll once more after polling has stopped in case an event | ||
6 | occurred while stopping polling. | ||
7 | 3 | ||
8 | However, there are downsides: | 4 | If it is not, if the qemu process aborts, then not all of the droppings |
9 | 1. Polling becomes a bottleneck when the number of event sources is very | 5 | are cleaned up (e.g. pidfile, fifos). |
10 | high. It's more efficient to monitor fds in that case. | ||
11 | 2. A high-frequency polling event source can starve non-polling event | ||
12 | sources because ppoll(2)/epoll(7) is never invoked. | ||
13 | 6 | ||
14 | This patch removes the forced polling iteration so that poll_ns=0 really | 7 | This updates those tests that did not have a cleanup in qemu-iotests. |
15 | means no polling. | ||
16 | 8 | ||
17 | IOPS increases from 10k to 60k when the guest has 100 | 9 | (I swapped spaces for tabs in test 102 as well) |
18 | virtio-blk-pci,num-queues=32 devices and 1 virtio-blk-pci,num-queues=1 | ||
19 | device because the large number of event sources being polled slows down | ||
20 | the event loop. | ||
21 | 10 | ||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 11 | Reported-by: Eric Blake <eblake@redhat.com> |
23 | Link: https://lore.kernel.org/r/20200305170806.1313245-2-stefanha@redhat.com | 12 | Reviewed-by: Eric Blake <eblake@redhat.com> |
24 | Message-Id: <20200305170806.1313245-2-stefanha@redhat.com> | 13 | Signed-off-by: Jeff Cody <jcody@redhat.com> |
14 | Message-id: d59c2f6ad6c1da8b9b3c7f357c94a7122ccfc55a.1492544096.git.jcody@redhat.com | ||
25 | --- | 15 | --- |
26 | util/aio-posix.c | 22 +++++++++++++++------- | 16 | tests/qemu-iotests/028 | 1 + |
27 | 1 file changed, 15 insertions(+), 7 deletions(-) | 17 | tests/qemu-iotests/094 | 11 ++++++++--- |
18 | tests/qemu-iotests/102 | 5 +++-- | ||
19 | tests/qemu-iotests/109 | 1 + | ||
20 | tests/qemu-iotests/117 | 1 + | ||
21 | tests/qemu-iotests/130 | 1 + | ||
22 | tests/qemu-iotests/140 | 1 + | ||
23 | tests/qemu-iotests/141 | 1 + | ||
24 | tests/qemu-iotests/143 | 1 + | ||
25 | tests/qemu-iotests/156 | 1 + | ||
26 | 10 files changed, 19 insertions(+), 5 deletions(-) | ||
28 | 27 | ||
29 | diff --git a/util/aio-posix.c b/util/aio-posix.c | 28 | diff --git a/tests/qemu-iotests/028 b/tests/qemu-iotests/028 |
30 | index XXXXXXX..XXXXXXX 100644 | 29 | index XXXXXXX..XXXXXXX 100755 |
31 | --- a/util/aio-posix.c | 30 | --- a/tests/qemu-iotests/028 |
32 | +++ b/util/aio-posix.c | 31 | +++ b/tests/qemu-iotests/028 |
33 | @@ -XXX,XX +XXX,XX @@ void aio_set_event_notifier_poll(AioContext *ctx, | 32 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! |
34 | (IOHandler *)io_poll_end); | 33 | |
34 | _cleanup() | ||
35 | { | ||
36 | + _cleanup_qemu | ||
37 | rm -f "${TEST_IMG}.copy" | ||
38 | _cleanup_test_img | ||
35 | } | 39 | } |
36 | 40 | diff --git a/tests/qemu-iotests/094 b/tests/qemu-iotests/094 | |
37 | -static void poll_set_started(AioContext *ctx, bool started) | 41 | index XXXXXXX..XXXXXXX 100755 |
38 | +static bool poll_set_started(AioContext *ctx, bool started) | 42 | --- a/tests/qemu-iotests/094 |
43 | +++ b/tests/qemu-iotests/094 | ||
44 | @@ -XXX,XX +XXX,XX @@ echo "QA output created by $seq" | ||
45 | here="$PWD" | ||
46 | status=1 # failure is the default! | ||
47 | |||
48 | -trap "exit \$status" 0 1 2 3 15 | ||
49 | +_cleanup() | ||
50 | +{ | ||
51 | + _cleanup_qemu | ||
52 | + _cleanup_test_img | ||
53 | + rm -f "$TEST_DIR/source.$IMGFMT" | ||
54 | +} | ||
55 | + | ||
56 | +trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
57 | |||
58 | # get standard environment, filters and checks | ||
59 | . ./common.rc | ||
60 | @@ -XXX,XX +XXX,XX @@ _send_qemu_cmd $QEMU_HANDLE \ | ||
61 | |||
62 | wait=1 _cleanup_qemu | ||
63 | |||
64 | -_cleanup_test_img | ||
65 | -rm -f "$TEST_DIR/source.$IMGFMT" | ||
66 | |||
67 | # success, all done | ||
68 | echo '*** done' | ||
69 | diff --git a/tests/qemu-iotests/102 b/tests/qemu-iotests/102 | ||
70 | index XXXXXXX..XXXXXXX 100755 | ||
71 | --- a/tests/qemu-iotests/102 | ||
72 | +++ b/tests/qemu-iotests/102 | ||
73 | @@ -XXX,XX +XXX,XX @@ seq=$(basename $0) | ||
74 | echo "QA output created by $seq" | ||
75 | |||
76 | here=$PWD | ||
77 | -status=1 # failure is the default! | ||
78 | +status=1 # failure is the default! | ||
79 | |||
80 | _cleanup() | ||
39 | { | 81 | { |
40 | AioHandler *node; | 82 | - _cleanup_test_img |
41 | + bool progress = false; | 83 | + _cleanup_qemu |
42 | 84 | + _cleanup_test_img | |
43 | if (started == ctx->poll_started) { | ||
44 | - return; | ||
45 | + return false; | ||
46 | } | ||
47 | |||
48 | ctx->poll_started = started; | ||
49 | @@ -XXX,XX +XXX,XX @@ static void poll_set_started(AioContext *ctx, bool started) | ||
50 | if (fn) { | ||
51 | fn(node->opaque); | ||
52 | } | ||
53 | + | ||
54 | + /* Poll one last time in case ->io_poll_end() raced with the event */ | ||
55 | + if (!started) { | ||
56 | + progress = node->io_poll(node->opaque) || progress; | ||
57 | + } | ||
58 | } | ||
59 | qemu_lockcnt_dec(&ctx->list_lock); | ||
60 | + | ||
61 | + return progress; | ||
62 | } | 85 | } |
63 | 86 | trap "_cleanup; exit \$status" 0 1 2 3 15 | |
64 | 87 | ||
65 | @@ -XXX,XX +XXX,XX @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout) | 88 | diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109 |
66 | } | 89 | index XXXXXXX..XXXXXXX 100755 |
67 | } | 90 | --- a/tests/qemu-iotests/109 |
68 | 91 | +++ b/tests/qemu-iotests/109 | |
69 | - poll_set_started(ctx, false); | 92 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! |
70 | + if (poll_set_started(ctx, false)) { | 93 | |
71 | + *timeout = 0; | 94 | _cleanup() |
72 | + return true; | 95 | { |
73 | + } | 96 | + _cleanup_qemu |
74 | 97 | rm -f $TEST_IMG.src | |
75 | - /* Even if we don't run busy polling, try polling once in case it can make | 98 | _cleanup_test_img |
76 | - * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2). | ||
77 | - */ | ||
78 | - return run_poll_handlers_once(ctx, timeout); | ||
79 | + return false; | ||
80 | } | 99 | } |
81 | 100 | diff --git a/tests/qemu-iotests/117 b/tests/qemu-iotests/117 | |
82 | bool aio_poll(AioContext *ctx, bool blocking) | 101 | index XXXXXXX..XXXXXXX 100755 |
102 | --- a/tests/qemu-iotests/117 | ||
103 | +++ b/tests/qemu-iotests/117 | ||
104 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! | ||
105 | |||
106 | _cleanup() | ||
107 | { | ||
108 | + _cleanup_qemu | ||
109 | _cleanup_test_img | ||
110 | } | ||
111 | trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
112 | diff --git a/tests/qemu-iotests/130 b/tests/qemu-iotests/130 | ||
113 | index XXXXXXX..XXXXXXX 100755 | ||
114 | --- a/tests/qemu-iotests/130 | ||
115 | +++ b/tests/qemu-iotests/130 | ||
116 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! | ||
117 | |||
118 | _cleanup() | ||
119 | { | ||
120 | + _cleanup_qemu | ||
121 | _cleanup_test_img | ||
122 | } | ||
123 | trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
124 | diff --git a/tests/qemu-iotests/140 b/tests/qemu-iotests/140 | ||
125 | index XXXXXXX..XXXXXXX 100755 | ||
126 | --- a/tests/qemu-iotests/140 | ||
127 | +++ b/tests/qemu-iotests/140 | ||
128 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! | ||
129 | |||
130 | _cleanup() | ||
131 | { | ||
132 | + _cleanup_qemu | ||
133 | _cleanup_test_img | ||
134 | rm -f "$TEST_DIR/nbd" | ||
135 | } | ||
136 | diff --git a/tests/qemu-iotests/141 b/tests/qemu-iotests/141 | ||
137 | index XXXXXXX..XXXXXXX 100755 | ||
138 | --- a/tests/qemu-iotests/141 | ||
139 | +++ b/tests/qemu-iotests/141 | ||
140 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! | ||
141 | |||
142 | _cleanup() | ||
143 | { | ||
144 | + _cleanup_qemu | ||
145 | _cleanup_test_img | ||
146 | rm -f "$TEST_DIR/{b,m,o}.$IMGFMT" | ||
147 | } | ||
148 | diff --git a/tests/qemu-iotests/143 b/tests/qemu-iotests/143 | ||
149 | index XXXXXXX..XXXXXXX 100755 | ||
150 | --- a/tests/qemu-iotests/143 | ||
151 | +++ b/tests/qemu-iotests/143 | ||
152 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! | ||
153 | |||
154 | _cleanup() | ||
155 | { | ||
156 | + _cleanup_qemu | ||
157 | rm -f "$TEST_DIR/nbd" | ||
158 | } | ||
159 | trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
160 | diff --git a/tests/qemu-iotests/156 b/tests/qemu-iotests/156 | ||
161 | index XXXXXXX..XXXXXXX 100755 | ||
162 | --- a/tests/qemu-iotests/156 | ||
163 | +++ b/tests/qemu-iotests/156 | ||
164 | @@ -XXX,XX +XXX,XX @@ status=1 # failure is the default! | ||
165 | |||
166 | _cleanup() | ||
167 | { | ||
168 | + _cleanup_qemu | ||
169 | rm -f "$TEST_IMG{,.target}{,.backing,.overlay}" | ||
170 | } | ||
171 | trap "_cleanup; exit \$status" 0 1 2 3 15 | ||
83 | -- | 172 | -- |
84 | 2.24.1 | 173 | 2.9.3 |
85 | 174 | ||
175 | diff view generated by jsdifflib |