From: John G Johnson <john.g.johnson@oracle.com>
Add user.c and user.h files for vfio-user with the basic
send and receive functions.
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
hw/vfio/user.h | 120 ++++++++++++++
include/hw/vfio/vfio-common.h | 2 +
hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++
MAINTAINERS | 4 +
hw/vfio/meson.build | 1 +
5 files changed, 413 insertions(+)
create mode 100644 hw/vfio/user.h
create mode 100644 hw/vfio/user.c
diff --git a/hw/vfio/user.h b/hw/vfio/user.h
new file mode 100644
index 0000000000..cdbc074579
--- /dev/null
+++ b/hw/vfio/user.h
@@ -0,0 +1,120 @@
+#ifndef VFIO_USER_H
+#define VFIO_USER_H
+
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Each message has a standard header that describes the command
+ * being sent, which is almost always a VFIO ioctl().
+ *
+ * The header may be followed by command-specfic data, such as the
+ * region and offset info for read and write commands.
+ */
+
+/* commands */
+enum vfio_user_command {
+ VFIO_USER_VERSION = 1,
+ VFIO_USER_DMA_MAP = 2,
+ VFIO_USER_DMA_UNMAP = 3,
+ VFIO_USER_DEVICE_GET_INFO = 4,
+ VFIO_USER_DEVICE_GET_REGION_INFO = 5,
+ VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6,
+ VFIO_USER_DEVICE_GET_IRQ_INFO = 7,
+ VFIO_USER_DEVICE_SET_IRQS = 8,
+ VFIO_USER_REGION_READ = 9,
+ VFIO_USER_REGION_WRITE = 10,
+ VFIO_USER_DMA_READ = 11,
+ VFIO_USER_DMA_WRITE = 12,
+ VFIO_USER_DEVICE_RESET = 13,
+ VFIO_USER_DIRTY_PAGES = 14,
+ VFIO_USER_MAX,
+};
+
+/* flags */
+#define VFIO_USER_REQUEST 0x0
+#define VFIO_USER_REPLY 0x1
+#define VFIO_USER_TYPE 0xF
+
+#define VFIO_USER_NO_REPLY 0x10
+#define VFIO_USER_ERROR 0x20
+
+typedef struct vfio_user_hdr {
+ uint16_t id;
+ uint16_t command;
+ uint32_t size;
+ uint32_t flags;
+ uint32_t error_reply;
+} vfio_user_hdr_t;
+
+/*
+ * VFIO_USER_VERSION
+ */
+#define VFIO_USER_MAJOR_VER 0
+#define VFIO_USER_MINOR_VER 0
+
+struct vfio_user_version {
+ vfio_user_hdr_t hdr;
+ uint16_t major;
+ uint16_t minor;
+ char capabilities[];
+};
+
+#define VFIO_USER_DEF_MAX_FDS 8
+#define VFIO_USER_MAX_MAX_FDS 16
+
+#define VFIO_USER_DEF_MAX_XFER (1024 * 1024)
+#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024)
+
+typedef struct VFIOUserFDs {
+ int send_fds;
+ int recv_fds;
+ int *fds;
+} VFIOUserFDs;
+
+typedef struct VFIOUserReply {
+ QTAILQ_ENTRY(VFIOUserReply) next;
+ vfio_user_hdr_t *msg;
+ VFIOUserFDs *fds;
+ int rsize;
+ uint32_t id;
+ QemuCond cv;
+ uint8_t complete;
+} VFIOUserReply;
+
+enum proxy_state {
+ CONNECTED = 1,
+ RECV_ERROR = 2,
+ CLOSING = 3,
+ CLOSED = 4,
+};
+
+typedef struct VFIOProxy {
+ QLIST_ENTRY(VFIOProxy) next;
+ char *sockname;
+ struct QIOChannel *ioc;
+ int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
+ void *reqarg;
+ int flags;
+ QemuCond close_cv;
+
+ /*
+ * above only changed when iolock is held
+ * below are protected by per-proxy lock
+ */
+ QemuMutex lock;
+ QTAILQ_HEAD(, VFIOUserReply) free;
+ QTAILQ_HEAD(, VFIOUserReply) pending;
+ enum proxy_state state;
+ int close_wait;
+} VFIOProxy;
+
+#define VFIO_PROXY_CLIENT 0x1
+
+void vfio_user_recv(void *opaque);
+void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
+#endif /* VFIO_USER_H */
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8af11b0a76..f43dc6e5d0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
} VFIOAddressSpace;
struct VFIOGroup;
+typedef struct VFIOProxy VFIOProxy;
typedef struct VFIOContainer {
VFIOAddressSpace *space;
@@ -143,6 +144,7 @@ typedef struct VFIODevice {
VFIOMigration *migration;
Error *migration_blocker;
OnOffAuto pre_copy_dirty_page_tracking;
+ VFIOProxy *proxy;
} VFIODevice;
struct VFIODeviceOps {
diff --git a/hw/vfio/user.c b/hw/vfio/user.c
new file mode 100644
index 0000000000..021d5540e0
--- /dev/null
+++ b/hw/vfio/user.c
@@ -0,0 +1,286 @@
+/*
+ * vfio protocol over a UNIX socket.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "hw/hw.h"
+#include "hw/vfio/vfio-common.h"
+#include "hw/vfio/vfio.h"
+#include "qemu/sockets.h"
+#include "io/channel.h"
+#include "io/channel-util.h"
+#include "sysemu/iothread.h"
+#include "user.h"
+
+static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER;
+static IOThread *vfio_user_iothread;
+static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+ VFIOUserFDs *fds);
+static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+ VFIOUserFDs *fds);
+static void vfio_user_shutdown(VFIOProxy *proxy);
+
+static void vfio_user_shutdown(VFIOProxy *proxy)
+{
+ qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
+ qio_channel_set_aio_fd_handler(proxy->ioc,
+ iothread_get_aio_context(vfio_user_iothread),
+ NULL, NULL, NULL);
+}
+
+void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret)
+{
+ vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf;
+
+ /*
+ * convert header to associated reply
+ * positive ret is reply size, negative is error code
+ */
+ hdr->flags = VFIO_USER_REPLY;
+ if (ret > 0) {
+ hdr->size = ret;
+ } else if (ret < 0) {
+ hdr->flags |= VFIO_USER_ERROR;
+ hdr->error_reply = -ret;
+ hdr->size = sizeof(*hdr);
+ }
+ vfio_user_send(proxy, hdr, NULL);
+}
+
+void vfio_user_recv(void *opaque)
+{
+ VFIODevice *vbasedev = opaque;
+ VFIOProxy *proxy = vbasedev->proxy;
+ VFIOUserReply *reply = NULL;
+ g_autofree int *fdp = NULL;
+ VFIOUserFDs reqfds = { 0, 0, fdp };
+ vfio_user_hdr_t msg;
+ struct iovec iov = {
+ .iov_base = &msg,
+ .iov_len = sizeof(msg),
+ };
+ int isreply, i, ret;
+ size_t msgleft, numfds = 0;
+ char *data = NULL;
+ g_autofree char *buf = NULL;
+ Error *local_err = NULL;
+
+ qemu_mutex_lock(&proxy->lock);
+ if (proxy->state == CLOSING) {
+ qemu_mutex_unlock(&proxy->lock);
+ return;
+ }
+
+ ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
+ &local_err);
+ if (ret <= 0) {
+ /* read error or other side closed connection */
+ error_setg_errno(&local_err, errno, "vfio_user_recv read error");
+ goto fatal;
+ }
+
+ if (ret < sizeof(msg)) {
+ error_setg(&local_err, "vfio_user_recv short read of header");
+ goto err;
+ }
+
+ /*
+ * For replies, find the matching pending request
+ */
+ switch (msg.flags & VFIO_USER_TYPE) {
+ case VFIO_USER_REQUEST:
+ isreply = 0;
+ break;
+ case VFIO_USER_REPLY:
+ isreply = 1;
+ break;
+ default:
+ error_setg(&local_err, "vfio_user_recv unknown message type");
+ goto err;
+ }
+
+ if (isreply) {
+ QTAILQ_FOREACH(reply, &proxy->pending, next) {
+ if (msg.id == reply->id) {
+ break;
+ }
+ }
+ if (reply == NULL) {
+ error_setg(&local_err, "vfio_user_recv unexpected reply");
+ goto err;
+ }
+ QTAILQ_REMOVE(&proxy->pending, reply, next);
+
+ /*
+ * Process any received FDs
+ */
+ if (numfds != 0) {
+ if (reply->fds == NULL || reply->fds->recv_fds < numfds) {
+ error_setg(&local_err, "vfio_user_recv unexpected FDs");
+ goto err;
+ }
+ reply->fds->recv_fds = numfds;
+ memcpy(reply->fds->fds, fdp, numfds * sizeof(int));
+ }
+
+ } else {
+ /*
+ * The client doesn't expect any FDs in requests, but
+ * they will be expected on the server
+ */
+ if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) {
+ error_setg(&local_err, "vfio_user_recv fd in client reply");
+ goto err;
+ }
+ reqfds.recv_fds = numfds;
+ }
+
+ /*
+ * put the whole message into a single buffer
+ */
+ msgleft = msg.size - sizeof(msg);
+ if (isreply) {
+ if (msg.size > reply->rsize) {
+ error_setg(&local_err,
+ "vfio_user_recv reply larger than recv buffer");
+ goto fatal;
+ }
+ *reply->msg = msg;
+ data = (char *)reply->msg + sizeof(msg);
+ } else {
+ if (msg.size > max_xfer_size) {
+ error_setg(&local_err, "vfio_user_recv request larger than max");
+ goto fatal;
+ }
+ buf = g_malloc0(msg.size);
+ memcpy(buf, &msg, sizeof(msg));
+ data = buf + sizeof(msg);
+ }
+
+ if (msgleft != 0) {
+ ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
+ if (ret < 0) {
+ goto fatal;
+ }
+ if (ret != msgleft) {
+ error_setg(&local_err, "vfio_user_recv short read of msg body");
+ goto err;
+ }
+ }
+
+ /*
+ * Replies signal a waiter, requests get processed by vfio code
+ * that may assume the iothread lock is held.
+ */
+ qemu_mutex_unlock(&proxy->lock);
+ if (isreply) {
+ reply->complete = 1;
+ qemu_cond_signal(&reply->cv);
+ } else {
+ qemu_mutex_lock_iothread();
+ /*
+ * make sure proxy wasn't closed while we waited
+ * checking without holding the proxy lock is safe
+ * since state is only set to CLOSING when iolock is held
+ */
+ if (proxy->state != CLOSING) {
+ ret = proxy->request(proxy->reqarg, buf, &reqfds);
+ if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) {
+ vfio_user_send_reply(proxy, buf, ret);
+ }
+ }
+ qemu_mutex_unlock_iothread();
+ }
+
+ return;
+ fatal:
+ vfio_user_shutdown(proxy);
+ proxy->state = RECV_ERROR;
+
+ err:
+ qemu_mutex_unlock(&proxy->lock);
+ for (i = 0; i < numfds; i++) {
+ close(fdp[i]);
+ }
+ if (reply != NULL) {
+ /* force an error to keep sending thread from hanging */
+ reply->msg->flags |= VFIO_USER_ERROR;
+ reply->msg->error_reply = EINVAL;
+ reply->complete = 1;
+ qemu_cond_signal(&reply->cv);
+ }
+ error_report_err(local_err);
+}
+
+static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+ VFIOUserFDs *fds)
+{
+ struct iovec iov = {
+ .iov_base = msg,
+ .iov_len = msg->size,
+ };
+ size_t numfds = 0;
+ int msgleft, ret, *fdp = NULL;
+ char *buf;
+ Error *local_err = NULL;
+
+ if (proxy->state != CONNECTED) {
+ msg->flags |= VFIO_USER_ERROR;
+ msg->error_reply = ECONNRESET;
+ return;
+ }
+
+ if (fds != NULL && fds->send_fds != 0) {
+ numfds = fds->send_fds;
+ fdp = fds->fds;
+ }
+ ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err);
+ if (ret < 0) {
+ goto err;
+ }
+ if (ret == msg->size) {
+ return;
+ }
+
+ buf = iov.iov_base + ret;
+ msgleft = iov.iov_len - ret;
+ do {
+ ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err);
+ if (ret < 0) {
+ goto err;
+ }
+ buf += ret, msgleft -= ret;
+ } while (msgleft != 0);
+ return;
+
+ err:
+ error_report_err(local_err);
+}
+
+static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
+ VFIOUserFDs *fds)
+{
+ bool iolock = qemu_mutex_iothread_locked();
+
+ if (iolock) {
+ qemu_mutex_unlock_iothread();
+ }
+ qemu_mutex_lock(&proxy->lock);
+ vfio_user_send_locked(proxy, msg, fds);
+ qemu_mutex_unlock(&proxy->lock);
+ if (iolock) {
+ qemu_mutex_lock_iothread();
+ }
+}
diff --git a/MAINTAINERS b/MAINTAINERS
index 12d69f3a45..aa4df6c418 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org
vfio-user
M: John G Johnson <john.g.johnson@oracle.com>
M: Thanos Makatos <thanos.makatos@nutanix.com>
+M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
+M: Jagannathan Raman <jag.raman@oracle.com>
S: Supported
F: docs/devel/vfio-user.rst
+F: hw/vfio/user.c
+F: hw/vfio/user.h
vhost
M: Michael S. Tsirkin <mst@redhat.com>
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index da9af297a0..739b30be73 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
'display.c',
'pci-quirks.c',
'pci.c',
+ 'user.c',
))
vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))
--
2.25.1
On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote:
> From: John G Johnson <john.g.johnson@oracle.com>
>
> Add user.c and user.h files for vfio-user with the basic
> send and receive functions.
>
> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> ---
> hw/vfio/user.h | 120 ++++++++++++++
> include/hw/vfio/vfio-common.h | 2 +
> hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++
> MAINTAINERS | 4 +
> hw/vfio/meson.build | 1 +
> 5 files changed, 413 insertions(+)
> create mode 100644 hw/vfio/user.h
> create mode 100644 hw/vfio/user.c
The multi-threading, coroutine, and blocking I/O requirements of
vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please
document them so it's clear what environment they can be called from. I
guess they are not called from coroutines and proxy->ioc is a blocking
IOChannel?
>
> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
> new file mode 100644
> index 0000000000..cdbc074579
> --- /dev/null
> +++ b/hw/vfio/user.h
> @@ -0,0 +1,120 @@
> +#ifndef VFIO_USER_H
> +#define VFIO_USER_H
> +
> +/*
> + * vfio protocol over a UNIX socket.
> + *
> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.
> + *
> + * Each message has a standard header that describes the command
> + * being sent, which is almost always a VFIO ioctl().
> + *
> + * The header may be followed by command-specfic data, such as the
> + * region and offset info for read and write commands.
> + */
> +
> +/* commands */
> +enum vfio_user_command {
> + VFIO_USER_VERSION = 1,
> + VFIO_USER_DMA_MAP = 2,
> + VFIO_USER_DMA_UNMAP = 3,
> + VFIO_USER_DEVICE_GET_INFO = 4,
> + VFIO_USER_DEVICE_GET_REGION_INFO = 5,
> + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6,
> + VFIO_USER_DEVICE_GET_IRQ_INFO = 7,
> + VFIO_USER_DEVICE_SET_IRQS = 8,
> + VFIO_USER_REGION_READ = 9,
> + VFIO_USER_REGION_WRITE = 10,
> + VFIO_USER_DMA_READ = 11,
> + VFIO_USER_DMA_WRITE = 12,
> + VFIO_USER_DEVICE_RESET = 13,
> + VFIO_USER_DIRTY_PAGES = 14,
> + VFIO_USER_MAX,
> +};
> +
> +/* flags */
> +#define VFIO_USER_REQUEST 0x0
> +#define VFIO_USER_REPLY 0x1
> +#define VFIO_USER_TYPE 0xF
> +
> +#define VFIO_USER_NO_REPLY 0x10
> +#define VFIO_USER_ERROR 0x20
> +
> +typedef struct vfio_user_hdr {
> + uint16_t id;
> + uint16_t command;
> + uint32_t size;
> + uint32_t flags;
> + uint32_t error_reply;
> +} vfio_user_hdr_t;
Please use QEMU coding style in QEMU code (i.e. not shared with Linux or
external libraries):
typedef struct {
...
} VfioUserHdr;
You can also specify the struct VfioUserHdr tag if you want but it's
only necessary to reference the struct before the end of the typedef
definition.
https://qemu-project.gitlab.io/qemu/devel/style.html
> +
> +/*
> + * VFIO_USER_VERSION
> + */
> +#define VFIO_USER_MAJOR_VER 0
> +#define VFIO_USER_MINOR_VER 0
> +
> +struct vfio_user_version {
> + vfio_user_hdr_t hdr;
> + uint16_t major;
> + uint16_t minor;
> + char capabilities[];
> +};
> +
> +#define VFIO_USER_DEF_MAX_FDS 8
> +#define VFIO_USER_MAX_MAX_FDS 16
> +
> +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024)
> +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024)
> +
> +typedef struct VFIOUserFDs {
> + int send_fds;
> + int recv_fds;
> + int *fds;
> +} VFIOUserFDs;
I think around here we switch from vfio-user spec definitions to QEMU
implementation details. It might be nice to keep the vfio-user spec
definitions in a separate header file so the boundary is clear.
> +
> +typedef struct VFIOUserReply {
> + QTAILQ_ENTRY(VFIOUserReply) next;
> + vfio_user_hdr_t *msg;
> + VFIOUserFDs *fds;
> + int rsize;
> + uint32_t id;
> + QemuCond cv;
> + uint8_t complete;
Please use bool.
> +} VFIOUserReply;
> +
> +enum proxy_state {
> + CONNECTED = 1,
> + RECV_ERROR = 2,
> + CLOSING = 3,
> + CLOSED = 4,
> +};
These enum values probably need a prefix (VFIO_PROXY_*). Generic short
names like CONNECTED, CLOSED, etc could lead to namespace collisions.
Enum constants are in the global namespace.
> +
> +typedef struct VFIOProxy {
> + QLIST_ENTRY(VFIOProxy) next;
> + char *sockname;
> + struct QIOChannel *ioc;
> + int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
> + void *reqarg;
> + int flags;
> + QemuCond close_cv;
> +
> + /*
> + * above only changed when iolock is held
Please use "BQL" instead of "iolock". git grep shows many results for
BQL and the only result for iolock is in mpqemu code.
> + * below are protected by per-proxy lock
> + */
> + QemuMutex lock;
> + QTAILQ_HEAD(, VFIOUserReply) free;
> + QTAILQ_HEAD(, VFIOUserReply) pending;
> + enum proxy_state state;
> + int close_wait;
Is this a bool? Please use bool.
> +} VFIOProxy;
> +
> +#define VFIO_PROXY_CLIENT 0x1
A comment that shows which field VFIO_PROXY_CLIENT relates would make this clearer:
/* VFIOProxy->flags */
#define VFIO_PROXY_CLIENT 0x1
> +
> +void vfio_user_recv(void *opaque);
> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
> +#endif /* VFIO_USER_H */
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 8af11b0a76..f43dc6e5d0 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
> } VFIOAddressSpace;
>
> struct VFIOGroup;
> +typedef struct VFIOProxy VFIOProxy;
>
> typedef struct VFIOContainer {
> VFIOAddressSpace *space;
> @@ -143,6 +144,7 @@ typedef struct VFIODevice {
> VFIOMigration *migration;
> Error *migration_blocker;
> OnOffAuto pre_copy_dirty_page_tracking;
> + VFIOProxy *proxy;
> } VFIODevice;
>
> struct VFIODeviceOps {
> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
> new file mode 100644
> index 0000000000..021d5540e0
> --- /dev/null
> +++ b/hw/vfio/user.c
> @@ -0,0 +1,286 @@
> +/*
> + * vfio protocol over a UNIX socket.
> + *
> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/osdep.h"
> +#include <linux/vfio.h>
> +#include <sys/ioctl.h>
> +
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "qemu/main-loop.h"
> +#include "hw/hw.h"
> +#include "hw/vfio/vfio-common.h"
> +#include "hw/vfio/vfio.h"
> +#include "qemu/sockets.h"
> +#include "io/channel.h"
> +#include "io/channel-util.h"
> +#include "sysemu/iothread.h"
> +#include "user.h"
> +
> +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER;
> +static IOThread *vfio_user_iothread;
> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> + VFIOUserFDs *fds);
> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> + VFIOUserFDs *fds);
> +static void vfio_user_shutdown(VFIOProxy *proxy);
> +
> +static void vfio_user_shutdown(VFIOProxy *proxy)
> +{
> + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
> + qio_channel_set_aio_fd_handler(proxy->ioc,
> + iothread_get_aio_context(vfio_user_iothread),
> + NULL, NULL, NULL);
There is no other qio_channel_set_aio_fd_handler() call in this patch.
Why is this one necessary?
> +}
> +
> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret)
> +{
> + vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf;
> +
> + /*
> + * convert header to associated reply
> + * positive ret is reply size, negative is error code
> + */
> + hdr->flags = VFIO_USER_REPLY;
> + if (ret > 0) {
> + hdr->size = ret;
> + } else if (ret < 0) {
> + hdr->flags |= VFIO_USER_ERROR;
> + hdr->error_reply = -ret;
> + hdr->size = sizeof(*hdr);
> + }
assert(ret != 0)? That case doesn't seem to be defined so maybe an
assertion is worthwhile.
> + vfio_user_send(proxy, hdr, NULL);
> +}
> +
> +void vfio_user_recv(void *opaque)
> +{
> + VFIODevice *vbasedev = opaque;
> + VFIOProxy *proxy = vbasedev->proxy;
> + VFIOUserReply *reply = NULL;
> + g_autofree int *fdp = NULL;
> + VFIOUserFDs reqfds = { 0, 0, fdp };
> + vfio_user_hdr_t msg;
> + struct iovec iov = {
> + .iov_base = &msg,
> + .iov_len = sizeof(msg),
> + };
> + int isreply, i, ret;
> + size_t msgleft, numfds = 0;
> + char *data = NULL;
> + g_autofree char *buf = NULL;
> + Error *local_err = NULL;
> +
> + qemu_mutex_lock(&proxy->lock);
> + if (proxy->state == CLOSING) {
> + qemu_mutex_unlock(&proxy->lock);
QEMU_LOCK_GUARD() automatically unlocks mutexes when the function
returns and is less error-prone than manual lock/unlock calls.
> + return;
> + }
> +
> + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
> + &local_err);
> + if (ret <= 0) {
> + /* read error or other side closed connection */
> + error_setg_errno(&local_err, errno, "vfio_user_recv read error");
This will trigger an assertion failure when local_err was already set by
qio_channel_readv_full():
static void error_setv(Error **errp,
const char *src, int line, const char *func,
ErrorClass err_class, const char *fmt, va_list ap,
const char *suffix)
{
Error *err;
int saved_errno = errno;
if (errp == NULL) {
return;
}
assert(*errp == NULL);
^^^^^^^^^^^^^^^^^^^^^^
I think this error_setg_errno() call should be dropped. You can use
error_prepend() if you'd like to add more information to the error
message from qio_channel_readv_full().
> + goto fatal;
> + }
> +
> + if (ret < sizeof(msg)) {
> + error_setg(&local_err, "vfio_user_recv short read of header");
> + goto err;
> + }
> +
> + /*
> + * For replies, find the matching pending request
> + */
> + switch (msg.flags & VFIO_USER_TYPE) {
> + case VFIO_USER_REQUEST:
> + isreply = 0;
> + break;
> + case VFIO_USER_REPLY:
> + isreply = 1;
> + break;
> + default:
> + error_setg(&local_err, "vfio_user_recv unknown message type");
> + goto err;
> + }
> +
> + if (isreply) {
> + QTAILQ_FOREACH(reply, &proxy->pending, next) {
> + if (msg.id == reply->id) {
> + break;
> + }
> + }
I'm surprised to see this loop since proxy->lock prevents additional
requests from being sent while we're trying to receive a message. Can
there really be multiple replies pending with this locking scheme?
> + if (reply == NULL) {
> + error_setg(&local_err, "vfio_user_recv unexpected reply");
> + goto err;
> + }
> + QTAILQ_REMOVE(&proxy->pending, reply, next);
> +
> + /*
> + * Process any received FDs
> + */
> + if (numfds != 0) {
> + if (reply->fds == NULL || reply->fds->recv_fds < numfds) {
> + error_setg(&local_err, "vfio_user_recv unexpected FDs");
> + goto err;
> + }
> + reply->fds->recv_fds = numfds;
> + memcpy(reply->fds->fds, fdp, numfds * sizeof(int));
> + }
> +
> + } else {
> + /*
> + * The client doesn't expect any FDs in requests, but
> + * they will be expected on the server
> + */
> + if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) {
> + error_setg(&local_err, "vfio_user_recv fd in client reply");
> + goto err;
> + }
> + reqfds.recv_fds = numfds;
> + }
> +
> + /*
> + * put the whole message into a single buffer
> + */
> + msgleft = msg.size - sizeof(msg);
msg.size has not been validated so this could underflow. Please validate
all inputs so malicious servers/clients cannot crash or compromise the
program.
> + if (isreply) {
> + if (msg.size > reply->rsize) {
rsize is an int. Should it be uint32_t like msg.size?
> + error_setg(&local_err,
> + "vfio_user_recv reply larger than recv buffer");
> + goto fatal;
> + }
> + *reply->msg = msg;
> + data = (char *)reply->msg + sizeof(msg);
> + } else {
> + if (msg.size > max_xfer_size) {
> + error_setg(&local_err, "vfio_user_recv request larger than max");
> + goto fatal;
> + }
Missing check to prevent buffer overflow:
if (msg.size < sizeof(msg)) {
error_setg(&local_err, "vfio_user_recv request too small");
goto fatal;
}
> + buf = g_malloc0(msg.size);
> + memcpy(buf, &msg, sizeof(msg));
> + data = buf + sizeof(msg);
> + }
> +
> + if (msgleft != 0) {
> + ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
> + if (ret < 0) {
> + goto fatal;
> + }
> + if (ret != msgleft) {
> + error_setg(&local_err, "vfio_user_recv short read of msg body");
> + goto err;
> + }
> + }
> +
> + /*
> + * Replies signal a waiter, requests get processed by vfio code
> + * that may assume the iothread lock is held.
> + */
> + qemu_mutex_unlock(&proxy->lock);
> + if (isreply) {
> + reply->complete = 1;
> + qemu_cond_signal(&reply->cv);
signal must be called with the mutex held to avoid race conditions. If
the waiter acquires the lock and still sees complete == 0, then we
signal before wait is entered, the signal is missed and the waiter is
stuck.
> + } else {
> + qemu_mutex_lock_iothread();
> + /*
> + * make sure proxy wasn't closed while we waited
> + * checking without holding the proxy lock is safe
> + * since state is only set to CLOSING when iolock is held
s/iolock/the BQL/
> + */
> + if (proxy->state != CLOSING) {
> + ret = proxy->request(proxy->reqarg, buf, &reqfds);
> + if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) {
> + vfio_user_send_reply(proxy, buf, ret);
> + }
> + }
> + qemu_mutex_unlock_iothread();
> + }
> +
> + return;
> + fatal:
> + vfio_user_shutdown(proxy);
> + proxy->state = RECV_ERROR;
> +
> + err:
> + qemu_mutex_unlock(&proxy->lock);
> + for (i = 0; i < numfds; i++) {
> + close(fdp[i]);
> + }
> + if (reply != NULL) {
> + /* force an error to keep sending thread from hanging */
> + reply->msg->flags |= VFIO_USER_ERROR;
> + reply->msg->error_reply = EINVAL;
> + reply->complete = 1;
> + qemu_cond_signal(&reply->cv);
This has the race condition too.
> + }
> + error_report_err(local_err);
> +}
> +
> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> + VFIOUserFDs *fds)
> +{
> + struct iovec iov = {
> + .iov_base = msg,
> + .iov_len = msg->size,
> + };
> + size_t numfds = 0;
> + int msgleft, ret, *fdp = NULL;
> + char *buf;
> + Error *local_err = NULL;
> +
> + if (proxy->state != CONNECTED) {
> + msg->flags |= VFIO_USER_ERROR;
> + msg->error_reply = ECONNRESET;
> + return;
> + }
> +
> + if (fds != NULL && fds->send_fds != 0) {
> + numfds = fds->send_fds;
> + fdp = fds->fds;
> + }
> + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err);
> + if (ret < 0) {
> + goto err;
> + }
> + if (ret == msg->size) {
> + return;
> + }
> +
> + buf = iov.iov_base + ret;
> + msgleft = iov.iov_len - ret;
> + do {
> + ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err);
> + if (ret < 0) {
> + goto err;
> + }
> + buf += ret, msgleft -= ret;
Please use semicolon. Comma operators are rare, requiring readers to
check their exact semantics. There is no need to use comma here.
> + } while (msgleft != 0);
> + return;
> +
> + err:
> + error_report_err(local_err);
State remains unchanged and msg->error_reply isn't set?
> +}
> +
> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
> + VFIOUserFDs *fds)
> +{
> + bool iolock = qemu_mutex_iothread_locked();
> +
> + if (iolock) {
> + qemu_mutex_unlock_iothread();
> + }
> + qemu_mutex_lock(&proxy->lock);
> + vfio_user_send_locked(proxy, msg, fds);
> + qemu_mutex_unlock(&proxy->lock);
> + if (iolock) {
> + qemu_mutex_lock_iothread();
> + }
> +}
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 12d69f3a45..aa4df6c418 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org
> vfio-user
> M: John G Johnson <john.g.johnson@oracle.com>
> M: Thanos Makatos <thanos.makatos@nutanix.com>
> +M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> +M: Jagannathan Raman <jag.raman@oracle.com>
> S: Supported
> F: docs/devel/vfio-user.rst
> +F: hw/vfio/user.c
> +F: hw/vfio/user.h
>
> vhost
> M: Michael S. Tsirkin <mst@redhat.com>
> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
> index da9af297a0..739b30be73 100644
> --- a/hw/vfio/meson.build
> +++ b/hw/vfio/meson.build
> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
> 'display.c',
> 'pci-quirks.c',
> 'pci.c',
> + 'user.c',
> ))
> vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
> vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))
> --
> 2.25.1
>
> On Jul 27, 2021, at 9:34 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote:
>> From: John G Johnson <john.g.johnson@oracle.com>
>>
>> Add user.c and user.h files for vfio-user with the basic
>> send and receive functions.
>>
>> Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
>> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
>> ---
>> hw/vfio/user.h | 120 ++++++++++++++
>> include/hw/vfio/vfio-common.h | 2 +
>> hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++
>> MAINTAINERS | 4 +
>> hw/vfio/meson.build | 1 +
>> 5 files changed, 413 insertions(+)
>> create mode 100644 hw/vfio/user.h
>> create mode 100644 hw/vfio/user.c
>
> The multi-threading, coroutine, and blocking I/O requirements of
> vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please
> document them so it's clear what environment they can be called from. I
> guess they are not called from coroutines and proxy->ioc is a blocking
> IOChannel?
>
Yes to both, moreover, a block comment above vfio_user_recv() would
be useful. The call to setup vfio_user_recv() as the socket handler isn’t
in this patch, do you want the series re-org’d?
>>
>> diff --git a/hw/vfio/user.h b/hw/vfio/user.h
>> new file mode 100644
>> index 0000000000..cdbc074579
>> --- /dev/null
>> +++ b/hw/vfio/user.h
>> @@ -0,0 +1,120 @@
>> +#ifndef VFIO_USER_H
>> +#define VFIO_USER_H
>> +
>> +/*
>> + * vfio protocol over a UNIX socket.
>> + *
>> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2. See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Each message has a standard header that describes the command
>> + * being sent, which is almost always a VFIO ioctl().
>> + *
>> + * The header may be followed by command-specfic data, such as the
>> + * region and offset info for read and write commands.
>> + */
>> +
>> +/* commands */
>> +enum vfio_user_command {
>> + VFIO_USER_VERSION = 1,
>> + VFIO_USER_DMA_MAP = 2,
>> + VFIO_USER_DMA_UNMAP = 3,
>> + VFIO_USER_DEVICE_GET_INFO = 4,
>> + VFIO_USER_DEVICE_GET_REGION_INFO = 5,
>> + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6,
>> + VFIO_USER_DEVICE_GET_IRQ_INFO = 7,
>> + VFIO_USER_DEVICE_SET_IRQS = 8,
>> + VFIO_USER_REGION_READ = 9,
>> + VFIO_USER_REGION_WRITE = 10,
>> + VFIO_USER_DMA_READ = 11,
>> + VFIO_USER_DMA_WRITE = 12,
>> + VFIO_USER_DEVICE_RESET = 13,
>> + VFIO_USER_DIRTY_PAGES = 14,
>> + VFIO_USER_MAX,
>> +};
>> +
>> +/* flags */
>> +#define VFIO_USER_REQUEST 0x0
>> +#define VFIO_USER_REPLY 0x1
>> +#define VFIO_USER_TYPE 0xF
>> +
>> +#define VFIO_USER_NO_REPLY 0x10
>> +#define VFIO_USER_ERROR 0x20
>> +
>> +typedef struct vfio_user_hdr {
>> + uint16_t id;
>> + uint16_t command;
>> + uint32_t size;
>> + uint32_t flags;
>> + uint32_t error_reply;
>> +} vfio_user_hdr_t;
>
> Please use QEMU coding style in QEMU code (i.e. not shared with Linux or
> external libraries):
>
> typedef struct {
> ...
> } VfioUserHdr;
>
> You can also specify the struct VfioUserHdr tag if you want but it's
> only necessary to reference the struct before the end of the typedef
> definition.
>
> https://qemu-project.gitlab.io/qemu/devel/style.html
>
OK
>> +
>> +/*
>> + * VFIO_USER_VERSION
>> + */
>> +#define VFIO_USER_MAJOR_VER 0
>> +#define VFIO_USER_MINOR_VER 0
>> +
>> +struct vfio_user_version {
>> + vfio_user_hdr_t hdr;
>> + uint16_t major;
>> + uint16_t minor;
>> + char capabilities[];
>> +};
>> +
>> +#define VFIO_USER_DEF_MAX_FDS 8
>> +#define VFIO_USER_MAX_MAX_FDS 16
>> +
>> +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024)
>> +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024)
>> +
>> +typedef struct VFIOUserFDs {
>> + int send_fds;
>> + int recv_fds;
>> + int *fds;
>> +} VFIOUserFDs;
>
> I think around here we switch from vfio-user spec definitions to QEMU
> implementation details. It might be nice to keep the vfio-user spec
> definitions in a separate header file so the boundary is clear.
>
OK
>> +
>> +typedef struct VFIOUserReply {
>> + QTAILQ_ENTRY(VFIOUserReply) next;
>> + vfio_user_hdr_t *msg;
>> + VFIOUserFDs *fds;
>> + int rsize;
>> + uint32_t id;
>> + QemuCond cv;
>> + uint8_t complete;
>
> Please use bool.
>
OK
>> +} VFIOUserReply;
>> +
>> +enum proxy_state {
>> + CONNECTED = 1,
>> + RECV_ERROR = 2,
>> + CLOSING = 3,
>> + CLOSED = 4,
>> +};
>
> These enum values probably need a prefix (VFIO_PROXY_*). Generic short
> names like CONNECTED, CLOSED, etc could lead to namespace collisions.
> Enum constants are in the global namespace.
>
OK
>> +
>> +typedef struct VFIOProxy {
>> + QLIST_ENTRY(VFIOProxy) next;
>> + char *sockname;
>> + struct QIOChannel *ioc;
>> + int (*request)(void *opaque, char *buf, VFIOUserFDs *fds);
>> + void *reqarg;
>> + int flags;
>> + QemuCond close_cv;
>> +
>> + /*
>> + * above only changed when iolock is held
>
> Please use "BQL" instead of "iolock". git grep shows many results for
> BQL and the only result for iolock is in mpqemu code.
>
OK
>> + * below are protected by per-proxy lock
>> + */
>> + QemuMutex lock;
>> + QTAILQ_HEAD(, VFIOUserReply) free;
>> + QTAILQ_HEAD(, VFIOUserReply) pending;
>> + enum proxy_state state;
>> + int close_wait;
>
> Is this a bool? Please use bool.
yes it’s a bool
>
>> +} VFIOProxy;
>> +
>> +#define VFIO_PROXY_CLIENT 0x1
>
> A comment that shows which field VFIO_PROXY_CLIENT relates would make this clearer:
>
> /* VFIOProxy->flags */
> #define VFIO_PROXY_CLIENT 0x1
>
OK
>> +
>> +void vfio_user_recv(void *opaque);
>> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret);
>> +#endif /* VFIO_USER_H */
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 8af11b0a76..f43dc6e5d0 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
>> } VFIOAddressSpace;
>>
>> struct VFIOGroup;
>> +typedef struct VFIOProxy VFIOProxy;
>>
>> typedef struct VFIOContainer {
>> VFIOAddressSpace *space;
>> @@ -143,6 +144,7 @@ typedef struct VFIODevice {
>> VFIOMigration *migration;
>> Error *migration_blocker;
>> OnOffAuto pre_copy_dirty_page_tracking;
>> + VFIOProxy *proxy;
>> } VFIODevice;
>>
>> struct VFIODeviceOps {
>> diff --git a/hw/vfio/user.c b/hw/vfio/user.c
>> new file mode 100644
>> index 0000000000..021d5540e0
>> --- /dev/null
>> +++ b/hw/vfio/user.c
>> @@ -0,0 +1,286 @@
>> +/*
>> + * vfio protocol over a UNIX socket.
>> + *
>> + * Copyright © 2018, 2021 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include <linux/vfio.h>
>> +#include <sys/ioctl.h>
>> +
>> +#include "qemu/error-report.h"
>> +#include "qapi/error.h"
>> +#include "qemu/main-loop.h"
>> +#include "hw/hw.h"
>> +#include "hw/vfio/vfio-common.h"
>> +#include "hw/vfio/vfio.h"
>> +#include "qemu/sockets.h"
>> +#include "io/channel.h"
>> +#include "io/channel-util.h"
>> +#include "sysemu/iothread.h"
>> +#include "user.h"
>> +
>> +static uint64_t max_xfer_size = VFIO_USER_DEF_MAX_XFER;
>> +static IOThread *vfio_user_iothread;
>> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> + VFIOUserFDs *fds);
>> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> + VFIOUserFDs *fds);
>> +static void vfio_user_shutdown(VFIOProxy *proxy);
>> +
>> +static void vfio_user_shutdown(VFIOProxy *proxy)
>> +{
>> + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
>> + qio_channel_set_aio_fd_handler(proxy->ioc,
>> + iothread_get_aio_context(vfio_user_iothread),
>> + NULL, NULL, NULL);
>
> There is no other qio_channel_set_aio_fd_handler() call in this patch.
> Why is this one necessary?
>
See first comment.
>> +}
>> +
>> +void vfio_user_send_reply(VFIOProxy *proxy, char *buf, int ret)
>> +{
>> + vfio_user_hdr_t *hdr = (vfio_user_hdr_t *)buf;
>> +
>> + /*
>> + * convert header to associated reply
>> + * positive ret is reply size, negative is error code
>> + */
>> + hdr->flags = VFIO_USER_REPLY;
>> + if (ret > 0) {
>> + hdr->size = ret;
>> + } else if (ret < 0) {
>> + hdr->flags |= VFIO_USER_ERROR;
>> + hdr->error_reply = -ret;
>> + hdr->size = sizeof(*hdr);
>> + }
>
> assert(ret != 0)? That case doesn't seem to be defined so maybe an
> assertion is worthwhile.
>
I should test for positive size less than the header size as an error.
>> + vfio_user_send(proxy, hdr, NULL);
>> +}
>> +
>> +void vfio_user_recv(void *opaque)
>> +{
>> + VFIODevice *vbasedev = opaque;
>> + VFIOProxy *proxy = vbasedev->proxy;
>> + VFIOUserReply *reply = NULL;
>> + g_autofree int *fdp = NULL;
>> + VFIOUserFDs reqfds = { 0, 0, fdp };
>> + vfio_user_hdr_t msg;
>> + struct iovec iov = {
>> + .iov_base = &msg,
>> + .iov_len = sizeof(msg),
>> + };
>> + int isreply, i, ret;
>> + size_t msgleft, numfds = 0;
>> + char *data = NULL;
>> + g_autofree char *buf = NULL;
>> + Error *local_err = NULL;
>> +
>> + qemu_mutex_lock(&proxy->lock);
>> + if (proxy->state == CLOSING) {
>> + qemu_mutex_unlock(&proxy->lock);
>
> QEMU_LOCK_GUARD() automatically unlocks mutexes when the function
> returns and is less error-prone than manual lock/unlock calls.
>
will look into it
>> + return;
>> + }
>> +
>> + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds,
>> + &local_err);
>> + if (ret <= 0) {
>> + /* read error or other side closed connection */
>> + error_setg_errno(&local_err, errno, "vfio_user_recv read error");
>
> This will trigger an assertion failure when local_err was already set by
> qio_channel_readv_full():
>
> static void error_setv(Error **errp,
> const char *src, int line, const char *func,
> ErrorClass err_class, const char *fmt, va_list ap,
> const char *suffix)
> {
> Error *err;
> int saved_errno = errno;
>
> if (errp == NULL) {
> return;
> }
> assert(*errp == NULL);
> ^^^^^^^^^^^^^^^^^^^^^^
>
> I think this error_setg_errno() call should be dropped. You can use
> error_prepend() if you'd like to add more information to the error
> message from qio_channel_readv_full().
>
OK
>> + goto fatal;
>> + }
>> +
>> + if (ret < sizeof(msg)) {
>> + error_setg(&local_err, "vfio_user_recv short read of header");
>> + goto err;
>> + }
>> +
>> + /*
>> + * For replies, find the matching pending request
>> + */
>> + switch (msg.flags & VFIO_USER_TYPE) {
>> + case VFIO_USER_REQUEST:
>> + isreply = 0;
>> + break;
>> + case VFIO_USER_REPLY:
>> + isreply = 1;
>> + break;
>> + default:
>> + error_setg(&local_err, "vfio_user_recv unknown message type");
>> + goto err;
>> + }
>> +
>> + if (isreply) {
>> + QTAILQ_FOREACH(reply, &proxy->pending, next) {
>> + if (msg.id == reply->id) {
>> + break;
>> + }
>> + }
>
> I'm surprised to see this loop since proxy->lock prevents additional
> requests from being sent while we're trying to receive a message. Can
> there really be multiple replies pending with this locking scheme?
>
I didn’t want to assume that was always true. Note an email
exchange with Peter Xu where I can drop BQL in the middle of a memory
region transaction that causes dma_map/unmap messages to be sent. The
fix to that issue will be to send the messages async, then wait for the
youngest reply when the transaction commits.
>> + if (reply == NULL) {
>> + error_setg(&local_err, "vfio_user_recv unexpected reply");
>> + goto err;
>> + }
>> + QTAILQ_REMOVE(&proxy->pending, reply, next);
>> +
>> + /*
>> + * Process any received FDs
>> + */
>> + if (numfds != 0) {
>> + if (reply->fds == NULL || reply->fds->recv_fds < numfds) {
>> + error_setg(&local_err, "vfio_user_recv unexpected FDs");
>> + goto err;
>> + }
>> + reply->fds->recv_fds = numfds;
>> + memcpy(reply->fds->fds, fdp, numfds * sizeof(int));
>> + }
>> +
>> + } else {
>> + /*
>> + * The client doesn't expect any FDs in requests, but
>> + * they will be expected on the server
>> + */
>> + if (numfds != 0 && (proxy->flags & VFIO_PROXY_CLIENT)) {
>> + error_setg(&local_err, "vfio_user_recv fd in client reply");
>> + goto err;
>> + }
>> + reqfds.recv_fds = numfds;
>> + }
>> +
>> + /*
>> + * put the whole message into a single buffer
>> + */
>> + msgleft = msg.size - sizeof(msg);
>
> msg.size has not been validated so this could underflow. Please validate
> all inputs so malicious servers/clients cannot crash or compromise the
> program.
>
OK
>> + if (isreply) {
>> + if (msg.size > reply->rsize) {
>
> rsize is an int. Should it be uint32_t like msg.size?
>
OK
>> + error_setg(&local_err,
>> + "vfio_user_recv reply larger than recv buffer");
>> + goto fatal;
>> + }
>> + *reply->msg = msg;
>> + data = (char *)reply->msg + sizeof(msg);
>> + } else {
>> + if (msg.size > max_xfer_size) {
>> + error_setg(&local_err, "vfio_user_recv request larger than max");
>> + goto fatal;
>> + }
>
> Missing check to prevent buffer overflow:
>
> if (msg.size < sizeof(msg)) {
> error_setg(&local_err, "vfio_user_recv request too small");
> goto fatal;
> }
>
I will put this check up before the msgleft calculation in
the review comment above.
>> + buf = g_malloc0(msg.size);
>> + memcpy(buf, &msg, sizeof(msg));
>> + data = buf + sizeof(msg);
>> + }
>> +
>> + if (msgleft != 0) {
>> + ret = qio_channel_read(proxy->ioc, data, msgleft, &local_err);
>> + if (ret < 0) {
>> + goto fatal;
>> + }
>> + if (ret != msgleft) {
>> + error_setg(&local_err, "vfio_user_recv short read of msg body");
>> + goto err;
>> + }
>> + }
>> +
>> + /*
>> + * Replies signal a waiter, requests get processed by vfio code
>> + * that may assume the iothread lock is held.
>> + */
>> + qemu_mutex_unlock(&proxy->lock);
>> + if (isreply) {
>> + reply->complete = 1;
>> + qemu_cond_signal(&reply->cv);
>
> signal must be called with the mutex held to avoid race conditions. If
> the waiter acquires the lock and still sees complete == 0, then we
> signal before wait is entered, the signal is missed and the waiter is
> stuck.
>
Yes, this is a bug
>> + } else {
>> + qemu_mutex_lock_iothread();
>> + /*
>> + * make sure proxy wasn't closed while we waited
>> + * checking without holding the proxy lock is safe
>> + * since state is only set to CLOSING when iolock is held
>
> s/iolock/the BQL/
>
OK
>> + */
>> + if (proxy->state != CLOSING) {
>> + ret = proxy->request(proxy->reqarg, buf, &reqfds);
>> + if (ret < 0 && !(msg.flags & VFIO_USER_NO_REPLY)) {
>> + vfio_user_send_reply(proxy, buf, ret);
>> + }
>> + }
>> + qemu_mutex_unlock_iothread();
>> + }
>> +
>> + return;
>> + fatal:
>> + vfio_user_shutdown(proxy);
>> + proxy->state = RECV_ERROR;
>> +
>> + err:
>> + qemu_mutex_unlock(&proxy->lock);
>> + for (i = 0; i < numfds; i++) {
>> + close(fdp[i]);
>> + }
>> + if (reply != NULL) {
>> + /* force an error to keep sending thread from hanging */
>> + reply->msg->flags |= VFIO_USER_ERROR;
>> + reply->msg->error_reply = EINVAL;
>> + reply->complete = 1;
>> + qemu_cond_signal(&reply->cv);
>
> This has the race condition too.
>
Yes
>> + }
>> + error_report_err(local_err);
>> +}
>> +
>> +static void vfio_user_send_locked(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> + VFIOUserFDs *fds)
>> +{
>> + struct iovec iov = {
>> + .iov_base = msg,
>> + .iov_len = msg->size,
>> + };
>> + size_t numfds = 0;
>> + int msgleft, ret, *fdp = NULL;
>> + char *buf;
>> + Error *local_err = NULL;
>> +
>> + if (proxy->state != CONNECTED) {
>> + msg->flags |= VFIO_USER_ERROR;
>> + msg->error_reply = ECONNRESET;
>> + return;
>> + }
>> +
>> + if (fds != NULL && fds->send_fds != 0) {
>> + numfds = fds->send_fds;
>> + fdp = fds->fds;
>> + }
>> + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, &local_err);
>> + if (ret < 0) {
>> + goto err;
>> + }
>> + if (ret == msg->size) {
>> + return;
>> + }
>> +
>> + buf = iov.iov_base + ret;
>> + msgleft = iov.iov_len - ret;
>> + do {
>> + ret = qio_channel_write(proxy->ioc, buf, msgleft, &local_err);
>> + if (ret < 0) {
>> + goto err;
>> + }
>> + buf += ret, msgleft -= ret;
>
> Please use semicolon. Comma operators are rare, requiring readers to
> check their exact semantics. There is no need to use comma here.
>
OK
>> + } while (msgleft != 0);
>> + return;
>> +
>> + err:
>> + error_report_err(local_err);
>
> State remains unchanged and msg->error_reply isn't set?
>
They should be set.
JJ
>> +}
>> +
>> +static void vfio_user_send(VFIOProxy *proxy, vfio_user_hdr_t *msg,
>> + VFIOUserFDs *fds)
>> +{
>> + bool iolock = qemu_mutex_iothread_locked();
>> +
>> + if (iolock) {
>> + qemu_mutex_unlock_iothread();
>> + }
>> + qemu_mutex_lock(&proxy->lock);
>> + vfio_user_send_locked(proxy, msg, fds);
>> + qemu_mutex_unlock(&proxy->lock);
>> + if (iolock) {
>> + qemu_mutex_lock_iothread();
>> + }
>> +}
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 12d69f3a45..aa4df6c418 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1883,8 +1883,12 @@ L: qemu-s390x@nongnu.org
>> vfio-user
>> M: John G Johnson <john.g.johnson@oracle.com>
>> M: Thanos Makatos <thanos.makatos@nutanix.com>
>> +M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
>> +M: Jagannathan Raman <jag.raman@oracle.com>
>> S: Supported
>> F: docs/devel/vfio-user.rst
>> +F: hw/vfio/user.c
>> +F: hw/vfio/user.h
>>
>> vhost
>> M: Michael S. Tsirkin <mst@redhat.com>
>> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
>> index da9af297a0..739b30be73 100644
>> --- a/hw/vfio/meson.build
>> +++ b/hw/vfio/meson.build
>> @@ -8,6 +8,7 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
>> 'display.c',
>> 'pci-quirks.c',
>> 'pci.c',
>> + 'user.c',
>> ))
>> vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c'))
>> vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c'))
>> --
>> 2.25.1
>>
On Wed, Jul 28, 2021 at 06:08:26PM +0000, John Johnson wrote: > > > > On Jul 27, 2021, at 9:34 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote: > > > > On Sun, Jul 18, 2021 at 11:27:42PM -0700, Elena Ufimtseva wrote: > >> From: John G Johnson <john.g.johnson@oracle.com> > >> > >> Add user.c and user.h files for vfio-user with the basic > >> send and receive functions. > >> > >> Signed-off-by: John G Johnson <john.g.johnson@oracle.com> > >> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com> > >> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com> > >> --- > >> hw/vfio/user.h | 120 ++++++++++++++ > >> include/hw/vfio/vfio-common.h | 2 + > >> hw/vfio/user.c | 286 ++++++++++++++++++++++++++++++++++ > >> MAINTAINERS | 4 + > >> hw/vfio/meson.build | 1 + > >> 5 files changed, 413 insertions(+) > >> create mode 100644 hw/vfio/user.h > >> create mode 100644 hw/vfio/user.c > > > > The multi-threading, coroutine, and blocking I/O requirements of > > vfio_user_recv() and vfio_user_send_reply() are unclear to me. Please > > document them so it's clear what environment they can be called from. I > > guess they are not called from coroutines and proxy->ioc is a blocking > > IOChannel? > > > > Yes to both, moreover, a block comment above vfio_user_recv() would > be useful. The call to setup vfio_user_recv() as the socket handler isn’t > in this patch, do you want the series re-org’d? That would help with review, thanks! Stefan
© 2016 - 2026 Red Hat, Inc.