[PATCH 13/14] vfio: add device IO ops vector

John Levon posted 14 patches 7 months, 1 week ago
There is a newer version of this series
[PATCH 13/14] vfio: add device IO ops vector
Posted by John Levon 7 months, 1 week ago
For vfio-user, device operations such as IRQ handling and region
read/writes are implemented in userspace over the control socket, not
ioctl() or read()/write() to the vfio kernel driver; add an ops vector
to generalize this, and implement vfio_device_io_ops_ioctl for
interacting with the kernel vfio driver.

Originally-by: John Johnson <john.g.johnson@oracle.com>
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: John Levon <john.levon@nutanix.com>
---
 hw/vfio/ap.c                  |   2 +-
 hw/vfio/ccw.c                 |   2 +-
 hw/vfio/container-base.c      |   6 +-
 hw/vfio/device.c              | 102 ++++++++++++++++++++++++++++++----
 hw/vfio/listener.c            |  13 +++--
 hw/vfio/pci.c                 |  40 +++++++------
 hw/vfio/platform.c            |   2 +-
 hw/vfio/region.c              |  17 ++++--
 include/hw/vfio/vfio-device.h |  24 +++++++-
 9 files changed, 155 insertions(+), 53 deletions(-)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index f311bca5b6..b6233b2107 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -229,7 +229,7 @@ static void vfio_ap_instance_init(Object *obj)
      * handle ram_block_discard_disable().
      */
     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
-                     DEVICE(vapdev), true);
+                     &vfio_device_io_ops_ioctl, DEVICE(vapdev), true);
 
     /* AP device is mdev type device */
     vbasedev->mdev = true;
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 14dee7cd19..aee52b5a8d 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -676,7 +676,7 @@ static void vfio_ccw_instance_init(Object *obj)
      * ram_block_discard_disable().
      */
     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
-                     DEVICE(vcdev), true);
+                     &vfio_device_io_ops_ioctl, DEVICE(vcdev), true);
 }
 
 #ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 613fe1a00d..16fe5f79d2 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -198,11 +198,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
     feature->flags = VFIO_DEVICE_FEATURE_GET |
                      VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
 
-    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
-        return -errno;
-    }
-
-    return 0;
+    return vbasedev->io_ops->device_feature(vbasedev, feature);
 }
 
 static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/device.c b/hw/vfio/device.c
index 102fa5a9b4..545d9f1faf 100644
--- a/hw/vfio/device.c
+++ b/hw/vfio/device.c
@@ -82,7 +82,7 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
         .count = 0,
     };
 
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
 }
 
 void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
@@ -95,7 +95,7 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
         .count = 1,
     };
 
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
 }
 
 void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
@@ -108,7 +108,7 @@ void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
         .count = 1,
     };
 
-    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
 }
 
 static inline const char *action_to_str(int action)
@@ -155,6 +155,7 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
     int argsz;
     const char *name;
     int32_t *pfd;
+    int ret;
 
     argsz = sizeof(*irq_set) + sizeof(*pfd);
 
@@ -167,7 +168,9 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
     pfd = (int32_t *)&irq_set->data;
     *pfd = fd;
 
-    if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
+    ret = vbasedev->io_ops->set_irqs(vbasedev, irq_set);
+
+    if (!ret) {
         return true;
     }
 
@@ -188,22 +191,19 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
 int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
                              struct vfio_irq_info *info)
 {
-    int ret;
-
     memset(info, 0, sizeof(*info));
 
     info->argsz = sizeof(*info);
     info->index = index;
 
-    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
-
-    return ret < 0 ? -errno : ret;
+    return vbasedev->io_ops->get_irq_info(vbasedev, info);
 }
 
 int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
                                 struct vfio_region_info **info)
 {
     size_t argsz = sizeof(struct vfio_region_info);
+    int ret;
 
     /* create region info cache */
     if (vbasedev->reginfo == NULL) {
@@ -222,10 +222,11 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
 retry:
     (*info)->argsz = argsz;
 
-    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
+    ret = vbasedev->io_ops->get_region_info(vbasedev, *info);
+    if (ret != 0) {
         g_free(*info);
         *info = NULL;
-        return -errno;
+        return ret;
     }
 
     if ((*info)->argsz > argsz) {
@@ -332,10 +333,12 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
 }
 
 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
-                      DeviceState *dev, bool ram_discard)
+                      VFIODeviceIOOps *io_ops, DeviceState *dev,
+                      bool ram_discard)
 {
     vbasedev->type = type;
     vbasedev->ops = ops;
+    vbasedev->io_ops = io_ops;
     vbasedev->dev = dev;
     vbasedev->fd = -1;
 
@@ -463,3 +466,78 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
 
     vfio_device_get_all_region_info(vbasedev);
 }
+
+/*
+ * Traditional ioctl() based io
+ */
+
+static int vfio_device_io_device_feature(VFIODevice *vbasedev,
+                                         struct vfio_device_feature *feature)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+
+    return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
+                                          struct vfio_region_info *info)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+
+    return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_get_irq_info(VFIODevice *vbasedev,
+                                       struct vfio_irq_info *info)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
+
+    return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_set_irqs(VFIODevice *vbasedev,
+                                   struct vfio_irq_set *irqs)
+{
+    int ret;
+
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
+
+    return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
+                                      off_t off, uint32_t size, void *data)
+{
+    struct vfio_region_info *info = vbasedev->reginfo[index];
+    int ret;
+
+    ret = pread(vbasedev->fd, data, size, info->offset + off);
+
+    return ret < 0 ? -errno : ret;
+}
+
+static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
+                                       off_t off, uint32_t size, void *data)
+{
+    struct vfio_region_info *info = vbasedev->reginfo[index];
+    int ret;
+
+    ret = pwrite(vbasedev->fd, data, size, info->offset + off);
+
+    return ret < 0 ? -errno : ret;
+}
+
+VFIODeviceIOOps vfio_device_io_ops_ioctl = {
+    .device_feature = vfio_device_io_device_feature,
+    .get_region_info = vfio_device_io_get_region_info,
+    .get_irq_info = vfio_device_io_get_irq_info,
+    .set_irqs = vfio_device_io_set_irqs,
+    .region_read = vfio_device_io_region_read,
+    .region_write = vfio_device_io_region_write,
+};
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index bcf2b98e79..7ea9e0dfb7 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -821,13 +821,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
                      VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
 
     QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
+        int ret;
+
         if (!vbasedev->dirty_tracking) {
             continue;
         }
 
-        if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+        ret = vbasedev->io_ops->device_feature(vbasedev, feature);
+
+        if (ret != 0) {
             warn_report("%s: Failed to stop DMA logging, err %d (%s)",
-                        vbasedev->name, -errno, strerror(errno));
+                        vbasedev->name, -ret, strerror(-ret));
         }
         vbasedev->dirty_tracking = false;
     }
@@ -928,10 +932,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
             continue;
         }
 
-        ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+        ret = vbasedev->io_ops->device_feature(vbasedev, feature);
         if (ret) {
-            ret = -errno;
-            error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
+            error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
                              vbasedev->name);
             goto out;
         }
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b40d5abdfd..ff2b15ff02 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -381,7 +381,7 @@ static void vfio_msi_interrupt(void *opaque)
 static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
 {
     g_autofree struct vfio_irq_set *irq_set = NULL;
-    int ret = 0, argsz;
+    int argsz;
     int32_t *fd;
 
     argsz = sizeof(*irq_set) + sizeof(*fd);
@@ -396,9 +396,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
     fd = (int32_t *)&irq_set->data;
     *fd = -1;
 
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-    return ret < 0 ? -errno : ret;
+    return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
 }
 
 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
@@ -455,11 +453,11 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
         fds[i] = fd;
     }
 
-    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+    ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
 
     g_free(irq_set);
 
-    return ret < 0 ? -errno : ret;
+    return ret;
 }
 
 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
@@ -917,18 +915,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
     memset(vdev->rom, 0xff, size);
 
     while (size) {
-        bytes = pread(vbasedev->fd, vdev->rom + off,
-                      size, vdev->rom_offset + off);
+        bytes = vbasedev->io_ops->region_read(vbasedev,
+                                              VFIO_PCI_ROM_REGION_INDEX,
+                                              off, size, vdev->rom + off);
+
         if (bytes == 0) {
             break;
         } else if (bytes > 0) {
             off += bytes;
             size -= bytes;
         } else {
-            if (errno == EINTR || errno == EAGAIN) {
+            if (bytes == -EINTR || bytes == -EAGAIN) {
                 continue;
             }
-            error_report("vfio: Error reading device ROM: %m");
+            error_report("vfio: Error reading device ROM: %s",
+                         strerror(-bytes));
+
             break;
         }
     }
@@ -968,22 +970,18 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
                                       uint32_t size, void *data)
 {
-    ssize_t ret;
-
-    ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
-
-    return ret < 0 ? -errno : (int)ret;
+    return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
+                                              VFIO_PCI_CONFIG_REGION_INDEX,
+                                              offset, size, data);
 }
 
 /* "Raw" write of underlying config space. */
 static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
                                        uint32_t size, void *data)
 {
-    ssize_t ret;
-
-    ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
-
-    return ret < 0 ? -errno : (int)ret;
+    return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
+                                               VFIO_PCI_CONFIG_REGION_INDEX,
+                                               offset, size, data);
 }
 
 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
@@ -3405,7 +3403,7 @@ static void vfio_instance_init(Object *obj)
     vdev->host.function = ~0U;
 
     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
-                     DEVICE(vdev), false);
+                     &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
 
     vdev->nv_gpudirect_clique = 0xFF;
 
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index fd176c18a4..28eedfa571 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -650,7 +650,7 @@ static void vfio_platform_instance_init(Object *obj)
     VFIODevice *vbasedev = &vdev->vbasedev;
 
     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
-                     DEVICE(vdev), false);
+                     &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
 }
 
 #ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index ef2630cac3..35fb81c04a 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -45,6 +45,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
         uint32_t dword;
         uint64_t qword;
     } buf;
+    int ret;
 
     switch (size) {
     case 1:
@@ -64,11 +65,13 @@ void vfio_region_write(void *opaque, hwaddr addr,
         break;
     }
 
-    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
+    ret = vbasedev->io_ops->region_write(vbasedev, region->nr,
+                                         addr, size, &buf);
+    if (ret != size) {
         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
-                     ",%d) failed: %m",
+                     ",%d) failed: %s",
                      __func__, vbasedev->name, region->nr,
-                     addr, data, size);
+                     addr, data, size, ret < 0 ? strerror(ret) : "short write");
     }
 
     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
@@ -96,11 +99,13 @@ uint64_t vfio_region_read(void *opaque,
         uint64_t qword;
     } buf;
     uint64_t data = 0;
+    int ret;
 
-    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
-        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
+    ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf);
+    if (ret != size) {
+        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
                      __func__, vbasedev->name, region->nr,
-                     addr, size);
+                     addr, size, ret < 0 ? strerror(ret) : "short read");
         return (uint64_t)-1;
     }
     switch (size) {
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 967b07cd89..cb2f581826 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -41,6 +41,7 @@ enum {
 };
 
 typedef struct VFIODeviceOps VFIODeviceOps;
+typedef struct VFIODeviceIOOps VFIODeviceIOOps;
 typedef struct VFIOMigration VFIOMigration;
 
 typedef struct IOMMUFDBackend IOMMUFDBackend;
@@ -66,6 +67,7 @@ typedef struct VFIODevice {
     OnOffAuto migration_multifd_transfer;
     bool migration_events;
     VFIODeviceOps *ops;
+    VFIODeviceIOOps *io_ops;
     unsigned int num_irqs;
     unsigned int num_regions;
     unsigned int flags;
@@ -140,6 +142,25 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
 extern VFIODeviceList vfio_device_list;
 
 #ifdef CONFIG_LINUX
+/*
+ * How devices communicate with the server.  The default option is through
+ * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote
+ * process.
+ */
+struct VFIODeviceIOOps {
+    int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *);
+    int (*get_region_info)(VFIODevice *vdev,
+                           struct vfio_region_info *info);
+    int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq);
+    int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs);
+    int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
+                       void *data);
+    int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
+                        void *data);
+};
+
+extern VFIODeviceIOOps vfio_device_io_ops_ioctl;
+
 int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
                                 struct vfio_region_info **info);
 int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
@@ -154,6 +175,7 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
 bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
-                      DeviceState *dev, bool ram_discard);
+                      VFIODeviceIOOps *io_ops, DeviceState *dev,
+                      bool ram_discard);
 int vfio_device_get_aw_bits(VFIODevice *vdev);
 #endif /* HW_VFIO_VFIO_COMMON_H */
-- 
2.34.1
Re: [PATCH 13/14] vfio: add device IO ops vector
Posted by Cédric Le Goater 6 months, 3 weeks ago
On 4/9/25 15:48, John Levon wrote:
> For vfio-user, device operations such as IRQ handling and region
> read/writes are implemented in userspace over the control socket, not
> ioctl() or read()/write() to the vfio kernel driver; add an ops vector
> to generalize this, and implement vfio_device_io_ops_ioctl for
> interacting with the kernel vfio driver.
> 
> Originally-by: John Johnson <john.g.johnson@oracle.com>
> Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
> Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
> Signed-off-by: John Levon <john.levon@nutanix.com>
> ---
>   hw/vfio/ap.c                  |   2 +-
>   hw/vfio/ccw.c                 |   2 +-
>   hw/vfio/container-base.c      |   6 +-
>   hw/vfio/device.c              | 102 ++++++++++++++++++++++++++++++----
>   hw/vfio/listener.c            |  13 +++--
>   hw/vfio/pci.c                 |  40 +++++++------
>   hw/vfio/platform.c            |   2 +-
>   hw/vfio/region.c              |  17 ++++--
>   include/hw/vfio/vfio-device.h |  24 +++++++-
>   9 files changed, 155 insertions(+), 53 deletions(-)
> 
> diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
> index f311bca5b6..b6233b2107 100644
> --- a/hw/vfio/ap.c
> +++ b/hw/vfio/ap.c
> @@ -229,7 +229,7 @@ static void vfio_ap_instance_init(Object *obj)
>        * handle ram_block_discard_disable().
>        */
>       vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
> -                     DEVICE(vapdev), true);
> +                     &vfio_device_io_ops_ioctl, DEVICE(vapdev), true);


We only have one io_ops implementation currently. Please drop all
the vfio_device_init() changes and keep vfio_device_io_ops_ioctl
static.

>   
>       /* AP device is mdev type device */
>       vbasedev->mdev = true;
> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
> index 14dee7cd19..aee52b5a8d 100644
> --- a/hw/vfio/ccw.c
> +++ b/hw/vfio/ccw.c
> @@ -676,7 +676,7 @@ static void vfio_ccw_instance_init(Object *obj)
>        * ram_block_discard_disable().
>        */
>       vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
> -                     DEVICE(vcdev), true);
> +                     &vfio_device_io_ops_ioctl, DEVICE(vcdev), true);
>   }
>   
>   #ifdef CONFIG_IOMMUFD
> diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
> index 613fe1a00d..16fe5f79d2 100644
> --- a/hw/vfio/container-base.c
> +++ b/hw/vfio/container-base.c
> @@ -198,11 +198,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
>       feature->flags = VFIO_DEVICE_FEATURE_GET |
>                        VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
>   
> -    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
> -        return -errno;
> -    }
> -
> -    return 0;
> +    return vbasedev->io_ops->device_feature(vbasedev, feature);
>   }
>   
>   static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 102fa5a9b4..545d9f1faf 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -82,7 +82,7 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
>           .count = 0,
>       };
>   
> -    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
>   }
>   
>   void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
> @@ -95,7 +95,7 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
>           .count = 1,
>       };
>   
> -    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
>   }
>   
>   void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
> @@ -108,7 +108,7 @@ void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
>           .count = 1,
>       };
>   
> -    ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
> +    vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
>   }
>   
>   static inline const char *action_to_str(int action)
> @@ -155,6 +155,7 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
>       int argsz;
>       const char *name;
>       int32_t *pfd;
> +    int ret;
>   
>       argsz = sizeof(*irq_set) + sizeof(*pfd);
>   
> @@ -167,7 +168,9 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
>       pfd = (int32_t *)&irq_set->data;
>       *pfd = fd;
>   
> -    if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
> +    ret = vbasedev->io_ops->set_irqs(vbasedev, irq_set);
> +
> +    if (!ret) {
>           return true;
>       }
>   
> @@ -188,22 +191,19 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex
>   int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
>                                struct vfio_irq_info *info)
>   {
> -    int ret;
> -
>       memset(info, 0, sizeof(*info));
>   
>       info->argsz = sizeof(*info);
>       info->index = index;
>   
> -    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
> -
> -    return ret < 0 ? -errno : ret;
> +    return vbasedev->io_ops->get_irq_info(vbasedev, info);
>   }
>   
>   int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
>                                   struct vfio_region_info **info)
>   {
>       size_t argsz = sizeof(struct vfio_region_info);
> +    int ret;
>   
>       /* create region info cache */
>       if (vbasedev->reginfo == NULL) {
> @@ -222,10 +222,11 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
>   retry:
>       (*info)->argsz = argsz;
>   
> -    if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
> +    ret = vbasedev->io_ops->get_region_info(vbasedev, *info);
> +    if (ret != 0) {
>           g_free(*info);
>           *info = NULL;
> -        return -errno;
> +        return ret;
>       }
>   
>       if ((*info)->argsz > argsz) {
> @@ -332,10 +333,12 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
>   }
>   
>   void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
> -                      DeviceState *dev, bool ram_discard)
> +                      VFIODeviceIOOps *io_ops, DeviceState *dev,
> +                      bool ram_discard)
>   {
>       vbasedev->type = type;
>       vbasedev->ops = ops;
> +    vbasedev->io_ops = io_ops;
>       vbasedev->dev = dev;
>       vbasedev->fd = -1;
>   
> @@ -463,3 +466,78 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
>   
>       vfio_device_get_all_region_info(vbasedev);
>   }
> +
> +/*
> + * Traditional ioctl() based io
> + */
> +
> +static int vfio_device_io_device_feature(VFIODevice *vbasedev,
> +                                         struct vfio_device_feature *feature)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
> +
> +    return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
> +                                          struct vfio_region_info *info)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
> +
> +    return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_get_irq_info(VFIODevice *vbasedev,
> +                                       struct vfio_irq_info *info)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
> +
> +    return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_set_irqs(VFIODevice *vbasedev,
> +                                   struct vfio_irq_set *irqs)
> +{
> +    int ret;
> +
> +    ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
> +
> +    return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
> +                                      off_t off, uint32_t size, void *data)
> +{
> +    struct vfio_region_info *info = vbasedev->reginfo[index];
> +    int ret;
> +
> +    ret = pread(vbasedev->fd, data, size, info->offset + off);
> +
> +    return ret < 0 ? -errno : ret;
> +}
> +
> +static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
> +                                       off_t off, uint32_t size, void *data)
> +{
> +    struct vfio_region_info *info = vbasedev->reginfo[index];
> +    int ret;
> +
> +    ret = pwrite(vbasedev->fd, data, size, info->offset + off);
> +
> +    return ret < 0 ? -errno : ret;
> +}
> +
> +VFIODeviceIOOps vfio_device_io_ops_ioctl = {
> +    .device_feature = vfio_device_io_device_feature,
> +    .get_region_info = vfio_device_io_get_region_info,
> +    .get_irq_info = vfio_device_io_get_irq_info,
> +    .set_irqs = vfio_device_io_set_irqs,
> +    .region_read = vfio_device_io_region_read,
> +    .region_write = vfio_device_io_region_write,
> +};
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index bcf2b98e79..7ea9e0dfb7 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -821,13 +821,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
>                        VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
>   
>       QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
> +        int ret;
> +
>           if (!vbasedev->dirty_tracking) {
>               continue;
>           }
>   
> -        if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
> +        ret = vbasedev->io_ops->device_feature(vbasedev, feature);
> +
> +        if (ret != 0) {
>               warn_report("%s: Failed to stop DMA logging, err %d (%s)",
> -                        vbasedev->name, -errno, strerror(errno));
> +                        vbasedev->name, -ret, strerror(-ret));
>           }
>           vbasedev->dirty_tracking = false;
>       }
> @@ -928,10 +932,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer,
>               continue;
>           }
>   
> -        ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
> +        ret = vbasedev->io_ops->device_feature(vbasedev, feature);
>           if (ret) {
> -            ret = -errno;
> -            error_setg_errno(errp, errno, "%s: Failed to start DMA logging",
> +            error_setg_errno(errp, -ret, "%s: Failed to start DMA logging",
>                                vbasedev->name);
>               goto out;
>           }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index b40d5abdfd..ff2b15ff02 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -381,7 +381,7 @@ static void vfio_msi_interrupt(void *opaque)
>   static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
>   {
>       g_autofree struct vfio_irq_set *irq_set = NULL;
> -    int ret = 0, argsz;
> +    int argsz;
>       int32_t *fd;
>   
>       argsz = sizeof(*irq_set) + sizeof(*fd);
> @@ -396,9 +396,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
>       fd = (int32_t *)&irq_set->data;
>       *fd = -1;
>   
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> -
> -    return ret < 0 ? -errno : ret;
> +    return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
>   }
>   
>   static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
> @@ -455,11 +453,11 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
>           fds[i] = fd;
>       }
>   
> -    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
> +    ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set);
>   
>       g_free(irq_set);
>   
> -    return ret < 0 ? -errno : ret;
> +    return ret;
>   }
>   
>   static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
> @@ -917,18 +915,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>       memset(vdev->rom, 0xff, size);
>   
>       while (size) {
> -        bytes = pread(vbasedev->fd, vdev->rom + off,
> -                      size, vdev->rom_offset + off);
> +        bytes = vbasedev->io_ops->region_read(vbasedev,
> +                                              VFIO_PCI_ROM_REGION_INDEX,
> +                                              off, size, vdev->rom + off);
> +
>           if (bytes == 0) {
>               break;
>           } else if (bytes > 0) {
>               off += bytes;
>               size -= bytes;
>           } else {
> -            if (errno == EINTR || errno == EAGAIN) {
> +            if (bytes == -EINTR || bytes == -EAGAIN) {
>                   continue;
>               }
> -            error_report("vfio: Error reading device ROM: %m");
> +            error_report("vfio: Error reading device ROM: %s",
> +                         strerror(-bytes));
> +
>               break;
>           }
>       }
> @@ -968,22 +970,18 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
>   static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset,
>                                         uint32_t size, void *data)
>   {
> -    ssize_t ret;
> -
> -    ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> -
> -    return ret < 0 ? -errno : (int)ret;
> +    return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev,
> +                                              VFIO_PCI_CONFIG_REGION_INDEX,
> +                                              offset, size, data);
>   }
>   
>   /* "Raw" write of underlying config space. */
>   static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset,
>                                          uint32_t size, void *data)
>   {
> -    ssize_t ret;
> -
> -    ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset);
> -
> -    return ret < 0 ? -errno : (int)ret;
> +    return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev,
> +                                               VFIO_PCI_CONFIG_REGION_INDEX,
> +                                               offset, size, data);
>   }
>   
>   static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
> @@ -3405,7 +3403,7 @@ static void vfio_instance_init(Object *obj)
>       vdev->host.function = ~0U;
>   
>       vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
> -                     DEVICE(vdev), false);
> +                     &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
>   
>       vdev->nv_gpudirect_clique = 0xFF;
>   
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> index fd176c18a4..28eedfa571 100644
> --- a/hw/vfio/platform.c
> +++ b/hw/vfio/platform.c
> @@ -650,7 +650,7 @@ static void vfio_platform_instance_init(Object *obj)
>       VFIODevice *vbasedev = &vdev->vbasedev;
>   
>       vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
> -                     DEVICE(vdev), false);
> +                     &vfio_device_io_ops_ioctl, DEVICE(vdev), false);
>   }
>   
>   #ifdef CONFIG_IOMMUFD
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index ef2630cac3..35fb81c04a 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -45,6 +45,7 @@ void vfio_region_write(void *opaque, hwaddr addr,
>           uint32_t dword;
>           uint64_t qword;
>       } buf;
> +    int ret;
>   
>       switch (size) {
>       case 1:
> @@ -64,11 +65,13 @@ void vfio_region_write(void *opaque, hwaddr addr,
>           break;
>       }
>   
> -    if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> +    ret = vbasedev->io_ops->region_write(vbasedev, region->nr,
> +                                         addr, size, &buf);
> +    if (ret != size) {
>           error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
> -                     ",%d) failed: %m",
> +                     ",%d) failed: %s",
>                        __func__, vbasedev->name, region->nr,
> -                     addr, data, size);
> +                     addr, data, size, ret < 0 ? strerror(ret) : "short write");
>       }
>   
>       trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
> @@ -96,11 +99,13 @@ uint64_t vfio_region_read(void *opaque,
>           uint64_t qword;
>       } buf;
>       uint64_t data = 0;
> +    int ret;
>   
> -    if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
> -        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
> +    ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf);
> +    if (ret != size) {
> +        error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s",
>                        __func__, vbasedev->name, region->nr,
> -                     addr, size);
> +                     addr, size, ret < 0 ? strerror(ret) : "short read");
>           return (uint64_t)-1;
>       }
>       switch (size) {
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 967b07cd89..cb2f581826 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -41,6 +41,7 @@ enum {
>   };
>   
>   typedef struct VFIODeviceOps VFIODeviceOps;
> +typedef struct VFIODeviceIOOps VFIODeviceIOOps;
>   typedef struct VFIOMigration VFIOMigration;
>   
>   typedef struct IOMMUFDBackend IOMMUFDBackend;
> @@ -66,6 +67,7 @@ typedef struct VFIODevice {
>       OnOffAuto migration_multifd_transfer;
>       bool migration_events;
>       VFIODeviceOps *ops;
> +    VFIODeviceIOOps *io_ops;
>       unsigned int num_irqs;
>       unsigned int num_regions;
>       unsigned int flags;
> @@ -140,6 +142,25 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
>   extern VFIODeviceList vfio_device_list;
>   
>   #ifdef CONFIG_LINUX
> +/*
> + * How devices communicate with the server.  The default option is through
> + * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote
> + * process.
> + */
> +struct VFIODeviceIOOps {
> +    int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *);
> +    int (*get_region_info)(VFIODevice *vdev,
> +                           struct vfio_region_info *info);
> +    int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq);
> +    int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs);
> +    int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> +                       void *data);
> +    int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size,
> +                        void *data);
> +};


Could you please add documentation for this new struct.


Thanks,

C.




> +extern VFIODeviceIOOps vfio_device_io_ops_ioctl;
> +
>   int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
>                                   struct vfio_region_info **info);
>   int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
> @@ -154,6 +175,7 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
>   bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
>   void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
>   void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
> -                      DeviceState *dev, bool ram_discard);
> +                      VFIODeviceIOOps *io_ops, DeviceState *dev,
> +                      bool ram_discard);
>   int vfio_device_get_aw_bits(VFIODevice *vdev);
>   #endif /* HW_VFIO_VFIO_COMMON_H */