Wire up support for DMA for the case where the vfio-user client does not
provide mmap()-able file descriptors, but DMA requests must be performed
via the VFIO-user protocol. This installs an indirect memory region,
which already works for pci_dma_{read,write}, and pci_dma_map works
thanks to the existing DMA bounce buffering support.
Note that while simple scenarios work with this patch, there's a known
race condition in libvfio-user that will mess up the communication
channel. See https://github.com/nutanix/libvfio-user/issues/279 for
details as well as a proposed fix.
Reviewed-by: Jagannathan Raman <jag.raman@oracle.com>
Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
---
hw/remote/trace-events | 2 +
hw/remote/vfio-user-obj.c | 100 ++++++++++++++++++++++++++++++++------
2 files changed, 87 insertions(+), 15 deletions(-)
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
index 0d1b7d56a5..358a68fb34 100644
--- a/hw/remote/trace-events
+++ b/hw/remote/trace-events
@@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x"
vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
+vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
+vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes"
vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index d9b879e056..a15e291c9a 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
return count;
}
+static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
+ unsigned size, MemTxAttrs attrs)
+{
+ MemoryRegion *region = opaque;
+ vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
+ uint8_t buf[sizeof(uint64_t)];
+
+ trace_vfu_dma_read(region->addr + addr, size);
+
+ g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
+ vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
+ if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
+ vfu_sgl_read(vfu_ctx, sg, 1, buf) != 0) {
+ return MEMTX_ERROR;
+ }
+
+ *val = ldn_he_p(buf, size);
+
+ return MEMTX_OK;
+}
+
+static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size, MemTxAttrs attrs)
+{
+ MemoryRegion *region = opaque;
+ vfu_ctx_t *vfu_ctx = VFU_OBJECT(region->owner)->vfu_ctx;
+ uint8_t buf[sizeof(uint64_t)];
+
+ trace_vfu_dma_write(region->addr + addr, size);
+
+ stn_he_p(buf, size, val);
+
+ g_autofree dma_sg_t *sg = g_malloc0(dma_sg_size());
+ vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
+ if (vfu_addr_to_sgl(vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
+ vfu_sgl_write(vfu_ctx, sg, 1, buf) != 0) {
+ return MEMTX_ERROR;
+ }
+
+ return MEMTX_OK;
+}
+
+static const MemoryRegionOps vfu_dma_ops = {
+ .read_with_attrs = vfu_dma_read,
+ .write_with_attrs = vfu_dma_write,
+ .endianness = DEVICE_HOST_ENDIAN,
+ .valid = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ .unaligned = true,
+ },
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+};
+
static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
{
VfuObject *o = vfu_get_private(vfu_ctx);
@@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
g_autofree char *name = NULL;
struct iovec *iov = &info->iova;
- if (!info->vaddr) {
- return;
- }
-
name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
- (uint64_t)info->vaddr);
+ (uint64_t)iov->iov_base);
subregion = g_new0(MemoryRegion, 1);
- memory_region_init_ram_ptr(subregion, NULL, name,
- iov->iov_len, info->vaddr);
+ if (info->vaddr) {
+ memory_region_init_ram_ptr(subregion, OBJECT(o), name,
+ iov->iov_len, info->vaddr);
+ } else {
+ /*
+ * Note that I/O regions' MemoryRegionOps handle accesses of at most 8
+ * bytes at a time, and larger accesses are broken down. However,
+ * many/most DMA accesses are larger than 8 bytes and VFIO-user can
+ * handle large DMA accesses just fine, thus this size restriction
+ * unnecessarily hurts performance, in particular given that each
+ * access causes a round trip on the VFIO-user socket.
+ *
+ * TODO: Investigate how to plumb larger accesses through memory
+ * regions, possibly by amending MemoryRegionOps or by creating a new
+ * memory region type.
+ */
+ memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
+ name, iov->iov_len);
+ }
dma_as = pci_device_iommu_address_space(o->pci_dev);
@@ -330,20 +400,20 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
{
VfuObject *o = vfu_get_private(vfu_ctx);
+ MemoryRegionSection mr_section;
AddressSpace *dma_as = NULL;
- MemoryRegion *mr = NULL;
- ram_addr_t offset;
- mr = memory_region_from_host(info->vaddr, &offset);
- if (!mr) {
+ dma_as = pci_device_iommu_address_space(o->pci_dev);
+
+ mr_section =
+ memory_region_find(dma_as->root, (hwaddr)info->iova.iov_base, 1);
+ if (!mr_section.mr) {
return;
}
- dma_as = pci_device_iommu_address_space(o->pci_dev);
-
- memory_region_del_subregion(dma_as->root, mr);
+ memory_region_del_subregion(dma_as->root, mr_section.mr);
- object_unparent((OBJECT(mr)));
+ object_unparent((OBJECT(mr_section.mr)));
trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
}
--
2.43.2