[PATCH v3 4/5] vfio-user: Message-based DMA support

Mattias Nissler posted 5 patches 1 year, 2 months ago
Maintainers: "Michael S. Tsirkin" <mst@redhat.com>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>, Elena Ufimtseva <elena.ufimtseva@oracle.com>, Jagannathan Raman <jag.raman@oracle.com>, Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, Peter Xu <peterx@redhat.com>, David Hildenbrand <david@redhat.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>
There is a newer version of this series
[PATCH v3 4/5] vfio-user: Message-based DMA support
Posted by Mattias Nissler 1 year, 2 months ago
Wire up support for DMA for the case where the vfio-user client does not
provide mmap()-able file descriptors, but DMA requests must be performed
via the VFIO-user protocol. This installs an indirect memory region,
which already works for pci_dma_{read,write}, and pci_dma_map works
thanks to the existing DMA bounce buffering support.

Note that while simple scenarios work with this patch, there's a known
race condition in libvfio-user that will mess up the communication
channel. See https://github.com/nutanix/libvfio-user/issues/279 for
details as well as a proposed fix.

Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
---
 hw/remote/trace-events    |  2 +
 hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/hw/remote/trace-events b/hw/remote/trace-events
index 0d1b7d56a5..358a68fb34 100644
--- a/hw/remote/trace-events
+++ b/hw/remote/trace-events
@@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x"
 vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
 vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
 vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
+vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
+vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes"
 vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
 vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
 vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 8b10c32a3c..cee5e615a9 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
     return count;
 }
 
+static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
+                                unsigned size, MemTxAttrs attrs)
+{
+    MemoryRegion *region = opaque;
+    VfuObject *o = VFU_OBJECT(region->owner);
+    uint8_t buf[sizeof(uint64_t)];
+
+    trace_vfu_dma_read(region->addr + addr, size);
+
+    dma_sg_t *sg = alloca(dma_sg_size());
+    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
+    if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
+        vfu_sgl_read(o->vfu_ctx, sg, 1, buf) != 0) {
+        return MEMTX_ERROR;
+    }
+
+    *val = ldn_he_p(buf, size);
+
+    return MEMTX_OK;
+}
+
+static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
+                                 unsigned size, MemTxAttrs attrs)
+{
+    MemoryRegion *region = opaque;
+    VfuObject *o = VFU_OBJECT(region->owner);
+    uint8_t buf[sizeof(uint64_t)];
+
+    trace_vfu_dma_write(region->addr + addr, size);
+
+    stn_he_p(buf, size, val);
+
+    dma_sg_t *sg = alloca(dma_sg_size());
+    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
+    if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
+        vfu_sgl_write(o->vfu_ctx, sg, 1, buf) != 0)  {
+        return MEMTX_ERROR;
+    }
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps vfu_dma_ops = {
+    .read_with_attrs = vfu_dma_read,
+    .write_with_attrs = vfu_dma_write,
+    .endianness = DEVICE_HOST_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+};
+
 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 {
     VfuObject *o = vfu_get_private(vfu_ctx);
@@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
     g_autofree char *name = NULL;
     struct iovec *iov = &info->iova;
 
-    if (!info->vaddr) {
-        return;
-    }
-
     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
-                           (uint64_t)info->vaddr);
+                           (uint64_t)iov->iov_base);
 
     subregion = g_new0(MemoryRegion, 1);
 
-    memory_region_init_ram_ptr(subregion, NULL, name,
-                               iov->iov_len, info->vaddr);
+    if (info->vaddr) {
+        memory_region_init_ram_ptr(subregion, OBJECT(o), name,
+                                   iov->iov_len, info->vaddr);
+    } else {
+        /*
+         * Note that I/O regions' MemoryRegionOps handle accesses of at most 8
+         * bytes at a time, and larger accesses are broken down. However,
+         * many/most DMA accesses are larger than 8 bytes and VFIO-user can
+         * handle large DMA accesses just fine, thus this size restriction
+         * unnecessarily hurts performance, in particular given that each
+         * access causes a round trip on the VFIO-user socket.
+         *
+         * TODO: Investigate how to plumb larger accesses through memory
+         * regions, possibly by amending MemoryRegionOps or by creating a new
+         * memory region type.
+         */
+        memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
+                              name, iov->iov_len);
+    }
 
     dma_as = pci_device_iommu_address_space(o->pci_dev);
 
-- 
2.34.1
Re: [PATCH v3 4/5] vfio-user: Message-based DMA support
Posted by Stefan Hajnoczi 1 year, 2 months ago
On Thu, Sep 07, 2023 at 06:04:09AM -0700, Mattias Nissler wrote:
> Wire up support for DMA for the case where the vfio-user client does not
> provide mmap()-able file descriptors, but DMA requests must be performed
> via the VFIO-user protocol. This installs an indirect memory region,
> which already works for pci_dma_{read,write}, and pci_dma_map works
> thanks to the existing DMA bounce buffering support.
> 
> Note that while simple scenarios work with this patch, there's a known
> race condition in libvfio-user that will mess up the communication
> channel. See https://github.com/nutanix/libvfio-user/issues/279 for
> details as well as a proposed fix.
> 
> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> ---
>  hw/remote/trace-events    |  2 +
>  hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
>  2 files changed, 79 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/remote/trace-events b/hw/remote/trace-events
> index 0d1b7d56a5..358a68fb34 100644
> --- a/hw/remote/trace-events
> +++ b/hw/remote/trace-events
> @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x"
>  vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
>  vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
>  vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
> +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
> +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes"
>  vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
>  vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
>  vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 8b10c32a3c..cee5e615a9 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
>      return count;
>  }
>  
> +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
> +                                unsigned size, MemTxAttrs attrs)
> +{
> +    MemoryRegion *region = opaque;
> +    VfuObject *o = VFU_OBJECT(region->owner);
> +    uint8_t buf[sizeof(uint64_t)];
> +
> +    trace_vfu_dma_read(region->addr + addr, size);
> +
> +    dma_sg_t *sg = alloca(dma_sg_size());

Variable-length arrays have recently been removed from QEMU and
alloca(3) is a similar case. An example is commit
b3c8246750b7077add335559341268f2956f6470 ("hw/nvme: Avoid dynamic stack
allocation").

libvfio-user returns a sane sizeof(struct dma_sg) value so we don't need
to worry about bogus values, so the risk is low here.

However, its hard to scan for and forbid the dangerous alloca(3) calls
when exceptions are made for some alloca(3) uses.

I would avoid alloca(3) and instead use:

  g_autofree dma_sg_t *sg = g_new(dma_sg_size(), 1);

> +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> +    if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
> +        vfu_sgl_read(o->vfu_ctx, sg, 1, buf) != 0) {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *val = ldn_he_p(buf, size);
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
> +                                 unsigned size, MemTxAttrs attrs)
> +{
> +    MemoryRegion *region = opaque;
> +    VfuObject *o = VFU_OBJECT(region->owner);
> +    uint8_t buf[sizeof(uint64_t)];
> +
> +    trace_vfu_dma_write(region->addr + addr, size);
> +
> +    stn_he_p(buf, size, val);
> +
> +    dma_sg_t *sg = alloca(dma_sg_size());

Same here.

> +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> +    if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
> +        vfu_sgl_write(o->vfu_ctx, sg, 1, buf) != 0)  {
> +        return MEMTX_ERROR;
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps vfu_dma_ops = {
> +    .read_with_attrs = vfu_dma_read,
> +    .write_with_attrs = vfu_dma_write,
> +    .endianness = DEVICE_HOST_ENDIAN,
> +    .valid = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +};
> +
>  static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>  {
>      VfuObject *o = vfu_get_private(vfu_ctx);
> @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
>      g_autofree char *name = NULL;
>      struct iovec *iov = &info->iova;
>  
> -    if (!info->vaddr) {
> -        return;
> -    }
> -
>      name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
> -                           (uint64_t)info->vaddr);
> +                           (uint64_t)iov->iov_base);
>  
>      subregion = g_new0(MemoryRegion, 1);
>  
> -    memory_region_init_ram_ptr(subregion, NULL, name,
> -                               iov->iov_len, info->vaddr);
> +    if (info->vaddr) {
> +        memory_region_init_ram_ptr(subregion, OBJECT(o), name,
> +                                   iov->iov_len, info->vaddr);
> +    } else {
> +        /*
> +         * Note that I/O regions' MemoryRegionOps handle accesses of at most 8
> +         * bytes at a time, and larger accesses are broken down. However,
> +         * many/most DMA accesses are larger than 8 bytes and VFIO-user can
> +         * handle large DMA accesses just fine, thus this size restriction
> +         * unnecessarily hurts performance, in particular given that each
> +         * access causes a round trip on the VFIO-user socket.
> +         *
> +         * TODO: Investigate how to plumb larger accesses through memory
> +         * regions, possibly by amending MemoryRegionOps or by creating a new
> +         * memory region type.
> +         */
> +        memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
> +                              name, iov->iov_len);
> +    }
>  
>      dma_as = pci_device_iommu_address_space(o->pci_dev);
>  
> -- 
> 2.34.1
> 
Re: [PATCH v3 4/5] vfio-user: Message-based DMA support
Posted by Mattias Nissler 1 year, 2 months ago
On Thu, Sep 14, 2023 at 9:04 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Thu, Sep 07, 2023 at 06:04:09AM -0700, Mattias Nissler wrote:
> > Wire up support for DMA for the case where the vfio-user client does not
> > provide mmap()-able file descriptors, but DMA requests must be performed
> > via the VFIO-user protocol. This installs an indirect memory region,
> > which already works for pci_dma_{read,write}, and pci_dma_map works
> > thanks to the existing DMA bounce buffering support.
> >
> > Note that while simple scenarios work with this patch, there's a known
> > race condition in libvfio-user that will mess up the communication
> > channel. See https://github.com/nutanix/libvfio-user/issues/279 for
> > details as well as a proposed fix.
> >
> > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > ---
> >  hw/remote/trace-events    |  2 +
> >  hw/remote/vfio-user-obj.c | 84 +++++++++++++++++++++++++++++++++++----
> >  2 files changed, 79 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/remote/trace-events b/hw/remote/trace-events
> > index 0d1b7d56a5..358a68fb34 100644
> > --- a/hw/remote/trace-events
> > +++ b/hw/remote/trace-events
> > @@ -9,6 +9,8 @@ vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x -> 0x%x"
> >  vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%x <- 0x%x"
> >  vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
> >  vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
> > +vfu_dma_read(uint64_t gpa, size_t len) "vfu: DMA read 0x%"PRIx64", %zu bytes"
> > +vfu_dma_write(uint64_t gpa, size_t len) "vfu: DMA write 0x%"PRIx64", %zu bytes"
> >  vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
> >  vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
> >  vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
> > diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> > index 8b10c32a3c..cee5e615a9 100644
> > --- a/hw/remote/vfio-user-obj.c
> > +++ b/hw/remote/vfio-user-obj.c
> > @@ -300,6 +300,63 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
> >      return count;
> >  }
> >
> > +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
> > +                                unsigned size, MemTxAttrs attrs)
> > +{
> > +    MemoryRegion *region = opaque;
> > +    VfuObject *o = VFU_OBJECT(region->owner);
> > +    uint8_t buf[sizeof(uint64_t)];
> > +
> > +    trace_vfu_dma_read(region->addr + addr, size);
> > +
> > +    dma_sg_t *sg = alloca(dma_sg_size());
>
> Variable-length arrays have recently been removed from QEMU and
> alloca(3) is a similar case. An example is commit
> b3c8246750b7077add335559341268f2956f6470 ("hw/nvme: Avoid dynamic stack
> allocation").
>
> libvfio-user returns a sane sizeof(struct dma_sg) value so we don't need
> to worry about bogus values, so the risk is low here.
>
> However, its hard to scan for and forbid the dangerous alloca(3) calls
> when exceptions are made for some alloca(3) uses.
>
> I would avoid alloca(3) and instead use:
>
>   g_autofree dma_sg_t *sg = g_new(dma_sg_size(), 1);

Ok, changing. I personally actually dislike alloca, I was just
following libvfio-user's example code. Plus there's really no valid
performance argument here given that the IPC we're doing will dominate
everything.

>
> > +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> > +    if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
> > +        vfu_sgl_read(o->vfu_ctx, sg, 1, buf) != 0) {
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    *val = ldn_he_p(buf, size);
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
> > +                                 unsigned size, MemTxAttrs attrs)
> > +{
> > +    MemoryRegion *region = opaque;
> > +    VfuObject *o = VFU_OBJECT(region->owner);
> > +    uint8_t buf[sizeof(uint64_t)];
> > +
> > +    trace_vfu_dma_write(region->addr + addr, size);
> > +
> > +    stn_he_p(buf, size, val);
> > +
> > +    dma_sg_t *sg = alloca(dma_sg_size());
>
> Same here.
>
> > +    vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> > +    if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
> > +        vfu_sgl_write(o->vfu_ctx, sg, 1, buf) != 0)  {
> > +        return MEMTX_ERROR;
> > +    }
> > +
> > +    return MEMTX_OK;
> > +}
> > +
> > +static const MemoryRegionOps vfu_dma_ops = {
> > +    .read_with_attrs = vfu_dma_read,
> > +    .write_with_attrs = vfu_dma_write,
> > +    .endianness = DEVICE_HOST_ENDIAN,
> > +    .valid = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +        .unaligned = true,
> > +    },
> > +    .impl = {
> > +        .min_access_size = 1,
> > +        .max_access_size = 8,
> > +    },
> > +};
> > +
> >  static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
> >  {
> >      VfuObject *o = vfu_get_private(vfu_ctx);
> > @@ -308,17 +365,30 @@ static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
> >      g_autofree char *name = NULL;
> >      struct iovec *iov = &info->iova;
> >
> > -    if (!info->vaddr) {
> > -        return;
> > -    }
> > -
> >      name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
> > -                           (uint64_t)info->vaddr);
> > +                           (uint64_t)iov->iov_base);
> >
> >      subregion = g_new0(MemoryRegion, 1);
> >
> > -    memory_region_init_ram_ptr(subregion, NULL, name,
> > -                               iov->iov_len, info->vaddr);
> > +    if (info->vaddr) {
> > +        memory_region_init_ram_ptr(subregion, OBJECT(o), name,
> > +                                   iov->iov_len, info->vaddr);
> > +    } else {
> > +        /*
> > +         * Note that I/O regions' MemoryRegionOps handle accesses of at most 8
> > +         * bytes at a time, and larger accesses are broken down. However,
> > +         * many/most DMA accesses are larger than 8 bytes and VFIO-user can
> > +         * handle large DMA accesses just fine, thus this size restriction
> > +         * unnecessarily hurts performance, in particular given that each
> > +         * access causes a round trip on the VFIO-user socket.
> > +         *
> > +         * TODO: Investigate how to plumb larger accesses through memory
> > +         * regions, possibly by amending MemoryRegionOps or by creating a new
> > +         * memory region type.
> > +         */
> > +        memory_region_init_io(subregion, OBJECT(o), &vfu_dma_ops, subregion,
> > +                              name, iov->iov_len);
> > +    }
> >
> >      dma_as = pci_device_iommu_address_space(o->pci_dev);
> >
> > --
> > 2.34.1
> >