[PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per region

Shameer Kolothum posted 4 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per region
Posted by Shameer Kolothum 3 weeks, 5 days ago
From: Nicolin Chen <nicolinc@nvidia.com>

Linux now provides a VFIO dmabuf exporter to expose PCI BAR memory for P2P
use cases. Create a dmabuf for each mapped BAR region after the mmap is set
up, and store the returned fd in the region’s RAMBlock. This allows QEMU to
pass the fd to dma_map_file(), enabling iommufd to import the dmabuf and map
the BAR correctly in the host IOMMU page table.

If the kernel lacks support or dmabuf setup fails, QEMU skips the setup
and continues with normal mmap handling.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
 hw/vfio/region.c     | 69 +++++++++++++++++++++++++++++++++++++++++++-
 hw/vfio/trace-events |  1 +
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index ca75ab1be4..8476a54dda 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -29,6 +29,7 @@
 #include "qemu/error-report.h"
 #include "qemu/units.h"
 #include "monitor/monitor.h"
+#include "system/ramblock.h"
 #include "vfio-helpers.h"
 
 /*
@@ -238,13 +239,53 @@ static void vfio_subregion_unmap(VFIORegion *region, int index)
     region->mmaps[index].mmap = NULL;
 }
 
+static int vfio_region_create_dma_buf(VFIORegion *region)
+{
+    g_autofree struct vfio_device_feature *feature = NULL;
+    VFIODevice *vbasedev = region->vbasedev;
+    struct vfio_device_feature_dma_buf *dma_buf;
+    size_t total_size;
+    int i, ret;
+
+    g_assert(region->nr_mmaps);
+
+    total_size = sizeof(*feature) + sizeof(*dma_buf) +
+                 sizeof(struct vfio_region_dma_range) * region->nr_mmaps;
+    feature = g_malloc0(total_size);
+    *feature = (struct vfio_device_feature) {
+        .argsz = total_size,
+        .flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_BUF,
+    };
+
+    dma_buf = (void *)feature->data;
+    *dma_buf = (struct vfio_device_feature_dma_buf) {
+        .region_index = region->nr,
+        .open_flags = O_RDWR,
+        .nr_ranges = region->nr_mmaps,
+    };
+
+    for (i = 0; i < region->nr_mmaps; i++) {
+        dma_buf->dma_ranges[i].offset = region->mmaps[i].offset;
+        dma_buf->dma_ranges[i].length = region->mmaps[i].size;
+    }
+
+    ret = vfio_device_get_feature(vbasedev, feature);
+    for (i = 0; i < region->nr_mmaps; i++) {
+        trace_vfio_region_dmabuf(region->vbasedev->name, ret, region->nr,
+                                 memory_region_name(region->mem),
+                                 region->mmaps[i].offset,
+                                 region->mmaps[i].size);
+    }
+    return ret;
+}
+
 int vfio_region_mmap(VFIORegion *region)
 {
     int i, ret, prot = 0;
     char *name;
     int fd;
 
-    if (!region->mem) {
+    if (!region->mem || !region->nr_mmaps) {
         return 0;
     }
 
@@ -305,6 +346,32 @@ int vfio_region_mmap(VFIORegion *region)
                                region->mmaps[i].size - 1);
     }
 
+    ret = vfio_region_create_dma_buf(region);
+    if (ret < 0) {
+        if (ret == -ENOTTY) {
+            warn_report_once("VFIO dma-buf not supported in kernel: "
+                             "PCI BAR IOMMU mappings may fail");
+        } else {
+            error_report("%s: failed to create dma-buf (%s): "
+                         "PCI BAR IOMMU mappings may fail",
+                         memory_region_name(region->mem), strerror(errno));
+        }
+        /*
+         * Return success as VFIO region mmap succeeded and dmabuf is optional.
+         * P2P DMA or exposing device memory use cases will not be supported.
+         */
+        return 0;
+    }
+
+    /* Assign the dmabuf fd to associated RAMBlock */
+    for (i = 0; i < region->nr_mmaps; i++) {
+        MemoryRegion *mr = &region->mmaps[i].mem;
+        RAMBlock *ram_block = mr->ram_block;
+
+        ram_block->fd = ret;
+        ram_block->fd_offset = region->mmaps[i].offset;
+    }
+
     return 0;
 
 no_mmap:
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 1e895448cd..592a0349d4 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -117,6 +117,7 @@ vfio_device_put(int fd) "close vdev->fd=%d"
 vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
 vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
 vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
+vfio_region_dmabuf(const char *dev, int fd, int index,  const char *name, unsigned long offset, unsigned long size) "Device %s, dmabuf fd %d region %d \"%s\", offset: 0x%lx, size: 0x%lx"
 vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d"
 vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]"
 vfio_region_exit(const char *name, int index) "Device %s, region %d"
-- 
2.43.0


Re: [PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per region
Posted by Duan, Zhenzhong 3 weeks, 4 days ago
On 1/13/2026 7:37 PM, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> Linux now provides a VFIO dmabuf exporter to expose PCI BAR memory for P2P
> use cases. Create a dmabuf for each mapped BAR region after the mmap is set
> up, and store the returned fd in the region’s RAMBlock. This allows QEMU to
> pass the fd to dma_map_file(), enabling iommufd to import the dmabuf and map
> the BAR correctly in the host IOMMU page table.
>
> If the kernel lacks support or dmabuf setup fails, QEMU skips the setup
> and continues with normal mmap handling.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
>   hw/vfio/region.c     | 69 +++++++++++++++++++++++++++++++++++++++++++-
>   hw/vfio/trace-events |  1 +
>   2 files changed, 69 insertions(+), 1 deletion(-)
>
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index ca75ab1be4..8476a54dda 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -29,6 +29,7 @@
>   #include "qemu/error-report.h"
>   #include "qemu/units.h"
>   #include "monitor/monitor.h"
> +#include "system/ramblock.h"
>   #include "vfio-helpers.h"
>   
>   /*
> @@ -238,13 +239,53 @@ static void vfio_subregion_unmap(VFIORegion *region, int index)
>       region->mmaps[index].mmap = NULL;
>   }
>   
> +static int vfio_region_create_dma_buf(VFIORegion *region)
> +{
> +    g_autofree struct vfio_device_feature *feature = NULL;
> +    VFIODevice *vbasedev = region->vbasedev;
> +    struct vfio_device_feature_dma_buf *dma_buf;
> +    size_t total_size;
> +    int i, ret;
> +
> +    g_assert(region->nr_mmaps);

It's redundant as you have the check in vfio_region_mmap(), it's trivial.

> +
> +    total_size = sizeof(*feature) + sizeof(*dma_buf) +
> +                 sizeof(struct vfio_region_dma_range) * region->nr_mmaps;
> +    feature = g_malloc0(total_size);
> +    *feature = (struct vfio_device_feature) {
> +        .argsz = total_size,
> +        .flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_BUF,
> +    };
> +
> +    dma_buf = (void *)feature->data;
> +    *dma_buf = (struct vfio_device_feature_dma_buf) {
> +        .region_index = region->nr,
> +        .open_flags = O_RDWR,
> +        .nr_ranges = region->nr_mmaps,
> +    };
> +
> +    for (i = 0; i < region->nr_mmaps; i++) {
> +        dma_buf->dma_ranges[i].offset = region->mmaps[i].offset;
> +        dma_buf->dma_ranges[i].length = region->mmaps[i].size;
> +    }
> +
> +    ret = vfio_device_get_feature(vbasedev, feature);
> +    for (i = 0; i < region->nr_mmaps; i++) {
> +        trace_vfio_region_dmabuf(region->vbasedev->name, ret, region->nr,
> +                                 memory_region_name(region->mem),
> +                                 region->mmaps[i].offset,
> +                                 region->mmaps[i].size);
> +    }
> +    return ret;
> +}
> +
>   int vfio_region_mmap(VFIORegion *region)
>   {
>       int i, ret, prot = 0;
>       char *name;
>       int fd;
>   
> -    if (!region->mem) {
> +    if (!region->mem || !region->nr_mmaps) {
>           return 0;
>       }
>   
> @@ -305,6 +346,32 @@ int vfio_region_mmap(VFIORegion *region)
>                                  region->mmaps[i].size - 1);
>       }
>   
> +    ret = vfio_region_create_dma_buf(region);
> +    if (ret < 0) {
> +        if (ret == -ENOTTY) {
> +            warn_report_once("VFIO dma-buf not supported in kernel: "
> +                             "PCI BAR IOMMU mappings may fail");
> +        } else {
> +            error_report("%s: failed to create dma-buf (%s): "
> +                         "PCI BAR IOMMU mappings may fail",
> +                         memory_region_name(region->mem), strerror(errno));
> +        }
> +        /*
> +         * Return success as VFIO region mmap succeeded and dmabuf is optional.
> +         * P2P DMA or exposing device memory use cases will not be supported.
> +         */
> +        return 0;
> +    }
> +
> +    /* Assign the dmabuf fd to associated RAMBlock */
> +    for (i = 0; i < region->nr_mmaps; i++) {
> +        MemoryRegion *mr = &region->mmaps[i].mem;
> +        RAMBlock *ram_block = mr->ram_block;
> +
> +        ram_block->fd = ret;
> +        ram_block->fd_offset = region->mmaps[i].offset;
> +    }
> +

If you have next respin, maybe cleaner to move above dmabuf related code 
into

vfio_region_create_dma_buf().Anyway LGTM,

Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>

>       return 0;
>   
>   no_mmap:
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 1e895448cd..592a0349d4 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -117,6 +117,7 @@ vfio_device_put(int fd) "close vdev->fd=%d"
>   vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
>   vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
>   vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
> +vfio_region_dmabuf(const char *dev, int fd, int index,  const char *name, unsigned long offset, unsigned long size) "Device %s, dmabuf fd %d region %d \"%s\", offset: 0x%lx, size: 0x%lx"
>   vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d"
>   vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]"
>   vfio_region_exit(const char *name, int index) "Device %s, region %d"

Re: [PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per region
Posted by Cédric Le Goater 3 weeks, 3 days ago
On 1/14/26 10:16, Duan, Zhenzhong wrote:
> 
> On 1/13/2026 7:37 PM, Shameer Kolothum wrote:
>> From: Nicolin Chen <nicolinc@nvidia.com>
>>
>> Linux now provides a VFIO dmabuf exporter to expose PCI BAR memory for P2P
>> use cases. Create a dmabuf for each mapped BAR region after the mmap is set
>> up, and store the returned fd in the region’s RAMBlock. This allows QEMU to
>> pass the fd to dma_map_file(), enabling iommufd to import the dmabuf and map
>> the BAR correctly in the host IOMMU page table.
>>
>> If the kernel lacks support or dmabuf setup fails, QEMU skips the setup
>> and continues with normal mmap handling.
>>
>> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
>> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
>> ---
>>   hw/vfio/region.c     | 69 +++++++++++++++++++++++++++++++++++++++++++-
>>   hw/vfio/trace-events |  1 +
>>   2 files changed, 69 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
>> index ca75ab1be4..8476a54dda 100644
>> --- a/hw/vfio/region.c
>> +++ b/hw/vfio/region.c
>> @@ -29,6 +29,7 @@
>>   #include "qemu/error-report.h"
>>   #include "qemu/units.h"
>>   #include "monitor/monitor.h"
>> +#include "system/ramblock.h"
>>   #include "vfio-helpers.h"
>>   /*
>> @@ -238,13 +239,53 @@ static void vfio_subregion_unmap(VFIORegion *region, int index)
>>       region->mmaps[index].mmap = NULL;
>>   }
>> +static int vfio_region_create_dma_buf(VFIORegion *region)
>> +{
>> +    g_autofree struct vfio_device_feature *feature = NULL;
>> +    VFIODevice *vbasedev = region->vbasedev;
>> +    struct vfio_device_feature_dma_buf *dma_buf;
>> +    size_t total_size;
>> +    int i, ret;
>> +
>> +    g_assert(region->nr_mmaps);
> 
> It's redundant as you have the check in vfio_region_mmap(), it's trivial.
> 
>> +
>> +    total_size = sizeof(*feature) + sizeof(*dma_buf) +
>> +                 sizeof(struct vfio_region_dma_range) * region->nr_mmaps;
>> +    feature = g_malloc0(total_size);
>> +    *feature = (struct vfio_device_feature) {
>> +        .argsz = total_size,
>> +        .flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_BUF,
>> +    };
>> +
>> +    dma_buf = (void *)feature->data;
>> +    *dma_buf = (struct vfio_device_feature_dma_buf) {
>> +        .region_index = region->nr,
>> +        .open_flags = O_RDWR,
>> +        .nr_ranges = region->nr_mmaps,
>> +    };
>> +
>> +    for (i = 0; i < region->nr_mmaps; i++) {
>> +        dma_buf->dma_ranges[i].offset = region->mmaps[i].offset;
>> +        dma_buf->dma_ranges[i].length = region->mmaps[i].size;
>> +    }
>> +
>> +    ret = vfio_device_get_feature(vbasedev, feature);
>> +    for (i = 0; i < region->nr_mmaps; i++) {
>> +        trace_vfio_region_dmabuf(region->vbasedev->name, ret, region->nr,
>> +                                 memory_region_name(region->mem),
>> +                                 region->mmaps[i].offset,
>> +                                 region->mmaps[i].size);
>> +    }
>> +    return ret;
>> +}
>> +
>>   int vfio_region_mmap(VFIORegion *region)
>>   {
>>       int i, ret, prot = 0;
>>       char *name;
>>       int fd;
>> -    if (!region->mem) {
>> +    if (!region->mem || !region->nr_mmaps) {
>>           return 0;
>>       }
>> @@ -305,6 +346,32 @@ int vfio_region_mmap(VFIORegion *region)
>>                                  region->mmaps[i].size - 1);
>>       }
>> +    ret = vfio_region_create_dma_buf(region);
>> +    if (ret < 0) {
>> +        if (ret == -ENOTTY) {
>> +            warn_report_once("VFIO dma-buf not supported in kernel: "
>> +                             "PCI BAR IOMMU mappings may fail");
>> +        } else {
>> +            error_report("%s: failed to create dma-buf (%s): "
>> +                         "PCI BAR IOMMU mappings may fail",
>> +                         memory_region_name(region->mem), strerror(errno));
>> +        }
>> +        /*
>> +         * Return success as VFIO region mmap succeeded and dmabuf is optional.
>> +         * P2P DMA or exposing device memory use cases will not be supported.
>> +         */
>> +        return 0;
>> +    }
>> +
>> +    /* Assign the dmabuf fd to associated RAMBlock */
>> +    for (i = 0; i < region->nr_mmaps; i++) {
>> +        MemoryRegion *mr = &region->mmaps[i].mem;
>> +        RAMBlock *ram_block = mr->ram_block;
>> +
>> +        ram_block->fd = ret;
>> +        ram_block->fd_offset = region->mmaps[i].offset;
>> +    }
>> +
> 
> If you have next respin, maybe cleaner to move above dmabuf related code into
> 
> vfio_region_create_dma_buf().

I agree. Can you please respin a v3 Shameer ?

Thanks,

C.


> Anyway LGTM,
> 
> Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> 
>>       return 0;
>>   no_mmap:
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 1e895448cd..592a0349d4 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -117,6 +117,7 @@ vfio_device_put(int fd) "close vdev->fd=%d"
>>   vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
>>   vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
>>   vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
>> +vfio_region_dmabuf(const char *dev, int fd, int index,  const char *name, unsigned long offset, unsigned long size) "Device %s, dmabuf fd %d region %d \"%s\", offset: 0x%lx, size: 0x%lx"
>>   vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d"
>>   vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]"
>>   vfio_region_exit(const char *name, int index) "Device %s, region %d"
> 


RE: [PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per region
Posted by Shameer Kolothum 3 weeks, 3 days ago


> -----Original Message-----
> From: Cédric Le Goater <clg@redhat.com>
> Sent: 15 January 2026 12:51
> To: Duan, Zhenzhong <zhenzhong.duan@intel.com>; Shameer Kolothum
> <skolothumtho@nvidia.com>; qemu-arm@nongnu.org; qemu-
> devel@nongnu.org
> Cc: eric.auger@redhat.com; alex@shazbot.org; cohuck@redhat.com;
> mst@redhat.com; Nicolin Chen <nicolinc@nvidia.com>; Nathan Chen
> <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>; Krishnakant Jaju <kjaju@nvidia.com>
> Subject: Re: [PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per
> region
> 

[...]

> > If you have next respin, maybe cleaner to move above dmabuf related code
> into
> >
> > vfio_region_create_dma_buf().
> 
> I agree. Can you please respin a v3 Shameer ?

Ok. Just to confirm, does below look good:

Thanks,
Shameer

...
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index ca75ab1be4..f927e64365 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -29,6 +29,7 @@
 #include "qemu/error-report.h"
 #include "qemu/units.h"
 #include "monitor/monitor.h"
+#include "system/ramblock.h"
 #include "vfio-helpers.h"

 /*
@@ -238,13 +239,69 @@ static void vfio_subregion_unmap(VFIORegion *region, int index)
     region->mmaps[index].mmap = NULL;
 }

+static void vfio_region_create_dma_buf(VFIORegion *region)
+{
+    g_autofree struct vfio_device_feature *feature = NULL;
+    VFIODevice *vbasedev = region->vbasedev;
+    struct vfio_device_feature_dma_buf *dma_buf;
+    size_t total_size;
+    int i, ret;
+
+    total_size = sizeof(*feature) + sizeof(*dma_buf) +
+                 sizeof(struct vfio_region_dma_range) * region->nr_mmaps;
+    feature = g_malloc0(total_size);
+    *feature = (struct vfio_device_feature) {
+        .argsz = total_size,
+        .flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_BUF,
+    };
+
+    dma_buf = (void *)feature->data;
+    *dma_buf = (struct vfio_device_feature_dma_buf) {
+        .region_index = region->nr,
+        .open_flags = O_RDWR,
+        .nr_ranges = region->nr_mmaps,
+    };
+
+    for (i = 0; i < region->nr_mmaps; i++) {
+        dma_buf->dma_ranges[i].offset = region->mmaps[i].offset;
+        dma_buf->dma_ranges[i].length = region->mmaps[i].size;
+    }
+
+    ret = vfio_device_get_feature(vbasedev, feature);
+    if (ret < 0) {
+        if (ret == -ENOTTY) {
+            warn_report_once("VFIO dma-buf not supported in kernel: "
+                             "PCI BAR IOMMU mappings may fail");
+        } else {
+            error_report("%s: failed to create dma-buf (%s): "
+                         "PCI BAR IOMMU mappings may fail",
+                         memory_region_name(region->mem), strerror(errno));
+        }
+        /* P2P DMA or exposing device memory use cases are not supported. */
+        return;
+    }
+
+    /* Assign the dmabuf fd to associated RAMBlock */
+    for (i = 0; i < region->nr_mmaps; i++) {
+        MemoryRegion *mr = &region->mmaps[i].mem;
+        RAMBlock *ram_block = mr->ram_block;
+
+        ram_block->fd = ret;
+        ram_block->fd_offset = region->mmaps[i].offset;
+        trace_vfio_region_dmabuf(region->vbasedev->name, ret, region->nr,
+                                 memory_region_name(region->mem),
+                                 region->mmaps[i].offset,
+                                 region->mmaps[i].size);
+    }
+}
+
 int vfio_region_mmap(VFIORegion *region)
 {
     int i, ret, prot = 0;
     char *name;
     int fd;

-    if (!region->mem) {
+    if (!region->mem || !region->nr_mmaps) {
         return 0;
     }

@@ -305,6 +362,8 @@ int vfio_region_mmap(VFIORegion *region)
                                region->mmaps[i].size - 1);
     }

+    vfio_region_create_dma_buf(region);
+
     return 0;

 no_mmap:
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 180e3d526b..466695507b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -118,6 +118,7 @@ vfio_device_put(int fd) "close vdev->fd=%d"
 vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
 vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
 vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
+vfio_region_dmabuf(const char *dev, int fd, int index,  const char *name, unsigned long offset, unsigned long size) "Device %s, dmabuf fd %d region %d \"%s\", offset: 0x%lx, size: 0x%lx"
 vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d"
 vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]"
 vfio_region_exit(const char *name, int index) "Device %s, region %d"


Re: [PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per region
Posted by Cédric Le Goater 3 weeks, 3 days ago
On 1/15/26 15:32, Shameer Kolothum wrote:
> 
> 
>> -----Original Message-----
>> From: Cédric Le Goater <clg@redhat.com>
>> Sent: 15 January 2026 12:51
>> To: Duan, Zhenzhong <zhenzhong.duan@intel.com>; Shameer Kolothum
>> <skolothumtho@nvidia.com>; qemu-arm@nongnu.org; qemu-
>> devel@nongnu.org
>> Cc: eric.auger@redhat.com; alex@shazbot.org; cohuck@redhat.com;
>> mst@redhat.com; Nicolin Chen <nicolinc@nvidia.com>; Nathan Chen
>> <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; Krishnakant Jaju <kjaju@nvidia.com>
>> Subject: Re: [PATCH v2 4/4] hw/vfio/region: Create dmabuf for PCI BAR per
>> region
>>
> 
> [...]
> 
>>> If you have next respin, maybe cleaner to move above dmabuf related code
>> into
>>>
>>> vfio_region_create_dma_buf().
>>
>> I agree. Can you please respin a v3 Shameer ?
> 
> Ok. Just to confirm, does below look good:

it does.

> 
> Thanks,
> Shameer
> 
> ...
> diff --git a/hw/vfio/region.c b/hw/vfio/region.c
> index ca75ab1be4..f927e64365 100644
> --- a/hw/vfio/region.c
> +++ b/hw/vfio/region.c
> @@ -29,6 +29,7 @@
>   #include "qemu/error-report.h"
>   #include "qemu/units.h"
>   #include "monitor/monitor.h"
> +#include "system/ramblock.h"
>   #include "vfio-helpers.h"
> 
>   /*
> @@ -238,13 +239,69 @@ static void vfio_subregion_unmap(VFIORegion *region, int index)
>       region->mmaps[index].mmap = NULL;
>   }
> 
> +static void vfio_region_create_dma_buf(VFIORegion *region)
> +{


While at changing things, could you please add an 'Error** errp'
parameter to vfio_region_create_dma_buf() and make it return a
bool.

> +    g_autofree struct vfio_device_feature *feature = NULL;
> +    VFIODevice *vbasedev = region->vbasedev;
> +    struct vfio_device_feature_dma_buf *dma_buf;
> +    size_t total_size;
> +    int i, ret;
> +
> +    total_size = sizeof(*feature) + sizeof(*dma_buf) +
> +                 sizeof(struct vfio_region_dma_range) * region->nr_mmaps;
> +    feature = g_malloc0(total_size);
> +    *feature = (struct vfio_device_feature) {
> +        .argsz = total_size,
> +        .flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_BUF,
> +    };
> +
> +    dma_buf = (void *)feature->data;
> +    *dma_buf = (struct vfio_device_feature_dma_buf) {
> +        .region_index = region->nr,
> +        .open_flags = O_RDWR,
> +        .nr_ranges = region->nr_mmaps,
> +    };
> +
> +    for (i = 0; i < region->nr_mmaps; i++) {
> +        dma_buf->dma_ranges[i].offset = region->mmaps[i].offset;
> +        dma_buf->dma_ranges[i].length = region->mmaps[i].size;
> +    }
> +
> +    ret = vfio_device_get_feature(vbasedev, feature);
> +    if (ret < 0) {
> +        if (ret == -ENOTTY) {
> +            warn_report_once("VFIO dma-buf not supported in kernel: "
> +                             "PCI BAR IOMMU mappings may fail");
> +        } else {
> +            error_report("%s: failed to create dma-buf (%s): "
> +                         "PCI BAR IOMMU mappings may fail",
> +                         memory_region_name(region->mem), strerror(errno));
> +        }
> +        /* P2P DMA or exposing device memory use cases are not supported. */
> +        return;
> +    }
> +
> +    /* Assign the dmabuf fd to associated RAMBlock */
> +    for (i = 0; i < region->nr_mmaps; i++) {
> +        MemoryRegion *mr = &region->mmaps[i].mem;
> +        RAMBlock *ram_block = mr->ram_block;
> +
> +        ram_block->fd = ret;
> +        ram_block->fd_offset = region->mmaps[i].offset;
> +        trace_vfio_region_dmabuf(region->vbasedev->name, ret, region->nr,
> +                                 memory_region_name(region->mem),
> +                                 region->mmaps[i].offset,
> +                                 region->mmaps[i].size);
> +    }
> +}
> +
>   int vfio_region_mmap(VFIORegion *region)
>   {
>       int i, ret, prot = 0;
>       char *name;
>       int fd;
> 
> -    if (!region->mem) {
> +    if (!region->mem || !region->nr_mmaps) {
>           return 0;
>       }
> 
> @@ -305,6 +362,8 @@ int vfio_region_mmap(VFIORegion *region)
>                                  region->mmaps[i].size - 1);
>       }
> 
> +    vfio_region_create_dma_buf(region);
> +

vfio_region_mmap() would then test the returned value and call
error_report_err() on the error.

In follow-ups patches, we could propagate this error to callers
of vfio_region_mmap(). Can come later.

Thanks,

C.




>       return 0;
> 
>   no_mmap:
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 180e3d526b..466695507b 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -118,6 +118,7 @@ vfio_device_put(int fd) "close vdev->fd=%d"
>   vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
>   vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
>   vfio_region_setup(const char *dev, int index, const char *name, unsigned long flags, unsigned long offset, unsigned long size) "Device %s, region %d \"%s\", flags: 0x%lx, offset: 0x%lx, size: 0x%lx"
> +vfio_region_dmabuf(const char *dev, int fd, int index,  const char *name, unsigned long offset, unsigned long size) "Device %s, dmabuf fd %d region %d \"%s\", offset: 0x%lx, size: 0x%lx"
>   vfio_region_mmap_fault(const char *name, int index, unsigned long offset, unsigned long size, int fault) "Region %s mmaps[%d], [0x%lx - 0x%lx], fault: %d"
>   vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Region %s [0x%lx - 0x%lx]"
>   vfio_region_exit(const char *name, int index) "Device %s, region %d"
> 
>