[v1] vfio/pci: Allow MMIO regions to be exported through dma-buf

[PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Leon Romanovsky 2 months, 2 weeks ago

From: Leon Romanovsky <leonro@nvidia.com>

Add support for exporting PCI device MMIO regions through dma-buf,
enabling safe sharing of non-struct page memory with controlled
lifetime management. This allows RDMA and other subsystems to import
dma-buf FDs and build them into memory regions for PCI P2P operations.

The implementation provides a revocable attachment mechanism using
dma-buf move operations. MMIO regions are normally pinned as BARs
don't change physical addresses, but access is revoked when the VFIO
device is closed or a PCI reset is issued. This ensures kernel
self-defense against potentially hostile userspace.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/vfio/pci/Kconfig           |  20 ++
 drivers/vfio/pci/Makefile          |   2 +
 drivers/vfio/pci/vfio_pci_config.c |  22 +-
 drivers/vfio/pci/vfio_pci_core.c   |  25 ++-
 drivers/vfio/pci/vfio_pci_dmabuf.c | 321 +++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
 include/linux/dma-buf.h            |   1 +
 include/linux/vfio_pci_core.h      |   3 +
 include/uapi/linux/vfio.h          |  19 ++
 9 files changed, 431 insertions(+), 5 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f546652..55ae888bf26ae 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM
 
 	  To enable s390x KVM vfio-pci extensions, say Y.
 
+config VFIO_PCI_DMABUF
+	bool "VFIO PCI extensions for DMA-BUF"
+	depends on VFIO_PCI_CORE
+	depends on PCI_P2PDMA && DMA_SHARED_BUFFER
+	default y
+	help
+	  Enable support for VFIO PCI extensions that allow exporting
+	  device MMIO regions as DMA-BUFs for peer devices to access via
+	  peer-to-peer (P2P) DMA.
+
+	  This feature enables a VFIO-managed PCI device to export a portion
+	  of its MMIO BAR as a DMA-BUF file descriptor, which can be passed
+	  to other userspace drivers or kernel subsystems capable of
+	  initiating DMA to that region.
+
+	  Say Y here if you want to enable VFIO DMABUF-based MMIO export
+	  support for peer-to-peer DMA use cases.
+
+	  If unsure, say N.
+
 source "drivers/vfio/pci/mlx5/Kconfig"
 
 source "drivers/vfio/pci/hisilicon/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c8..f9155e9c5f630 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,7 +2,9 @@
 
 vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+
 obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
 
 vfio-pci-y := vfio_pci.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4b..7e23387a43b4d 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 		virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
 		new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
 
-		if (!new_mem)
+		if (!new_mem) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
-		else
+			vfio_pci_dma_buf_move(vdev, true);
+		} else {
 			down_write(&vdev->memory_lock);
+		}
 
 		/*
 		 * If the user is writing mem/io enable (new_mem/io) and we
@@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
 		*virt_cmd &= cpu_to_le16(~mask);
 		*virt_cmd |= cpu_to_le16(new_cmd & mask);
 
+		if (__vfio_pci_memory_enabled(vdev))
+			vfio_pci_dma_buf_move(vdev, false);
 		up_write(&vdev->memory_lock);
 	}
 
@@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
 static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
 					  pci_power_t state)
 {
-	if (state >= PCI_D3hot)
+	if (state >= PCI_D3hot) {
 		vfio_pci_zap_and_down_write_memory_lock(vdev);
-	else
+		vfio_pci_dma_buf_move(vdev, true);
+	} else {
 		down_write(&vdev->memory_lock);
+	}
 
 	vfio_pci_set_power_state(vdev, state);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 }
 
@@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
 
 		if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			vfio_pci_dma_buf_move(vdev, true);
 			pci_try_reset_function(vdev->pdev);
+			if (__vfio_pci_memory_enabled(vdev))
+				vfio_pci_dma_buf_move(vdev, true);
 			up_write(&vdev->memory_lock);
 		}
 	}
@@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
 
 		if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
 			vfio_pci_zap_and_down_write_memory_lock(vdev);
+			vfio_pci_dma_buf_move(vdev, true);
 			pci_try_reset_function(vdev->pdev);
+			if (__vfio_pci_memory_enabled(vdev))
+				vfio_pci_dma_buf_move(vdev, true);
 			up_write(&vdev->memory_lock);
 		}
 	}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 5512d13bb8899..e5ab5d1cafd9c 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -29,7 +29,9 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 #include <linux/iommufd.h>
+#ifdef CONFIG_VFIO_PCI_DMABUF
 #include <linux/pci-p2pdma.h>
+#endif
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -288,6 +290,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
 	 * semaphore.
 	 */
 	vfio_pci_zap_and_down_write_memory_lock(vdev);
+	vfio_pci_dma_buf_move(vdev, true);
+
 	if (vdev->pm_runtime_engaged) {
 		up_write(&vdev->memory_lock);
 		return -EINVAL;
@@ -371,6 +375,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
 	 */
 	down_write(&vdev->memory_lock);
 	__vfio_pci_runtime_pm_exit(vdev);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 }
 
@@ -691,6 +697,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 #endif
 	vfio_pci_core_disable(vdev);
 
+	vfio_pci_dma_buf_cleanup(vdev);
+
 	mutex_lock(&vdev->igate);
 	if (vdev->err_trigger) {
 		eventfd_ctx_put(vdev->err_trigger);
@@ -1223,7 +1231,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 	 */
 	vfio_pci_set_power_state(vdev, PCI_D0);
 
+	vfio_pci_dma_buf_move(vdev, true);
 	ret = pci_try_reset_function(vdev->pdev);
+	if (__vfio_pci_memory_enabled(vdev))
+		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
 
 	return ret;
@@ -1512,6 +1523,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_DMA_BUF:
+		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
@@ -2088,9 +2101,13 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
 	INIT_LIST_HEAD(&vdev->dummy_resources_list);
 	INIT_LIST_HEAD(&vdev->ioeventfds_list);
 	INIT_LIST_HEAD(&vdev->sriov_pfs_item);
+#ifdef CONFIG_VFIO_PCI_DMABUF
 	vdev->provider = pci_p2pdma_enable(vdev->pdev);
 	if (IS_ERR(vdev->provider))
 		return PTR_ERR(vdev->provider);
+
+	INIT_LIST_HEAD(&vdev->dmabufs);
+#endif
 	init_rwsem(&vdev->memory_lock);
 	xa_init(&vdev->ctx);
 
@@ -2473,11 +2490,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 	 * cause the PCI config space reset without restoring the original
 	 * state (saved locally in 'vdev->pm_save').
 	 */
-	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
+		vfio_pci_dma_buf_move(vdev, true);
 		vfio_pci_set_power_state(vdev, PCI_D0);
+	}
 
 	ret = pci_reset_bus(pdev);
 
+	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+		if (__vfio_pci_memory_enabled(vdev))
+			vfio_pci_dma_buf_move(vdev, false);
+
 	vdev = list_last_entry(&dev_set->device_list,
 			       struct vfio_pci_core_device, vdev.dev_set_list);
 
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
new file mode 100644
index 0000000000000..5fefcdecd1329
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct vfio_pci_dma_buf {
+	struct dma_buf *dmabuf;
+	struct vfio_pci_core_device *vdev;
+	struct list_head dmabufs_elm;
+	struct phys_vec phys_vec;
+	u8 revoked : 1;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+				   struct dma_buf_attachment *attachment)
+{
+	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+	if (!attachment->peer2peer)
+		return -EOPNOTSUPP;
+
+	if (priv->revoked)
+		return -ENODEV;
+
+	switch (pci_p2pdma_map_type(priv->vdev->provider, attachment->dev)) {
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		break;
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		/*
+		 * There is no need in IOVA at all for this flow.
+		 * We rely on attachment->priv == NULL as a marker
+		 * for this mode.
+		 */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
+	if (!attachment->priv)
+		return -ENOMEM;
+
+	dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->phys_vec.len);
+	return 0;
+}
+
+static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
+				    struct dma_buf_attachment *attachment)
+{
+	kfree(attachment->priv);
+}
+
+static void fill_sg_entry(struct scatterlist *sgl, unsigned int length,
+			 dma_addr_t addr)
+{
+	sg_set_page(sgl, NULL, length, 0);
+	sg_dma_address(sgl) = addr;
+	sg_dma_len(sgl) = length;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+		     enum dma_data_direction dir)
+{
+	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+	struct p2pdma_provider *provider = priv->vdev->provider;
+	struct dma_iova_state *state = attachment->priv;
+	struct phys_vec *phys_vec = &priv->phys_vec;
+	struct scatterlist *sgl;
+	struct sg_table *sgt;
+	dma_addr_t addr;
+	int ret;
+
+	dma_resv_assert_held(priv->dmabuf->resv);
+
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+
+	ret = sg_alloc_table(sgt, 1, GFP_KERNEL | __GFP_ZERO);
+	if (ret)
+		goto err_kfree_sgt;
+
+	sgl = sgt->sgl;
+
+	if (!state) {
+		addr = pci_p2pdma_bus_addr_map(provider, phys_vec->paddr);
+	} else if (dma_use_iova(state)) {
+		ret = dma_iova_link(attachment->dev, state, phys_vec->paddr, 0,
+				    phys_vec->len, dir, DMA_ATTR_SKIP_CPU_SYNC);
+		if (ret)
+			goto err_free_table;
+
+		ret = dma_iova_sync(attachment->dev, state, 0, phys_vec->len);
+		if (ret)
+			goto err_unmap_dma;
+
+		addr = state->addr;
+	} else {
+		addr = dma_map_phys(attachment->dev, phys_vec->paddr,
+				    phys_vec->len, dir, DMA_ATTR_SKIP_CPU_SYNC);
+		ret = dma_mapping_error(attachment->dev, addr);
+		if (ret)
+			goto err_free_table;
+	}
+
+	fill_sg_entry(sgl, phys_vec->len, addr);
+	return sgt;
+
+err_unmap_dma:
+	dma_iova_destroy(attachment->dev, state, phys_vec->len, dir,
+			 DMA_ATTR_SKIP_CPU_SYNC);
+err_free_table:
+	sg_free_table(sgt);
+err_kfree_sgt:
+	kfree(sgt);
+	return ERR_PTR(ret);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+				   struct sg_table *sgt,
+				   enum dma_data_direction dir)
+{
+	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+	struct dma_iova_state *state = attachment->priv;
+	struct scatterlist *sgl;
+	int i;
+
+	if (!state)
+		; /* Do nothing */
+	else if (dma_use_iova(state))
+		dma_iova_destroy(attachment->dev, state, priv->phys_vec.len,
+				 dir, DMA_ATTR_SKIP_CPU_SYNC);
+	else
+		for_each_sgtable_dma_sg(sgt, sgl, i)
+			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
+				       sg_dma_len(sgl), dir,
+				       DMA_ATTR_SKIP_CPU_SYNC);
+
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+	struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+	/*
+	 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+	 * The refcount prevents both.
+	 */
+	if (priv->vdev) {
+		down_write(&priv->vdev->memory_lock);
+		list_del_init(&priv->dmabufs_elm);
+		up_write(&priv->vdev->memory_lock);
+		vfio_device_put_registration(&priv->vdev->vdev);
+	}
+	kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+	.attach = vfio_pci_dma_buf_attach,
+	.detach = vfio_pci_dma_buf_detach,
+	.map_dma_buf = vfio_pci_dma_buf_map,
+	.release = vfio_pci_dma_buf_release,
+	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
+};
+
+static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
+				   struct vfio_device_feature_dma_buf *dma_buf)
+{
+	struct pci_dev *pdev = priv->vdev->pdev;
+
+	priv->phys_vec.len = dma_buf->length;
+	priv->phys_vec.paddr = pci_resource_start(pdev, dma_buf->region_index);
+	priv->phys_vec.paddr += dma_buf->offset;
+}
+
+static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
+				 struct vfio_device_feature_dma_buf *dma_buf)
+{
+	struct pci_dev *pdev = vdev->pdev;
+	u32 bar = dma_buf->region_index;
+	u64 offset = dma_buf->offset;
+	u64 len = dma_buf->length;
+	resource_size_t bar_size;
+	u64 sum;
+
+	/*
+	 * For PCI the region_index is the BAR number like  everything else.
+	 */
+	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
+		return -ENODEV;
+
+	if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	bar_size = pci_resource_len(pdev, bar);
+	if (check_add_overflow(offset, len, &sum) || sum > bar_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+				  struct vfio_device_feature_dma_buf __user *arg,
+				  size_t argsz)
+{
+	struct vfio_device_feature_dma_buf get_dma_buf = {};
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+	struct vfio_pci_dma_buf *priv;
+	int ret;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(get_dma_buf));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
+		return -EFAULT;
+
+	ret = validate_dmabuf_input(vdev, &get_dma_buf);
+	if (ret)
+		return ret;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->vdev = vdev;
+	dma_ranges_to_p2p_phys(priv, &get_dma_buf);
+
+	if (!vfio_device_try_get_registration(&vdev->vdev)) {
+		ret = -ENODEV;
+		goto err_free_priv;
+	}
+
+	exp_info.ops = &vfio_pci_dmabuf_ops;
+	exp_info.size = priv->phys_vec.len;
+	exp_info.flags = get_dma_buf.open_flags;
+	exp_info.priv = priv;
+
+	priv->dmabuf = dma_buf_export(&exp_info);
+	if (IS_ERR(priv->dmabuf)) {
+		ret = PTR_ERR(priv->dmabuf);
+		goto err_dev_put;
+	}
+
+	/* dma_buf_put() now frees priv */
+	INIT_LIST_HEAD(&priv->dmabufs_elm);
+	down_write(&vdev->memory_lock);
+	dma_resv_lock(priv->dmabuf->resv, NULL);
+	priv->revoked = !__vfio_pci_memory_enabled(vdev);
+	list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+	dma_resv_unlock(priv->dmabuf->resv);
+	up_write(&vdev->memory_lock);
+
+	/*
+	 * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+	 * will be released.
+	 */
+	return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+
+err_dev_put:
+	vfio_device_put_registration(&vdev->vdev);
+err_free_priv:
+	kfree(priv);
+	return ret;
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+	struct vfio_pci_dma_buf *priv;
+	struct vfio_pci_dma_buf *tmp;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+		if (!get_file_active(&priv->dmabuf->file))
+			continue;
+
+		if (priv->revoked != revoked) {
+			dma_resv_lock(priv->dmabuf->resv, NULL);
+			priv->revoked = revoked;
+			dma_buf_move_notify(priv->dmabuf);
+			dma_resv_unlock(priv->dmabuf->resv);
+		}
+		dma_buf_put(priv->dmabuf);
+	}
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_dma_buf *priv;
+	struct vfio_pci_dma_buf *tmp;
+
+	down_write(&vdev->memory_lock);
+	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+		if (!get_file_active(&priv->dmabuf->file))
+			continue;
+
+		dma_resv_lock(priv->dmabuf->resv, NULL);
+		list_del_init(&priv->dmabufs_elm);
+		priv->vdev = NULL;
+		priv->revoked = true;
+		dma_buf_move_notify(priv->dmabuf);
+		dma_resv_unlock(priv->dmabuf->resv);
+		vfio_device_put_registration(&vdev->vdev);
+		dma_buf_put(priv->dmabuf);
+	}
+	up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb2936..28a405f8b97c9 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
 }
 
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+				  struct vfio_device_feature_dma_buf __user *arg,
+				  size_t argsz);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static inline int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+			      struct vfio_device_feature_dma_buf __user *arg,
+			      size_t argsz)
+{
+	return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+					 bool revoked)
+{
+}
+#endif
+
 #endif
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d58e329ac0e71..f14b413aae48d 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -483,6 +483,7 @@ struct dma_buf_attach_ops {
  * @dev: device attached to the buffer.
  * @node: list of dma_buf_attachment, protected by dma_resv lock of the dmabuf.
  * @peer2peer: true if the importer can handle peer resources without pages.
+ * #state: DMA structure to provide support for physical addresses DMA interface
  * @priv: exporter specific attachment data.
  * @importer_ops: importer operations for this attachment, if provided
  * dma_buf_map/unmap_attachment() must be called with the dma_resv lock held.
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index b017fae251811..548cbb51bf146 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -94,7 +94,10 @@ struct vfio_pci_core_device {
 	struct vfio_pci_core_device	*sriov_pf_core_dev;
 	struct notifier_block	nb;
 	struct rw_semaphore	memory_lock;
+#ifdef CONFIG_VFIO_PCI_DMABUF
 	struct p2pdma_provider  *provider;
+	struct list_head	dmabufs;
+#endif
 };
 
 /* Will be exported for vfio pci drivers usage */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5764f315137f9..ad8e303697f97 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1468,6 +1468,25 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
+ * regions selected.
+ *
+ * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
+ * etc. offset/length specify a slice of the region to create the dmabuf from.
+ * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
+ *
+ * Return: The fd number on success, -1 and errno is set on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+
+struct vfio_device_feature_dma_buf {
+	__u32	region_index;
+	__u32	open_flags;
+	__u64	offset;
+	__u64	length;
+};
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.50.1

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Robin Murphy 2 months, 1 week ago

On 2025-07-23 2:00 pm, Leon Romanovsky wrote:
[...]
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +		     enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct p2pdma_provider *provider = priv->vdev->provider;
> +	struct dma_iova_state *state = attachment->priv;
> +	struct phys_vec *phys_vec = &priv->phys_vec;
> +	struct scatterlist *sgl;
> +	struct sg_table *sgt;
> +	dma_addr_t addr;
> +	int ret;
> +
> +	dma_resv_assert_held(priv->dmabuf->resv);
> +
> +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +	if (!sgt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ret = sg_alloc_table(sgt, 1, GFP_KERNEL | __GFP_ZERO);
> +	if (ret)
> +		goto err_kfree_sgt;
> +
> +	sgl = sgt->sgl;
> +
> +	if (!state) {
> +		addr = pci_p2pdma_bus_addr_map(provider, phys_vec->paddr);
> +	} else if (dma_use_iova(state)) {
> +		ret = dma_iova_link(attachment->dev, state, phys_vec->paddr, 0,
> +				    phys_vec->len, dir, DMA_ATTR_SKIP_CPU_SYNC);

The supposed benefits of this API are only for replacing scatterlists 
where multiple disjoint pages are being mapped. In this case with just 
one single contiguous mapping, it is clearly objectively worse to have 
to bounce in and out of the IOMMU layer 3 separate times and store a 
dma_map_state, to achieve the exact same operations that a single call 
to iommu_dma_map_resource() will perform more efficiently and with no 
external state required.

Oh yeah, and mapping MMIO with regular memory attributes (IOMMU_CACHE) 
rather than appropriate ones (IOMMU_MMIO), as this will end up doing, 
isn't guaranteed not to end badly either (e.g. if the system 
interconnect ends up merging consecutive write bursts and exceeding the 
target root port's MPS.)

> +		if (ret)
> +			goto err_free_table;
> +
> +		ret = dma_iova_sync(attachment->dev, state, 0, phys_vec->len);
> +		if (ret)
> +			goto err_unmap_dma;
> +
> +		addr = state->addr;
> +	} else {
> +		addr = dma_map_phys(attachment->dev, phys_vec->paddr,
> +				    phys_vec->len, dir, DMA_ATTR_SKIP_CPU_SYNC);

And again, if the IOMMU is in bypass (the idea of P2P with vfio-noiommu 
simply isn't worth entertaining) then what purpose do you imagine this 
call serves at all, other than to hilariously crash under 
"swiotlb=force"? Even in the case that phys_to_dma(phys_vec->paddr) != 
phys_vec->paddr, in almost all circumstances (both hardware offsets and 
CoCo environments with address-based aliasing), it is more likely than 
not that the latter is still the address you want and the former is 
wrong (and liable to lead to corruption or fatal system errors), because 
MMIO and memory remain fundamentally different things.

AFAICS you're *depending* on this call being an effective no-op, and 
thus only demonstrating that the dma_map_phys() idea is still entirely 
unnecessary.

> +		ret = dma_mapping_error(attachment->dev, addr);
> +		if (ret)
> +			goto err_free_table;
> +	}
> +
> +	fill_sg_entry(sgl, phys_vec->len, addr);
> +	return sgt;
> +
> +err_unmap_dma:
> +	dma_iova_destroy(attachment->dev, state, phys_vec->len, dir,
> +			 DMA_ATTR_SKIP_CPU_SYNC);
> +err_free_table:
> +	sg_free_table(sgt);
> +err_kfree_sgt:
> +	kfree(sgt);
> +	return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> +				   struct sg_table *sgt,
> +				   enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	struct scatterlist *sgl;
> +	int i;
> +
> +	if (!state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, priv->phys_vec.len,
> +				 dir, DMA_ATTR_SKIP_CPU_SYNC);
> +	else
> +		for_each_sgtable_dma_sg(sgt, sgl, i)

The table always has exactly one entry...

Thanks,
Robin.

> +			dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
> +				       sg_dma_len(sgl), dir,
> +				       DMA_ATTR_SKIP_CPU_SYNC);
> +
> +	sg_free_table(sgt);
> +	kfree(sgt);
> +}

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Jason Gunthorpe 2 months, 1 week ago

On Tue, Jul 29, 2025 at 08:44:21PM +0100, Robin Murphy wrote:

> In this case with just one single
> contiguous mapping, it is clearly objectively worse to have to bounce in and
> out of the IOMMU layer 3 separate times and store a dma_map_state,

The non-contiguous mappings are comming back, it was in earlier drafts
of this. Regardless, the point is to show how to use the general API
that we would want to bring into the DRM drivers that don't have
contiguity even though VFIO is a bit special.

> Oh yeah, and mapping MMIO with regular memory attributes (IOMMU_CACHE)
> rather than appropriate ones (IOMMU_MMIO), as this will end up doing, isn't
> guaranteed not to end badly either (e.g. if the system interconnect ends up
> merging consecutive write bursts and exceeding the target root port's MPS.)

Yes, I recently noticed this too, it should be fixed..

But so we are all on the same page, alot of the PCI P2P systems are
setup so P2P does not transit through the iommu. It either takes the
ACS path through a switch or it uses ATS and takes a different ACS
path through a switch. It only transits through the iommu in
misconfigured systems or in the rarer case of P2P between root ports.

> And again, if the IOMMU is in bypass (the idea of P2P with vfio-noiommu simply
> isn't worth entertaining) 

Not quite. DMABUF is sort of upside down.

For example if we are exporting a DMABUF from VFIO and importing it to
RDMA then RDMA will call VFIO to make an attachment and the above VFIO
code will perform the DMA map to the RDMA struct device. DMABUF
returns a dma mapped scatterlist back to the RDMA driver.

The above dma_map_phys(rdma_dev,...) can be in bypass because the rdma
device can legitimately be in bypass, or not have a iommu, or
whatever.

> AFAICS you're *depending* on this call being an effective no-op, and thus
> only demonstrating that the dma_map_phys() idea is still entirely
> unnecessary.

It should not be a full no-op, and it should be closer to
dma map resource to avoid the mmio issues.

It should be failing for cases where it is not supported (ie
swiotlb=force), it should still be calling the legacy dma_ops, and it
should be undoing any CC mangling with the address. (also the
pci_p2pdma_bus_addr_map() needs to deal with any CC issues too)

Jason

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Robin Murphy 2 months, 1 week ago

On 2025-07-29 9:13 pm, Jason Gunthorpe wrote:
> On Tue, Jul 29, 2025 at 08:44:21PM +0100, Robin Murphy wrote:
> 
>> In this case with just one single
>> contiguous mapping, it is clearly objectively worse to have to bounce in and
>> out of the IOMMU layer 3 separate times and store a dma_map_state,
> 
> The non-contiguous mappings are comming back, it was in earlier drafts
> of this. Regardless, the point is to show how to use the general API
> that we would want to bring into the DRM drivers that don't have
> contiguity even though VFIO is a bit special.
> 
>> Oh yeah, and mapping MMIO with regular memory attributes (IOMMU_CACHE)
>> rather than appropriate ones (IOMMU_MMIO), as this will end up doing, isn't
>> guaranteed not to end badly either (e.g. if the system interconnect ends up
>> merging consecutive write bursts and exceeding the target root port's MPS.)
> 
> Yes, I recently noticed this too, it should be fixed..
> 
> But so we are all on the same page, alot of the PCI P2P systems are
> setup so P2P does not transit through the iommu. It either takes the
> ACS path through a switch or it uses ATS and takes a different ACS
> path through a switch. It only transits through the iommu in
> misconfigured systems or in the rarer case of P2P between root ports.

For non-ATS (and ATS Untranslated traffic), my understanding is that we 
rely on ACS upstream redirect to send transactions all the way up to the 
root port for translation (and without that then they are indeed pure 
bus addresses, take the pci_p2pdma_bus_addr_map() case, and the rest of 
this is all irrelevant). In Arm system terms, simpler root ports may 
well have to run that traffic out to an external SMMU TBU, at which 
point any P2P would loop back externally through the memory space window 
in the system interconnect PA space, as opposed to DTI-ATS root 
complexes that effectively implement their own internal translation 
agent on the PCIe side. Thus on some systems, even P2P behind a single 
root port may end up looking functionally the same as the cross-RP case, 
but in general cross-RP *is* something that people seem to care about as 
well. We're seeing more and more systems where each slot has its own RP 
as a separate segment, rather than giant root complexes with a host 
bridge and everyone on one big happy root bus together.

>> And again, if the IOMMU is in bypass (the idea of P2P with vfio-noiommu simply
>> isn't worth entertaining)
> 
> Not quite. DMABUF is sort of upside down.
> 
> For example if we are exporting a DMABUF from VFIO and importing it to
> RDMA then RDMA will call VFIO to make an attachment and the above VFIO
> code will perform the DMA map to the RDMA struct device. DMABUF
> returns a dma mapped scatterlist back to the RDMA driver.
> 
> The above dma_map_phys(rdma_dev,...) can be in bypass because the rdma
> device can legitimately be in bypass, or not have a iommu, or
> whatever.

I understand how dma-buf works - obviously DMA mapping for the VFIO 
device itself while it's not even attached to its default domain would 
be silly. I mean that any system that has 64-bit coherent PCIe behind an 
IOMMU such that this VFIO exporter could exist, is realistically going 
to have the same (or equivalent) IOMMU in front of any potential 
importers as well. *Especially* if you expect the normal case for P2P to 
be within a single hierarchy. Thus I was simply commenting that 
IOMMU_DOMAIN_IDENTITY is the *only* realistic reason to actually expect 
to interact with dma-direct here.

But of course, if it's not dma-direct because we're on POWER with TCE, 
rather than VFIO Type1 implying an iommu-dma/dma-direct arch, then who 
knows? I imagine the complete absence of any mention means this hasn't 
been tried, or possibly even considered?

>> AFAICS you're *depending* on this call being an effective no-op, and thus
>> only demonstrating that the dma_map_phys() idea is still entirely
>> unnecessary.
> 
> It should not be a full no-op, and it should be closer to
> dma map resource to avoid the mmio issues.

I don't get what you mean by "not be a full no-op", can you clarify 
exactly what you think it should be doing? Even if it's just the 
dma_capable() mask check equivalent to dma_direct_map_resource(), you 
don't actually want that here either - in that case you'd want to fail 
the entire attachment to begin with since it can never work.

> It should be failing for cases where it is not supported (ie
> swiotlb=force), it should still be calling the legacy dma_ops, and it
> should be undoing any CC mangling with the address. (also the
> pci_p2pdma_bus_addr_map() needs to deal with any CC issues too)

Um, my whole point is that the "legacy DMA ops" cannot be called, 
because they still assume page-backed memory, so at best are guaranteed 
to fail; any "CC mangling" assumed for memory is most likely wrong for 
MMIO, and there simply is no "deal with" at this point.

A device BAR is simply not under control of the trusted hypervisor the 
same way memory is; whatever (I/G)PA it is at must already be the 
correct address, if the aliasing scheme even applies at all. Sticking to 
Arm CCA terminology for example, if a device in shared state tries to 
import a BAR from a device in locked/private state, there is no notion 
of touching the shared alias and hoping it somehow magically works (at 
best it might throw the exporting device into TDISP error state 
terminally); that attachment simply cannot be allowed. If an shared 
resource exists in the shared IPA space to begin with, dma_to_phys() 
will do the wrong thing, and even phys_to_dma() would technically not 
walk dma_range_map correctly, because both assume "phys" represents 
kernel memory. However it's also all moot since any attempt at any 
combination will fail anyway due to SWIOTLB being forced by 
is_realm_world().

(OK, I admit "crash" wasn't strictly the right word to use there - I 
keep forgetting that some of the P2P scatterlist support in dma-direct 
ended up affecting the map_page path too, even though that was never 
really the functional intent - but hey, the overall result of failing to 
work as expected is the same.)

Thanks,
Robin.

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Jason Gunthorpe 2 months, 1 week ago

On Wed, Jul 30, 2025 at 03:49:45PM +0100, Robin Murphy wrote:
> On 2025-07-29 9:13 pm, Jason Gunthorpe wrote:
> > On Tue, Jul 29, 2025 at 08:44:21PM +0100, Robin Murphy wrote:
> > 
> > > In this case with just one single
> > > contiguous mapping, it is clearly objectively worse to have to bounce in and
> > > out of the IOMMU layer 3 separate times and store a dma_map_state,
> > 
> > The non-contiguous mappings are comming back, it was in earlier drafts
> > of this. Regardless, the point is to show how to use the general API
> > that we would want to bring into the DRM drivers that don't have
> > contiguity even though VFIO is a bit special.
> > 
> > > Oh yeah, and mapping MMIO with regular memory attributes (IOMMU_CACHE)
> > > rather than appropriate ones (IOMMU_MMIO), as this will end up doing, isn't
> > > guaranteed not to end badly either (e.g. if the system interconnect ends up
> > > merging consecutive write bursts and exceeding the target root port's MPS.)
> > 
> > Yes, I recently noticed this too, it should be fixed..
> > 
> > But so we are all on the same page, alot of the PCI P2P systems are
> > setup so P2P does not transit through the iommu. It either takes the
> > ACS path through a switch or it uses ATS and takes a different ACS
> > path through a switch. It only transits through the iommu in
> > misconfigured systems or in the rarer case of P2P between root ports.
> 
> For non-ATS (and ATS Untranslated traffic), my understanding is that we rely
> on ACS upstream redirect to send transactions all the way up to the root
> port for translation (and without that then they are indeed pure bus
> addresses, take the pci_p2pdma_bus_addr_map() case,

My point is it is common for real systems to take the pci_p2pdma_bus_addr_map()
path. Going through the RP is too slow.

> all irrelevant). In Arm system terms, simpler root ports may well have to
> run that traffic out to an external SMMU TBU, at which point any P2P would
> loop back externally through the memory space window in the system

Many real systems simply don't support this at all :(

> But of course, if it's not dma-direct because we're on POWER with TCE,
> rather than VFIO Type1 implying an iommu-dma/dma-direct arch, then who
> knows? I imagine the complete absence of any mention means this hasn't been
> tried, or possibly even considered?

POWER uses dma_ops and the point of this design is that dma_may_phys()
will still call the dma_ops. See below.

> I don't get what you mean by "not be a full no-op", can you clarify exactly
> what you think it should be doing? Even if it's just the dma_capable() mask
> check equivalent to dma_direct_map_resource(), you don't actually want that
> here either - in that case you'd want to fail the entire attachment to begin
> with since it can never work.

The expectation would be if the dma mapping can't succeed then the
phys map should fail. So if dma_capable() or whatever is not OK then
fail inside the loop and unwind back to failing the whole attach.

> > It should be failing for cases where it is not supported (ie
> > swiotlb=force), it should still be calling the legacy dma_ops, and it
> > should be undoing any CC mangling with the address. (also the
> > pci_p2pdma_bus_addr_map() needs to deal with any CC issues too)
> 
> Um, my whole point is that the "legacy DMA ops" cannot be called, because
> they still assume page-backed memory, so at best are guaranteed to fail; any
> "CC mangling" assumed for memory is most likely wrong for MMIO, and there
> simply is no "deal with" at this point.

I think we all agreed it should use the resource path. So legacy DMA
ops, including POWER, should end up calling

struct dma_map_ops {
	dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr,
			size_t size, enum dma_data_direction dir,
			unsigned long attrs);

And if that is NULL it should fail.

> A device BAR is simply not under control of the trusted hypervisor the same
> way memory is;

I'm not sure what you mean? I think it is, at least for CC I expect
ACS to be setup to force translation and this squarly puts access to
the MMIO BAR under control of the the S2 translation.

In ARM terms I expect that the RMM's S2 will contain the MMIO BAR at
the shared IPA (ie top bit set), which will match where the CPU should
access it? Linux's IOMMU S2 should mirror this and put the MMIO BAR at
the shared IPA. Meaning upon locking the MMIO phys_addr_t effectively
moves?

At least I would be surprised to hear that shared MMIO was placed in
the private IPA space??

Outside CC we do have a rare configuration where the ACS is not
forcing translation and then your remarks are true. Hypervisor must
enfroce IPA == GPA == bus addr. It's a painful configuration to make
work.

> Sticking to Arm CCA terminology for example, if a device in shared
> state tries to import a BAR from a device in locked/private state,
> there is no notion of touching the shared alias and hoping it
> somehow magically works (at best it might throw the exporting device
> into TDISP error state terminally);

Right, we don't support T=1 DMA yet, or locked devices, but when we do
the p2pdma layer needs to be fixed up to catch this and reject it.

I think it is pretty easy, the p2pdma_provider struct can record if
the exporting struct device has shared or private MMIO. Then when
doing the mapping we require that private MMIO be accessed from T=1.

This should be addressed as part of enabling PCI T=1 support, eg in
ARM terms along with Aneesh's series "ARM CCA Device Assignment
support"

> simply cannot be allowed. If an shared resource exists in the shared IPA
> space to begin with, dma_to_phys() will do the wrong thing, and even
> phys_to_dma() would technically not walk dma_range_map correctly, because
> both assume "phys" represents kernel memory. 

As above for CC I am expecting that translation will always be
required. The S2 in both the RMM and hypervisor SMMUs should both have
shared accessiblity for whatever phys_addr the CPU is using.

So phys_to_dma() just needs to return the normal CPU phys_addr_t to
work, and this looks believable to me. ARM forces the shared IPA
through dma_addr_unencrypted(), but it is already wrong for the core
code to call that function for "encrypted" MMIO.

Not sure about the ranges or dma_to_phys(), I doubt anyone has ever
tested this so it probably doesn't work - but I don't see anything
architecturally catastrophic here, just some bugs.

> However it's also all moot since any attempt at any combination will
> fail anyway due to SWIOTLB being forced by is_realm_world().

Yep.

Basically P2P for ARM CCA today needs some bug fixing and testing -
not surprising. ARM CCA is already rare, and even we don't use P2P
under any CC architecture today.

I'm sure it will be fixed as a separate work, at least we will soon
care about P2P on ARM CCA working.

Regardless, from a driver perspective none of the CC detail should
leak into VFIO. The P2P APIs and the DMA APIs are the right place to
abstract it away, and yes they probably fail to do so right now.

I'm guessing that if DMA_ATTR_MMIO is agreed then a
DMA_ATTR_MMIO_ENCRYPTED would be the logical step. That should provide
enough detail that the DMA API can compute correct addressing.

Maybe this whole discussion improves the case for DMA_ATTR_MMIO.

Jason

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Leon Romanovsky 2 months, 1 week ago

On Tue, Jul 29, 2025 at 05:13:51PM -0300, Jason Gunthorpe wrote:
> On Tue, Jul 29, 2025 at 08:44:21PM +0100, Robin Murphy wrote:
> 
> > In this case with just one single
> > contiguous mapping, it is clearly objectively worse to have to bounce in and
> > out of the IOMMU layer 3 separate times and store a dma_map_state,
> 
> The non-contiguous mappings are comming back, it was in earlier drafts
> of this. Regardless, the point is to show how to use the general API
> that we would want to bring into the DRM drivers that don't have
> contiguity even though VFIO is a bit special.

Yes, we will see comeback of DMA ranges in v2.

Thanks

RE: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Kasireddy, Vivek 2 months, 2 weeks ago

Hi Leon,

> Subject: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO
> regions
> 
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>  drivers/vfio/pci/Kconfig           |  20 ++
>  drivers/vfio/pci/Makefile          |   2 +
>  drivers/vfio/pci/vfio_pci_config.c |  22 +-
>  drivers/vfio/pci/vfio_pci_core.c   |  25 ++-
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 321 +++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
>  include/linux/dma-buf.h            |   1 +
>  include/linux/vfio_pci_core.h      |   3 +
>  include/uapi/linux/vfio.h          |  19 ++
>  9 files changed, 431 insertions(+), 5 deletions(-)
>  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f546652..55ae888bf26ae 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM
> 
>  	  To enable s390x KVM vfio-pci extensions, say Y.
> 
> +config VFIO_PCI_DMABUF
> +	bool "VFIO PCI extensions for DMA-BUF"
> +	depends on VFIO_PCI_CORE
> +	depends on PCI_P2PDMA && DMA_SHARED_BUFFER
> +	default y
> +	help
> +	  Enable support for VFIO PCI extensions that allow exporting
> +	  device MMIO regions as DMA-BUFs for peer devices to access via
> +	  peer-to-peer (P2P) DMA.
> +
> +	  This feature enables a VFIO-managed PCI device to export a portion
> +	  of its MMIO BAR as a DMA-BUF file descriptor, which can be passed
> +	  to other userspace drivers or kernel subsystems capable of
> +	  initiating DMA to that region.
> +
> +	  Say Y here if you want to enable VFIO DMABUF-based MMIO export
> +	  support for peer-to-peer DMA use cases.
> +
> +	  If unsure, say N.
> +
>  source "drivers/vfio/pci/mlx5/Kconfig"
> 
>  source "drivers/vfio/pci/hisilicon/Kconfig"
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c8..f9155e9c5f630 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -2,7 +2,9 @@
> 
>  vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o
> vfio_pci_config.o
>  vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
> +
>  obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
> +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
> 
>  vfio-pci-y := vfio_pci.o
>  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> diff --git a/drivers/vfio/pci/vfio_pci_config.c
> b/drivers/vfio/pci/vfio_pci_config.c
> index 8f02f236b5b4b..7e23387a43b4d 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct
> vfio_pci_core_device *vdev, int pos,
>  		virt_mem = !!(le16_to_cpu(*virt_cmd) &
> PCI_COMMAND_MEMORY);
>  		new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
> 
> -		if (!new_mem)
> +		if (!new_mem) {
>  			vfio_pci_zap_and_down_write_memory_lock(vdev);
> -		else
> +			vfio_pci_dma_buf_move(vdev, true);
> +		} else {
>  			down_write(&vdev->memory_lock);
> +		}
> 
>  		/*
>  		 * If the user is writing mem/io enable (new_mem/io) and we
> @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct
> vfio_pci_core_device *vdev, int pos,
>  		*virt_cmd &= cpu_to_le16(~mask);
>  		*virt_cmd |= cpu_to_le16(new_cmd & mask);
> 
> +		if (__vfio_pci_memory_enabled(vdev))
> +			vfio_pci_dma_buf_move(vdev, false);
>  		up_write(&vdev->memory_lock);
>  	}
> 
> @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct
> perm_bits *perm)
>  static void vfio_lock_and_set_power_state(struct vfio_pci_core_device
> *vdev,
>  					  pci_power_t state)
>  {
> -	if (state >= PCI_D3hot)
> +	if (state >= PCI_D3hot) {
>  		vfio_pci_zap_and_down_write_memory_lock(vdev);
> -	else
> +		vfio_pci_dma_buf_move(vdev, true);
> +	} else {
>  		down_write(&vdev->memory_lock);
> +	}
> 
>  	vfio_pci_set_power_state(vdev, state);
> +	if (__vfio_pci_memory_enabled(vdev))
> +		vfio_pci_dma_buf_move(vdev, false);
>  	up_write(&vdev->memory_lock);
>  }
> 
> @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct
> vfio_pci_core_device *vdev, int pos,
> 
>  		if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
>  			vfio_pci_zap_and_down_write_memory_lock(vdev);
> +			vfio_pci_dma_buf_move(vdev, true);
>  			pci_try_reset_function(vdev->pdev);
> +			if (__vfio_pci_memory_enabled(vdev))
> +				vfio_pci_dma_buf_move(vdev, true);
>  			up_write(&vdev->memory_lock);
>  		}
>  	}
> @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct
> vfio_pci_core_device *vdev, int pos,
> 
>  		if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP))
> {
>  			vfio_pci_zap_and_down_write_memory_lock(vdev);
> +			vfio_pci_dma_buf_move(vdev, true);
>  			pci_try_reset_function(vdev->pdev);
> +			if (__vfio_pci_memory_enabled(vdev))
> +				vfio_pci_dma_buf_move(vdev, true);
>  			up_write(&vdev->memory_lock);
>  		}
>  	}
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 5512d13bb8899..e5ab5d1cafd9c 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,7 +29,9 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>  #include <linux/pci-p2pdma.h>
> +#endif
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -288,6 +290,8 @@ static int vfio_pci_runtime_pm_entry(struct
> vfio_pci_core_device *vdev,
>  	 * semaphore.
>  	 */
>  	vfio_pci_zap_and_down_write_memory_lock(vdev);
> +	vfio_pci_dma_buf_move(vdev, true);
> +
>  	if (vdev->pm_runtime_engaged) {
>  		up_write(&vdev->memory_lock);
>  		return -EINVAL;
> @@ -371,6 +375,8 @@ static void vfio_pci_runtime_pm_exit(struct
> vfio_pci_core_device *vdev)
>  	 */
>  	down_write(&vdev->memory_lock);
>  	__vfio_pci_runtime_pm_exit(vdev);
> +	if (__vfio_pci_memory_enabled(vdev))
> +		vfio_pci_dma_buf_move(vdev, false);
>  	up_write(&vdev->memory_lock);
>  }
> 
> @@ -691,6 +697,8 @@ void vfio_pci_core_close_device(struct vfio_device
> *core_vdev)
>  #endif
>  	vfio_pci_core_disable(vdev);
> 
> +	vfio_pci_dma_buf_cleanup(vdev);
> +
>  	mutex_lock(&vdev->igate);
>  	if (vdev->err_trigger) {
>  		eventfd_ctx_put(vdev->err_trigger);
> @@ -1223,7 +1231,10 @@ static int vfio_pci_ioctl_reset(struct
> vfio_pci_core_device *vdev,
>  	 */
>  	vfio_pci_set_power_state(vdev, PCI_D0);
> 
> +	vfio_pci_dma_buf_move(vdev, true);
>  	ret = pci_try_reset_function(vdev->pdev);
> +	if (__vfio_pci_memory_enabled(vdev))
> +		vfio_pci_dma_buf_move(vdev, false);
>  	up_write(&vdev->memory_lock);
> 
>  	return ret;
> @@ -1512,6 +1523,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
> *device, u32 flags,
>  		return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_DMA_BUF:
> +		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> @@ -2088,9 +2101,13 @@ int vfio_pci_core_init_dev(struct vfio_device
> *core_vdev)
>  	INIT_LIST_HEAD(&vdev->dummy_resources_list);
>  	INIT_LIST_HEAD(&vdev->ioeventfds_list);
>  	INIT_LIST_HEAD(&vdev->sriov_pfs_item);
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>  	vdev->provider = pci_p2pdma_enable(vdev->pdev);
>  	if (IS_ERR(vdev->provider))
>  		return PTR_ERR(vdev->provider);
> +
> +	INIT_LIST_HEAD(&vdev->dmabufs);
> +#endif
>  	init_rwsem(&vdev->memory_lock);
>  	xa_init(&vdev->ctx);
> 
> @@ -2473,11 +2490,17 @@ static int vfio_pci_dev_set_hot_reset(struct
> vfio_device_set *dev_set,
>  	 * cause the PCI config space reset without restoring the original
>  	 * state (saved locally in 'vdev->pm_save').
>  	 */
> -	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
> +		vfio_pci_dma_buf_move(vdev, true);
>  		vfio_pci_set_power_state(vdev, PCI_D0);
> +	}
> 
>  	ret = pci_reset_bus(pdev);
> 
> +	list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +		if (__vfio_pci_memory_enabled(vdev))
> +			vfio_pci_dma_buf_move(vdev, false);
> +
>  	vdev = list_last_entry(&dev_set->device_list,
>  			       struct vfio_pci_core_device, vdev.dev_set_list);
> 
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c
> b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 0000000000000..5fefcdecd1329
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -0,0 +1,321 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
> + */
> +#include <linux/dma-buf.h>
> +#include <linux/pci-p2pdma.h>
> +#include <linux/dma-resv.h>
> +
> +#include "vfio_pci_priv.h"
> +
> +MODULE_IMPORT_NS("DMA_BUF");
> +
> +struct vfio_pci_dma_buf {
> +	struct dma_buf *dmabuf;
> +	struct vfio_pci_core_device *vdev;
> +	struct list_head dmabufs_elm;
> +	struct phys_vec phys_vec;
> +	u8 revoked : 1;
> +};
> +
> +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> +				   struct dma_buf_attachment *attachment)
> +{
> +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +	if (!attachment->peer2peer)
> +		return -EOPNOTSUPP;
> +
> +	if (priv->revoked)
> +		return -ENODEV;
> +
> +	switch (pci_p2pdma_map_type(priv->vdev->provider, attachment-
> >dev)) {
> +	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
> +		break;
> +	case PCI_P2PDMA_MAP_BUS_ADDR:
> +		/*
> +		 * There is no need in IOVA at all for this flow.
> +		 * We rely on attachment->priv == NULL as a marker
> +		 * for this mode.
> +		 */
> +		return 0;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	attachment->priv = kzalloc(sizeof(struct dma_iova_state),
> GFP_KERNEL);
> +	if (!attachment->priv)
> +		return -ENOMEM;
> +
> +	dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv-
> >phys_vec.len);
> +	return 0;
> +}
> +
> +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> +				    struct dma_buf_attachment *attachment)
> +{
> +	kfree(attachment->priv);
> +}
> +
> +static void fill_sg_entry(struct scatterlist *sgl, unsigned int length,
> +			 dma_addr_t addr)
> +{
> +	sg_set_page(sgl, NULL, length, 0);
> +	sg_dma_address(sgl) = addr;
> +	sg_dma_len(sgl) = length;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +		     enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct p2pdma_provider *provider = priv->vdev->provider;
> +	struct dma_iova_state *state = attachment->priv;
> +	struct phys_vec *phys_vec = &priv->phys_vec;
> +	struct scatterlist *sgl;
> +	struct sg_table *sgt;
> +	dma_addr_t addr;
> +	int ret;
> +
> +	dma_resv_assert_held(priv->dmabuf->resv);
> +
> +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +	if (!sgt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ret = sg_alloc_table(sgt, 1, GFP_KERNEL | __GFP_ZERO);
> +	if (ret)
> +		goto err_kfree_sgt;
> +
> +	sgl = sgt->sgl;
> +
> +	if (!state) {
> +		addr = pci_p2pdma_bus_addr_map(provider, phys_vec-
> >paddr);
> +	} else if (dma_use_iova(state)) {
> +		ret = dma_iova_link(attachment->dev, state, phys_vec->paddr,
> 0,
> +				    phys_vec->len, dir,
> DMA_ATTR_SKIP_CPU_SYNC);
> +		if (ret)
> +			goto err_free_table;
> +
> +		ret = dma_iova_sync(attachment->dev, state, 0, phys_vec-
> >len);
> +		if (ret)
> +			goto err_unmap_dma;
> +
> +		addr = state->addr;
> +	} else {
> +		addr = dma_map_phys(attachment->dev, phys_vec->paddr,
> +				    phys_vec->len, dir,
> DMA_ATTR_SKIP_CPU_SYNC);
> +		ret = dma_mapping_error(attachment->dev, addr);
> +		if (ret)
> +			goto err_free_table;
> +	}
> +
> +	fill_sg_entry(sgl, phys_vec->len, addr);
> +	return sgt;
> +
> +err_unmap_dma:
> +	dma_iova_destroy(attachment->dev, state, phys_vec->len, dir,
> +			 DMA_ATTR_SKIP_CPU_SYNC);
> +err_free_table:
> +	sg_free_table(sgt);
> +err_kfree_sgt:
> +	kfree(sgt);
> +	return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment
> *attachment,
> +				   struct sg_table *sgt,
> +				   enum dma_data_direction dir)
> +{
> +	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +	struct dma_iova_state *state = attachment->priv;
> +	struct scatterlist *sgl;
> +	int i;
> +
> +	if (!state)
> +		; /* Do nothing */
> +	else if (dma_use_iova(state))
> +		dma_iova_destroy(attachment->dev, state, priv->phys_vec.len,
> +				 dir, DMA_ATTR_SKIP_CPU_SYNC);
> +	else
> +		for_each_sgtable_dma_sg(sgt, sgl, i)
> +			dma_unmap_phys(attachment->dev,
> sg_dma_address(sgl),
> +				       sg_dma_len(sgl), dir,
> +				       DMA_ATTR_SKIP_CPU_SYNC);
> +
> +	sg_free_table(sgt);
> +	kfree(sgt);
> +}
> +
> +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> +{
> +	struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +	/*
> +	 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> +	 * The refcount prevents both.
> +	 */
> +	if (priv->vdev) {
> +		down_write(&priv->vdev->memory_lock);
> +		list_del_init(&priv->dmabufs_elm);
> +		up_write(&priv->vdev->memory_lock);
> +		vfio_device_put_registration(&priv->vdev->vdev);
> +	}
> +	kfree(priv);
> +}
> +
> +static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> +	.attach = vfio_pci_dma_buf_attach,
> +	.detach = vfio_pci_dma_buf_detach,
> +	.map_dma_buf = vfio_pci_dma_buf_map,
> +	.release = vfio_pci_dma_buf_release,
> +	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
> +};
> +
> +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> +				   struct vfio_device_feature_dma_buf
> *dma_buf)
> +{
> +	struct pci_dev *pdev = priv->vdev->pdev;
> +
> +	priv->phys_vec.len = dma_buf->length;
> +	priv->phys_vec.paddr = pci_resource_start(pdev, dma_buf-
> >region_index);
> +	priv->phys_vec.paddr += dma_buf->offset;
> +}
> +
> +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
> +				 struct vfio_device_feature_dma_buf *dma_buf)
> +{
> +	struct pci_dev *pdev = vdev->pdev;
> +	u32 bar = dma_buf->region_index;
> +	u64 offset = dma_buf->offset;
> +	u64 len = dma_buf->length;
> +	resource_size_t bar_size;
> +	u64 sum;
> +
> +	/*
> +	 * For PCI the region_index is the BAR number like  everything else.
> +	 */
> +	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
> +		return -ENODEV;
> +
> +	if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
> +		return -EINVAL;
> +
> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> +		return -EINVAL;
> +
> +	bar_size = pci_resource_len(pdev, bar);
> +	if (check_add_overflow(offset, len, &sum) || sum > bar_size)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32
> flags,
> +				  struct vfio_device_feature_dma_buf __user
> *arg,
> +				  size_t argsz)
> +{
> +	struct vfio_device_feature_dma_buf get_dma_buf = {};
> +	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> +	struct vfio_pci_dma_buf *priv;
> +	int ret;
> +
> +	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(get_dma_buf));
> +	if (ret != 1)
> +		return ret;
> +
> +	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
> +		return -EFAULT;
> +
> +	ret = validate_dmabuf_input(vdev, &get_dma_buf);
> +	if (ret)
> +		return ret;
> +
> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +	if (!priv)
> +		return -ENOMEM;
> +
> +	priv->vdev = vdev;
> +	dma_ranges_to_p2p_phys(priv, &get_dma_buf);
> +
> +	if (!vfio_device_try_get_registration(&vdev->vdev)) {
> +		ret = -ENODEV;
> +		goto err_free_priv;
> +	}
> +
> +	exp_info.ops = &vfio_pci_dmabuf_ops;
> +	exp_info.size = priv->phys_vec.len;
> +	exp_info.flags = get_dma_buf.open_flags;
> +	exp_info.priv = priv;
> +
> +	priv->dmabuf = dma_buf_export(&exp_info);
> +	if (IS_ERR(priv->dmabuf)) {
> +		ret = PTR_ERR(priv->dmabuf);
> +		goto err_dev_put;
> +	}
> +
> +	/* dma_buf_put() now frees priv */
> +	INIT_LIST_HEAD(&priv->dmabufs_elm);
> +	down_write(&vdev->memory_lock);
> +	dma_resv_lock(priv->dmabuf->resv, NULL);
> +	priv->revoked = !__vfio_pci_memory_enabled(vdev);
> +	list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> +	dma_resv_unlock(priv->dmabuf->resv);
> +	up_write(&vdev->memory_lock);
> +
> +	/*
> +	 * dma_buf_fd() consumes the reference, when the file closes the
> dmabuf
> +	 * will be released.
> +	 */
> +	return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
> +
> +err_dev_put:
> +	vfio_device_put_registration(&vdev->vdev);
> +err_free_priv:
> +	kfree(priv);
> +	return ret;
> +}
> +
> +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool
> revoked)
> +{
> +	struct vfio_pci_dma_buf *priv;
> +	struct vfio_pci_dma_buf *tmp;
> +
> +	lockdep_assert_held_write(&vdev->memory_lock);
> +
> +	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> +		if (!get_file_active(&priv->dmabuf->file))
> +			continue;
> +
> +		if (priv->revoked != revoked) {
> +			dma_resv_lock(priv->dmabuf->resv, NULL);
> +			priv->revoked = revoked;
> +			dma_buf_move_notify(priv->dmabuf);
> +			dma_resv_unlock(priv->dmabuf->resv);
> +		}
> +		dma_buf_put(priv->dmabuf);
> +	}
> +}
> +
> +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> +{
> +	struct vfio_pci_dma_buf *priv;
> +	struct vfio_pci_dma_buf *tmp;
> +
> +	down_write(&vdev->memory_lock);
> +	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> +		if (!get_file_active(&priv->dmabuf->file))
> +			continue;
> +
> +		dma_resv_lock(priv->dmabuf->resv, NULL);
> +		list_del_init(&priv->dmabufs_elm);
> +		priv->vdev = NULL;
> +		priv->revoked = true;
> +		dma_buf_move_notify(priv->dmabuf);
> +		dma_resv_unlock(priv->dmabuf->resv);
> +		vfio_device_put_registration(&vdev->vdev);
> +		dma_buf_put(priv->dmabuf);
> +	}
> +	up_write(&vdev->memory_lock);
> +}
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index a9972eacb2936..28a405f8b97c9 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev
> *pdev)
>  	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
> 
> +#ifdef CONFIG_VFIO_PCI_DMABUF
> +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32
> flags,
> +				  struct vfio_device_feature_dma_buf __user
> *arg,
> +				  size_t argsz);
> +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool
> revoked);
> +#else
> +static inline int
> +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32
> flags,
> +			      struct vfio_device_feature_dma_buf __user *arg,
> +			      size_t argsz)
> +{
> +	return -ENOTTY;
> +}
> +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device
> *vdev)
> +{
> +}
> +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device
> *vdev,
> +					 bool revoked)
> +{
> +}
> +#endif
> +
>  #endif
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index d58e329ac0e71..f14b413aae48d 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -483,6 +483,7 @@ struct dma_buf_attach_ops {
>   * @dev: device attached to the buffer.
>   * @node: list of dma_buf_attachment, protected by dma_resv lock of the
> dmabuf.
>   * @peer2peer: true if the importer can handle peer resources without pages.
> + * #state: DMA structure to provide support for physical addresses DMA
> interface
>   * @priv: exporter specific attachment data.
>   * @importer_ops: importer operations for this attachment, if provided
>   * dma_buf_map/unmap_attachment() must be called with the dma_resv
> lock held.
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index b017fae251811..548cbb51bf146 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -94,7 +94,10 @@ struct vfio_pci_core_device {
>  	struct vfio_pci_core_device	*sriov_pf_core_dev;
>  	struct notifier_block	nb;
>  	struct rw_semaphore	memory_lock;
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>  	struct p2pdma_provider  *provider;
> +	struct list_head	dmabufs;
> +#endif
>  };
> 
>  /* Will be exported for vfio pci drivers usage */
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5764f315137f9..ad8e303697f97 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1468,6 +1468,25 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
> 
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR,
> O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the
> dmabuf.
Any particular reason why you dropped the option (nr_ranges) of creating a
single dmabuf from multiple ranges of an MMIO region?

Restricting the dmabuf to a single range (or having to create multiple dmabufs
to represent multiple regions/ranges associated with a single scattered buffer)
would be very limiting and may not work in all cases. For instance, in my use-case,
I am trying to share a large (4k mode) framebuffer (FB) located in GPU's VRAM
between two (p2p compatible) GPU devices. And, this would probably not work
given that allocating a large contiguous FB (nr_ranges = 1) in VRAM may not be
feasible when there is memory pressure.

Furthermore, since you are adding a new UAPI with this patch/feature, as you know,
we cannot go back and tweak it (to add support for nr_ranges > 1) should there
be a need in the future, but you can always use nr_ranges = 1 anytime. Therefore,
I think it makes sense to be flexible in terms of the number of ranges to include
while creating a dmabuf instead of restricting ourselves to one range.

Thanks,
Vivek

> + *
> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_device_feature_dma_buf {
> +	__u32	region_index;
> +	__u32	open_flags;
> +	__u64	offset;
> +	__u64	length;
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
> 
>  /**
> --
> 2.50.1

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Leon Romanovsky 2 months, 2 weeks ago

On Thu, Jul 24, 2025 at 05:13:49AM +0000, Kasireddy, Vivek wrote:
> Hi Leon,
> 
> > Subject: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO
> > regions
> > 
> > From: Leon Romanovsky <leonro@nvidia.com>
> > 
> > Add support for exporting PCI device MMIO regions through dma-buf,
> > enabling safe sharing of non-struct page memory with controlled
> > lifetime management. This allows RDMA and other subsystems to import
> > dma-buf FDs and build them into memory regions for PCI P2P operations.
> > 
> > The implementation provides a revocable attachment mechanism using
> > dma-buf move operations. MMIO regions are normally pinned as BARs
> > don't change physical addresses, but access is revoked when the VFIO
> > device is closed or a PCI reset is issued. This ensures kernel
> > self-defense against potentially hostile userspace.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >  drivers/vfio/pci/Kconfig           |  20 ++
> >  drivers/vfio/pci/Makefile          |   2 +
> >  drivers/vfio/pci/vfio_pci_config.c |  22 +-
> >  drivers/vfio/pci/vfio_pci_core.c   |  25 ++-
> >  drivers/vfio/pci/vfio_pci_dmabuf.c | 321 +++++++++++++++++++++++++++++
> >  drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
> >  include/linux/dma-buf.h            |   1 +
> >  include/linux/vfio_pci_core.h      |   3 +
> >  include/uapi/linux/vfio.h          |  19 ++
> >  9 files changed, 431 insertions(+), 5 deletions(-)
> >  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c

<...>

> > +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
> > +				 struct vfio_device_feature_dma_buf *dma_buf)
> > +{
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	u32 bar = dma_buf->region_index;
> > +	u64 offset = dma_buf->offset;
> > +	u64 len = dma_buf->length;
> > +	resource_size_t bar_size;
> > +	u64 sum;
> > +
> > +	/*
> > +	 * For PCI the region_index is the BAR number like  everything else.
> > +	 */
> > +	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
> > +		return -ENODEV;

<...>

> > +/**
> > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > + * regions selected.
> > + *
> > + * open_flags are the typical flags passed to open(2), eg O_RDWR,
> > O_CLOEXEC,
> > + * etc. offset/length specify a slice of the region to create the dmabuf from.
> > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the
> > dmabuf.
> Any particular reason why you dropped the option (nr_ranges) of creating a
> single dmabuf from multiple ranges of an MMIO region?

I did it for two reasons. First, I wanted to simplify the code in order
to speed-up discussion over the patchset itself. Second, I failed to
find justification for need of multiple ranges, as the number of BARs
are limited by VFIO_PCI_ROM_REGION_INDEX (6) and same functionality
can be achieved by multiple calls to DMABUF import.

> 
> Restricting the dmabuf to a single range (or having to create multiple dmabufs
> to represent multiple regions/ranges associated with a single scattered buffer)
> would be very limiting and may not work in all cases. For instance, in my use-case,
> I am trying to share a large (4k mode) framebuffer (FB) located in GPU's VRAM
> between two (p2p compatible) GPU devices. And, this would probably not work
> given that allocating a large contiguous FB (nr_ranges = 1) in VRAM may not be
> feasible when there is memory pressure.

Can you please help me and point to the place in the code where this can fail?
I'm probably missing something basic as there are no large allocations
in the current patchset.

> 
> Furthermore, since you are adding a new UAPI with this patch/feature, as you know,
> we cannot go back and tweak it (to add support for nr_ranges > 1) should there
> be a need in the future, but you can always use nr_ranges = 1 anytime. Therefore,
> I think it makes sense to be flexible in terms of the number of ranges to include
> while creating a dmabuf instead of restricting ourselves to one range.

I'm not a big fan of over-engineering. Let's first understand if this
case is needed.

Thanks

> 
> Thanks,
> Vivek
> 
> > + *
> > + * Return: The fd number on success, -1 and errno is set on failure.
> > + */
> > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> > +
> > +struct vfio_device_feature_dma_buf {
> > +	__u32	region_index;
> > +	__u32	open_flags;
> > +	__u64	offset;
> > +	__u64	length;
> > +};
> > +
> >  /* -------- API for Type1 VFIO IOMMU -------- */
> > 
> >  /**
> > --
> > 2.50.1
>

RE: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Kasireddy, Vivek 2 months, 1 week ago

Hi Leon,

> Subject: Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO
> regions
> 
> > >
> > > From: Leon Romanovsky <leonro@nvidia.com>
> > >
> > > Add support for exporting PCI device MMIO regions through dma-buf,
> > > enabling safe sharing of non-struct page memory with controlled
> > > lifetime management. This allows RDMA and other subsystems to
> import
> > > dma-buf FDs and build them into memory regions for PCI P2P
> operations.
> > >
> > > The implementation provides a revocable attachment mechanism using
> > > dma-buf move operations. MMIO regions are normally pinned as BARs
> > > don't change physical addresses, but access is revoked when the VFIO
> > > device is closed or a PCI reset is issued. This ensures kernel
> > > self-defense against potentially hostile userspace.
> > >
> > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > ---
> > >  drivers/vfio/pci/Kconfig           |  20 ++
> > >  drivers/vfio/pci/Makefile          |   2 +
> > >  drivers/vfio/pci/vfio_pci_config.c |  22 +-
> > >  drivers/vfio/pci/vfio_pci_core.c   |  25 ++-
> > >  drivers/vfio/pci/vfio_pci_dmabuf.c | 321
> +++++++++++++++++++++++++++++
> > >  drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
> > >  include/linux/dma-buf.h            |   1 +
> > >  include/linux/vfio_pci_core.h      |   3 +
> > >  include/uapi/linux/vfio.h          |  19 ++
> > >  9 files changed, 431 insertions(+), 5 deletions(-)
> > >  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
> 
> <...>
> 
> > > +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
> > > +				 struct vfio_device_feature_dma_buf
> *dma_buf)
> > > +{
> > > +	struct pci_dev *pdev = vdev->pdev;
> > > +	u32 bar = dma_buf->region_index;
> > > +	u64 offset = dma_buf->offset;
> > > +	u64 len = dma_buf->length;
> > > +	resource_size_t bar_size;
> > > +	u64 sum;
> > > +
> > > +	/*
> > > +	 * For PCI the region_index is the BAR number like  everything else.
> > > +	 */
> > > +	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
> > > +		return -ENODEV;
> 
> <...>
> 
> > > +/**
> > > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > > + * regions selected.
> > > + *
> > > + * open_flags are the typical flags passed to open(2), eg O_RDWR,
> > > O_CLOEXEC,
> > > + * etc. offset/length specify a slice of the region to create the dmabuf
> from.
> > > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the
> > > dmabuf.
> > Any particular reason why you dropped the option (nr_ranges) of creating
> a
> > single dmabuf from multiple ranges of an MMIO region?
> 
> I did it for two reasons. First, I wanted to simplify the code in order
> to speed-up discussion over the patchset itself. Second, I failed to
> find justification for need of multiple ranges, as the number of BARs
> are limited by VFIO_PCI_ROM_REGION_INDEX (6) and same functionality
> can be achieved by multiple calls to DMABUF import.
I don't think the same functionality can be achieved by multiple calls to
dmabuf import. AFAIU, a dmabuf (as of today) is backed by a SGL that can
have multiple entries because it represents a scattered buffer (multiple
non-contiguous entries in System RAM or an MMIO region). But in this
patch you are constraining it such that only one entry associated with a
buffer would be included, which effectively means that we cannot create
a dmabuf to represent scattered buffers (located in a single MMIO region
such as VRAM or other device memory) anymore. 

> 
> >
> > Restricting the dmabuf to a single range (or having to create multiple
> dmabufs
> > to represent multiple regions/ranges associated with a single scattered
> buffer)
> > would be very limiting and may not work in all cases. For instance, in my
> use-case,
> > I am trying to share a large (4k mode) framebuffer (FB) located in GPU's
> VRAM
> > between two (p2p compatible) GPU devices. And, this would probably not
> work
> > given that allocating a large contiguous FB (nr_ranges = 1) in VRAM may
> not be
> > feasible when there is memory pressure.
> 
> Can you please help me and point to the place in the code where this can
> fail?
> I'm probably missing something basic as there are no large allocations
> in the current patchset.
Sorry, I was not very clear. What I meant is that it is not prudent to assume that
there will only be one range associated with an MMIO region which we need to
consider while creating a dmabuf. And, I was pointing out my use-case as an
example where vfio-pci needs to create a dmabuf for a large buffer (FB) that
would likely be scattered (and not contiguous) in an MMIO region (such as VRAM).

Let me further explain with my use-case. Here is a link to my Qemu-based test:
https://gitlab.freedesktop.org/Vivek/qemu/-/commit/b2bdb16d9cfaf55384c95b1f060f175ad1c89e95#81dc845f0babf39649c4e086e173375614111b4a_29_46

While exhaustively testing this case, I noticed that the Guest VM (GPU driver)
would occasionally create the buffer (represented by virtio_gpu_simple_resource,
for which we need to create a dmabuf) in such a way that there are multiple
entries (indicated by res->iov_cnt) that need to be included. This is the main
reason why I added support for nr_ranges > 1 to this patch/feature.

Furthermore, creating multiple dmabufs to represent each range of the same
buffer, like you suggest IIUC is suboptimal and does not align with how dmabuf
works currently.

> 
> >
> > Furthermore, since you are adding a new UAPI with this patch/feature, as
> you know,
> > we cannot go back and tweak it (to add support for nr_ranges > 1) should
> there
> > be a need in the future, but you can always use nr_ranges = 1 anytime.
> Therefore,
> > I think it makes sense to be flexible in terms of the number of ranges to
> include
> > while creating a dmabuf instead of restricting ourselves to one range.
> 
> I'm not a big fan of over-engineering. Let's first understand if this
> case is needed.
As explained above with my use-case, having support for nr_ranges > 1 is not
just nice to have but absolutely necessary. Otherwise, this feature would be
constrained to creating dmabufs for contiguous buffers (nr_ranges = 1) only,
which would limit its effectiveness as most GPU buffers are rarely contiguous.

Thanks,
Vivek

> 
> Thanks
> 
> >
> > Thanks,
> > Vivek
> >
> > > + *
> > > + * Return: The fd number on success, -1 and errno is set on failure.
> > > + */
> > > +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> > > +
> > > +struct vfio_device_feature_dma_buf {
> > > +	__u32	region_index;
> > > +	__u32	open_flags;
> > > +	__u64	offset;
> > > +	__u64	length;
> > > +};
> > > +
> > >  /* -------- API for Type1 VFIO IOMMU -------- */
> > >
> > >  /**
> > > --
> > > 2.50.1
> >

Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO regions

Posted by Leon Romanovsky 2 months, 1 week ago

On Fri, Jul 25, 2025 at 05:34:40AM +0000, Kasireddy, Vivek wrote:
> Hi Leon,
> 
> > Subject: Re: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO
> > regions
> > 
> > > >
> > > > From: Leon Romanovsky <leonro@nvidia.com>
> > > >
> > > > Add support for exporting PCI device MMIO regions through dma-buf,
> > > > enabling safe sharing of non-struct page memory with controlled
> > > > lifetime management. This allows RDMA and other subsystems to
> > import
> > > > dma-buf FDs and build them into memory regions for PCI P2P
> > operations.
> > > >
> > > > The implementation provides a revocable attachment mechanism using
> > > > dma-buf move operations. MMIO regions are normally pinned as BARs
> > > > don't change physical addresses, but access is revoked when the VFIO
> > > > device is closed or a PCI reset is issued. This ensures kernel
> > > > self-defense against potentially hostile userspace.
> > > >
> > > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > > > ---
> > > >  drivers/vfio/pci/Kconfig           |  20 ++
> > > >  drivers/vfio/pci/Makefile          |   2 +
> > > >  drivers/vfio/pci/vfio_pci_config.c |  22 +-
> > > >  drivers/vfio/pci/vfio_pci_core.c   |  25 ++-
> > > >  drivers/vfio/pci/vfio_pci_dmabuf.c | 321
> > +++++++++++++++++++++++++++++
> > > >  drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
> > > >  include/linux/dma-buf.h            |   1 +
> > > >  include/linux/vfio_pci_core.h      |   3 +
> > > >  include/uapi/linux/vfio.h          |  19 ++
> > > >  9 files changed, 431 insertions(+), 5 deletions(-)
> > > >  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
> > 
> > <...>
> > 
> > > > +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
> > > > +				 struct vfio_device_feature_dma_buf
> > *dma_buf)
> > > > +{
> > > > +	struct pci_dev *pdev = vdev->pdev;
> > > > +	u32 bar = dma_buf->region_index;
> > > > +	u64 offset = dma_buf->offset;
> > > > +	u64 len = dma_buf->length;
> > > > +	resource_size_t bar_size;
> > > > +	u64 sum;
> > > > +
> > > > +	/*
> > > > +	 * For PCI the region_index is the BAR number like  everything else.
> > > > +	 */
> > > > +	if (bar >= VFIO_PCI_ROM_REGION_INDEX)
> > > > +		return -ENODEV;
> > 
> > <...>
> > 
> > > > +/**
> > > > + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> > > > + * regions selected.
> > > > + *
> > > > + * open_flags are the typical flags passed to open(2), eg O_RDWR,
> > > > O_CLOEXEC,
> > > > + * etc. offset/length specify a slice of the region to create the dmabuf
> > from.
> > > > + * nr_ranges is the total number of (P2P DMA) ranges that comprise the
> > > > dmabuf.
> > > Any particular reason why you dropped the option (nr_ranges) of creating
> > a
> > > single dmabuf from multiple ranges of an MMIO region?
> > 
> > I did it for two reasons. First, I wanted to simplify the code in order
> > to speed-up discussion over the patchset itself. Second, I failed to
> > find justification for need of multiple ranges, as the number of BARs
> > are limited by VFIO_PCI_ROM_REGION_INDEX (6) and same functionality
> > can be achieved by multiple calls to DMABUF import.
> I don't think the same functionality can be achieved by multiple calls to
> dmabuf import. AFAIU, a dmabuf (as of today) is backed by a SGL that can
> have multiple entries because it represents a scattered buffer (multiple
> non-contiguous entries in System RAM or an MMIO region). 

I don't know all the reasons why SG was chosen, but one of the main
reasons is that DMA SG API was the only one possible way to handle p2p
transfers (peer2peer flag).


> But in this patch you are constraining it such that only one entry associated with a
> buffer would be included, which effectively means that we cannot create
> a dmabuf to represent scattered buffers (located in a single MMIO region
> such as VRAM or other device memory) anymore. 

Yes

> 
> > 
> > >
> > > Restricting the dmabuf to a single range (or having to create multiple
> > dmabufs
> > > to represent multiple regions/ranges associated with a single scattered
> > buffer)
> > > would be very limiting and may not work in all cases. For instance, in my
> > use-case,
> > > I am trying to share a large (4k mode) framebuffer (FB) located in GPU's
> > VRAM
> > > between two (p2p compatible) GPU devices. And, this would probably not
> > work
> > > given that allocating a large contiguous FB (nr_ranges = 1) in VRAM may
> > not be
> > > feasible when there is memory pressure.
> > 
> > Can you please help me and point to the place in the code where this can
> > fail?
> > I'm probably missing something basic as there are no large allocations
> > in the current patchset.
> Sorry, I was not very clear. What I meant is that it is not prudent to assume that
> there will only be one range associated with an MMIO region which we need to
> consider while creating a dmabuf. And, I was pointing out my use-case as an
> example where vfio-pci needs to create a dmabuf for a large buffer (FB) that
> would likely be scattered (and not contiguous) in an MMIO region (such as VRAM).
> 
> Let me further explain with my use-case. Here is a link to my Qemu-based test:
> https://gitlab.freedesktop.org/Vivek/qemu/-/commit/b2bdb16d9cfaf55384c95b1f060f175ad1c89e95#81dc845f0babf39649c4e086e173375614111b4a_29_46

Ohh, thanks. I'll add nr_ranges in next version. I see that you are
using same region_index for all ranges and this is how I would like to
keep it: "multiple nr_ranges, same region_index".

Thanks