From: Leon Romanovsky <leonro@nvidia.com>
Add support for exporting PCI device MMIO regions through dma-buf,
enabling safe sharing of non-struct page memory with controlled
lifetime management. This allows RDMA and other subsystems to import
dma-buf FDs and build them into memory regions for PCI P2P operations.
The implementation provides a revocable attachment mechanism using
dma-buf move operations. MMIO regions are normally pinned as BARs
don't change physical addresses, but access is revoked when the VFIO
device is closed or a PCI reset is issued. This ensures kernel
self-defense against potentially hostile userspace.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/vfio/pci/Makefile | 2 +
drivers/vfio/pci/vfio_pci_config.c | 22 +-
drivers/vfio/pci/vfio_pci_core.c | 17 ++
drivers/vfio/pci/vfio_pci_dmabuf.c | 398 +++++++++++++++++++++++++++++
drivers/vfio/pci/vfio_pci_priv.h | 23 ++
include/linux/vfio_pci_core.h | 3 +
include/uapi/linux/vfio.h | 25 ++
7 files changed, 486 insertions(+), 4 deletions(-)
create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..f9155e9c5f63 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,7 +2,9 @@
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
vfio-pci-y := vfio_pci.o
vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 8f02f236b5b4..1f6008eabf23 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
- if (!new_mem)
+ if (!new_mem) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
/*
* If the user is writing mem/io enable (new_mem/io) and we
@@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
*virt_cmd &= cpu_to_le16(~mask);
*virt_cmd |= cpu_to_le16(new_cmd & mask);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm)
static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev,
pci_power_t state)
{
- if (state >= PCI_D3hot)
+ if (state >= PCI_D3hot) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
vfio_pci_set_power_state(vdev, state);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos,
if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
}
@@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos,
if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 0c39368280d7..aa88c42db69b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -289,6 +289,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
* semaphore.
*/
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ vfio_pci_dma_buf_move(vdev, true);
+
if (vdev->pm_runtime_engaged) {
up_write(&vdev->memory_lock);
return -EINVAL;
@@ -372,6 +374,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
*/
down_write(&vdev->memory_lock);
__vfio_pci_runtime_pm_exit(vdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
@@ -692,6 +696,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
#endif
vfio_pci_core_disable(vdev);
+ vfio_pci_dma_buf_cleanup(vdev);
+
mutex_lock(&vdev->igate);
if (vdev->err_trigger) {
eventfd_ctx_put(vdev->err_trigger);
@@ -1224,7 +1230,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
*/
vfio_pci_set_power_state(vdev, PCI_D0);
+ vfio_pci_dma_buf_move(vdev, true);
ret = pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
return ret;
@@ -1513,6 +1522,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
+ case VFIO_DEVICE_FEATURE_DMA_BUF:
+ return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
default:
return -ENOTTY;
}
@@ -2098,6 +2109,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
ret = pcim_p2pdma_init(vdev->pdev);
if (ret)
return ret;
+ INIT_LIST_HEAD(&vdev->dmabufs);
#endif
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
@@ -2463,6 +2475,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
break;
}
+ vfio_pci_dma_buf_move(vdev, true);
vfio_pci_zap_bars(vdev);
}
@@ -2486,6 +2499,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
ret = pci_reset_bus(pdev);
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
+
vdev = list_last_entry(&dev_set->device_list,
struct vfio_pci_core_device, vdev.dev_set_list);
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
new file mode 100644
index 000000000000..838619f812aa
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS("DMA_BUF");
+
+struct vfio_pci_dma_buf {
+ struct dma_buf *dmabuf;
+ struct vfio_pci_core_device *vdev;
+ struct list_head dmabufs_elm;
+ size_t size;
+ struct phys_vec *phys_vec;
+ struct p2pdma_provider *provider;
+ u32 nr_ranges;
+ u8 revoked : 1;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ if (!attachment->peer2peer)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ /*
+ * There is no need in IOVA at all for this flow.
+ * We rely on attachment->priv == NULL as a marker
+ * for this mode.
+ */
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+ attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL);
+ if (!attachment->priv)
+ return -ENOMEM;
+
+ dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size);
+ return 0;
+}
+
+static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ kfree(attachment->priv);
+}
+
+static void fill_sg_entry(struct scatterlist *sgl, unsigned int length,
+ dma_addr_t addr)
+{
+ /*
+ * Follow the DMABUF rules for scatterlist, the struct page can be
+ * NULL'd for MMIO only memort.
+ */
+ sg_set_page(sgl, NULL, length, 0);
+ sg_dma_address(sgl) = addr;
+ sg_dma_len(sgl) = length;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+ struct dma_iova_state *state = attachment->priv;
+ struct phys_vec *phys_vec = priv->phys_vec;
+ unsigned long attrs = DMA_ATTR_MMIO;
+ unsigned int mapped_len = 0;
+ struct scatterlist *sgl;
+ struct sg_table *sgt;
+ dma_addr_t addr;
+ int ret, i;
+
+ dma_resv_assert_held(priv->dmabuf->resv);
+
+ if (priv->revoked)
+ return ERR_PTR(-ENODEV);
+
+ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+ if (!sgt)
+ return ERR_PTR(-ENOMEM);
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL | __GFP_ZERO);
+ if (ret)
+ goto err_kfree_sgt;
+
+ sgl = sgt->sgl;
+
+ for (i = 0; i < priv->nr_ranges; i++) {
+ if (!state) {
+ addr = pci_p2pdma_bus_addr_map(priv->provider,
+ phys_vec[i].paddr);
+ } else if (dma_use_iova(state)) {
+ ret = dma_iova_link(attachment->dev, state,
+ phys_vec[i].paddr, 0,
+ phys_vec[i].len, dir, attrs);
+ if (ret)
+ goto err_unmap_dma;
+
+ mapped_len += phys_vec[i].len;
+ } else {
+ addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,
+ phys_vec[i].len, dir, attrs);
+ ret = dma_mapping_error(attachment->dev, addr);
+ if (ret)
+ goto err_unmap_dma;
+ }
+
+ if (!state || !dma_use_iova(state)) {
+ /*
+ * In IOVA case, there is only one SG entry which spans
+ * for whole IOVA address space. So there is no need
+ * to call to sg_next() here.
+ */
+ fill_sg_entry(sgl, phys_vec[i].len, addr);
+ sgl = sg_next(sgl);
+ }
+ }
+
+ if (state && dma_use_iova(state)) {
+ WARN_ON_ONCE(mapped_len != priv->size);
+ ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);
+ if (ret)
+ goto err_unmap_dma;
+ fill_sg_entry(sgl, mapped_len, state->addr);
+ }
+
+ return sgt;
+
+err_unmap_dma:
+ if (!i || !state)
+ ; /* Do nothing */
+ else if (dma_use_iova(state))
+ dma_iova_destroy(attachment->dev, state, mapped_len, dir,
+ attrs);
+ else
+ for_each_sgtable_dma_sg(sgt, sgl, i)
+ dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
+ sg_dma_len(sgl), dir, attrs);
+ sg_free_table(sgt);
+err_kfree_sgt:
+ kfree(sgt);
+ return ERR_PTR(ret);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+ struct dma_iova_state *state = attachment->priv;
+ unsigned long attrs = DMA_ATTR_MMIO;
+ struct scatterlist *sgl;
+ int i;
+
+ if (!state)
+ ; /* Do nothing */
+ else if (dma_use_iova(state))
+ dma_iova_destroy(attachment->dev, state, priv->size, dir,
+ attrs);
+ else
+ for_each_sgtable_dma_sg(sgt, sgl, i)
+ dma_unmap_phys(attachment->dev, sg_dma_address(sgl),
+ sg_dma_len(sgl), dir, attrs);
+
+ sg_free_table(sgt);
+ kfree(sgt);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ /*
+ * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+ * The refcount prevents both.
+ */
+ if (priv->vdev) {
+ down_write(&priv->vdev->memory_lock);
+ list_del_init(&priv->dmabufs_elm);
+ up_write(&priv->vdev->memory_lock);
+ vfio_device_put_registration(&priv->vdev->vdev);
+ }
+ kfree(priv->phys_vec);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+ .attach = vfio_pci_dma_buf_attach,
+ .detach = vfio_pci_dma_buf_detach,
+ .map_dma_buf = vfio_pci_dma_buf_map,
+ .release = vfio_pci_dma_buf_release,
+ .unmap_dma_buf = vfio_pci_dma_buf_unmap,
+};
+
+static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
+ struct vfio_device_feature_dma_buf *dma_buf,
+ struct vfio_region_dma_range *dma_ranges,
+ struct p2pdma_provider *provider)
+{
+ struct pci_dev *pdev = priv->vdev->pdev;
+ phys_addr_t pci_start;
+ int i;
+
+ pci_start = pci_resource_start(pdev, dma_buf->region_index);
+ for (i = 0; i < dma_buf->nr_ranges; i++) {
+ priv->phys_vec[i].len = dma_ranges[i].length;
+ priv->phys_vec[i].paddr = pci_start + dma_ranges[i].offset;
+ priv->size += priv->phys_vec[i].len;
+ }
+ priv->nr_ranges = dma_buf->nr_ranges;
+ priv->provider = provider;
+}
+
+static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
+ struct vfio_device_feature_dma_buf *dma_buf,
+ struct vfio_region_dma_range *dma_ranges,
+ struct p2pdma_provider **provider)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ u32 bar = dma_buf->region_index;
+ resource_size_t bar_size;
+ u64 sum;
+ int i;
+
+ if (dma_buf->flags)
+ return -EINVAL;
+ /*
+ * For PCI the region_index is the BAR number like everything else.
+ */
+ if (bar >= VFIO_PCI_ROM_REGION_INDEX)
+ return -ENODEV;
+
+ *provider = pcim_p2pdma_provider(pdev, bar);
+ if (!provider)
+ return -EINVAL;
+
+ bar_size = pci_resource_len(pdev, bar);
+ for (i = 0; i < dma_buf->nr_ranges; i++) {
+ u64 offset = dma_ranges[i].offset;
+ u64 len = dma_ranges[i].length;
+
+ if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+ return -EINVAL;
+
+ if (check_add_overflow(offset, len, &sum) || sum > bar_size)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_dma_buf get_dma_buf = {};
+ struct vfio_region_dma_range *dma_ranges;
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct p2pdma_provider *provider;
+ struct vfio_pci_dma_buf *priv;
+ int ret;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+ sizeof(get_dma_buf));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
+ return -EFAULT;
+
+ if (!get_dma_buf.nr_ranges)
+ return -EINVAL;
+
+ dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
+ sizeof(*dma_ranges));
+ if (IS_ERR(dma_ranges))
+ return PTR_ERR(dma_ranges);
+
+ ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider);
+ if (ret)
+ return ret;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto err_free_ranges;
+ }
+ priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
+ GFP_KERNEL);
+ if (!priv->phys_vec) {
+ ret = -ENOMEM;
+ goto err_free_priv;
+ }
+
+ priv->vdev = vdev;
+ dma_ranges_to_p2p_phys(priv, &get_dma_buf, dma_ranges, provider);
+ kfree(dma_ranges);
+ dma_ranges = NULL;
+
+ if (!vfio_device_try_get_registration(&vdev->vdev)) {
+ ret = -ENODEV;
+ goto err_free_phys;
+ }
+
+ exp_info.ops = &vfio_pci_dmabuf_ops;
+ exp_info.size = priv->size;
+ exp_info.flags = get_dma_buf.open_flags;
+ exp_info.priv = priv;
+
+ priv->dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(priv->dmabuf)) {
+ ret = PTR_ERR(priv->dmabuf);
+ goto err_dev_put;
+ }
+
+ /* dma_buf_put() now frees priv */
+ INIT_LIST_HEAD(&priv->dmabufs_elm);
+ down_write(&vdev->memory_lock);
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = !__vfio_pci_memory_enabled(vdev);
+ list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+ dma_resv_unlock(priv->dmabuf->resv);
+ up_write(&vdev->memory_lock);
+
+ /*
+ * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+ * will be released.
+ */
+ return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+
+err_dev_put:
+ vfio_device_put_registration(&vdev->vdev);
+err_free_phys:
+ kfree(priv->phys_vec);
+err_free_priv:
+ kfree(priv);
+err_free_ranges:
+ kfree(dma_ranges);
+ return ret;
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!get_file_active(&priv->dmabuf->file))
+ continue;
+
+ if (priv->revoked != revoked) {
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ }
+ dma_buf_put(priv->dmabuf);
+ }
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ down_write(&vdev->memory_lock);
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!get_file_active(&priv->dmabuf->file))
+ continue;
+
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ list_del_init(&priv->dmabufs_elm);
+ priv->vdev = NULL;
+ priv->revoked = true;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ vfio_device_put_registration(&vdev->vdev);
+ dma_buf_put(priv->dmabuf);
+ }
+ up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index a9972eacb293..28a405f8b97c 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
}
+#ifdef CONFIG_VFIO_PCI_DMABUF
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static inline int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
+ struct vfio_device_feature_dma_buf __user *arg,
+ size_t argsz)
+{
+ return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+ bool revoked)
+{
+}
+#endif
+
#endif
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2..68afa18630d4 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -94,6 +94,9 @@ struct vfio_pci_core_device {
struct vfio_pci_core_device *sriov_pf_core_dev;
struct notifier_block nb;
struct rw_semaphore memory_lock;
+#ifdef CONFIG_VFIO_PCI_DMABUF
+ struct list_head dmabufs;
+#endif
};
/* Will be exported for vfio pci drivers usage */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..63214467c875 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master {
};
#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
+ * regions selected.
+ *
+ * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
+ * etc. offset/length specify a slice of the region to create the dmabuf from.
+ * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
+ *
+ * Return: The fd number on success, -1 and errno is set on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+
+struct vfio_region_dma_range {
+ __u64 offset;
+ __u64 length;
+};
+
+struct vfio_device_feature_dma_buf {
+ __u32 region_index;
+ __u32 open_flags;
+ __u32 flags;
+ __u32 nr_ranges;
+ struct vfio_region_dma_range dma_ranges[];
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
--
2.51.0
On Sun, 28 Sep 2025 17:50:20 +0300 Leon Romanovsky <leon@kernel.org> wrote: > +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, > + struct vfio_device_feature_dma_buf *dma_buf, > + struct vfio_region_dma_range *dma_ranges, > + struct p2pdma_provider **provider) > +{ > + struct pci_dev *pdev = vdev->pdev; > + u32 bar = dma_buf->region_index; > + resource_size_t bar_size; > + u64 sum; > + int i; > + > + if (dma_buf->flags) > + return -EINVAL; > + /* > + * For PCI the region_index is the BAR number like everything else. > + */ > + if (bar >= VFIO_PCI_ROM_REGION_INDEX) > + return -ENODEV; > + > + *provider = pcim_p2pdma_provider(pdev, bar); > + if (!provider) This needs to be IS_ERR_OR_NULL() or the function needs to settle on a consistent error return value regardless of CONFIG_PCI_P2PDMA. > + return -EINVAL; > + > + bar_size = pci_resource_len(pdev, bar); We get to this feature via vfio_pci_core_ioctl_feature(), which is used by several variant drivers, some of which mangle the BAR size exposed to the user, ex. hisi_acc. I'm afraid this might actually be giving dmabuf access to a portion of the BAR that isn't exposed otherwise. > + for (i = 0; i < dma_buf->nr_ranges; i++) { > + u64 offset = dma_ranges[i].offset; > + u64 len = dma_ranges[i].length; > + > + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) > + return -EINVAL; > + > + if (check_add_overflow(offset, len, &sum) || sum > bar_size) > + return -EINVAL; > + } > + > + return 0; > +} > + > +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, > + struct vfio_device_feature_dma_buf __user *arg, > + size_t argsz) > +{ > + struct vfio_device_feature_dma_buf get_dma_buf = {}; > + struct vfio_region_dma_range *dma_ranges; > + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); > + struct p2pdma_provider *provider; > + struct vfio_pci_dma_buf *priv; > + int ret; > + > + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, > + sizeof(get_dma_buf)); > + if (ret != 1) > + return ret; > + > + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) > + return -EFAULT; > + > + if (!get_dma_buf.nr_ranges) > + return -EINVAL; > + > + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, > + sizeof(*dma_ranges)); > + if (IS_ERR(dma_ranges)) > + return PTR_ERR(dma_ranges); > + > + ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider); > + if (ret) > + return ret; goto err_free_ranges; Thanks, Alex
On Mon, Sep 29, 2025 at 03:17:49PM -0600, Alex Williamson wrote: > On Sun, 28 Sep 2025 17:50:20 +0300 > Leon Romanovsky <leon@kernel.org> wrote: > > +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev, > > + struct vfio_device_feature_dma_buf *dma_buf, > > + struct vfio_region_dma_range *dma_ranges, > > + struct p2pdma_provider **provider) > > +{ > > + struct pci_dev *pdev = vdev->pdev; > > + u32 bar = dma_buf->region_index; > > + resource_size_t bar_size; > > + u64 sum; > > + int i; > > + > > + if (dma_buf->flags) > > + return -EINVAL; > > + /* > > + * For PCI the region_index is the BAR number like everything else. > > + */ > > + if (bar >= VFIO_PCI_ROM_REGION_INDEX) > > + return -ENODEV; > > + > > + *provider = pcim_p2pdma_provider(pdev, bar); > > + if (!provider) > > This needs to be IS_ERR_OR_NULL() or the function needs to settle on a > consistent error return value regardless of CONFIG_PCI_P2PDMA. pcim_p2pdma_provider() doesn't return errors after split to _init() and _get(). The more accurate check needs to be if (!*provider) and not what is written. > > > + return -EINVAL; > > + > > + bar_size = pci_resource_len(pdev, bar); > > We get to this feature via vfio_pci_core_ioctl_feature(), which is used > by several variant drivers, some of which mangle the BAR size exposed > to the user, ex. hisi_acc. I'm afraid this might actually be giving > dmabuf access to a portion of the BAR that isn't exposed otherwise. Doe you mean that part? 1185 static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) 1186 { ... 1204 * Also the HiSilicon ACC VF devices supported by this driver on 1205 * HiSilicon hardware platforms are integrated end point devices 1206 * and the platform lacks the capability to perform any PCIe P2P 1207 * between these devices. 1208 */ 1209 1210 vf_qm->io_base = 1211 ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), 1212 pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); 1213 if (!vf_qm->io_base) 1214 return -EIO; 1215 According to the comment, it doesn't support p2p and in any case we will fail that platform in vfio_pci_dma_buf_attach() by taking "default" case: 34 switch (pci_p2pdma_map_type(priv->provider, attachment->dev)) { 35 case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: 36 break; 37 case PCI_P2PDMA_MAP_BUS_ADDR: 38 /* 39 * There is no need in IOVA at all for this flow. 40 * We rely on attachment->priv == NULL as a marker 41 * for this mode. 42 */ 43 return 0; 44 default: 45 return -EINVAL; 46 } 47 > > > + for (i = 0; i < dma_buf->nr_ranges; i++) { > > + u64 offset = dma_ranges[i].offset; > > + u64 len = dma_ranges[i].length; > > + > > + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) > > + return -EINVAL; > > + > > + if (check_add_overflow(offset, len, &sum) || sum > bar_size) > > + return -EINVAL; > > + } > > + > > + return 0; > > +} > > + > > +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, > > + struct vfio_device_feature_dma_buf __user *arg, > > + size_t argsz) > > +{ > > + struct vfio_device_feature_dma_buf get_dma_buf = {}; > > + struct vfio_region_dma_range *dma_ranges; > > + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); > > + struct p2pdma_provider *provider; > > + struct vfio_pci_dma_buf *priv; > > + int ret; > > + > > + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, > > + sizeof(get_dma_buf)); > > + if (ret != 1) > > + return ret; > > + > > + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) > > + return -EFAULT; > > + > > + if (!get_dma_buf.nr_ranges) > > + return -EINVAL; > > + > > + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, > > + sizeof(*dma_ranges)); > > + if (IS_ERR(dma_ranges)) > > + return PTR_ERR(dma_ranges); > > + > > + ret = validate_dmabuf_input(vdev, &get_dma_buf, dma_ranges, &provider); > > + if (ret) > > + return ret; > > goto err_free_ranges; Thanks > > Thanks, > Alex > >
© 2016 - 2025 Red Hat, Inc.