A VFIO DMABUF can export a subset of a BAR to userspace by fd; add
support for mmap() of this fd. This provides another route for a
process to map BARs, except one where the process can only map a specific
subset of a BAR represented by the exported DMABUF.
mmap() support enables userspace driver designs that safely delegate
access to BAR sub-ranges to other client processes by sharing a DMABUF
fd, without having to share the (omnipotent) VFIO device fd with them.
The mmap callback installs vm_ops callbacks for .fault and .huge_fault;
they find a PFN by searching the DMABUF's physical ranges. That is,
DMABUFs with multiple ranges are supported for mmap().
Signed-off-by: Matt Evans <mattev@meta.com>
---
drivers/vfio/pci/vfio_pci_dmabuf.c | 219 +++++++++++++++++++++++++++++
1 file changed, 219 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 46ab64fbeb19..bebb496bd0f2 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -85,6 +85,209 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
kfree(priv);
}
+static int vfio_pci_dma_buf_find_pfn(struct device *dev,
+ struct vfio_pci_dma_buf *vpdmabuf,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned int order,
+ unsigned long *out_pfn)
+{
+ /*
+ * Given a VMA (start, end, pgoffs) and a fault address,
+ * search phys_vec[] to find the range representing the
+ * address's offset into the VMA (and so a PFN).
+ *
+ * The phys_vec ranges represent contiguous spans of VAs
+ * upwards from the buffer offset 0; the actual PFNs might be
+ * in any order, overlap/alias, etc. Calculate an offset of
+ * the desired page given VMA start/pgoff and address, then
+ * search upwards from 0 to find which span contains it.
+ *
+ * On success, a valid PFN for a page sized by 'order' is
+ * returned into out_pfn.
+ *
+ * Failure occurs if:
+ * - The page would cross the edge of the VMA
+ * - The page isn't entirely contained within a range
+ * - We find a range, but the final PFN isn't aligned to the
+ * requested order.
+ *
+ * (Upon failure, the caller is expected to try again with a
+ * smaller order; the tests above will always succeed for
+ * order=0 as the limit case.)
+ *
+ * It's suboptimal if DMABUFs are created with neigbouring
+ * ranges that are physically contiguous, since hugepages
+ * can't straddle range boundaries. (The construction of the
+ * ranges vector should merge such ranges.)
+ */
+
+ unsigned long rounded_page_addr = address & ~((PAGE_SIZE << order) - 1);
+ unsigned long rounded_page_end = rounded_page_addr + (PAGE_SIZE << order);
+ unsigned long buf_page_offset;
+ unsigned long buf_offset = 0;
+ unsigned int i;
+
+ if (rounded_page_addr < vma->vm_start || rounded_page_end > vma->vm_end)
+ return -EAGAIN;
+
+ if (unlikely(check_add_overflow(rounded_page_addr - vma->vm_start,
+ vma->vm_pgoff << PAGE_SHIFT, &buf_page_offset)))
+ return -EFAULT;
+
+ for (i = 0; i < vpdmabuf->nr_ranges; i++) {
+ unsigned long range_len = vpdmabuf->phys_vec[i].len;
+ unsigned long range_start = vpdmabuf->phys_vec[i].paddr;
+
+ if (buf_page_offset >= buf_offset &&
+ buf_page_offset + (PAGE_SIZE << order) <= buf_offset + range_len) {
+ /*
+ * The faulting page is wholly contained
+ * within the span represented by the range.
+ * Validate PFN alignment for the order:
+ */
+ unsigned long pfn = (range_start >> PAGE_SHIFT) +
+ ((buf_page_offset - buf_offset) >> PAGE_SHIFT);
+
+ if (IS_ALIGNED(pfn, 1 << order)) {
+ *out_pfn = pfn;
+ return 0;
+ }
+ /* Retry with smaller order */
+ return -EAGAIN;
+ }
+ buf_offset += range_len;
+ }
+
+ /*
+ * If we get here, the address fell outside of the span
+ * represented by the (concatenated) ranges. This can
+ * never happen because vfio_pci_dma_buf_mmap() checks that
+ * the VMA is <= the total size of the ranges.
+ *
+ * But if it does, force SIGBUS for the access, and warn.
+ */
+ WARN_ONCE(1, "No range for addr 0x%lx, order %d: VMA 0x%lx-0x%lx pgoff 0x%lx, %d ranges, size 0x%lx\n",
+ address, order, vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vpdmabuf->nr_ranges, vpdmabuf->size);
+
+ return -EFAULT;
+}
+
+static vm_fault_t vfio_pci_dma_buf_mmap_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vfio_pci_dma_buf *priv = vma->vm_private_data;
+ struct vfio_pci_core_device *vdev;
+ unsigned long pfn;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ vdev = READ_ONCE(priv->vdev);
+
+ /*
+ * A fault for an existing mmap might occur after
+ * vfio_pci_dma_buf_cleanup() has revoked and destroyed the
+ * vdev's DMABUFs, and annulled vdev. After creation, vdev is
+ * only ever written in cleanup.
+ */
+ if (!vdev)
+ return VM_FAULT_SIGBUS;
+
+ int r = vfio_pci_dma_buf_find_pfn(&vdev->pdev->dev, priv, vma,
+ vmf->address, order, &pfn);
+
+ if (r == 0) {
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ /* Deal with the possibility of a fault racing
+ * with vfio_pci_dma_buf_move() revoking and
+ * then unmapping the buffer. The
+ * revocation/unmap and status change occurs
+ * whilst holding memory_lock.
+ */
+ if (priv->revoked)
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+ }
+ } else if (r != -EAGAIN) {
+ ret = VM_FAULT_SIGBUS;
+ }
+
+ dev_dbg_ratelimited(&vdev->pdev->dev,
+ "%s(order = %d) PFN 0x%lx, VA 0x%lx, pgoff 0x%lx: 0x%x\n",
+ __func__, order, pfn, vmf->address, vma->vm_pgoff, (unsigned int)ret);
+
+ return ret;
+}
+
+static vm_fault_t vfio_pci_dma_buf_mmap_page_fault(struct vm_fault *vmf)
+{
+ return vfio_pci_dma_buf_mmap_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct vfio_pci_dma_buf_mmap_ops = {
+ .fault = vfio_pci_dma_buf_mmap_page_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+ .huge_fault = vfio_pci_dma_buf_mmap_huge_fault,
+#endif
+};
+
+static bool vfio_pci_dma_buf_is_mappable(struct dma_buf *dmabuf)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ /*
+ * Sanity checks at mmap() time; alignment has already been
+ * asserted by validate_dmabuf_input().
+ *
+ * Although the revoked state is transient, refuse to map a
+ * revoked buffer to flag early that something odd is going
+ * on: for example, users should not be mmap()ing a buffer
+ * that's being moved [by a user-triggered activity].
+ */
+ if (priv->revoked)
+ return false;
+
+ return true;
+}
+
+/*
+ * Similar to vfio_pci_core_mmap() for a regular VFIO device fd, but
+ * differs by pre-checks performed and ultimately the vm_ops installed.
+ */
+static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+ u64 req_len, req_start;
+
+ if (!vfio_pci_dma_buf_is_mappable(dmabuf))
+ return -ENODEV;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+
+ req_len = vma->vm_end - vma->vm_start;
+ req_start = vma->vm_pgoff << PAGE_SHIFT;
+
+ if (req_start + req_len > priv->size)
+ return -EINVAL;
+
+ vma->vm_private_data = priv;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+ /*
+ * See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED.
+ *
+ * FIXME: get mapping attributes from dmabuf?
+ */
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
+ vma->vm_ops = &vfio_pci_dma_buf_mmap_ops;
+
+ return 0;
+}
+
static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
.pin = vfio_pci_dma_buf_pin,
.unpin = vfio_pci_dma_buf_unpin,
@@ -92,6 +295,7 @@ static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
.map_dma_buf = vfio_pci_dma_buf_map,
.unmap_dma_buf = vfio_pci_dma_buf_unmap,
.release = vfio_pci_dma_buf_release,
+ .mmap = vfio_pci_dma_buf_mmap,
};
/*
@@ -335,6 +539,11 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
struct vfio_pci_dma_buf *tmp;
lockdep_assert_held_write(&vdev->memory_lock);
+ /*
+ * Holding memory_lock ensures a racing
+ * vfio_pci_dma_buf_mmap_*_fault() observes priv->revoked
+ * properly.
+ */
list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
if (!get_file_active(&priv->dmabuf->file))
@@ -345,6 +554,14 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
priv->revoked = revoked;
dma_buf_move_notify(priv->dmabuf);
dma_resv_unlock(priv->dmabuf->resv);
+
+ /*
+ * Unmap any possible userspace mappings for a
+ * now-revoked DMABUF:
+ */
+ if (revoked)
+ unmap_mapping_range(priv->dmabuf->file->f_mapping,
+ 0, priv->size, 1);
}
fput(priv->dmabuf->file);
}
@@ -366,6 +583,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
priv->revoked = true;
dma_buf_move_notify(priv->dmabuf);
dma_resv_unlock(priv->dmabuf->resv);
+ unmap_mapping_range(priv->dmabuf->file->f_mapping,
+ 0, priv->size, 1);
vfio_device_put_registration(&vdev->vdev);
fput(priv->dmabuf->file);
}
--
2.47.3
On 2/26/26 21:21, Matt Evans wrote:
> A VFIO DMABUF can export a subset of a BAR to userspace by fd; add
> support for mmap() of this fd. This provides another route for a
> process to map BARs, except one where the process can only map a specific
> subset of a BAR represented by the exported DMABUF.
>
> mmap() support enables userspace driver designs that safely delegate
> access to BAR sub-ranges to other client processes by sharing a DMABUF
> fd, without having to share the (omnipotent) VFIO device fd with them.
>
> The mmap callback installs vm_ops callbacks for .fault and .huge_fault;
> they find a PFN by searching the DMABUF's physical ranges. That is,
> DMABUFs with multiple ranges are supported for mmap().
In general sounds like a good idea but this approach here doesn't looks good at all.
Especially how you call unmap_mapping_range() from your DMA-buf cleanup path looks extremely questionable.
...
> +/*
> + * Similar to vfio_pci_core_mmap() for a regular VFIO device fd, but
> + * differs by pre-checks performed and ultimately the vm_ops installed.
> + */
> +static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> +{
> + struct vfio_pci_dma_buf *priv = dmabuf->priv;
> + u64 req_len, req_start;
> +
> + if (!vfio_pci_dma_buf_is_mappable(dmabuf))
> + return -ENODEV;
> + if ((vma->vm_flags & VM_SHARED) == 0)
> + return -EINVAL;
> +
> + req_len = vma->vm_end - vma->vm_start;
> + req_start = vma->vm_pgoff << PAGE_SHIFT;
> +
> + if (req_start + req_len > priv->size)
> + return -EINVAL;
> +
> + vma->vm_private_data = priv;
> + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
> +
> + /*
> + * See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED.
> + *
> + * FIXME: get mapping attributes from dmabuf?
> + */
> + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
> + VM_DONTEXPAND | VM_DONTDUMP);
> + vma->vm_ops = &vfio_pci_dma_buf_mmap_ops;
> +
> + return 0;
Let's start with this here, it just looks horrible over complicated.
When a DMA-buf just represents a linear piece of BAR which is map-able through the VFIO FD anyway then the right approach is to just re-direct the mapping to this VFIO FD.
Roughly something like this here should do it:
vma->vm_pgoff += offset_which_your_dma_buf_represents;
vma_set_file(vma, core_dev->file);
vfio_pci_core_mmap(core_dev, vma);
It can be that you want additional checks (e.g. if the DMA-buf is revoked) in which case you would need to override the vma->vm_ops, but then just do the access checks and call the vfio_pci_mmap_ops to get the actually page fault handling done.
>+ unmap_mapping_range(priv->dmabuf->file->f_mapping,
>+ 0, priv->size, 1);
When you need to use unmap_mapping_range() then you usually share the address space object between the file descriptor exporting the DMA-buf and the DMA-buf fd itself.
Otherwise functions like vfio_pci_zap_bars() doesn't work correctly any more and that usually creates a huge bunch of problems.
Regards,
Christian.
On Fri, Feb 27, 2026 at 11:09:31AM +0100, Christian König wrote: > When a DMA-buf just represents a linear piece of BAR which is > map-able through the VFIO FD anyway then the right approach is to > just re-direct the mapping to this VFIO FD. I actually would like to go the other way and have VFIO always have a DMABUF under the VMA's it mmaps because that will make it easy to finish the type1 emulation which requires finding dmabufs for the VMAs. > It can be that you want additional checks (e.g. if the DMA-buf is > revoked) in which case you would need to override the vma->vm_ops, > but then just do the access checks and call the vfio_pci_mmap_ops to > get the actually page fault handling done. It isn't that simple, the vm_ops won't have a way to get back to the dmabuf from the vma to find the per-fd revoke flag to check it. > >+ unmap_mapping_range(priv->dmabuf->file->f_mapping, > >+ 0, priv->size, 1); > > When you need to use unmap_mapping_range() then you usually share > the address space object between the file descriptor exporting the > DMA-buf and the DMA-buf fd itself. Yeah, this becomes problematic. Right now there is a single address space per vfio-device and the invalidation is global. Possibly for this use case you can keep that and do a global unmap and rely on fault to restore the mmaps that were not revoked. Jason
Hi Jason + Christian, On 27/02/2026 12:51, Jason Gunthorpe wrote: > On Fri, Feb 27, 2026 at 11:09:31AM +0100, Christian König wrote: > >> When a DMA-buf just represents a linear piece of BAR which is >> map-able through the VFIO FD anyway then the right approach is to >> just re-direct the mapping to this VFIO FD. We think limiting this to one range per DMABUF isn't enough, i.e. supporting multiple ranges will be a benefit. Bumping vm_pgoff to then reuse vfio_pci_mmap_ops is a really nice suggestion for the simplest case, but can't support multiple ranges; the .fault() needs to be aware of the non-linear DMABUF layout. > I actually would like to go the other way and have VFIO always have a > DMABUF under the VMA's it mmaps because that will make it easy to > finish the type1 emulation which requires finding dmabufs for the > VMAs. > >> It can be that you want additional checks (e.g. if the DMA-buf is >> revoked) in which case you would need to override the vma->vm_ops, >> but then just do the access checks and call the vfio_pci_mmap_ops to >> get the actually page fault handling done. > > It isn't that simple, the vm_ops won't have a way to get back to the > dmabuf from the vma to find the per-fd revoke flag to check it. Sounds like the suggestion is just to reuse vfio_pci_mmap_*fault(), i.e. install "interposer" vm_ops for some new 'fault_but_check_revoke()' to then call down to the existing vfio_pci_mmap_*fault(), after fishing the DMABUF out of vm_private_data. (Like the proposed vfio_pci_dma_buf_mmap_huge_fault() does.) Putting aside the above point of needing a new .fault() able to find a PFN for >1 range for a mo, how would the test of the revoked flag work w.r.t. synchronisation and protecting against a racing revoke? It's not safe to take memory_lock, test revoked, unlock, then hand over to the existing vfio_pci_mmap_*fault() -- which re-takes the lock. I'm not quite seeing how we could reuse the existing vfio_pci_mmap_*fault(), TBH. I did briefly consider refactoring that existing .fault() code, but that makes both paths uglier. To summarise, I think we still - need a new fops->mmap() to link vfio_pci_dma_buf into vm_private_data, and determine WC attrs - need a new vm_ops->fault() to test dmabuf->revoked/status and determine map vs fault with memory_lock held, and to determine the PFN from >1 DMABUF ranges >>> + unmap_mapping_range(priv->dmabuf->file->f_mapping, >>> + 0, priv->size, 1); >> >> When you need to use unmap_mapping_range() then you usually share >> the address space object between the file descriptor exporting the >> DMA-buf and the DMA-buf fd itself. > > Yeah, this becomes problematic. Right now there is a single address > space per vfio-device and the invalidation is global. > > Possibly for this use case you can keep that and do a global unmap and > rely on fault to restore the mmaps that were not revoked. Hm, that'd be functional, but we should consider huge BARs with a lot of PTEs (even huge ones); zapping all BARs might noticeably disturb other clients. But see my query below please, if we could zap just the resource being reclaimed that would be preferable. >> Otherwise functions like vfio_pci_zap_bars() doesn't work correctly >> any more and that usually creates a huge bunch of problems. I'd reasoned it was OK for the DMABUF to have its own unique address space -- even though IIUC that means an unmap_mapping_range() by vfio_pci_core_device won't affect a DMABUF's mappings -- because anything that needs to zap a BAR _also_ must already plan to notify DMABUF importers via vfio_pci_dma_buf_move(). And then, vfio_pci_dma_buf_move() will zap the mappings. Are there paths that _don't_ always pair vfio_pci_zap_bars() with a vfio_pci_dma_buf_move()? I'm sure I'm missing something, so question phrased as a statement: The only way that mappings could be missed would be if some path forgets to ...buf_move() when zapping the BARs, but that'd be a problem for importers regardless of whether they can now also be mmap()ed, no? I don't want to flout convention for the sake of it, and am keen to learn more, so please gently explain in more detail: Why must we associate the DMABUFs with the VFIO address space [by sharing the AS object between the VFIO fd exporting the DMABUF and the DMABUF fd]? Many thanks, Matt
On Fri, Feb 27, 2026 at 07:42:08PM +0000, Matt Evans wrote: > Hi Jason + Christian, > > On 27/02/2026 12:51, Jason Gunthorpe wrote: > > On Fri, Feb 27, 2026 at 11:09:31AM +0100, Christian König wrote: > > > >> When a DMA-buf just represents a linear piece of BAR which is > >> map-able through the VFIO FD anyway then the right approach is to > >> just re-direct the mapping to this VFIO FD. > > We think limiting this to one range per DMABUF isn't enough, > i.e. supporting multiple ranges will be a benefit. > > Bumping vm_pgoff to then reuse vfio_pci_mmap_ops is a really nice > suggestion for the simplest case, but can't support multiple ranges; > the .fault() needs to be aware of the non-linear DMABUF layout. Sigh, yes that's right we have the non-linear thing, and if you need that to work it can't use the existing code. > > I actually would like to go the other way and have VFIO always have a > > DMABUF under the VMA's it mmaps because that will make it easy to > > finish the type1 emulation which requires finding dmabufs for the > > VMAs. This is a still better idea since it avoid duplicating the VMA flow into two parts.. > Putting aside the above point of needing a new .fault() able to find a > PFN for >1 range for a mo, how would the test of the revoked flag work > w.r.t. synchronisation and protecting against a racing revoke? It's not > safe to take memory_lock, test revoked, unlock, then hand over to the > existing vfio_pci_mmap_*fault() -- which re-takes the lock. I'm not > quite seeing how we could reuse the existing vfio_pci_mmap_*fault(), > TBH. I did briefly consider refactoring that existing .fault() code, > but that makes both paths uglier. More reasons to do the above.. > > Possibly for this use case you can keep that and do a global unmap and > > rely on fault to restore the mmaps that were not revoked. > > Hm, that'd be functional, but we should consider huge BARs with a lot of > PTEs (even huge ones); zapping all BARs might noticeably disturb other > clients. But see my query below please, if we could zap just the > resource being reclaimed that would be preferable. Hurm. Otherwise you have to create a bunch of address spaces and juggle them. > >> Otherwise functions like vfio_pci_zap_bars() doesn't work correctly > >> any more and that usually creates a huge bunch of problems. > > I'd reasoned it was OK for the DMABUF to have its own unique address > space -- even though IIUC that means an unmap_mapping_range() by > vfio_pci_core_device won't affect a DMABUF's mappings -- because > anything that needs to zap a BAR _also_ must already plan to notify > DMABUF importers via vfio_pci_dma_buf_move(). And then, > vfio_pci_dma_buf_move() will zap the mappings. That might be correct, but if then it is yet another reason to do the first point and remove the shared address_space fully. Basically one mmap flow that always uses dma-buf and always uses a per-dma-buf address space with a per-FD revoke and so on and so forth. This way there is still one of everything, we just pay a bit of cost to automatically create a dmabuf file * in the existing path. > Are there paths that _don't_ always pair vfio_pci_zap_bars() with a > vfio_pci_dma_buf_move()? There should not be. Jason
On Fri, Feb 27, 2026 at 03:48:07PM -0400, Jason Gunthorpe wrote: > > > I actually would like to go the other way and have VFIO always have a > > > DMABUF under the VMA's it mmaps because that will make it easy to > > > finish the type1 emulation which requires finding dmabufs for the > > > VMAs. > > This is a still better idea since it avoid duplicating the VMA flow > into two parts.. I suppose this would also compose with your idea to use dma-buf for iommufd_compat support of VFIO_IOMMU_MAP_DMA of vfio device fd-backed mmap()s [1]? Instead of needing to materialize a new dma-buf, you could use the existing backing one? [1] https://lore.kernel.org/all/20260108141044.GC545276@ziepe.ca/
On Fri, Feb 27, 2026 at 01:52:15PM -0800, Alex Mastro wrote: > On Fri, Feb 27, 2026 at 03:48:07PM -0400, Jason Gunthorpe wrote: > > > > I actually would like to go the other way and have VFIO always have a > > > > DMABUF under the VMA's it mmaps because that will make it easy to > > > > finish the type1 emulation which requires finding dmabufs for the > > > > VMAs. > > > > This is a still better idea since it avoid duplicating the VMA flow > > into two parts.. > > I suppose this would also compose with your idea to use dma-buf for > iommufd_compat support of VFIO_IOMMU_MAP_DMA of vfio device fd-backed mmap()s > [1]? Instead of needing to materialize a new dma-buf, you could use the existing > backing one? Yeah, that too I think it is a fairly easy progression: 1) mmap_prepare() allocates a new dmabuf file * and sticks it in desc->vm_file. Rework so all the vma_ops are using vm_file that is a dmabuf. The allocated dmabuf has a singleton range 2) Teach the fault handlers to support full range semantics 3) Use dmabuf revoke variables/etc in the mmap fault handlers 4) Move the address space from the vfio to the dmabuf 5) Allow mmaping the dmabuf fd directly which is now only a couple lines I forget how all the different mmap implementations in vfio interact though - but I think the above is good for vfio-pci Jason
On 2/27/26 23:04, Jason Gunthorpe wrote: > On Fri, Feb 27, 2026 at 01:52:15PM -0800, Alex Mastro wrote: >> On Fri, Feb 27, 2026 at 03:48:07PM -0400, Jason Gunthorpe wrote: >>>>> I actually would like to go the other way and have VFIO always have a >>>>> DMABUF under the VMA's it mmaps because that will make it easy to >>>>> finish the type1 emulation which requires finding dmabufs for the >>>>> VMAs. >>> >>> This is a still better idea since it avoid duplicating the VMA flow >>> into two parts.. >> >> I suppose this would also compose with your idea to use dma-buf for >> iommufd_compat support of VFIO_IOMMU_MAP_DMA of vfio device fd-backed mmap()s >> [1]? Instead of needing to materialize a new dma-buf, you could use the existing >> backing one? > > Yeah, that too > > I think it is a fairly easy progression: > > 1) mmap_prepare() allocates a new dmabuf file * and sticks it in > desc->vm_file. Rework so all the vma_ops are using vm_file that is > a dmabuf. The allocated dmabuf has a singleton range Interesting approach to fix this, but I would suggest something even simpler: Use the same structure as base class for the VFIO and DMA-buf file for your vma->vm_file->private_data object. The DMA-buf file actually contains the real ranges exposed by it and pointing to the exporting VFIO, while the one for the VFIO is just a dummy covering the whole range and pointing to itself. This way you should be able to use the same vm_operations_struct for VMAs mapped through both DMA-buf and the VFIO file descriptors. Independent of how you implement this just one additional warning: huge_fault has caused a number of really hard to debug problems on x86. As far as I know background is that on x86 pte_special() only works on true leave pte but not pmd/pud. That in turn results in some nasty surprises when your PFNs are potentially backed by struct pages, e.g. for direct I/O. For example on the resulting mmap() get_user_pages_fast() works, but get_user_pages() doesn't. I hope that those problems aren't applicable here, but if it is Thomas from the Intel XE team can give you more details on that stuff. Regards, Christian. > 2) Teach the fault handlers to support full range semantics > 3) Use dmabuf revoke variables/etc in the mmap fault handlers > 4) Move the address space from the vfio to the dmabuf > 5) Allow mmaping the dmabuf fd directly which is now only a couple lines > > I forget how all the different mmap implementations in vfio interact > though - but I think the above is good for vfio-pci > > Jason
On Mon, Mar 02, 2026 at 11:07:41AM +0100, Christian König wrote:
> As far as I know background is that on x86 pte_special() only works
> on true leave pte but not pmd/pud.
This is not the case, there are pmd and pud_special as well, protected
by CONFIG_xx
The arch should not define CONFIG_ARCH_SUPPORTS_PMD_PFNMAP if
vmf_insert_pfn_pmd() doesn't result in pmd_special() working, for
example.
eg:
vmf_insert_pfn_pmd()
insert_pmd()
if (fop.is_folio) {
// Not Taken
} else {
entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
This stuff was all put together by Peter specifically for VFIO to use,
AFAIK it is correct.
IDK what Thomas was using, but if you tried to do huge faults before
all of this was built it definitely would not work right as it only
supported a folio backed path.
Jason
On 3/2/26 13:54, Jason Gunthorpe wrote:
> On Mon, Mar 02, 2026 at 11:07:41AM +0100, Christian König wrote:
>
>> As far as I know background is that on x86 pte_special() only works
>> on true leave pte but not pmd/pud.
>
> This is not the case, there are pmd and pud_special as well, protected
> by CONFIG_xx
>
> The arch should not define CONFIG_ARCH_SUPPORTS_PMD_PFNMAP if
> vmf_insert_pfn_pmd() doesn't result in pmd_special() working, for
> example.
>
> eg:
>
> vmf_insert_pfn_pmd()
> insert_pmd()
>
> if (fop.is_folio) {
> // Not Taken
> } else {
> entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
> entry = pmd_mkspecial(entry);
>
> This stuff was all put together by Peter specifically for VFIO to use,
> AFAIK it is correct.
Oh that is really nice to know, thanks for that information. It means we could give that approach another try.
> IDK what Thomas was using, but if you tried to do huge faults before
> all of this was built it definitely would not work right as it only
> supported a folio backed path.
Yeah Thomas tried that like ~6years ago and my educated guess is that the whole infrastructure was just not there at that time.
Christian.
>
> Jason
On Fri, Feb 27, 2026 at 01:52:15PM -0800, Alex Mastro wrote: > On Fri, Feb 27, 2026 at 03:48:07PM -0400, Jason Gunthorpe wrote: > > > > I actually would like to go the other way and have VFIO always have a > > > > DMABUF under the VMA's it mmaps because that will make it easy to > > > > finish the type1 emulation which requires finding dmabufs for the > > > > VMAs. > > > > This is a still better idea since it avoid duplicating the VMA flow > > into two parts.. > > I suppose this would also compose with your idea to use dma-buf for > iommufd_compat support of VFIO_IOMMU_MAP_DMA of vfio device fd-backed mmap()s > [1]? Instead of needing to materialize a new dma-buf, you could use the existing > backing one? > > [1] https://lore.kernel.org/all/20260108141044.GC545276@ziepe.ca/ Sorry, I can't read. That's literally what you said!
© 2016 - 2026 Red Hat, Inc.