From: Manish Honap <mhonap@nvidia.com>
Wire the CXL DPA range up as a VFIO demand-paged region so QEMU can
mmap guest device memory directly. Faults call vmf_insert_pfn() to
insert one PFN at a time rather than mapping the full range upfront.
CXL region lifecycle:
- The CXL memory region is registered with VFIO layer during
vfio_pci_open_device
- mmap() establishes the VMA with vm_ops but inserts no PTEs
- Each guest page fault calls vfio_cxl_region_page_fault() which
inserts a single PFN under the memory_lock read side
- On device reset, vfio_cxl_zap_region_locked() sets region_active=false
and calls unmap_mapping_range() to invalidate all DPA PTEs atomically
while holding memory_lock for writing
- Faults racing with reset see region_active==false and return
VM_FAULT_SIGBUS
- vfio_cxl_reactivate_region() restores region_active after successful
hardware reset
Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so
that FLR correctly invalidates DPA mappings and restores them on success.
Co-developed-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
drivers/vfio/pci/cxl/vfio_cxl_core.c | 187 +++++++++++++++++++++++++++
drivers/vfio/pci/cxl/vfio_cxl_emu.c | 2 +-
drivers/vfio/pci/cxl/vfio_cxl_priv.h | 3 +
drivers/vfio/pci/vfio_pci_core.c | 11 ++
drivers/vfio/pci/vfio_pci_priv.h | 6 +
5 files changed, 208 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index 30b365b91903..19d3dc205f99 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -435,4 +435,191 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
vfio_cxl_destroy_cxl_region(cxl);
}
+static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf)
+{
+ struct vfio_pci_region *region = vmf->vma->vm_private_data;
+ struct vfio_pci_cxl_state *cxl = region->data;
+ unsigned long pgoff;
+ unsigned long pfn;
+
+ if (!READ_ONCE(cxl->region_active))
+ return VM_FAULT_SIGBUS;
+
+ pgoff = vmf->pgoff &
+ ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ if (pgoff >= (cxl->region_size >> PAGE_SHIFT))
+ return VM_FAULT_SIGBUS;
+
+ pfn = PHYS_PFN(cxl->region_hpa) + pgoff;
+
+ return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
+}
+
+static const struct vm_operations_struct vfio_cxl_region_vm_ops = {
+ .fault = vfio_cxl_region_vm_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_region *region,
+ struct vm_area_struct *vma)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+ u64 req_len, pgoff, end;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+ return -EINVAL;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
+ (vma->vm_flags & VM_READ))
+ return -EPERM;
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+ (vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
+ check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
+ return -EOVERFLOW;
+
+ if (end > cxl->region_size)
+ return -EINVAL;
+
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
+
+ vma->vm_ops = &vfio_cxl_region_vm_ops;
+ vma->vm_private_data = region;
+
+ return 0;
+}
+
+/*
+ * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs.
+ *
+ * Must be called with vdev->memory_lock held for writing. Sets
+ * region_active=false before zapping so any subsequent I/O to the region
+ * sees the inactive state and returns an error rather than accessing
+ * stale mappings.
+ */
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ if (!cxl)
+ return;
+
+ WRITE_ONCE(cxl->region_active, false);
+}
+
+/*
+ * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset.
+ *
+ * Must be called with vdev->memory_lock held for writing. Re-reads the
+ * HDM decoder state from hardware (FLR cleared it) and sets region_active
+ * so that subsequent I/O to the region is permitted again.
+ */
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ if (!cxl)
+ return;
+ /*
+ * Re-initialise the emulated HDM comp_reg_virt[] from hardware.
+ * After FLR the decoder registers read as zero; mirror that in
+ * the emulated state so QEMU sees a clean slate.
+ */
+ vfio_cxl_reinit_comp_regs(cxl);
+
+ /*
+ * Only re-enable the DPA mmap if the hardware has actually
+ * re-committed decoder 0 after FLR. Read the COMMITTED bit from the
+ * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR
+ * hardware state, not stale pre-reset state.
+ *
+ * If COMMITTED is 0 (slow firmware re-commit path), leave
+ * region_active=false. Guest faults will return VM_FAULT_SIGBUS
+ * until the decoder is re-committed and the region is re-enabled.
+ */
+ if (cxl->precommitted && cxl->comp_reg_virt) {
+ /*
+ * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset
+ * (now CXL.mem-relative) plus the within-HDM-block offset.
+ */
+ u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl,
+ CXL_HDM_DECODER0_CTRL_OFFSET(0)));
+
+ if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)
+ WRITE_ONCE(cxl->region_active, true);
+ }
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+ char __user *buf, size_t count, loff_t *ppos,
+ bool iswrite)
+{
+ unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+ struct vfio_pci_cxl_state *cxl = core_dev->region[i].data;
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count || pos >= cxl->region_size)
+ return 0;
+
+ /*
+ * Guard against access after a failed reset (region_active=false)
+ * or a release race (region_vaddr=NULL). Either condition means
+ * the memremap'd window is no longer valid; touching it would produce
+ * a Synchronous External Abort. Return -EIO so the caller gets a
+ * clean error rather than a kernel oops.
+ */
+ if (!READ_ONCE(cxl->region_active) || !cxl->region_vaddr)
+ return -EIO;
+
+ count = min(count, (size_t)(cxl->region_size - pos));
+
+ if (iswrite) {
+ if (copy_from_user(cxl->region_vaddr + pos, buf, count))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(buf, cxl->region_vaddr + pos, count))
+ return -EFAULT;
+ }
+
+ return count;
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_region *region)
+{
+ struct vfio_pci_cxl_state *cxl = region->data;
+
+ /*
+ * Deactivate the region before removing user mappings so that any
+ * fault handler racing the release returns VM_FAULT_SIGBUS rather
+ * than inserting a PFN into an unmapped region.
+ */
+ WRITE_ONCE(cxl->region_active, false);
+
+ if (cxl->region_vaddr) {
+ memunmap(cxl->region_vaddr);
+ cxl->region_vaddr = NULL;
+ }
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+ .rw = vfio_cxl_region_rw,
+ .mmap = vfio_cxl_region_mmap,
+ .release = vfio_cxl_region_release,
+};
+
MODULE_IMPORT_NS("CXL");
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
index 11195e8c21d7..781328a79b43 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
@@ -33,7 +33,7 @@
* +0x1c: (reserved)
*/
-static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off)
+__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off)
{
/*
* hdm_off is a byte offset within the HDM decoder block.
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 72a0d7d7e183..3458768445af 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -33,6 +33,7 @@ struct vfio_pci_cxl_state {
u8 comp_reg_bar;
bool cache_capable;
bool precommitted;
+ bool region_active;
};
/* Register access sizes */
@@ -96,4 +97,6 @@ int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl,
resource_size_t size);
void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl);
+__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off);
+
#endif /* __LINUX_VFIO_CXL_PRIV_H */
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b7364178e23d..48e0274c19aa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1223,6 +1223,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
vfio_pci_zap_and_down_write_memory_lock(vdev);
+ /* Zap CXL DPA region PTEs before hardware reset clears HDM state */
+ vfio_cxl_zap_region_locked(vdev);
+
/*
* This function can be invoked while the power state is non-D0. If
* pci_try_reset_function() has been called while the power state is
@@ -1236,6 +1239,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
vfio_pci_dma_buf_move(vdev, true);
ret = pci_try_reset_function(vdev->pdev);
+
+ /*
+ * Re-enable DPA region if reset succeeded; fault handler will
+ * re-insert PFNs on next access without requiring a new mmap.
+ */
+ if (!ret)
+ vfio_cxl_reactivate_region(vdev);
+
if (__vfio_pci_memory_enabled(vdev))
vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 1082ba43bafe..726063b6ff70 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -145,6 +145,8 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev);
void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
#else
@@ -152,6 +154,10 @@ static inline void
vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { }
static inline void
vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
#endif /* CONFIG_VFIO_CXL_CORE */
--
2.25.1