[PATCH v2 14/20] vfio/cxl: DPA VFIO region with demand fault mmap and reset zap

mhonap@nvidia.com posted 20 patches 6 hours ago
[PATCH v2 14/20] vfio/cxl: DPA VFIO region with demand fault mmap and reset zap
Posted by mhonap@nvidia.com 5 hours ago
From: Manish Honap <mhonap@nvidia.com>

Wire the CXL DPA range up as a VFIO demand-paged region so QEMU can
mmap guest device memory directly. Faults call vmf_insert_pfn() to
insert one PFN at a time rather than mapping the full range upfront.

CXL region lifecycle:
- The CXL memory region is registered with VFIO layer during
  vfio_pci_open_device
- mmap() establishes the VMA with vm_ops but inserts no PTEs
- Each guest page fault calls vfio_cxl_region_page_fault() which
  inserts a single PFN under the memory_lock read side
- On device reset, vfio_cxl_zap_region_locked() sets region_active=false
  and calls unmap_mapping_range() to invalidate all DPA PTEs atomically
  while holding memory_lock for writing
- Faults racing with reset see region_active==false and return
  VM_FAULT_SIGBUS
- vfio_cxl_reactivate_region() restores region_active after successful
  hardware reset

Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so
that FLR correctly invalidates DPA mappings and restores them on success.

Co-developed-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
 drivers/vfio/pci/cxl/vfio_cxl_core.c | 187 +++++++++++++++++++++++++++
 drivers/vfio/pci/cxl/vfio_cxl_emu.c  |   2 +-
 drivers/vfio/pci/cxl/vfio_cxl_priv.h |   3 +
 drivers/vfio/pci/vfio_pci_core.c     |  11 ++
 drivers/vfio/pci/vfio_pci_priv.h     |   6 +
 5 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index 30b365b91903..19d3dc205f99 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -435,4 +435,191 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
 	vfio_cxl_destroy_cxl_region(cxl);
 }
 
+static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf)
+{
+	struct vfio_pci_region *region = vmf->vma->vm_private_data;
+	struct vfio_pci_cxl_state *cxl = region->data;
+	unsigned long pgoff;
+	unsigned long pfn;
+
+	if (!READ_ONCE(cxl->region_active))
+		return VM_FAULT_SIGBUS;
+
+	pgoff = vmf->pgoff &
+		((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	if (pgoff >= (cxl->region_size >> PAGE_SHIFT))
+		return VM_FAULT_SIGBUS;
+
+	pfn = PHYS_PFN(cxl->region_hpa) + pgoff;
+
+	return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
+}
+
+static const struct vm_operations_struct vfio_cxl_region_vm_ops = {
+	.fault = vfio_cxl_region_vm_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev,
+				struct vfio_pci_region *region,
+				struct vm_area_struct *vma)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	u64 req_len, pgoff, end;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
+	    (vma->vm_flags & VM_READ))
+		return -EPERM;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
+	    (vma->vm_flags & VM_WRITE))
+		return -EPERM;
+
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
+	    check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
+		return -EOVERFLOW;
+
+	if (end > cxl->region_size)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+		     VM_DONTEXPAND | VM_DONTDUMP);
+
+	vma->vm_ops = &vfio_cxl_region_vm_ops;
+	vma->vm_private_data = region;
+
+	return 0;
+}
+
+/*
+ * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs.
+ *
+ * Must be called with vdev->memory_lock held for writing.  Sets
+ * region_active=false before zapping so any subsequent I/O to the region
+ * sees the inactive state and returns an error rather than accessing
+ * stale mappings.
+ */
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	if (!cxl)
+		return;
+
+	WRITE_ONCE(cxl->region_active, false);
+}
+
+/*
+ * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset.
+ *
+ * Must be called with vdev->memory_lock held for writing.  Re-reads the
+ * HDM decoder state from hardware (FLR cleared it) and sets region_active
+ * so that subsequent I/O to the region is permitted again.
+ */
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	if (!cxl)
+		return;
+	/*
+	 * Re-initialise the emulated HDM comp_reg_virt[] from hardware.
+	 * After FLR the decoder registers read as zero; mirror that in
+	 * the emulated state so QEMU sees a clean slate.
+	 */
+	vfio_cxl_reinit_comp_regs(cxl);
+
+	/*
+	 * Only re-enable the DPA mmap if the hardware has actually
+	 * re-committed decoder 0 after FLR.  Read the COMMITTED bit from the
+	 * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR
+	 * hardware state, not stale pre-reset state.
+	 *
+	 * If COMMITTED is 0 (slow firmware re-commit path), leave
+	 * region_active=false.	 Guest faults will return VM_FAULT_SIGBUS
+	 * until the decoder is re-committed and the region is re-enabled.
+	 */
+	if (cxl->precommitted && cxl->comp_reg_virt) {
+		/*
+		 * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset
+		 * (now CXL.mem-relative) plus the within-HDM-block offset.
+		 */
+		u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl,
+					    CXL_HDM_DECODER0_CTRL_OFFSET(0)));
+
+		if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)
+			WRITE_ONCE(cxl->region_active, true);
+	}
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+				  char __user *buf, size_t count, loff_t *ppos,
+				  bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_pci_cxl_state *cxl = core_dev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (!count || pos >= cxl->region_size)
+		return 0;
+
+	/*
+	 * Guard against access after a failed reset (region_active=false)
+	 * or a release race (region_vaddr=NULL).  Either condition means
+	 * the memremap'd window is no longer valid; touching it would produce
+	 * a Synchronous External Abort.  Return -EIO so the caller gets a
+	 * clean error rather than a kernel oops.
+	 */
+	if (!READ_ONCE(cxl->region_active) || !cxl->region_vaddr)
+		return -EIO;
+
+	count = min(count, (size_t)(cxl->region_size - pos));
+
+	if (iswrite) {
+		if (copy_from_user(cxl->region_vaddr + pos, buf, count))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(buf, cxl->region_vaddr + pos, count))
+			return -EFAULT;
+	}
+
+	return count;
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_region *region)
+{
+	struct vfio_pci_cxl_state *cxl = region->data;
+
+	/*
+	 * Deactivate the region before removing user mappings so that any
+	 * fault handler racing the release returns VM_FAULT_SIGBUS rather
+	 * than inserting a PFN into an unmapped region.
+	 */
+	WRITE_ONCE(cxl->region_active, false);
+
+	if (cxl->region_vaddr) {
+		memunmap(cxl->region_vaddr);
+		cxl->region_vaddr = NULL;
+	}
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+	.rw		= vfio_cxl_region_rw,
+	.mmap		= vfio_cxl_region_mmap,
+	.release	= vfio_cxl_region_release,
+};
+
 MODULE_IMPORT_NS("CXL");
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
index 11195e8c21d7..781328a79b43 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
@@ -33,7 +33,7 @@
  *     +0x1c: (reserved)
  */
 
-static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off)
+__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off)
 {
 	/*
 	 * hdm_off is a byte offset within the HDM decoder block.
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 72a0d7d7e183..3458768445af 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -33,6 +33,7 @@ struct vfio_pci_cxl_state {
 	u8                           comp_reg_bar;
 	bool                         cache_capable;
 	bool                         precommitted;
+	bool                         region_active;
 };
 
 /* Register access sizes */
@@ -96,4 +97,6 @@ int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl,
 			       resource_size_t size);
 void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl);
 
+__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off);
+
 #endif /* __LINUX_VFIO_CXL_PRIV_H */
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b7364178e23d..48e0274c19aa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1223,6 +1223,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 
 	vfio_pci_zap_and_down_write_memory_lock(vdev);
 
+	/* Zap CXL DPA region PTEs before hardware reset clears HDM state */
+	vfio_cxl_zap_region_locked(vdev);
+
 	/*
 	 * This function can be invoked while the power state is non-D0. If
 	 * pci_try_reset_function() has been called while the power state is
@@ -1236,6 +1239,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 
 	vfio_pci_dma_buf_move(vdev, true);
 	ret = pci_try_reset_function(vdev->pdev);
+
+	/*
+	 * Re-enable DPA region if reset succeeded; fault handler will
+	 * re-insert PFNs on next access without requiring a new mmap.
+	 */
+	if (!ret)
+		vfio_cxl_reactivate_region(vdev);
+
 	if (__vfio_pci_memory_enabled(vdev))
 		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 1082ba43bafe..726063b6ff70 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -145,6 +145,8 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
 
 void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev);
 void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
 
 #else
 
@@ -152,6 +154,10 @@ static inline void
 vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { }
 static inline void
 vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
 
 #endif /* CONFIG_VFIO_CXL_CORE */
 
-- 
2.25.1