[PATCH 11/20] vfio/cxl: Expose DPA memory region to userspace with fault+zap mmap

mhonap@nvidia.com posted 20 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH 11/20] vfio/cxl: Expose DPA memory region to userspace with fault+zap mmap
Posted by mhonap@nvidia.com 3 weeks, 5 days ago
From: Manish Honap <mhonap@nvidia.com>

To directly access the device memory, a CXL region is required. For
the userspace (e.g. QEMU) to access the CXL region, the region is
required to be exposed via VFIO interfaces.

Introduce a new VFIO device region and region ops to expose the created
CXL region. Introduce a new sub region type for userspace to identify
a CXL region.

CXL region lifecycle:
- The CXL memory region is registered with VFIO layer during
  vfio_pci_open_device
- mmap() establishes the VMA with vm_ops but inserts no PTEs
- Each guest page fault calls vfio_cxl_region_page_fault() which
  inserts a single PFN under the memory_lock read side
- On device reset, vfio_cxl_zap_region_locked() sets region_active=false
  and calls unmap_mapping_range() to invalidate all DPA PTEs atomically
  while holding memory_lock for writing
- Faults racing with reset see region_active==false and return
  VM_FAULT_SIGBUS
- vfio_cxl_reactivate_region() restores region_active after successful
  hardware reset

Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so
that FLR correctly invalidates DPA mappings and restores them on success.

Co-developed-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
 drivers/vfio/pci/cxl/vfio_cxl_core.c | 222 +++++++++++++++++++++++++++
 drivers/vfio/pci/cxl/vfio_cxl_priv.h |   2 +
 drivers/vfio/pci/vfio_pci.c          |   9 ++
 drivers/vfio/pci/vfio_pci_core.c     |  11 ++
 drivers/vfio/pci/vfio_pci_priv.h     |  13 ++
 5 files changed, 257 insertions(+)

diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index 9c71f592e74e..03846bd11c8a 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -44,6 +44,7 @@ static int vfio_cxl_create_device_state(struct vfio_pci_core_device *vdev,
 
 	cxl = vdev->cxl;
 	cxl->dvsec = dvsec;
+	cxl->dpa_region_idx = -1;
 
 	pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
 			     &cap_word);
@@ -300,3 +301,224 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
 
 	vfio_cxl_destroy_cxl_region(vdev);
 }
+
+/*
+ * Fault handler for the DPA region VMA.  Called under mm->mmap_lock read
+ * side by the fault path.  We take memory_lock read side here to exclude
+ * the write-side held by vfio_cxl_zap_region_locked() during reset.
+ */
+static vm_fault_t vfio_cxl_region_page_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_core_device *vdev = vma->vm_private_data;
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	unsigned long pfn;
+
+	guard(rwsem_read)(&vdev->memory_lock);
+
+	if (!READ_ONCE(cxl->region_active))
+		return VM_FAULT_SIGBUS;
+
+	pfn = PHYS_PFN(cxl->region_hpa) +
+		((vmf->address - vma->vm_start) >> PAGE_SHIFT);
+
+	/*
+	 * Scrub the page via the kernel ioremap_cache mapping before inserting
+	 * the user PFN. Prevent the stale device data from leaking across VFIO
+	 * device open/close boundaries.
+	 */
+	memset_io((u8 __iomem *)cxl->region_vaddr +
+		  ((pfn - PHYS_PFN(cxl->region_hpa)) << PAGE_SHIFT),
+		  0, PAGE_SIZE);
+
+	return vmf_insert_pfn(vma, vmf->address, pfn);
+}
+
+static const struct vm_operations_struct vfio_cxl_region_vm_ops = {
+	.fault = vfio_cxl_region_page_fault,
+};
+
+static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev,
+				struct vfio_pci_region *region,
+				struct vm_area_struct *vma)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	unsigned long req_len;
+
+	if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+		return -EINVAL;
+
+	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len))
+		return -EOVERFLOW;
+
+	if (req_len > cxl->region_size)
+		return -EINVAL;
+
+	/*
+	 * Do not insert PTEs here (no remap_pfn_range).  PTEs are inserted
+	 * lazily on first fault via vfio_cxl_region_page_fault().  This
+	 * allows vfio_cxl_zap_region_locked() to safely invalidate them
+	 * during device reset without any userspace cooperation.
+	 * Leave vm_page_prot at its default.
+	 */
+
+	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+	vma->vm_private_data = vdev;
+	vma->vm_ops = &vfio_cxl_region_vm_ops;
+
+	return 0;
+}
+
+/*
+ * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs.
+ *
+ * Must be called with vdev->memory_lock held for writing.  Sets
+ * region_active=false before zapping so any fault racing with zap sees
+ * the inactive state and returns VM_FAULT_SIGBUS rather than inserting
+ * a stale PFN.
+ */
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_device *core_vdev = &vdev->vdev;
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	if (!cxl || cxl->dpa_region_idx < 0)
+		return;
+
+	WRITE_ONCE(cxl->region_active, false);
+	unmap_mapping_range(core_vdev->inode->i_mapping,
+			    VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS +
+						     cxl->dpa_region_idx),
+			    cxl->region_size, true);
+}
+
+/*
+ * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset.
+ *
+ * Must be called with vdev->memory_lock held for writing.  Re-reads the
+ * HDM decoder state from hardware (FLR cleared it) and sets region_active
+ * so that subsequent faults can re-insert PFNs without a new mmap.
+ */
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	lockdep_assert_held_write(&vdev->memory_lock);
+
+	if (!cxl)
+		return;
+}
+
+static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
+				  char __user *buf, size_t count, loff_t *ppos,
+				  bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_pci_cxl_state *cxl = core_dev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	guard(rwsem_read)(&core_dev->memory_lock);
+
+	if (!READ_ONCE(cxl->region_active))
+		return -EIO;
+
+	if (!count)
+		return 0;
+
+	return vfio_pci_core_do_io_rw(core_dev, false,
+				      cxl->region_vaddr,
+				      (char __user *)buf, pos, count,
+				      0, 0, iswrite, VFIO_PCI_IO_WIDTH_8);
+}
+
+static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
+				    struct vfio_pci_region *region)
+{
+	struct vfio_pci_cxl_state *cxl = region->data;
+
+	if (cxl->region_vaddr) {
+		iounmap(cxl->region_vaddr);
+		cxl->region_vaddr = NULL;
+	}
+}
+
+static const struct vfio_pci_regops vfio_cxl_regops = {
+	.rw		= vfio_cxl_region_rw,
+	.mmap		= vfio_cxl_region_mmap,
+	.release	= vfio_cxl_region_release,
+};
+
+int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	u32 flags;
+	int ret;
+
+	if (!cxl)
+		return -ENODEV;
+
+	if (!cxl->region || cxl->region_vaddr)
+		return -ENODEV;
+
+	cxl->region_vaddr = ioremap_cache(cxl->region_hpa, cxl->region_size);
+	if (!cxl->region_vaddr)
+		return -ENOMEM;
+
+	flags = VFIO_REGION_INFO_FLAG_READ |
+		VFIO_REGION_INFO_FLAG_WRITE |
+		VFIO_REGION_INFO_FLAG_MMAP;
+
+	ret = vfio_pci_core_register_dev_region(vdev,
+						PCI_VENDOR_ID_CXL |
+						VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+						VFIO_REGION_SUBTYPE_CXL,
+						&vfio_cxl_regops,
+						cxl->region_size, flags,
+						cxl);
+	if (ret) {
+		iounmap(cxl->region_vaddr);
+		cxl->region_vaddr = NULL;
+		return ret;
+	}
+
+	/*
+	 * Cache the vdev->region[] index before activating the region.
+	 * vfio_pci_core_register_dev_region() placed the new entry at
+	 * vdev->region[num_regions - 1] and incremented num_regions.
+	 * vfio_cxl_zap_region_locked() uses this to avoid scanning
+	 * vdev->region[] on every FLR.
+	 */
+	cxl->dpa_region_idx = vdev->num_regions - 1;
+	WRITE_ONCE(cxl->region_active, true);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_cxl_register_cxl_region);
+
+/**
+ * vfio_cxl_unregister_cxl_region - Undo vfio_cxl_register_cxl_region()
+ * @vdev: VFIO PCI device
+ *
+ * Marks the DPA region inactive so any racing fault returns VM_FAULT_SIGBUS
+ * and resets dpa_region_idx.  Does NOT call release() or touch num_regions;
+ * vfio_pci_core_disable() will call the idempotent release() callback as
+ * normal during device close.
+ *
+ * Does NOT touch CXL subsystem state (cxl->region, cxl->cxled, cxl->cxlrd).
+ * The caller must call vfio_cxl_destroy_cxl_region() separately to release
+ * those objects.
+ */
+void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	if (!cxl || cxl->dpa_region_idx < 0)
+		return;
+
+	WRITE_ONCE(cxl->region_active, false);
+
+	cxl->dpa_region_idx = -1;
+}
+EXPORT_SYMBOL_GPL(vfio_cxl_unregister_cxl_region);
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 985680842a13..b870926bfb19 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -26,9 +26,11 @@ struct vfio_pci_cxl_state {
 	resource_size_t              comp_reg_offset;
 	size_t                       comp_reg_size;
 	u32                          hdm_count;
+	int                          dpa_region_idx;
 	u16                          dvsec;
 	u8                           comp_reg_bar;
 	bool                         precommitted;
+	bool                         region_active;
 };
 
 /*
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 0c771064c0b8..d3138badeaa6 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -120,6 +120,15 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev)
 		}
 	}
 
+	if (vdev->cxl) {
+		ret = vfio_cxl_register_cxl_region(vdev);
+		if (ret) {
+			pci_warn(pdev, "Failed to setup CXL region\n");
+			vfio_pci_core_disable(vdev);
+			return ret;
+		}
+	}
+
 	vfio_pci_core_finish_enable(vdev);
 
 	return 0;
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b7364178e23d..48e0274c19aa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1223,6 +1223,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 
 	vfio_pci_zap_and_down_write_memory_lock(vdev);
 
+	/* Zap CXL DPA region PTEs before hardware reset clears HDM state */
+	vfio_cxl_zap_region_locked(vdev);
+
 	/*
 	 * This function can be invoked while the power state is non-D0. If
 	 * pci_try_reset_function() has been called while the power state is
@@ -1236,6 +1239,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
 
 	vfio_pci_dma_buf_move(vdev, true);
 	ret = pci_try_reset_function(vdev->pdev);
+
+	/*
+	 * Re-enable DPA region if reset succeeded; fault handler will
+	 * re-insert PFNs on next access without requiring a new mmap.
+	 */
+	if (!ret)
+		vfio_cxl_reactivate_region(vdev);
+
 	if (__vfio_pci_memory_enabled(vdev))
 		vfio_pci_dma_buf_move(vdev, false);
 	up_write(&vdev->memory_lock);
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 818d99f098bf..441b4a47637a 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -140,6 +140,10 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev);
 int vfio_cxl_create_cxl_region(struct vfio_pci_core_device *vdev,
 			       resource_size_t size);
 void vfio_cxl_destroy_cxl_region(struct vfio_pci_core_device *vdev);
+int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev);
+void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev);
+void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
 
 #else
 
@@ -152,6 +156,15 @@ static inline int vfio_cxl_create_cxl_region(struct vfio_pci_core_device *vdev,
 { return 0; }
 static inline void
 vfio_cxl_destroy_cxl_region(struct vfio_pci_core_device *vdev) { }
+static inline int
+vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev)
+{ return 0; }
+static inline void
+vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
+static inline void
+vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
 
 #endif /* CONFIG_VFIO_CXL_CORE */
 
-- 
2.25.1
Re: [PATCH 11/20] vfio/cxl: Expose DPA memory region to userspace with fault+zap mmap
Posted by Dave Jiang 3 weeks, 4 days ago

On 3/11/26 1:34 PM, mhonap@nvidia.com wrote:
> From: Manish Honap <mhonap@nvidia.com> 
< --snip-- >

> +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +	u32 flags;
> +	int ret;
> +
> +	if (!cxl)
> +		return -ENODEV;
> +
> +	if (!cxl->region || cxl->region_vaddr)
> +		return -ENODEV;
> +
> +	cxl->region_vaddr = ioremap_cache(cxl->region_hpa, cxl->region_size);

Should this be using memremap_pages() family of call rather than ioremap() like how DAX does it? CXL mem regions are not MMIO regions.

DJ