From: Manish Honap <mhonap@nvidia.com>
Expose CXL device capability through the VFIO device info ioctl and give
userspace access to the GPU/accelerator register windows in the component
BAR while protecting the CXL component register block.
vfio_cxl_get_info() fills VFIO_DEVICE_INFO_CAP_CXL with the HDM register
BAR index and byte offset, commit flags, and VFIO region indices for the
DPA and COMP_REGS regions. HDM decoder count and the HDM block offset
within COMP_REGS are not populated; both are derivable from the CXL
Capability Array in the COMP_REGS region itself.
vfio_cxl_get_region_info() handles VFIO_DEVICE_GET_REGION_INFO for the
component register BAR. It builds a sparse-mmap capability that advertises
only the GPU/accelerator register windows, carving out the CXL component
register block. Three physical layouts are handled:
Topology A comp block at BAR end: one area [0, comp_reg_offset)
Topology B comp block at BAR start: one area [comp_end, bar_len)
Topology C comp block in the middle: two areas, one on each side
vfio_cxl_mmap_overlaps_comp_regs() checks whether an mmap request overlaps
[comp_reg_offset, comp_reg_offset + comp_reg_size). vfio_pci_core_mmap()
calls it to reject access to the component register block while allowing
mmap of the GPU register windows in the sparse capability. This replaces
the earlier blanket rejection of any mmap on the component BAR index.
Hook both helpers into vfio_pci_ioctl_get_info() and
vfio_pci_ioctl_get_region_info() in vfio_pci_core.c.
The component BAR cannot be claimed exclusively since the CXL subsystem
holds persistent sub-range iomem claims during HDM decoder setup.
pci_request_selected_regions() returns EBUSY; pass bars=0 to skip the
request and map directly via pci_iomap(). Physical ownership is assured
by driver binding.
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
drivers/vfio/pci/cxl/vfio_cxl_core.c | 155 +++++++++++++++++++++++++++
drivers/vfio/pci/vfio_pci_core.c | 31 +++++-
drivers/vfio/pci/vfio_pci_priv.h | 24 +++++
drivers/vfio/pci/vfio_pci_rdwr.c | 16 ++-
4 files changed, 221 insertions(+), 5 deletions(-)
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index b38a04301660..46430cbfa962 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -21,6 +21,161 @@
#include "../vfio_pci_priv.h"
#include "vfio_cxl_priv.h"
+u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev)
+{
+ return vdev->cxl->comp_reg_bar;
+}
+
+int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_region_info_cap_sparse_mmap *sparse;
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+ resource_size_t bar_len, comp_end;
+ u32 nr_areas, cap_size;
+ int ret;
+
+ if (!cxl)
+ return -ENOTTY;
+
+ if (!info)
+ return -ENOTTY;
+
+ if (info->argsz < minsz)
+ return -EINVAL;
+
+ if (info->index != cxl->comp_reg_bar)
+ return -ENOTTY;
+
+ /*
+ * The device state is not fully initialised;
+ * fall through to the default BAR handler.
+ */
+ if (!cxl->comp_reg_size)
+ return -ENOTTY;
+
+ bar_len = pci_resource_len(vdev->pdev, info->index);
+ comp_end = cxl->comp_reg_offset + cxl->comp_reg_size;
+
+ /*
+ * Advertise the GPU/accelerator register windows as mmappable by
+ * carving the CXL component register block out of the BAR. The
+ * number of sparse areas depends on where the block sits:
+ *
+ * [A] comp block at BAR end [gpu_regs | comp_regs]:
+ * comp_reg_offset > 0 && comp_end == bar_len
+ * = 1 area: [0, comp_reg_offset)
+ *
+ * [B] comp block at BAR start [comp_regs | gpu_regs]:
+ * comp_reg_offset == 0 && comp_end < bar_len
+ * = 1 area: [comp_end, bar_len)
+ *
+ * [C] comp block in middle [gpu_regs | comp_regs | gpu_regs]:
+ * comp_reg_offset > 0 && comp_end < bar_len
+ * = 2 areas: [0, comp_reg_offset) and [comp_end, bar_len)
+ */
+ if (cxl->comp_reg_offset > 0 && comp_end < bar_len)
+ nr_areas = 2;
+ else
+ nr_areas = 1;
+
+ cap_size = struct_size(sparse, areas, nr_areas);
+ sparse = kzalloc(cap_size, GFP_KERNEL);
+ if (!sparse)
+ return -ENOMEM;
+
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+ sparse->nr_areas = nr_areas;
+
+ if (nr_areas == 2) {
+ /* [C]: window before and after comp block */
+ sparse->areas[0].offset = 0;
+ sparse->areas[0].size = cxl->comp_reg_offset;
+ sparse->areas[1].offset = comp_end;
+ sparse->areas[1].size = bar_len - comp_end;
+ } else if (cxl->comp_reg_offset == 0) {
+ /* [B]: comp block at BAR start, window follows */
+ sparse->areas[0].offset = comp_end;
+ sparse->areas[0].size = bar_len - comp_end;
+ } else {
+ /* [A]: comp block at BAR end, window precedes */
+ sparse->areas[0].offset = 0;
+ sparse->areas[0].size = cxl->comp_reg_offset;
+ }
+
+ ret = vfio_info_add_capability(caps, &sparse->header, cap_size);
+ kfree(sparse);
+ if (ret)
+ return ret;
+
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = bar_len;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+
+ return 0;
+}
+
+bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev,
+ u64 req_start, u64 req_len)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ if (!cxl->comp_reg_size)
+ return false;
+
+ return req_start < cxl->comp_reg_offset + cxl->comp_reg_size &&
+ req_start + req_len > cxl->comp_reg_offset;
+}
+
+int vfio_cxl_get_info(struct vfio_pci_core_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+ struct vfio_device_info_cap_cxl cxl_cap = {0};
+
+ if (!cxl)
+ return 0;
+
+ /*
+ * Device is not fully initialised?
+ */
+ if (WARN_ON(cxl->dpa_region_idx < 0 || cxl->comp_reg_region_idx < 0))
+ return -ENODEV;
+
+ /* Fill in from CXL device structure */
+ cxl_cap.header.id = VFIO_DEVICE_INFO_CAP_CXL;
+ cxl_cap.header.version = 1;
+ /*
+ * COMP_REGS region starts at comp_reg_offset + CXL_CM_OFFSET within
+ * the BAR. This is the byte offset of the CXL.mem register area (where
+ * the CXL Capability Array Header lives) within the component register
+ * block. Userspace derives hdm_decoder_offset and hdm_count from the
+ * COMP_REGS region itself (CXL Capability Array traversal + HDMC read).
+ */
+ cxl_cap.hdm_regs_offset = cxl->comp_reg_offset + CXL_CM_OFFSET;
+ cxl_cap.hdm_regs_bar_index = cxl->comp_reg_bar;
+
+ if (cxl->precommitted)
+ cxl_cap.flags |= VFIO_CXL_CAP_FIRMWARE_COMMITTED;
+ if (cxl->cache_capable)
+ cxl_cap.flags |= VFIO_CXL_CAP_CACHE_CAPABLE;
+
+ /*
+ * Populate absolute VFIO region indices so userspace can query them
+ * directly with VFIO_DEVICE_GET_REGION_INFO.
+ */
+ cxl_cap.dpa_region_index = VFIO_PCI_NUM_REGIONS + cxl->dpa_region_idx;
+ cxl_cap.comp_regs_region_index =
+ VFIO_PCI_NUM_REGIONS + cxl->comp_reg_region_idx;
+
+ return vfio_info_add_capability(caps, &cxl_cap.header, sizeof(cxl_cap));
+}
+
/*
* Scope-based cleanup wrappers for the CXL resource APIs
*/
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 48e0274c19aa..570775cc8711 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -591,7 +591,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_dummy_resource *dummy_res, *tmp;
struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
- int i, bar;
+ int i, bar, bars;
/* For needs_reset */
lockdep_assert_held(&vdev->vdev.dev_set->lock);
@@ -650,8 +650,10 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
bar = i + PCI_STD_RESOURCES;
if (!vdev->barmap[bar])
continue;
+ bars = (vdev->cxl && i == vfio_cxl_get_component_reg_bar(vdev)) ?
+ 0 : (1 << bar);
pci_iounmap(pdev, vdev->barmap[bar]);
- pci_release_selected_regions(pdev, 1 << bar);
+ pci_release_selected_regions(pdev, bars);
vdev->barmap[bar] = NULL;
}
@@ -989,6 +991,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
+ if (vdev->cxl) {
+ ret = vfio_cxl_get_info(vdev, &caps);
+ if (ret)
+ return ret;
+ info.flags |= VFIO_DEVICE_FLAGS_CXL;
+ }
+
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
@@ -1034,6 +1043,12 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
struct pci_dev *pdev = vdev->pdev;
int i, ret;
+ if (vdev->cxl) {
+ ret = vfio_cxl_get_region_info(vdev, info, caps);
+ if (ret != -ENOTTY)
+ return ret;
+ }
+
switch (info->index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
@@ -1768,6 +1783,18 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
if (req_start + req_len > phys_len)
return -EINVAL;
+ /*
+ * CXL devices: mmap is permitted for the GPU/accelerator register
+ * windows listed in the sparse-mmap capability. Block any request
+ * that overlaps the CXL component register block
+ * [comp_reg_offset, comp_reg_offset + comp_reg_size); those registers
+ * must be accessed exclusively through the COMP_REGS device region so
+ * that the emulation layer (notify_change) intercepts every write.
+ */
+ if (vdev->cxl && index == vfio_cxl_get_component_reg_bar(vdev) &&
+ vfio_cxl_mmap_overlaps_comp_regs(vdev, req_start, req_len))
+ return -EINVAL;
+
/*
* Even though we don't make use of the barmap for the mmap,
* we need to request the region and the barmap tracks that.
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index ae0091d5096c..2d4aadd1b35a 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -151,6 +151,14 @@ void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev);
int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev);
void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev);
int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev);
+int vfio_cxl_get_info(struct vfio_pci_core_device *vdev,
+ struct vfio_info_cap *caps);
+int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
+u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev);
+bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev,
+ u64 req_start, u64 req_len);
#else
@@ -172,6 +180,22 @@ vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { }
static inline int
vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev)
{ return 0; }
+static inline int
+vfio_cxl_get_info(struct vfio_pci_core_device *vdev,
+ struct vfio_info_cap *caps)
+{ return -ENOTTY; }
+static inline int
+vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{ return -ENOTTY; }
+static inline u8
+vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev)
+{ return U8_MAX; }
+static inline bool
+vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev,
+ u64 req_start, u64 req_len)
+{ return false; }
#endif /* CONFIG_VFIO_CXL_CORE */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index b38627b35c35..e95bdbdbcdb2 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -201,19 +201,29 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw);
int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
{
struct pci_dev *pdev = vdev->pdev;
- int ret;
+ int ret, bars;
void __iomem *io;
if (vdev->barmap[bar])
return 0;
- ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
+ /*
+ * The CXL component register BAR cannot be claimed exclusively: the
+ * CXL subsystem holds persistent sub-range iomem claims during HDM
+ * decoder setup. pci_request_selected_regions() for the full BAR
+ * fails with EBUSY. Pass bars=0 to make the request a no-op and map
+ * directly via pci_iomap().
+ */
+ bars = (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev)) ?
+ 0 : (1 << bar);
+
+ ret = pci_request_selected_regions(pdev, bars, "vfio");
if (ret)
return ret;
io = pci_iomap(pdev, bar, 0);
if (!io) {
- pci_release_selected_regions(pdev, 1 << bar);
+ pci_release_selected_regions(pdev, bars);
return -ENOMEM;
}
--
2.25.1