From: Manish Honap <mhonap@nvidia.com>
Detect a vendor-specific CXL device at vfio-pci bind time and probe
its HDM decoder register block.
vfio_cxl_create_device_state() allocates per-device state via devm and
reads MEM_CAPABLE and CACHE_CAPABLE from the CXL DVSEC.
vfio_cxl_setup_regs() locates the component register block, temporarily
maps it, calls cxl_probe_component_regs() to find the HDM block, then
releases the mapping.
vfio_pci_cxl_detect_and_init() chains these two steps. If either fails,
vdev->cxl stays NULL and the device falls back to plain vfio-pci.
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
drivers/vfio/pci/cxl/vfio_cxl_core.c | 217 +++++++++++++++++++++++++++
drivers/vfio/pci/cxl/vfio_cxl_priv.h | 12 ++
2 files changed, 229 insertions(+)
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index d12afec82ecd..b1c7603590b5 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -21,6 +21,158 @@
#include "../vfio_pci_priv.h"
#include "vfio_cxl_priv.h"
+/*
+ * vfio_cxl_create_device_state - Allocate and validate CXL device state
+ *
+ * Returns a pointer to the allocated vfio_pci_cxl_state on success, or
+ * ERR_PTR on failure. The allocation uses devm; the caller must call
+ * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release
+ * the resource before device unbind. Using devm_kfree() to undo a devm
+ * allocation early is explicitly supported by the devres API.
+ *
+ * The caller assigns vdev->cxl only after all setup steps succeed, preventing
+ * partially-initialised state from being visible through vdev->cxl on any
+ * failure path.
+ */
+static struct vfio_pci_cxl_state *
+vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec)
+{
+ struct vfio_pci_cxl_state *cxl;
+ u16 cap_word;
+ u32 hdr1;
+
+ /* Freed automatically when pdev->dev is released. */
+ cxl = devm_cxl_dev_state_create(&pdev->dev,
+ CXL_DEVTYPE_DEVMEM,
+ pdev->dev.id, dvsec,
+ struct vfio_pci_cxl_state,
+ cxlds, false);
+ if (!cxl)
+ return ERR_PTR(-ENOMEM);
+
+ pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1);
+ cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1);
+
+ pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
+ &cap_word);
+
+ /*
+ * Only handle vendor devices (class != 0x0502) with Mem_Capable set.
+ * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI
+ * sequence is needed before FLR.
+ */
+ if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) ||
+ (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) {
+ devm_kfree(&pdev->dev, cxl);
+ return ERR_PTR(-ENODEV);
+ }
+
+ cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word);
+
+ return cxl;
+}
+
+static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_cxl_state *cxl)
+{
+ struct cxl_register_map *map = &cxl->cxlds.reg_map;
+ resource_size_t offset, bar_offset, size;
+ struct pci_dev *pdev = vdev->pdev;
+ void __iomem *base;
+ int ret;
+ u8 count;
+ u8 bar;
+
+ if (WARN_ON_ONCE(!pci_is_enabled(pdev)))
+ return -EINVAL;
+
+ /* Find component register block via Register Locator DVSEC */
+ ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map);
+ if (ret)
+ return ret;
+
+ /*
+ * Request the region and map. This is a transient mapping
+ * used only to probe register capabilities; released immediately
+ * after cxl_probe_component_regs() returns.
+ */
+ if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe"))
+ return -EBUSY;
+
+ base = ioremap(map->resource, map->max_size);
+ if (!base) {
+ ret = -ENOMEM;
+ goto failed_release;
+ }
+
+ /* Probe component register capabilities */
+ cxl_probe_component_regs(&pdev->dev, base, &map->component_map);
+
+ /* Check if HDM decoder was found */
+ if (!map->component_map.hdm_decoder.valid) {
+ ret = -ENODEV;
+ goto failed_unmap;
+ }
+
+ pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n",
+ map->component_map.hdm_decoder.offset,
+ map->component_map.hdm_decoder.size);
+
+ /* Get HDM register info */
+ ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size);
+ if (ret)
+ goto failed_unmap;
+
+ if (!count || !size) {
+ ret = -ENODEV;
+ goto failed_unmap;
+ }
+
+ cxl->hdm_count = count;
+ /*
+ * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + <hdm_within_cm>
+ * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET before
+ * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset).
+ * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem
+ * register area start, which is where comp_reg_virt[0] is anchored.
+ * The physical BAR address for hdm_iobase is recovered by adding
+ * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs().
+ */
+ cxl->hdm_reg_offset = offset - CXL_CM_OFFSET;
+ cxl->hdm_reg_size = size;
+
+ ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset);
+ if (ret)
+ goto failed_unmap;
+
+ cxl->comp_reg_bar = bar;
+ cxl->comp_reg_offset = bar_offset;
+ cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE;
+
+ iounmap(base);
+ release_mem_region(map->resource, map->max_size);
+
+ return 0;
+
+failed_unmap:
+ iounmap(base);
+failed_release:
+ release_mem_region(map->resource, map->max_size);
+
+ return ret;
+}
+
+/*
+ * Free CXL state early on probe failure. devm_kfree() on a live devres
+ * allocation removes it from the list immediately, so the normal devres
+ * teardown at unbind time won't double-free it.
+ */
+static void vfio_cxl_dev_state_free(struct pci_dev *pdev,
+ struct vfio_pci_cxl_state *cxl)
+{
+ devm_kfree(&pdev->dev, cxl);
+}
+
/**
* vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific
* CXL.mem device
@@ -32,10 +184,75 @@
*/
void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev)
{
+ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_pci_cxl_state *cxl;
+ u16 dvsec;
+ int ret;
+
+ if (!pcie_is_cxl(pdev))
+ return;
+
+ dvsec = pci_find_dvsec_capability(pdev,
+ PCI_VENDOR_ID_CXL,
+ PCI_DVSEC_CXL_DEVICE);
+ if (!dvsec)
+ return;
+
+ /*
+ * CXL DVSEC found: any failure from here is a hard probe error on
+ * a confirmed CXL-capable device, not a silent non-CXL fallback.
+ * Warn the operator so misconfiguration is visible.
+ */
+ cxl = vfio_cxl_create_device_state(pdev, dvsec);
+ if (IS_ERR(cxl)) {
+ if (PTR_ERR(cxl) != -ENODEV)
+ pci_warn(pdev,
+ "vfio-cxl: CXL device state allocation failed: %ld\n",
+ PTR_ERR(cxl));
+ return;
+ }
+
+ /*
+ * Required for ioremap of the component register block and
+ * calls to cxl_probe_component_regs().
+ */
+ ret = pci_enable_device_mem(pdev);
+ if (ret) {
+ pci_warn(pdev,
+ "vfio-cxl: pci_enable_device_mem failed: %d\n", ret);
+ goto free_cxl;
+ }
+
+ ret = vfio_cxl_setup_regs(vdev, cxl);
+ if (ret) {
+ pci_warn(pdev,
+ "vfio-cxl: HDM register probing failed: %d\n", ret);
+ pci_disable_device(pdev);
+ goto free_cxl;
+ }
+
+ pci_disable_device(pdev);
+
+ /*
+ * Register probing succeeded. Assign vdev->cxl now so that
+ * all subsequent helpers can access state via vdev->cxl.
+ * All failure paths below clear vdev->cxl before calling
+ * vfio_cxl_dev_state_free().
+ */
+ vdev->cxl = cxl;
+
+ return;
+
+free_cxl:
+ vfio_cxl_dev_state_free(pdev, cxl);
}
void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
{
+ struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+ if (!cxl)
+ return;
}
MODULE_IMPORT_NS("CXL");
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 4cecc25db410..54b1f6d885aa 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -21,8 +21,20 @@ struct vfio_pci_cxl_state {
size_t hdm_reg_size;
resource_size_t comp_reg_offset;
size_t comp_reg_size;
+ u16 dvsec_len;
u8 hdm_count;
u8 comp_reg_bar;
+ bool cache_capable;
};
+/*
+ * CXL DVSEC for CXL Devices - register offsets within the DVSEC
+ * (CXL 4.0 8.1.3).
+ * Offsets are relative to the DVSEC capability base (cxl->dvsec).
+ */
+#define CXL_DVSEC_CAPABILITY_OFFSET 0xa
+#define CXL_DVSEC_MEM_CAPABLE BIT(2)
+/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */
+#define CXL_DVSEC_CACHE_CAPABLE BIT(0)
+
#endif /* __LINUX_VFIO_CXL_PRIV_H */
--
2.25.1