[PATCH v2 09/20] vfio/cxl: Detect CXL DVSEC and probe HDM block

mhonap@nvidia.com posted 20 patches 6 hours ago
[PATCH v2 09/20] vfio/cxl: Detect CXL DVSEC and probe HDM block
Posted by mhonap@nvidia.com 6 hours ago
From: Manish Honap <mhonap@nvidia.com>

Detect a vendor-specific CXL device at vfio-pci bind time and probe
its HDM decoder register block.

vfio_cxl_create_device_state() allocates per-device state via devm and
reads MEM_CAPABLE and CACHE_CAPABLE from the CXL DVSEC.

vfio_cxl_setup_regs() locates the component register block, temporarily
maps it, calls cxl_probe_component_regs() to find the HDM block, then
releases the mapping.

vfio_pci_cxl_detect_and_init() chains these two steps. If either fails,
vdev->cxl stays NULL and the device falls back to plain vfio-pci.

Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
 drivers/vfio/pci/cxl/vfio_cxl_core.c | 217 +++++++++++++++++++++++++++
 drivers/vfio/pci/cxl/vfio_cxl_priv.h |  12 ++
 2 files changed, 229 insertions(+)

diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index d12afec82ecd..b1c7603590b5 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -21,6 +21,158 @@
 #include "../vfio_pci_priv.h"
 #include "vfio_cxl_priv.h"
 
+/*
+ * vfio_cxl_create_device_state - Allocate and validate CXL device state
+ *
+ * Returns a pointer to the allocated vfio_pci_cxl_state on success, or
+ * ERR_PTR on failure.  The allocation uses devm; the caller must call
+ * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release
+ * the resource before device unbind.  Using devm_kfree() to undo a devm
+ * allocation early is explicitly supported by the devres API.
+ *
+ * The caller assigns vdev->cxl only after all setup steps succeed, preventing
+ * partially-initialised state from being visible through vdev->cxl on any
+ * failure path.
+ */
+static struct vfio_pci_cxl_state *
+vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec)
+{
+	struct vfio_pci_cxl_state *cxl;
+	u16 cap_word;
+	u32 hdr1;
+
+	/* Freed automatically when pdev->dev is released. */
+	cxl = devm_cxl_dev_state_create(&pdev->dev,
+					CXL_DEVTYPE_DEVMEM,
+					pdev->dev.id, dvsec,
+					struct vfio_pci_cxl_state,
+					cxlds, false);
+	if (!cxl)
+		return ERR_PTR(-ENOMEM);
+
+	pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1);
+	cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1);
+
+	pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
+			     &cap_word);
+
+	/*
+	 * Only handle vendor devices (class != 0x0502) with Mem_Capable set.
+	 * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI
+	 * sequence is needed before FLR.
+	 */
+	if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) ||
+	    (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) {
+		devm_kfree(&pdev->dev, cxl);
+		return ERR_PTR(-ENODEV);
+	}
+
+	cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word);
+
+	return cxl;
+}
+
+static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_cxl_state *cxl)
+{
+	struct cxl_register_map *map = &cxl->cxlds.reg_map;
+	resource_size_t offset, bar_offset, size;
+	struct pci_dev *pdev = vdev->pdev;
+	void __iomem *base;
+	int ret;
+	u8 count;
+	u8 bar;
+
+	if (WARN_ON_ONCE(!pci_is_enabled(pdev)))
+		return -EINVAL;
+
+	/* Find component register block via Register Locator DVSEC */
+	ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map);
+	if (ret)
+		return ret;
+
+	/*
+	 * Request the region and map.  This is a transient mapping
+	 * used only to probe register capabilities; released immediately
+	 * after cxl_probe_component_regs() returns.
+	 */
+	if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe"))
+		return -EBUSY;
+
+	base = ioremap(map->resource, map->max_size);
+	if (!base) {
+		ret = -ENOMEM;
+		goto failed_release;
+	}
+
+	/* Probe component register capabilities */
+	cxl_probe_component_regs(&pdev->dev, base, &map->component_map);
+
+	/* Check if HDM decoder was found */
+	if (!map->component_map.hdm_decoder.valid) {
+		ret = -ENODEV;
+		goto failed_unmap;
+	}
+
+	pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n",
+		map->component_map.hdm_decoder.offset,
+		map->component_map.hdm_decoder.size);
+
+	/* Get HDM register info */
+	ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size);
+	if (ret)
+		goto failed_unmap;
+
+	if (!count || !size) {
+		ret = -ENODEV;
+		goto failed_unmap;
+	}
+
+	cxl->hdm_count = count;
+	/*
+	 * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + <hdm_within_cm>
+	 * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET before
+	 * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset).
+	 * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem
+	 * register area start, which is where comp_reg_virt[0] is anchored.
+	 * The physical BAR address for hdm_iobase is recovered by adding
+	 * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs().
+	 */
+	cxl->hdm_reg_offset = offset - CXL_CM_OFFSET;
+	cxl->hdm_reg_size = size;
+
+	ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset);
+	if (ret)
+		goto failed_unmap;
+
+	cxl->comp_reg_bar = bar;
+	cxl->comp_reg_offset = bar_offset;
+	cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE;
+
+	iounmap(base);
+	release_mem_region(map->resource, map->max_size);
+
+	return 0;
+
+failed_unmap:
+	iounmap(base);
+failed_release:
+	release_mem_region(map->resource, map->max_size);
+
+	return ret;
+}
+
+/*
+ * Free CXL state early on probe failure.  devm_kfree() on a live devres
+ * allocation removes it from the list immediately, so the normal devres
+ * teardown at unbind time won't double-free it.
+ */
+static void vfio_cxl_dev_state_free(struct pci_dev *pdev,
+				    struct vfio_pci_cxl_state *cxl)
+{
+	devm_kfree(&pdev->dev, cxl);
+}
+
 /**
  * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific
  *                                CXL.mem device
@@ -32,10 +184,75 @@
  */
 void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev)
 {
+	struct pci_dev *pdev = vdev->pdev;
+	struct vfio_pci_cxl_state *cxl;
+	u16 dvsec;
+	int ret;
+
+	if (!pcie_is_cxl(pdev))
+		return;
+
+	dvsec = pci_find_dvsec_capability(pdev,
+					  PCI_VENDOR_ID_CXL,
+					  PCI_DVSEC_CXL_DEVICE);
+	if (!dvsec)
+		return;
+
+	/*
+	 * CXL DVSEC found: any failure from here is a hard probe error on
+	 * a confirmed CXL-capable device, not a silent non-CXL fallback.
+	 * Warn the operator so misconfiguration is visible.
+	 */
+	cxl = vfio_cxl_create_device_state(pdev, dvsec);
+	if (IS_ERR(cxl)) {
+		if (PTR_ERR(cxl) != -ENODEV)
+			pci_warn(pdev,
+				 "vfio-cxl: CXL device state allocation failed: %ld\n",
+				 PTR_ERR(cxl));
+		return;
+	}
+
+	/*
+	 * Required for ioremap of the component register block and
+	 * calls to cxl_probe_component_regs().
+	 */
+	ret = pci_enable_device_mem(pdev);
+	if (ret) {
+		pci_warn(pdev,
+			 "vfio-cxl: pci_enable_device_mem failed: %d\n", ret);
+		goto free_cxl;
+	}
+
+	ret = vfio_cxl_setup_regs(vdev, cxl);
+	if (ret) {
+		pci_warn(pdev,
+			 "vfio-cxl: HDM register probing failed: %d\n", ret);
+		pci_disable_device(pdev);
+		goto free_cxl;
+	}
+
+	pci_disable_device(pdev);
+
+	/*
+	 * Register probing succeeded.  Assign vdev->cxl now so that
+	 * all subsequent helpers can access state via vdev->cxl.
+	 * All failure paths below clear vdev->cxl before calling
+	 * vfio_cxl_dev_state_free().
+	 */
+	vdev->cxl = cxl;
+
+	return;
+
+free_cxl:
+	vfio_cxl_dev_state_free(pdev, cxl);
 }
 
 void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
 {
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	if (!cxl)
+		return;
 }
 
 MODULE_IMPORT_NS("CXL");
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 4cecc25db410..54b1f6d885aa 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -21,8 +21,20 @@ struct vfio_pci_cxl_state {
 	size_t                       hdm_reg_size;
 	resource_size_t              comp_reg_offset;
 	size_t                       comp_reg_size;
+	u16                          dvsec_len;
 	u8                           hdm_count;
 	u8                           comp_reg_bar;
+	bool                         cache_capable;
 };
 
+/*
+ * CXL DVSEC for CXL Devices - register offsets within the DVSEC
+ * (CXL 4.0 8.1.3).
+ * Offsets are relative to the DVSEC capability base (cxl->dvsec).
+ */
+#define CXL_DVSEC_CAPABILITY_OFFSET 0xa
+#define CXL_DVSEC_MEM_CAPABLE	    BIT(2)
+/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */
+#define CXL_DVSEC_CACHE_CAPABLE	    BIT(0)
+
 #endif /* __LINUX_VFIO_CXL_PRIV_H */
-- 
2.25.1