[PATCH 06/20] vfio/cxl: Add UAPI for CXL Type-2 device passthrough

mhonap@nvidia.com posted 20 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH 06/20] vfio/cxl: Add UAPI for CXL Type-2 device passthrough
Posted by mhonap@nvidia.com 3 weeks, 5 days ago
From: Manish Honap <mhonap@nvidia.com>

CXL capabilities include:
- hdm_count: Number of HDM decoders available
- capacity: Total device memory (DPA)
- flags: COMMITTED, PRECOMMITTED

This UAPI enables VMMs like QEMU to passthrough CXL Type-2 devices
(GPUs, accelerators) with coherent memory to VMs.

Also added user-kernel API definitions for CXL Type-2 device passthrough.
Document how VFIO_DEVICE_FLAGS_CXL relates to VFIO_DEVICE_FLAGS_PCI
and VFIO_DEVICE_FLAGS_CAPS, and add field and flag descriptions
for the CXL capability.

Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
 include/uapi/linux/vfio.h | 52 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ac2329f24141..7ec0f96cc2d9 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -215,6 +215,13 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6)	/* vfio-fsl-mc device */
 #define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
 #define VFIO_DEVICE_FLAGS_CDX	(1 << 8)	/* vfio-cdx device */
+/*
+ * CXL Type-2 device (memory coherent; e.g. GPU, accelerator). When set,
+ * VFIO_DEVICE_FLAGS_PCI is also set (same device is a PCI device). The
+ * capability chain (VFIO_DEVICE_FLAGS_CAPS) contains VFIO_DEVICE_INFO_CAP_CXL
+ * describing HDM decoders, DPA size, and CXL-specific options.
+ */
+#define VFIO_DEVICE_FLAGS_CXL   (1 << 9)        /* Device supports CXL */
 	__u32	num_regions;	/* Max region index + 1 */
 	__u32	num_irqs;	/* Max IRQ index + 1 */
 	__u32   cap_offset;	/* Offset within info struct of first cap */
@@ -257,6 +264,39 @@ struct vfio_device_info_cap_pci_atomic_comp {
 	__u32 reserved;
 };
 
+/*
+ * VFIO_DEVICE_INFO_CAP_CXL - CXL Type-2 device capability
+ *
+ * Present in the device info capability chain when VFIO_DEVICE_FLAGS_CXL
+ * is set. Describes Host Managed Device Memory (HDM) layout and CXL
+ * memory options so that userspace (e.g. QEMU) can expose the CXL region
+ * and component registers correctly to the guest.
+ */
+#define VFIO_DEVICE_INFO_CAP_CXL		6
+struct vfio_device_info_cap_cxl {
+	struct vfio_info_cap_header header;
+	__u8  hdm_count; /* Number of HDM decoders */
+	__u8  hdm_regs_bar_index; /* PCI BAR containing HDM registers */
+	__u16 pad;
+	__u32 flags;
+/* Decoder was committed by host firmware/BIOS */
+#define VFIO_CXL_CAP_COMMITTED		(1 << 0)
+/*
+ * Memory was pre-committed (firmware-programmed); VMM need not allocate
+ * from CXL pool
+ */
+#define VFIO_CXL_CAP_PRECOMMITTED	(1 << 1)
+	__u64 hdm_regs_size; /* Size in bytes of HDM register block */
+	__u64 hdm_regs_offset; /* Byte offset within the BAR to the HDM decoder block */
+	__u64 dpa_size; /* Device Physical Address (DPA) size in bytes */
+	/*
+	 * Region indices for the two CXL VFIO device regions.
+	 * Avoids forcing userspace to scan all regions by type/subtype.
+	 */
+	__u32  dpa_region_index;       /* VFIO_REGION_SUBTYPE_CXL */
+	__u32  comp_regs_region_index; /* VFIO_REGION_SUBTYPE_CXL_COMP_REGS */
+};
+
 /**
  * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
  *				       struct vfio_region_info)
@@ -370,6 +410,18 @@ struct vfio_region_info_cap_type {
  */
 #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
 
+/* 1e98 vendor PCI sub-types (CXL Consortium) */
+/*
+ * CXL memory region. Use with region type
+ * (PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE).
+ * DPA memory region (fault+zap mmap)
+ */
+#define VFIO_REGION_SUBTYPE_CXL                 (1)
+/*
+ * HDM decoder register emulation region (read/write only, no mmap).
+ */
+#define VFIO_REGION_SUBTYPE_CXL_COMP_REGS       (2)
+
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
 
-- 
2.25.1
Re: [PATCH 06/20] vfio/cxl: Add UAPI for CXL Type-2 device passthrough
Posted by Dave Jiang 3 weeks, 4 days ago

On 3/11/26 1:34 PM, mhonap@nvidia.com wrote:
> From: Manish Honap <mhonap@nvidia.com>
> 
> CXL capabilities include:
> - hdm_count: Number of HDM decoders available
> - capacity: Total device memory (DPA)
> - flags: COMMITTED, PRECOMMITTED
> 
> This UAPI enables VMMs like QEMU to passthrough CXL Type-2 devices
> (GPUs, accelerators) with coherent memory to VMs.
> 
> Also added user-kernel API definitions for CXL Type-2 device passthrough.
> Document how VFIO_DEVICE_FLAGS_CXL relates to VFIO_DEVICE_FLAGS_PCI
> and VFIO_DEVICE_FLAGS_CAPS, and add field and flag descriptions
> for the CXL capability.
> 
> Signed-off-by: Manish Honap <mhonap@nvidia.com>
> ---
>  include/uapi/linux/vfio.h | 52 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ac2329f24141..7ec0f96cc2d9 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -215,6 +215,13 @@ struct vfio_device_info {
>  #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6)	/* vfio-fsl-mc device */
>  #define VFIO_DEVICE_FLAGS_CAPS	(1 << 7)	/* Info supports caps */
>  #define VFIO_DEVICE_FLAGS_CDX	(1 << 8)	/* vfio-cdx device */
> +/*
> + * CXL Type-2 device (memory coherent; e.g. GPU, accelerator). When set,
> + * VFIO_DEVICE_FLAGS_PCI is also set (same device is a PCI device). The
> + * capability chain (VFIO_DEVICE_FLAGS_CAPS) contains VFIO_DEVICE_INFO_CAP_CXL
> + * describing HDM decoders, DPA size, and CXL-specific options.
> + */
> +#define VFIO_DEVICE_FLAGS_CXL   (1 << 9)        /* Device supports CXL */
>  	__u32	num_regions;	/* Max region index + 1 */
>  	__u32	num_irqs;	/* Max IRQ index + 1 */
>  	__u32   cap_offset;	/* Offset within info struct of first cap */
> @@ -257,6 +264,39 @@ struct vfio_device_info_cap_pci_atomic_comp {
>  	__u32 reserved;
>  };
>  
> +/*
> + * VFIO_DEVICE_INFO_CAP_CXL - CXL Type-2 device capability
> + *
> + * Present in the device info capability chain when VFIO_DEVICE_FLAGS_CXL
> + * is set. Describes Host Managed Device Memory (HDM) layout and CXL
> + * memory options so that userspace (e.g. QEMU) can expose the CXL region
> + * and component registers correctly to the guest.
> + */
> +#define VFIO_DEVICE_INFO_CAP_CXL		6
> +struct vfio_device_info_cap_cxl {
> +	struct vfio_info_cap_header header;
> +	__u8  hdm_count; /* Number of HDM decoders */
> +	__u8  hdm_regs_bar_index; /* PCI BAR containing HDM registers */
> +	__u16 pad;
> +	__u32 flags;
> +/* Decoder was committed by host firmware/BIOS */

I'm confused by COMMITTED vs PRECOMMITTED. Should it just say "Decoder is committed" here? Otherwise what is the difference? Also can you explain a little the usage for COMMITTED vs PRECOMMITTED in the commit log please? i.e why does VFIO CXL needs to know a decoder is pre-committed?

DJ

> +#define VFIO_CXL_CAP_COMMITTED		(1 << 0)
> +/*
> + * Memory was pre-committed (firmware-programmed); VMM need not allocate
> + * from CXL pool
> + */
> +#define VFIO_CXL_CAP_PRECOMMITTED	(1 << 1)
> +	__u64 hdm_regs_size; /* Size in bytes of HDM register block */
> +	__u64 hdm_regs_offset; /* Byte offset within the BAR to the HDM decoder block */
> +	__u64 dpa_size; /* Device Physical Address (DPA) size in bytes */
> +	/*
> +	 * Region indices for the two CXL VFIO device regions.
> +	 * Avoids forcing userspace to scan all regions by type/subtype.
> +	 */
> +	__u32  dpa_region_index;       /* VFIO_REGION_SUBTYPE_CXL */
> +	__u32  comp_regs_region_index; /* VFIO_REGION_SUBTYPE_CXL_COMP_REGS */
> +};
> +
>  /**
>   * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
>   *				       struct vfio_region_info)
> @@ -370,6 +410,18 @@ struct vfio_region_info_cap_type {
>   */
>  #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
>  
> +/* 1e98 vendor PCI sub-types (CXL Consortium) */
> +/*
> + * CXL memory region. Use with region type
> + * (PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE).
> + * DPA memory region (fault+zap mmap)
> + */
> +#define VFIO_REGION_SUBTYPE_CXL                 (1)
> +/*
> + * HDM decoder register emulation region (read/write only, no mmap).
> + */
> +#define VFIO_REGION_SUBTYPE_CXL_COMP_REGS       (2)
> +
>  /* sub-types for VFIO_REGION_TYPE_GFX */
>  #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
>