[PATCH 13/20] vfio/cxl: Introduce HDM decoder register emulation framework

mhonap@nvidia.com posted 20 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH 13/20] vfio/cxl: Introduce HDM decoder register emulation framework
Posted by mhonap@nvidia.com 3 weeks, 5 days ago
From: Manish Honap <mhonap@nvidia.com>

Introduce an emulation framework to handle CXL MMIO register emulation
for CXL devices passed through to a VM.

A single compact __le32 array (comp_reg_virt) covers only the HDM
decoder register block (hdm_reg_size bytes, typically 256-512 bytes).

A new VFIO device region VFIO_REGION_SUBTYPE_CXL_COMP_REGS exposes
this array to userspace (QEMU) as a read-write region:
  - Reads return the emulated state (comp_reg_virt[])
  - Writes go through the HDM register write handlers and are
    forwarded to hardware where appropriate

QEMU attaches a notify_change callback to this region. When the
COMMIT bit is written in a decoder CTRL register the callback
reads the BASE_LO/HI from the same region fd (emulated state) and
maps the DPA MemoryRegion at the correct GPA in system_memory.

Co-developed-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Manish Honap <mhonap@nvidia.com>
---
 drivers/vfio/pci/Makefile            |   2 +-
 drivers/vfio/pci/cxl/vfio_cxl_core.c |  36 ++-
 drivers/vfio/pci/cxl/vfio_cxl_emu.c  | 366 +++++++++++++++++++++++++++
 drivers/vfio/pci/cxl/vfio_cxl_priv.h |  41 +++
 drivers/vfio/pci/vfio_pci_priv.h     |   7 +
 5 files changed, 450 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_emu.c

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index ecb0eacbc089..bef916495eae 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
-vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o
+vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o
 vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
 vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
 obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index 03846bd11c8a..d2401871489d 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -45,6 +45,7 @@ static int vfio_cxl_create_device_state(struct vfio_pci_core_device *vdev,
 	cxl = vdev->cxl;
 	cxl->dvsec = dvsec;
 	cxl->dpa_region_idx = -1;
+	cxl->comp_reg_region_idx = -1;
 
 	pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
 			     &cap_word);
@@ -124,6 +125,10 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev)
 	cxl->comp_reg_offset = bar_offset;
 	cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE;
 
+	ret = vfio_cxl_setup_virt_regs(vdev);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
@@ -281,12 +286,14 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev)
 
 	ret = vfio_cxl_create_region_helper(vdev, SZ_256M);
 	if (ret)
-		goto failed;
+		goto regs_failed;
 
 	cxl->precommitted = true;
 
 	return;
 
+regs_failed:
+	vfio_cxl_clean_virt_regs(vdev);
 failed:
 	devm_kfree(&pdev->dev, vdev->cxl);
 	vdev->cxl = NULL;
@@ -299,6 +306,7 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
 	if (!cxl || !cxl->region)
 		return;
 
+	vfio_cxl_clean_virt_regs(vdev);
 	vfio_cxl_destroy_cxl_region(vdev);
 }
 
@@ -409,6 +417,32 @@ void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
 
 	if (!cxl)
 		return;
+
+	/*
+	 * Re-initialise the emulated HDM comp_reg_virt[] from hardware.
+	 * After FLR the decoder registers read as zero; mirror that in
+	 * the emulated state so QEMU sees a clean slate.
+	 */
+	vfio_cxl_reinit_comp_regs(vdev);
+
+	/*
+	 * Only re-enable the DPA mmap if the hardware has actually
+	 * re-committed decoder 0 after FLR.  Read the COMMITTED bit from the
+	 * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR
+	 * hardware state, not stale pre-reset state.
+	 *
+	 * If COMMITTED is 0 (slow firmware re-commit path), leave
+	 * region_active=false.  Guest faults will return VM_FAULT_SIGBUS
+	 * until the decoder is re-committed and the region is re-enabled.
+	 */
+	if (cxl->precommitted && cxl->comp_reg_virt) {
+		u32 ctrl = le32_to_cpu(cxl->comp_reg_virt[
+				       CXL_HDM_DECODER0_CTRL_OFFSET(0) /
+				       CXL_REG_SIZE_DWORD]);
+
+		if (ctrl & CXL_HDM_DECODER_CTRL_COMMITTED_BIT)
+			WRITE_ONCE(cxl->region_active, true);
+	}
 }
 
 static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
new file mode 100644
index 000000000000..d5603c80fe51
--- /dev/null
+++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/bitops.h>
+#include <linux/vfio_pci_core.h>
+
+#include "../vfio_pci_priv.h"
+#include "vfio_cxl_priv.h"
+
+/*
+ * comp_reg_virt[] layout:
+ *   Index 0..N correspond to 32-bit registers at byte offset 0..hdm_reg_size-4
+ *   within the HDM decoder capability block.
+ *
+ * Register layout within the HDM block (CXL spec 8.2.5.19):
+ *   0x00: HDM Decoder Capability
+ *   0x04: HDM Decoder Global Control
+ *   0x08: HDM Decoder Global Status
+ *   0x0c: (reserved)
+ *   For each decoder N (N=0..hdm_count-1), at base 0x10 + N*0x20:
+ *     +0x00: BASE_LO
+ *     +0x04: BASE_HI
+ *     +0x08: SIZE_LO
+ *     +0x0c: SIZE_HI
+ *     +0x10: CTRL
+ *     +0x14: TARGET_LIST_LO
+ *     +0x18: TARGET_LIST_HI
+ *     +0x1c: (reserved)
+ */
+
+static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 off)
+{
+	/*
+	 * off is byte offset within the HDM block; comp_reg_virt is indexed
+	 * as an array of __le32.
+	 */
+	return &cxl->comp_reg_virt[off / sizeof(__le32)];
+}
+
+static ssize_t virt_hdm_rev_reg_write(struct vfio_pci_core_device *vdev,
+				      const __le32 *val32, u64 offset, u64 size)
+{
+	/* Discard writes on reserved registers. */
+	return size;
+}
+
+static ssize_t hdm_decoder_n_lo_write(struct vfio_pci_core_device *vdev,
+				      const __le32 *val32, u64 offset, u64 size)
+{
+	u32 new_val = le32_to_cpu(*val32);
+
+	if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD))
+		return -EINVAL;
+
+	/* Bit [27:0] are reserved. */
+	new_val &= ~CXL_HDM_DECODER_BASE_LO_RESERVED_MASK;
+
+	*hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val);
+
+	return size;
+}
+
+static ssize_t hdm_decoder_global_ctrl_write(struct vfio_pci_core_device *vdev,
+					     const __le32 *val32, u64 offset, u64 size)
+{
+	u32 hdm_decoder_global_cap;
+	u32 new_val = le32_to_cpu(*val32);
+
+	if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD))
+		return -EINVAL;
+
+	/* Bit [31:2] are reserved. */
+	new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK;
+
+	/* Poison On Decode Error Enable bit is 0 and RO if not support. */
+	hdm_decoder_global_cap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, 0));
+	if (!(hdm_decoder_global_cap & CXL_HDM_CAP_POISON_ON_DECODE_ERR_BIT))
+		new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT;
+
+	*hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val);
+
+	return size;
+}
+
+/*
+ * hdm_decoder_n_ctrl_write - Write handler for HDM decoder CTRL register.
+ *
+ * The COMMIT bit (bit 9) is the key: setting it requests the hardware to
+ * lock the decoder.  The emulated COMMITTED bit (bit 10) mirrors COMMIT
+ * immediately to allow QEMU's notify_change to detect the transition and
+ * map/unmap the DPA MemoryRegion in the guest address space.
+ *
+ * Note: the actual hardware HDM decoder programming (writing the real
+ * BASE/SIZE with host physical addresses) happens in the QEMU notify_change
+ * callback BEFORE this write reaches the hardware.  This ordering is
+ * correct because vfio_region_write() calls notify_change() first.
+ */
+static ssize_t hdm_decoder_n_ctrl_write(struct vfio_pci_core_device *vdev,
+					const __le32 *val32, u64 offset, u64 size)
+{
+	u32 hdm_decoder_global_cap;
+	u32 ro_mask = CXL_HDM_DECODER_CTRL_RO_BITS_MASK;
+	u32 rev_mask = CXL_HDM_DECODER_CTRL_RESERVED_MASK;
+	u32 new_val = le32_to_cpu(*val32);
+	u32 cur_val;
+
+	if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD))
+		return -EINVAL;
+
+	cur_val = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, offset));
+	if (cur_val & CXL_HDM_DECODER_CTRL_COMMIT_LOCK_BIT)
+		return size;
+
+	hdm_decoder_global_cap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, 0));
+	ro_mask |= CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO;
+	rev_mask |= CXL_HDM_DECODER_CTRL_DEVICE_RESERVED;
+	if (!(hdm_decoder_global_cap & CXL_HDM_CAP_UIO_SUPPORTED_BIT))
+		rev_mask |= CXL_HDM_DECODER_CTRL_UIO_RESERVED;
+
+	new_val &= ~rev_mask;
+	cur_val &= ro_mask;
+	new_val = (new_val & ~ro_mask) | cur_val;
+
+	/*
+	 * Mirror COMMIT → COMMITTED immediately in the emulated state.
+	 * QEMU's notify_change (called before this write reaches hardware)
+	 * reads COMMITTED from the region fd to detect commit transitions.
+	 */
+	if (new_val & CXL_HDM_DECODER_CTRL_COMMIT_BIT)
+		new_val |= CXL_HDM_DECODER_CTRL_COMMITTED_BIT;
+	else
+		new_val &= ~CXL_HDM_DECODER_CTRL_COMMITTED_BIT;
+
+	*hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val);
+
+	return size;
+}
+
+/*
+ * Dispatch table for COMP_REGS region writes.	Indexed by byte offset within
+ * the HDM decoder block.  Returns the appropriate write handler.
+ *
+ * Layout:
+ *   0x00	  HDM Decoder Capability  (RO)
+ *   0x04	  HDM Global Control	  (RW with reserved masking)
+ *   0x08	  HDM Global Status	  (RO)
+ *   0x0c	  (reserved)		  (ignored)
+ *   Per decoder N, base = 0x10 + N*0x20:
+ *     base+0x00  BASE_LO  (RW, [27:0] reserved)
+ *     base+0x04  BASE_HI  (RW)
+ *     base+0x08  SIZE_LO  (RW, [27:0] reserved)
+ *     base+0x0c  SIZE_HI  (RW)
+ *     base+0x10  CTRL	   (RW, complex rules)
+ *     base+0x14  TARGET_LIST_LO  (ignored for Type-2)
+ *     base+0x18  TARGET_LIST_HI  (ignored for Type-2)
+ *     base+0x1c  (reserved)	 (ignored)
+ */
+static ssize_t comp_regs_dispatch_write(struct vfio_pci_core_device *vdev,
+					u32 off, const __le32 *val32, u32 size)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	u32 dec_base, dec_off;
+
+	/* HDM Decoder Capability (0x00): RO */
+	if (off == 0x00)
+		return size;
+
+	/* HDM Global Control (0x04) */
+	if (off == CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET)
+		return hdm_decoder_global_ctrl_write(vdev, val32, off, size);
+
+	/* HDM Global Status (0x08): RO */
+	if (off == 0x08)
+		return size;
+
+	/* Per-decoder registers start at 0x10, stride 0x20 */
+	if (off < CXL_HDM_DECODER_FIRST_BLOCK_OFFSET)
+		return size; /* reserved gap */
+
+	dec_base = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET;
+	dec_off	 = (off - dec_base) % CXL_HDM_DECODER_BLOCK_STRIDE;
+
+	switch (dec_off) {
+	case CXL_HDM_DECODER_N_BASE_LOW_OFFSET:	 /* BASE_LO */
+	case CXL_HDM_DECODER_N_SIZE_LOW_OFFSET:	 /* SIZE_LO */
+		return hdm_decoder_n_lo_write(vdev, val32, off, size);
+	case CXL_HDM_DECODER_N_BASE_HIGH_OFFSET: /* BASE_HI */
+	case CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET: /* SIZE_HI */
+		/* Full 32-bit write, no reserved bits */
+		*hdm_reg_ptr(cxl, off) = *val32;
+		return size;
+	case CXL_HDM_DECODER_N_CTRL_OFFSET:	  /* CTRL */
+		return hdm_decoder_n_ctrl_write(vdev, val32, off, size);
+	case CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET:
+	case CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET:
+	case CXL_HDM_DECODER_N_REV_OFFSET:
+		return virt_hdm_rev_reg_write(vdev, val32, off, size);
+	default:
+		return size;
+	}
+}
+
+/*
+ * vfio_cxl_comp_regs_rw - regops rw handler for VFIO_REGION_SUBTYPE_CXL_COMP_REGS.
+ *
+ * Reads return the emulated HDM state (comp_reg_virt[]).
+ * Writes go through comp_regs_dispatch_write() for bit-field enforcement.
+ * Only 4-byte aligned 4-byte accesses are supported (hardware requirement).
+ */
+static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev,
+				     char __user *buf, size_t count,
+				     loff_t *ppos, bool iswrite)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	size_t done = 0;
+
+	if (!count)
+		return 0;
+
+	/* Clamp to region size */
+	if (pos >= cxl->hdm_reg_size)
+		return -EINVAL;
+	count = min(count, (size_t)(cxl->hdm_reg_size - pos));
+
+	while (done < count) {
+		u32 sz	 = min_t(u32, CXL_REG_SIZE_DWORD, count - done);
+		u32 off	 = pos + done;
+		__le32 v;
+
+		/* Enforce 4-byte alignment */
+		if (sz < CXL_REG_SIZE_DWORD || (off & 0x3))
+			return done ? (ssize_t)done : -EINVAL;
+
+		if (iswrite) {
+			if (copy_from_user(&v, buf + done, sizeof(v)))
+				return done ? (ssize_t)done : -EFAULT;
+			comp_regs_dispatch_write(vdev, off, &v, sizeof(v));
+		} else {
+			v = *hdm_reg_ptr(cxl, off);
+			if (copy_to_user(buf + done, &v, sizeof(v)))
+				return done ? (ssize_t)done : -EFAULT;
+		}
+		done += sizeof(v);
+	}
+
+	*ppos += done;
+	return done;
+}
+
+static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_region *region)
+{
+	/* comp_reg_virt is freed in vfio_cxl_clean_virt_regs(), not here. */
+}
+
+static const struct vfio_pci_regops vfio_cxl_comp_regs_ops = {
+	.rw	 = vfio_cxl_comp_regs_rw,
+	.release = vfio_cxl_comp_regs_release,
+};
+
+/*
+ * vfio_cxl_setup_virt_regs - Allocate emulated HDM register state.
+ *
+ * Allocates comp_reg_virt as a compact __le32 array covering only
+ * hdm_reg_size bytes of HDM decoder registers. The initial values
+ * are read from hardware via the BAR ioremap established by the caller.
+ *
+ * DVSEC state is accessed via vdev->vconfig (see the following patch).
+ */
+int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	size_t nregs;
+
+	if (WARN_ON(!cxl->hdm_reg_size))
+		return -EINVAL;
+
+	if (pci_resource_len(vdev->pdev, cxl->comp_reg_bar) <
+	    cxl->comp_reg_offset + cxl->hdm_reg_offset + cxl->hdm_reg_size)
+		return -ENODEV;
+
+	nregs = cxl->hdm_reg_size / sizeof(__le32);
+	cxl->comp_reg_virt = kcalloc(nregs, sizeof(__le32), GFP_KERNEL);
+	if (!cxl->comp_reg_virt)
+		return -ENOMEM;
+
+	/* Establish persistent mapping; kept alive until vfio_cxl_clean_virt_regs(). */
+	cxl->hdm_iobase = ioremap(pci_resource_start(vdev->pdev, cxl->comp_reg_bar) +
+				  cxl->comp_reg_offset + cxl->hdm_reg_offset,
+				  cxl->hdm_reg_size);
+	if (!cxl->hdm_iobase) {
+		kfree(cxl->comp_reg_virt);
+		cxl->comp_reg_virt = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Called with memory_lock write side held (from vfio_cxl_reactivate_region).
+ * Uses the pre-established hdm_iobase, no ioremap() under the lock,
+ * which would deadlock on PREEMPT_RT where ioremap() can sleep.
+ */
+void vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	size_t i, nregs;
+
+	if (!cxl || !cxl->comp_reg_virt || !cxl->hdm_iobase)
+		return;
+
+	nregs = cxl->hdm_reg_size / sizeof(__le32);
+
+	for (i = 0; i < nregs; i++)
+		cxl->comp_reg_virt[i] =
+			cpu_to_le32(readl(cxl->hdm_iobase + i * sizeof(__le32)));
+}
+
+void vfio_cxl_clean_virt_regs(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+	if (cxl->hdm_iobase) {
+		iounmap(cxl->hdm_iobase);
+		cxl->hdm_iobase = NULL;
+	}
+	kfree(cxl->comp_reg_virt);
+	cxl->comp_reg_virt = NULL;
+}
+
+/*
+ * vfio_cxl_register_comp_regs_region - Register the COMP_REGS device region.
+ *
+ * Exposes the emulated HDM decoder register state as a VFIO device region
+ * with type VFIO_REGION_SUBTYPE_CXL_COMP_REGS.	 QEMU attaches a
+ * notify_change callback to this region to intercept HDM COMMIT writes
+ * and map the DPA MemoryRegion at the appropriate GPA.
+ *
+ * The region is read+write only (no mmap) to ensure all accesses pass
+ * through comp_regs_dispatch_write() for proper bit-field enforcement.
+ */
+int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_pci_cxl_state *cxl = vdev->cxl;
+	u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+	int ret;
+
+	if (!cxl || !cxl->comp_reg_virt)
+		return -ENODEV;
+
+	ret = vfio_pci_core_register_dev_region(vdev,
+						PCI_VENDOR_ID_CXL |
+						VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+						VFIO_REGION_SUBTYPE_CXL_COMP_REGS,
+						&vfio_cxl_comp_regs_ops,
+						cxl->hdm_reg_size, flags, cxl);
+	if (!ret)
+		cxl->comp_reg_region_idx = vdev->num_regions - 1;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_cxl_register_comp_regs_region);
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index b870926bfb19..4f2637874e9d 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -25,14 +25,51 @@ struct vfio_pci_cxl_state {
 	size_t                       hdm_reg_size;
 	resource_size_t              comp_reg_offset;
 	size_t                       comp_reg_size;
+	__le32                      *comp_reg_virt;
+	void __iomem                *hdm_iobase;
 	u32                          hdm_count;
 	int                          dpa_region_idx;
+	int                          comp_reg_region_idx;
 	u16                          dvsec;
 	u8                           comp_reg_bar;
 	bool                         precommitted;
 	bool                         region_active;
 };
 
+/* Register access sizes */
+#define CXL_REG_SIZE_WORD  2
+#define CXL_REG_SIZE_DWORD 4
+
+/* HDM Decoder - register offsets (CXL 2.0 8.2.5.19) */
+#define CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET	  0x4
+#define CXL_HDM_DECODER_FIRST_BLOCK_OFFSET	  0x10
+#define CXL_HDM_DECODER_BLOCK_STRIDE		  0x20
+#define CXL_HDM_DECODER_N_BASE_LOW_OFFSET	  0x0
+#define CXL_HDM_DECODER_N_BASE_HIGH_OFFSET	  0x4
+#define CXL_HDM_DECODER_N_SIZE_LOW_OFFSET	  0x8
+#define CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET	  0xc
+#define CXL_HDM_DECODER_N_CTRL_OFFSET		  0x10
+#define CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET  0x14
+#define CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET 0x18
+#define CXL_HDM_DECODER_N_REV_OFFSET		  0x1c
+
+/* HDM Decoder Global Capability / Control - bit definitions */
+#define CXL_HDM_CAP_POISON_ON_DECODE_ERR_BIT BIT(10)
+#define CXL_HDM_CAP_UIO_SUPPORTED_BIT	     BIT(13)
+
+/* HDM Decoder N Control */
+#define CXL_HDM_DECODER_CTRL_COMMIT_LOCK_BIT	  BIT(8)
+#define CXL_HDM_DECODER_CTRL_COMMIT_BIT		  BIT(9)
+#define CXL_HDM_DECODER_CTRL_COMMITTED_BIT	  BIT(10)
+#define CXL_HDM_DECODER_CTRL_RO_BITS_MASK	  (BIT(10) | BIT(11))
+#define CXL_HDM_DECODER_CTRL_RESERVED_MASK	  (BIT(15) | GENMASK(31, 28))
+#define CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO	  BIT(12)
+#define CXL_HDM_DECODER_CTRL_DEVICE_RESERVED	  (GENMASK(19, 16) | GENMASK(23, 20))
+#define CXL_HDM_DECODER_CTRL_UIO_RESERVED	  (BIT(14) | GENMASK(27, 24))
+#define CXL_HDM_DECODER_BASE_LO_RESERVED_MASK	  GENMASK(27, 0)
+#define CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK GENMASK(31, 2)
+#define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0)
+
 /*
  * CXL DVSEC for CXL Devices - register offsets within the DVSEC
  * (CXL 2.0+ 8.1.3).
@@ -41,4 +78,8 @@ struct vfio_pci_cxl_state {
 #define CXL_DVSEC_CAPABILITY_OFFSET 0xa
 #define CXL_DVSEC_MEM_CAPABLE	    BIT(2)
 
+int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev);
+void vfio_cxl_clean_virt_regs(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev);
+
 #endif /* __LINUX_VFIO_CXL_PRIV_H */
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 8f440f9eaa0c..f8db9a05c033 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -152,6 +152,8 @@ int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev);
 void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev);
 void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
 void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
+int  vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev);
+void vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev);
 
 #else
 
@@ -173,6 +175,11 @@ static inline void
 vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
 static inline void
 vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
+static inline int
+vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev)
+{ return 0; }
+static inline void
+vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev) { }
 
 #endif /* CONFIG_VFIO_CXL_CORE */
 
-- 
2.25.1

Re: [PATCH 13/20] vfio/cxl: Introduce HDM decoder register emulation framework
Posted by Dave Jiang 3 weeks, 4 days ago

On 3/11/26 1:34 PM, mhonap@nvidia.com wrote:
> From: Manish Honap <mhonap@nvidia.com>
> 
> Introduce an emulation framework to handle CXL MMIO register emulation
> for CXL devices passed through to a VM.
> 
> A single compact __le32 array (comp_reg_virt) covers only the HDM
> decoder register block (hdm_reg_size bytes, typically 256-512 bytes).
> 
> A new VFIO device region VFIO_REGION_SUBTYPE_CXL_COMP_REGS exposes
> this array to userspace (QEMU) as a read-write region:
>   - Reads return the emulated state (comp_reg_virt[])
>   - Writes go through the HDM register write handlers and are
>     forwarded to hardware where appropriate
> 
> QEMU attaches a notify_change callback to this region. When the
> COMMIT bit is written in a decoder CTRL register the callback
> reads the BASE_LO/HI from the same region fd (emulated state) and
> maps the DPA MemoryRegion at the correct GPA in system_memory.
> 
> Co-developed-by: Zhi Wang <zhiw@nvidia.com>
> Signed-off-by: Zhi Wang <zhiw@nvidia.com>
> Signed-off-by: Manish Honap <mhonap@nvidia.com>
> ---
>  drivers/vfio/pci/Makefile            |   2 +-
>  drivers/vfio/pci/cxl/vfio_cxl_core.c |  36 ++-
>  drivers/vfio/pci/cxl/vfio_cxl_emu.c  | 366 +++++++++++++++++++++++++++
>  drivers/vfio/pci/cxl/vfio_cxl_priv.h |  41 +++
>  drivers/vfio/pci/vfio_pci_priv.h     |   7 +
>  5 files changed, 450 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_emu.c
> 
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index ecb0eacbc089..bef916495eae 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -1,7 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  
>  vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
> -vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o
> +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o
>  vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
>  vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
>  obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
> diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c
> index 03846bd11c8a..d2401871489d 100644
> --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
> +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
> @@ -45,6 +45,7 @@ static int vfio_cxl_create_device_state(struct vfio_pci_core_device *vdev,
>  	cxl = vdev->cxl;
>  	cxl->dvsec = dvsec;
>  	cxl->dpa_region_idx = -1;
> +	cxl->comp_reg_region_idx = -1;
>  
>  	pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
>  			     &cap_word);
> @@ -124,6 +125,10 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev)
>  	cxl->comp_reg_offset = bar_offset;
>  	cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE;
>  
> +	ret = vfio_cxl_setup_virt_regs(vdev);
> +	if (ret)
> +		return ret;
> +
>  	return 0;
>  }
>  
> @@ -281,12 +286,14 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev)
>  
>  	ret = vfio_cxl_create_region_helper(vdev, SZ_256M);
>  	if (ret)
> -		goto failed;
> +		goto regs_failed;
>  
>  	cxl->precommitted = true;
>  
>  	return;
>  
> +regs_failed:
> +	vfio_cxl_clean_virt_regs(vdev);
>  failed:
>  	devm_kfree(&pdev->dev, vdev->cxl);
>  	vdev->cxl = NULL;
> @@ -299,6 +306,7 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
>  	if (!cxl || !cxl->region)
>  		return;
>  
> +	vfio_cxl_clean_virt_regs(vdev);
>  	vfio_cxl_destroy_cxl_region(vdev);
>  }
>  
> @@ -409,6 +417,32 @@ void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev)
>  
>  	if (!cxl)
>  		return;
> +
> +	/*
> +	 * Re-initialise the emulated HDM comp_reg_virt[] from hardware.
> +	 * After FLR the decoder registers read as zero; mirror that in
> +	 * the emulated state so QEMU sees a clean slate.
> +	 */
> +	vfio_cxl_reinit_comp_regs(vdev);
> +
> +	/*
> +	 * Only re-enable the DPA mmap if the hardware has actually
> +	 * re-committed decoder 0 after FLR.  Read the COMMITTED bit from the
> +	 * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR
> +	 * hardware state, not stale pre-reset state.
> +	 *
> +	 * If COMMITTED is 0 (slow firmware re-commit path), leave
> +	 * region_active=false.  Guest faults will return VM_FAULT_SIGBUS
> +	 * until the decoder is re-committed and the region is re-enabled.
> +	 */
> +	if (cxl->precommitted && cxl->comp_reg_virt) {
> +		u32 ctrl = le32_to_cpu(cxl->comp_reg_virt[
> +				       CXL_HDM_DECODER0_CTRL_OFFSET(0) /
> +				       CXL_REG_SIZE_DWORD]);
> +
> +		if (ctrl & CXL_HDM_DECODER_CTRL_COMMITTED_BIT)
> +			WRITE_ONCE(cxl->region_active, true);
> +	}
>  }
>  
>  static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
> diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
> new file mode 100644
> index 000000000000..d5603c80fe51
> --- /dev/null
> +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c
> @@ -0,0 +1,366 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved
> + */
> +
> +#include <linux/bitops.h>
> +#include <linux/vfio_pci_core.h>
> +
> +#include "../vfio_pci_priv.h"
> +#include "vfio_cxl_priv.h"
> +
> +/*
> + * comp_reg_virt[] layout:
> + *   Index 0..N correspond to 32-bit registers at byte offset 0..hdm_reg_size-4
> + *   within the HDM decoder capability block.
> + *
> + * Register layout within the HDM block (CXL spec 8.2.5.19):
> + *   0x00: HDM Decoder Capability
> + *   0x04: HDM Decoder Global Control
> + *   0x08: HDM Decoder Global Status
> + *   0x0c: (reserved)
> + *   For each decoder N (N=0..hdm_count-1), at base 0x10 + N*0x20:
> + *     +0x00: BASE_LO
> + *     +0x04: BASE_HI
> + *     +0x08: SIZE_LO
> + *     +0x0c: SIZE_HI
> + *     +0x10: CTRL
> + *     +0x14: TARGET_LIST_LO
> + *     +0x18: TARGET_LIST_HI
> + *     +0x1c: (reserved)
> + */
> +
> +static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 off)
> +{
> +	/*
> +	 * off is byte offset within the HDM block; comp_reg_virt is indexed
> +	 * as an array of __le32.
> +	 */
> +	return &cxl->comp_reg_virt[off / sizeof(__le32)];
> +}
> +
> +static ssize_t virt_hdm_rev_reg_write(struct vfio_pci_core_device *vdev,
> +				      const __le32 *val32, u64 offset, u64 size)
> +{
> +	/* Discard writes on reserved registers. */
> +	return size;
> +}
> +
> +static ssize_t hdm_decoder_n_lo_write(struct vfio_pci_core_device *vdev,
> +				      const __le32 *val32, u64 offset, u64 size)
> +{> +	u32 new_val = le32_to_cpu(*val32);
> +
> +	if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD))
> +		return -EINVAL;
> +> +	/* Bit [27:0] are reserved. */
> +	new_val &= ~CXL_HDM_DECODER_BASE_LO_RESERVED_MASK;
> +
> +	*hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val);
> +
> +	return size;
> +}
> +
> +static ssize_t hdm_decoder_global_ctrl_write(struct vfio_pci_core_device *vdev,
> +					     const __le32 *val32, u64 offset, u64 size)
Why offset? If the dispatch function already checked and confirmed this is the offset for the global ctrl register then there's no need to pass in the offset.

> +{
> +	u32 hdm_decoder_global_cap;
> +	u32 new_val = le32_to_cpu(*val32);
> +
> +	if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD))
> +		return -EINVAL;
> +> +	/* Bit [31:2] are reserved. */
> +	new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK;
> +
> +	/* Poison On Decode Error Enable bit is 0 and RO if not support. */
> +	hdm_decoder_global_cap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, 0));
> +	if (!(hdm_decoder_global_cap & CXL_HDM_CAP_POISON_ON_DECODE_ERR_BIT))
> +		new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT;
> +
> +	*hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val);
> +
> +	return size;
> +}
> +
> +/*
> + * hdm_decoder_n_ctrl_write - Write handler for HDM decoder CTRL register.

If we are going to start with kdoc style comment, may as well finish the kdoc block and provide parameters and return values

> + *
> + * The COMMIT bit (bit 9) is the key: setting it requests the hardware to
> + * lock the decoder.  The emulated COMMITTED bit (bit 10) mirrors COMMIT
> + * immediately to allow QEMU's notify_change to detect the transition and
> + * map/unmap the DPA MemoryRegion in the guest address space.
> + *
> + * Note: the actual hardware HDM decoder programming (writing the real
> + * BASE/SIZE with host physical addresses) happens in the QEMU notify_change
> + * callback BEFORE this write reaches the hardware.  This ordering is
> + * correct because vfio_region_write() calls notify_change() first.
> + */
> +static ssize_t hdm_decoder_n_ctrl_write(struct vfio_pci_core_device *vdev,
> +					const __le32 *val32, u64 offset, u64 size)
> +{
> +	u32 hdm_decoder_global_cap;
> +	u32 ro_mask = CXL_HDM_DECODER_CTRL_RO_BITS_MASK;
> +	u32 rev_mask = CXL_HDM_DECODER_CTRL_RESERVED_MASK;
> +	u32 new_val = le32_to_cpu(*val32);
> +	u32 cur_val;
> +
> +	if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD))
> +		return -EINVAL;
> +
> +	cur_val = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, offset));
> +	if (cur_val & CXL_HDM_DECODER_CTRL_COMMIT_LOCK_BIT)
> +		return size;
> +
> +	hdm_decoder_global_cap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, 0));
> +	ro_mask |= CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO;
> +	rev_mask |= CXL_HDM_DECODER_CTRL_DEVICE_RESERVED;
> +	if (!(hdm_decoder_global_cap & CXL_HDM_CAP_UIO_SUPPORTED_BIT))
> +		rev_mask |= CXL_HDM_DECODER_CTRL_UIO_RESERVED;
> +
> +	new_val &= ~rev_mask;
> +	cur_val &= ro_mask;
> +	new_val = (new_val & ~ro_mask) | cur_val;
> +
> +	/*
> +	 * Mirror COMMIT → COMMITTED immediately in the emulated state.
> +	 * QEMU's notify_change (called before this write reaches hardware)
> +	 * reads COMMITTED from the region fd to detect commit transitions.
> +	 */
> +	if (new_val & CXL_HDM_DECODER_CTRL_COMMIT_BIT)
> +		new_val |= CXL_HDM_DECODER_CTRL_COMMITTED_BIT;
> +	else
> +		new_val &= ~CXL_HDM_DECODER_CTRL_COMMITTED_BIT;
> +
> +	*hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val);
> +
> +	return size;
> +}
> +
> +/*
> + * Dispatch table for COMP_REGS region writes.	Indexed by byte offset within
> + * the HDM decoder block.  Returns the appropriate write handler.
> + *
> + * Layout:
> + *   0x00	  HDM Decoder Capability  (RO)
> + *   0x04	  HDM Global Control	  (RW with reserved masking)
> + *   0x08	  HDM Global Status	  (RO)
> + *   0x0c	  (reserved)		  (ignored)
> + *   Per decoder N, base = 0x10 + N*0x20:
> + *     base+0x00  BASE_LO  (RW, [27:0] reserved)
> + *     base+0x04  BASE_HI  (RW)
> + *     base+0x08  SIZE_LO  (RW, [27:0] reserved)
> + *     base+0x0c  SIZE_HI  (RW)
> + *     base+0x10  CTRL	   (RW, complex rules)
> + *     base+0x14  TARGET_LIST_LO  (ignored for Type-2)
> + *     base+0x18  TARGET_LIST_HI  (ignored for Type-2)
> + *     base+0x1c  (reserved)	 (ignored)
> + */
> +static ssize_t comp_regs_dispatch_write(struct vfio_pci_core_device *vdev,
> +					u32 off, const __le32 *val32, u32 size)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +	u32 dec_base, dec_off;
> +
> +	/* HDM Decoder Capability (0x00): RO */
> +	if (off == 0x00)

define magic number

> +		return size;
> +
> +	/* HDM Global Control (0x04) */
> +	if (off == CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET)
> +		return hdm_decoder_global_ctrl_write(vdev, val32, off, size);
> +
> +	/* HDM Global Status (0x08): RO */
> +	if (off == 0x08)

define magic number

> +		return size;
> +
> +	/* Per-decoder registers start at 0x10, stride 0x20 */
> +	if (off < CXL_HDM_DECODER_FIRST_BLOCK_OFFSET)
> +		return size; /* reserved gap */
> +
> +	dec_base = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET;
> +	dec_off	 = (off - dec_base) % CXL_HDM_DECODER_BLOCK_STRIDE;

Need a check here to make sure offset is within the number of supported decoders.

> +
> +	switch (dec_off) {
> +	case CXL_HDM_DECODER_N_BASE_LOW_OFFSET:	 /* BASE_LO */
> +	case CXL_HDM_DECODER_N_SIZE_LOW_OFFSET:	 /* SIZE_LO */
> +		return hdm_decoder_n_lo_write(vdev, val32, off, size);
> +	case CXL_HDM_DECODER_N_BASE_HIGH_OFFSET: /* BASE_HI */
> +	case CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET: /* SIZE_HI */
> +		/* Full 32-bit write, no reserved bits */
> +		*hdm_reg_ptr(cxl, off) = *val32;
> +		return size;
> +	case CXL_HDM_DECODER_N_CTRL_OFFSET:	  /* CTRL */
> +		return hdm_decoder_n_ctrl_write(vdev, val32, off, size);
> +	case CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET:
> +	case CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET:
> +	case CXL_HDM_DECODER_N_REV_OFFSET:
> +		return virt_hdm_rev_reg_write(vdev, val32, off, size);
> +	default:
> +		return size;
> +	}
> +}
> +
> +/*
> + * vfio_cxl_comp_regs_rw - regops rw handler for VFIO_REGION_SUBTYPE_CXL_COMP_REGS.
> + *
> + * Reads return the emulated HDM state (comp_reg_virt[]).
> + * Writes go through comp_regs_dispatch_write() for bit-field enforcement.
> + * Only 4-byte aligned 4-byte accesses are supported (hardware requirement).
> + */
> +static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev,
> +				     char __user *buf, size_t count,
> +				     loff_t *ppos, bool iswrite)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +	size_t done = 0;
> +
> +	if (!count)
> +		return 0;
> +
> +	/* Clamp to region size */
> +	if (pos >= cxl->hdm_reg_size)
> +		return -EINVAL;
> +	count = min(count, (size_t)(cxl->hdm_reg_size - pos));
> +
> +	while (done < count) {
> +		u32 sz	 = min_t(u32, CXL_REG_SIZE_DWORD, count - done);
> +		u32 off	 = pos + done;
> +		__le32 v;
> +
> +		/* Enforce 4-byte alignment */
> +		if (sz < CXL_REG_SIZE_DWORD || (off & 0x3))
> +			return done ? (ssize_t)done : -EINVAL;
> +
> +		if (iswrite) {
> +			if (copy_from_user(&v, buf + done, sizeof(v)))
> +				return done ? (ssize_t)done : -EFAULT;
> +			comp_regs_dispatch_write(vdev, off, &v, sizeof(v));
> +		} else {
> +			v = *hdm_reg_ptr(cxl, off);
> +			if (copy_to_user(buf + done, &v, sizeof(v)))
> +				return done ? (ssize_t)done : -EFAULT;
> +		}
> +		done += sizeof(v);
> +	}
> +
> +	*ppos += done;
> +	return done;
> +}
> +
> +static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev,
> +				       struct vfio_pci_region *region)
> +{
> +	/* comp_reg_virt is freed in vfio_cxl_clean_virt_regs(), not here. */
> +}
> +
> +static const struct vfio_pci_regops vfio_cxl_comp_regs_ops = {
> +	.rw	 = vfio_cxl_comp_regs_rw,
> +	.release = vfio_cxl_comp_regs_release,
> +};
> +
> +/*
> + * vfio_cxl_setup_virt_regs - Allocate emulated HDM register state.
> + *
> + * Allocates comp_reg_virt as a compact __le32 array covering only
> + * hdm_reg_size bytes of HDM decoder registers. The initial values
> + * are read from hardware via the BAR ioremap established by the caller.
> + *
> + * DVSEC state is accessed via vdev->vconfig (see the following patch).
> + */
> +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +	size_t nregs;
> +
> +	if (WARN_ON(!cxl->hdm_reg_size))
> +		return -EINVAL;
> +
> +	if (pci_resource_len(vdev->pdev, cxl->comp_reg_bar) <
> +	    cxl->comp_reg_offset + cxl->hdm_reg_offset + cxl->hdm_reg_size)
> +		return -ENODEV;
> +
> +	nregs = cxl->hdm_reg_size / sizeof(__le32);
> +	cxl->comp_reg_virt = kcalloc(nregs, sizeof(__le32), GFP_KERNEL);
> +	if (!cxl->comp_reg_virt)
> +		return -ENOMEM;
> +
> +	/* Establish persistent mapping; kept alive until vfio_cxl_clean_virt_regs(). */
> +	cxl->hdm_iobase = ioremap(pci_resource_start(vdev->pdev, cxl->comp_reg_bar) +
> +				  cxl->comp_reg_offset + cxl->hdm_reg_offset,
> +				  cxl->hdm_reg_size);
> +	if (!cxl->hdm_iobase) {
> +		kfree(cxl->comp_reg_virt);
> +		cxl->comp_reg_virt = NULL;
> +		return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Called with memory_lock write side held (from vfio_cxl_reactivate_region).
> + * Uses the pre-established hdm_iobase, no ioremap() under the lock,
> + * which would deadlock on PREEMPT_RT where ioremap() can sleep.
> + */
> +void vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +	size_t i, nregs;
> +
> +	if (!cxl || !cxl->comp_reg_virt || !cxl->hdm_iobase)
> +		return;
> +
> +	nregs = cxl->hdm_reg_size / sizeof(__le32);
> +
> +	for (i = 0; i < nregs; i++)
> +		cxl->comp_reg_virt[i] =
> +			cpu_to_le32(readl(cxl->hdm_iobase + i * sizeof(__le32)));
> +}
> +
> +void vfio_cxl_clean_virt_regs(struct vfio_pci_core_device *vdev)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +
> +	if (cxl->hdm_iobase) {
> +		iounmap(cxl->hdm_iobase);
> +		cxl->hdm_iobase = NULL;
> +	}
> +	kfree(cxl->comp_reg_virt);
> +	cxl->comp_reg_virt = NULL;
> +}
> +
> +/*
> + * vfio_cxl_register_comp_regs_region - Register the COMP_REGS device region.
> + *
> + * Exposes the emulated HDM decoder register state as a VFIO device region
> + * with type VFIO_REGION_SUBTYPE_CXL_COMP_REGS.	 QEMU attaches a
> + * notify_change callback to this region to intercept HDM COMMIT writes
> + * and map the DPA MemoryRegion at the appropriate GPA.
> + *
> + * The region is read+write only (no mmap) to ensure all accesses pass
> + * through comp_regs_dispatch_write() for proper bit-field enforcement.
> + */
> +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev)
> +{
> +	struct vfio_pci_cxl_state *cxl = vdev->cxl;
> +	u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
> +	int ret;
> +
> +	if (!cxl || !cxl->comp_reg_virt)
> +		return -ENODEV;
> +
> +	ret = vfio_pci_core_register_dev_region(vdev,
> +						PCI_VENDOR_ID_CXL |
> +						VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
> +						VFIO_REGION_SUBTYPE_CXL_COMP_REGS,
> +						&vfio_cxl_comp_regs_ops,
> +						cxl->hdm_reg_size, flags, cxl);
> +	if (!ret)
> +		cxl->comp_reg_region_idx = vdev->num_regions - 1;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vfio_cxl_register_comp_regs_region);
> diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
> index b870926bfb19..4f2637874e9d 100644
> --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
> +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
> @@ -25,14 +25,51 @@ struct vfio_pci_cxl_state {
>  	size_t                       hdm_reg_size;
>  	resource_size_t              comp_reg_offset;
>  	size_t                       comp_reg_size;
> +	__le32                      *comp_reg_virt;
> +	void __iomem                *hdm_iobase;
>  	u32                          hdm_count;
>  	int                          dpa_region_idx;
> +	int                          comp_reg_region_idx;
>  	u16                          dvsec;
>  	u8                           comp_reg_bar;
>  	bool                         precommitted;
>  	bool                         region_active;
>  };
>  
> +/* Register access sizes */
> +#define CXL_REG_SIZE_WORD  2
> +#define CXL_REG_SIZE_DWORD 4
> +
> +/* HDM Decoder - register offsets (CXL 2.0 8.2.5.19) */
> +#define CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET	  0x4
> +#define CXL_HDM_DECODER_FIRST_BLOCK_OFFSET	  0x10
> +#define CXL_HDM_DECODER_BLOCK_STRIDE		  0x20
> +#define CXL_HDM_DECODER_N_BASE_LOW_OFFSET	  0x0
> +#define CXL_HDM_DECODER_N_BASE_HIGH_OFFSET	  0x4
> +#define CXL_HDM_DECODER_N_SIZE_LOW_OFFSET	  0x8
> +#define CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET	  0xc
> +#define CXL_HDM_DECODER_N_CTRL_OFFSET		  0x10
> +#define CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET  0x14
> +#define CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET 0x18
> +#define CXL_HDM_DECODER_N_REV_OFFSET		  0x1c
> +
> +/* HDM Decoder Global Capability / Control - bit definitions */
> +#define CXL_HDM_CAP_POISON_ON_DECODE_ERR_BIT BIT(10)
> +#define CXL_HDM_CAP_UIO_SUPPORTED_BIT	     BIT(13)
> +
> +/* HDM Decoder N Control */
> +#define CXL_HDM_DECODER_CTRL_COMMIT_LOCK_BIT	  BIT(8)
> +#define CXL_HDM_DECODER_CTRL_COMMIT_BIT		  BIT(9)
> +#define CXL_HDM_DECODER_CTRL_COMMITTED_BIT	  BIT(10)
> +#define CXL_HDM_DECODER_CTRL_RO_BITS_MASK	  (BIT(10) | BIT(11))
> +#define CXL_HDM_DECODER_CTRL_RESERVED_MASK	  (BIT(15) | GENMASK(31, 28))
> +#define CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO	  BIT(12)
> +#define CXL_HDM_DECODER_CTRL_DEVICE_RESERVED	  (GENMASK(19, 16) | GENMASK(23, 20))
> +#define CXL_HDM_DECODER_CTRL_UIO_RESERVED	  (BIT(14) | GENMASK(27, 24))
> +#define CXL_HDM_DECODER_BASE_LO_RESERVED_MASK	  GENMASK(27, 0)
> +#define CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK GENMASK(31, 2)
> +#define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0)

Maybe the reg defines should go in include/cxl/regs.h? Or move shared definitions out of drivers/cxl/.

DJ

> +
>  /*
>   * CXL DVSEC for CXL Devices - register offsets within the DVSEC
>   * (CXL 2.0+ 8.1.3).
> @@ -41,4 +78,8 @@ struct vfio_pci_cxl_state {
>  #define CXL_DVSEC_CAPABILITY_OFFSET 0xa
>  #define CXL_DVSEC_MEM_CAPABLE	    BIT(2)
>  
> +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev);
> +void vfio_cxl_clean_virt_regs(struct vfio_pci_core_device *vdev);
> +void vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev);
> +
>  #endif /* __LINUX_VFIO_CXL_PRIV_H */
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
> index 8f440f9eaa0c..f8db9a05c033 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -152,6 +152,8 @@ int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev);
>  void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev);
>  void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev);
>  void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev);
> +int  vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev);
> +void vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev);
>  
>  #else
>  
> @@ -173,6 +175,11 @@ static inline void
>  vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { }
>  static inline void
>  vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { }
> +static inline int
> +vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev)
> +{ return 0; }
> +static inline void
> +vfio_cxl_reinit_comp_regs(struct vfio_pci_core_device *vdev) { }
>  
>  #endif /* CONFIG_VFIO_CXL_CORE */
>