[PATCH 4/5] PCI: Add cxl DVSEC state save/restore across resets

smadhavan@nvidia.com posted 5 patches 1 month ago
[PATCH 4/5] PCI: Add cxl DVSEC state save/restore across resets
Posted by smadhavan@nvidia.com 1 month ago
From: Srirangan Madhavan <smadhavan@nvidia.com>

Save and restore CXL DVSEC control registers (CTRL, CTRL2), range
base registers, and lock state across PCI resets.

When the DVSEC CONFIG_LOCK bit is set, certain DVSEC fields
become read-only and hardware may have updated them. Blindly
restoring saved values would be silently ignored or conflict
with hardware state. Instead, a read-merge-write approach is
used: current hardware values are read for the RWL
(read-write-when-locked) fields and merged with saved state,
so only writable bits are restored while locked bits retain
their hardware values.

Hooked into pci_save_state()/pci_restore_state() so all PCI reset
paths automatically preserve CXL DVSEC configuration.

Signed-off-by: Srirangan Madhavan <smadhavan@nvidia.com>
---
 drivers/pci/Kconfig  |   4 +
 drivers/pci/Makefile |   1 +
 drivers/pci/cxl.c    | 177 +++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.c    |   3 +
 4 files changed, 185 insertions(+)
 create mode 100644 drivers/pci/cxl.c

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index e3f848ffb52a..6b96650b3f31 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -119,6 +119,10 @@ config XEN_PCIDEV_FRONTEND
 	  The PCI device frontend driver allows the kernel to import arbitrary
 	  PCI devices from a PCI backend to support PCI driver domains.

+config PCI_CXL
+	bool
+	default y if CXL_BUS
+
 config PCI_ATS
 	bool

diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 41ebc3b9a518..a6168ecef9c1 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_PCI_TSM)		+= tsm.o
 obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o
 obj-$(CONFIG_PCI_NPEM)		+= npem.o
 obj-$(CONFIG_PCIE_TPH)		+= tph.o
+obj-$(CONFIG_PCI_CXL)		+= cxl.o
 obj-$(CONFIG_CARDBUS)		+= setup-cardbus.o

 # Endpoint library must be initialized before its users
diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c
new file mode 100644
index 000000000000..abcf70de9171
--- /dev/null
+++ b/drivers/pci/cxl.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CXL PCI state save/restore support.
+ *
+ * Saves and restores CXL DVSEC registers across PCI resets and link
+ * disable/enable transitions. Hooked into pci_save_state() /
+ * pci_restore_state() via the PCI capability save chain.
+ */
+#include <linux/pci.h>
+#include <cxl/pci.h>
+#include "pci.h"
+
+struct cxl_pci_state {
+	u16 dvsec;
+	u16 dvsec_ctrl;
+	u16 dvsec_ctrl2;
+	u32 range_base_hi[CXL_DVSEC_RANGE_MAX];
+	u32 range_base_lo[CXL_DVSEC_RANGE_MAX];
+	u16 dvsec_lock;
+	bool dvsec_valid;
+};
+
+static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state)
+{
+	int rc_ctrl, rc_ctrl2;
+	u16 dvsec;
+	int i;
+
+	dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
+					  PCI_DVSEC_CXL_DEVICE);
+	if (!dvsec)
+		return;
+
+	state->dvsec = dvsec;
+	rc_ctrl = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL,
+				       &state->dvsec_ctrl);
+	rc_ctrl2 = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2,
+					&state->dvsec_ctrl2);
+	if (rc_ctrl || rc_ctrl2) {
+		pci_warn(pdev,
+			 "CXL: DVSEC read failed (ctrl rc=%d, ctrl2 rc=%d)\n",
+			 rc_ctrl, rc_ctrl2);
+		return;
+	}
+
+	for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) {
+		pci_read_config_dword(pdev,
+			dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
+			&state->range_base_hi[i]);
+		pci_read_config_dword(pdev,
+			dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
+			&state->range_base_lo[i]);
+	}
+
+	pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_LOCK,
+			     &state->dvsec_lock);
+
+	state->dvsec_valid = true;
+}
+
+static u32 cxl_merge_rwl(u32 saved, u32 current_hw, u32 rwl_mask)
+{
+	return (current_hw & rwl_mask) | (saved & ~rwl_mask);
+}
+
+static void cxl_restore_dvsec(struct pci_dev *pdev,
+			      const struct cxl_pci_state *state)
+{
+	u16 lock_reg = 0;
+	int i;
+
+	if (!state->dvsec_valid)
+		return;
+
+	pci_read_config_word(pdev, state->dvsec + PCI_DVSEC_CXL_LOCK,
+			     &lock_reg);
+
+	if (lock_reg & PCI_DVSEC_CXL_LOCK_CONFIG) {
+		u16 hw_ctrl;
+		u32 hw_range_hi, hw_range_lo;
+
+		pci_read_config_word(pdev,
+				     state->dvsec + PCI_DVSEC_CXL_CTRL,
+				     &hw_ctrl);
+		pci_write_config_word(pdev,
+			state->dvsec + PCI_DVSEC_CXL_CTRL,
+			cxl_merge_rwl(state->dvsec_ctrl, hw_ctrl,
+				      PCI_DVSEC_CXL_CTRL_RWL));
+
+		pci_write_config_word(pdev,
+			state->dvsec + PCI_DVSEC_CXL_CTRL2,
+			state->dvsec_ctrl2);
+
+		for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) {
+			pci_read_config_dword(pdev,
+				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
+				&hw_range_hi);
+			pci_write_config_dword(pdev,
+				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
+				cxl_merge_rwl(state->range_base_hi[i],
+					      hw_range_hi,
+					      PCI_DVSEC_CXL_RANGE_BASE_HI_RWL));
+
+			pci_read_config_dword(pdev,
+				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
+				&hw_range_lo);
+			pci_write_config_dword(pdev,
+				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
+				cxl_merge_rwl(state->range_base_lo[i],
+					      hw_range_lo,
+					      PCI_DVSEC_CXL_RANGE_BASE_LO_RWL));
+		}
+	} else {
+		pci_write_config_word(pdev,
+				      state->dvsec + PCI_DVSEC_CXL_CTRL,
+				      state->dvsec_ctrl);
+		pci_write_config_word(pdev,
+				      state->dvsec + PCI_DVSEC_CXL_CTRL2,
+				      state->dvsec_ctrl2);
+		for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) {
+			pci_write_config_dword(pdev,
+				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
+				state->range_base_hi[i]);
+			pci_write_config_dword(pdev,
+				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
+				state->range_base_lo[i]);
+		}
+
+		pci_write_config_word(pdev,
+			state->dvsec + PCI_DVSEC_CXL_LOCK,
+			state->dvsec_lock);
+	}
+}
+
+void pci_allocate_cxl_save_buffer(struct pci_dev *dev)
+{
+	if (!pcie_is_cxl(dev))
+		return;
+
+	if (pci_add_virtual_ext_cap_save_buffer(dev,
+			PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL,
+			sizeof(struct cxl_pci_state)))
+		pci_err(dev, "unable to allocate CXL save buffer\n");
+}
+
+void pci_save_cxl_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	struct cxl_pci_state *state;
+
+	save_state = pci_find_saved_ext_cap(pdev,
+					    PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL);
+	if (!save_state)
+		return;
+
+	state = (struct cxl_pci_state *)save_state->cap.data;
+	state->dvsec_valid = false;
+
+	cxl_save_dvsec(pdev, state);
+}
+
+void pci_restore_cxl_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	struct cxl_pci_state *state;
+
+	save_state = pci_find_saved_ext_cap(pdev,
+					    PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL);
+	if (!save_state)
+		return;
+
+	state = (struct cxl_pci_state *)save_state->cap.data;
+	if (!state->dvsec_valid)
+		return;
+
+	cxl_restore_dvsec(pdev, state);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index dc8181f13864..497720c64d6d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1759,6 +1759,7 @@ int pci_save_state(struct pci_dev *dev)
 	pci_save_aer_state(dev);
 	pci_save_ptm_state(dev);
 	pci_save_tph_state(dev);
+	pci_save_cxl_state(dev);
 	return pci_save_vc_state(dev);
 }
 EXPORT_SYMBOL(pci_save_state);
@@ -1841,6 +1842,7 @@ void pci_restore_state(struct pci_dev *dev)
 	pci_restore_aer_state(dev);

 	pci_restore_config_space(dev);
+	pci_restore_cxl_state(dev);

 	pci_restore_pcix_state(dev);
 	pci_restore_msi_state(dev);
@@ -3489,6 +3491,7 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev)
 		pci_err(dev, "unable to allocate suspend buffer for LTR\n");

 	pci_allocate_vc_save_buffers(dev);
+	pci_allocate_cxl_save_buffer(dev);
 }

 void pci_free_cap_save_buffers(struct pci_dev *dev)
--
2.43.0
Re: [PATCH 4/5] PCI: Add cxl DVSEC state save/restore across resets
Posted by Jonathan Cameron 4 weeks ago
On Fri, 6 Mar 2026 08:00:18 +0000
smadhavan@nvidia.com wrote:

> From: Srirangan Madhavan <smadhavan@nvidia.com>
> 
> Save and restore CXL DVSEC control registers (CTRL, CTRL2), range
> base registers, and lock state across PCI resets.
> 
> When the DVSEC CONFIG_LOCK bit is set, certain DVSEC fields
> become read-only and hardware may have updated them. 

This I'm not following.  Can you give an example of which
fields the hardware is allowed to change after lock is set?

> Blindly
> restoring saved values would be silently ignored or conflict
> with hardware state. Instead, a read-merge-write approach is
> used: current hardware values are read for the RWL
> (read-write-when-locked) fields and merged with saved state,
> so only writable bits are restored while locked bits retain
> their hardware values.
> 
> Hooked into pci_save_state()/pci_restore_state() so all PCI reset
> paths automatically preserve CXL DVSEC configuration.
> 
> Signed-off-by: Srirangan Madhavan <smadhavan@nvidia.com>
Re: [PATCH 4/5] PCI: Add cxl DVSEC state save/restore across resets
Posted by Alex Williamson 1 month ago
On Fri, 6 Mar 2026 08:00:18 +0000
<smadhavan@nvidia.com> wrote:

> From: Srirangan Madhavan <smadhavan@nvidia.com>
> 
> Save and restore CXL DVSEC control registers (CTRL, CTRL2), range
> base registers, and lock state across PCI resets.
> 
> When the DVSEC CONFIG_LOCK bit is set, certain DVSEC fields
> become read-only and hardware may have updated them. Blindly
> restoring saved values would be silently ignored or conflict
> with hardware state. Instead, a read-merge-write approach is
> used: current hardware values are read for the RWL
> (read-write-when-locked) fields and merged with saved state,
> so only writable bits are restored while locked bits retain
> their hardware values.
> 
> Hooked into pci_save_state()/pci_restore_state() so all PCI reset
> paths automatically preserve CXL DVSEC configuration.
> 
> Signed-off-by: Srirangan Madhavan <smadhavan@nvidia.com>
> ---
>  drivers/pci/Kconfig  |   4 +
>  drivers/pci/Makefile |   1 +
>  drivers/pci/cxl.c    | 177 +++++++++++++++++++++++++++++++++++++++++++
>  drivers/pci/pci.c    |   3 +
>  4 files changed, 185 insertions(+)
>  create mode 100644 drivers/pci/cxl.c
> 
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index e3f848ffb52a..6b96650b3f31 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -119,6 +119,10 @@ config XEN_PCIDEV_FRONTEND
>  	  The PCI device frontend driver allows the kernel to import arbitrary
>  	  PCI devices from a PCI backend to support PCI driver domains.
> 
> +config PCI_CXL
> +	bool
> +	default y if CXL_BUS
> +
>  config PCI_ATS
>  	bool
> 
> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
> index 41ebc3b9a518..a6168ecef9c1 100644
> --- a/drivers/pci/Makefile
> +++ b/drivers/pci/Makefile
> @@ -39,6 +39,7 @@ obj-$(CONFIG_PCI_TSM)		+= tsm.o
>  obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o
>  obj-$(CONFIG_PCI_NPEM)		+= npem.o
>  obj-$(CONFIG_PCIE_TPH)		+= tph.o
> +obj-$(CONFIG_PCI_CXL)		+= cxl.o
>  obj-$(CONFIG_CARDBUS)		+= setup-cardbus.o
> 
>  # Endpoint library must be initialized before its users
> diff --git a/drivers/pci/cxl.c b/drivers/pci/cxl.c
> new file mode 100644
> index 000000000000..abcf70de9171
> --- /dev/null
> +++ b/drivers/pci/cxl.c
> @@ -0,0 +1,177 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * CXL PCI state save/restore support.
> + *
> + * Saves and restores CXL DVSEC registers across PCI resets and link
> + * disable/enable transitions. Hooked into pci_save_state() /
> + * pci_restore_state() via the PCI capability save chain.
> + */
> +#include <linux/pci.h>
> +#include <cxl/pci.h>
> +#include "pci.h"
> +
> +struct cxl_pci_state {
> +	u16 dvsec;
> +	u16 dvsec_ctrl;
> +	u16 dvsec_ctrl2;
> +	u32 range_base_hi[CXL_DVSEC_RANGE_MAX];
> +	u32 range_base_lo[CXL_DVSEC_RANGE_MAX];
> +	u16 dvsec_lock;
> +	bool dvsec_valid;
> +};
> +
> +static void cxl_save_dvsec(struct pci_dev *pdev, struct cxl_pci_state *state)
> +{
> +	int rc_ctrl, rc_ctrl2;
> +	u16 dvsec;
> +	int i;
> +
> +	dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
> +					  PCI_DVSEC_CXL_DEVICE);
> +	if (!dvsec)
> +		return;
> +
> +	state->dvsec = dvsec;
> +	rc_ctrl = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL,
> +				       &state->dvsec_ctrl);
> +	rc_ctrl2 = pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2,
> +					&state->dvsec_ctrl2);
> +	if (rc_ctrl || rc_ctrl2) {
> +		pci_warn(pdev,
> +			 "CXL: DVSEC read failed (ctrl rc=%d, ctrl2 rc=%d)\n",
> +			 rc_ctrl, rc_ctrl2);
> +		return;
> +	}
> +
> +	for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) {
> +		pci_read_config_dword(pdev,
> +			dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
> +			&state->range_base_hi[i]);
> +		pci_read_config_dword(pdev,
> +			dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
> +			&state->range_base_lo[i]);
> +	}

Shouldn't we also be handling the array of RANGE_SIZE registers here?
NB. the low bits indicating active/valid need special handling.  Thanks,

Alex

> +
> +	pci_read_config_word(pdev, dvsec + PCI_DVSEC_CXL_LOCK,
> +			     &state->dvsec_lock);
> +
> +	state->dvsec_valid = true;
> +}
> +
> +static u32 cxl_merge_rwl(u32 saved, u32 current_hw, u32 rwl_mask)
> +{
> +	return (current_hw & rwl_mask) | (saved & ~rwl_mask);
> +}
> +
> +static void cxl_restore_dvsec(struct pci_dev *pdev,
> +			      const struct cxl_pci_state *state)
> +{
> +	u16 lock_reg = 0;
> +	int i;
> +
> +	if (!state->dvsec_valid)
> +		return;
> +
> +	pci_read_config_word(pdev, state->dvsec + PCI_DVSEC_CXL_LOCK,
> +			     &lock_reg);
> +
> +	if (lock_reg & PCI_DVSEC_CXL_LOCK_CONFIG) {
> +		u16 hw_ctrl;
> +		u32 hw_range_hi, hw_range_lo;
> +
> +		pci_read_config_word(pdev,
> +				     state->dvsec + PCI_DVSEC_CXL_CTRL,
> +				     &hw_ctrl);
> +		pci_write_config_word(pdev,
> +			state->dvsec + PCI_DVSEC_CXL_CTRL,
> +			cxl_merge_rwl(state->dvsec_ctrl, hw_ctrl,
> +				      PCI_DVSEC_CXL_CTRL_RWL));
> +
> +		pci_write_config_word(pdev,
> +			state->dvsec + PCI_DVSEC_CXL_CTRL2,
> +			state->dvsec_ctrl2);
> +
> +		for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) {
> +			pci_read_config_dword(pdev,
> +				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
> +				&hw_range_hi);
> +			pci_write_config_dword(pdev,
> +				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
> +				cxl_merge_rwl(state->range_base_hi[i],
> +					      hw_range_hi,
> +					      PCI_DVSEC_CXL_RANGE_BASE_HI_RWL));
> +
> +			pci_read_config_dword(pdev,
> +				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
> +				&hw_range_lo);
> +			pci_write_config_dword(pdev,
> +				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
> +				cxl_merge_rwl(state->range_base_lo[i],
> +					      hw_range_lo,
> +					      PCI_DVSEC_CXL_RANGE_BASE_LO_RWL));
> +		}
> +	} else {
> +		pci_write_config_word(pdev,
> +				      state->dvsec + PCI_DVSEC_CXL_CTRL,
> +				      state->dvsec_ctrl);
> +		pci_write_config_word(pdev,
> +				      state->dvsec + PCI_DVSEC_CXL_CTRL2,
> +				      state->dvsec_ctrl2);
> +		for (i = 0; i < CXL_DVSEC_RANGE_MAX; i++) {
> +			pci_write_config_dword(pdev,
> +				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i),
> +				state->range_base_hi[i]);
> +			pci_write_config_dword(pdev,
> +				state->dvsec + PCI_DVSEC_CXL_RANGE_BASE_LOW(i),
> +				state->range_base_lo[i]);
> +		}
> +
> +		pci_write_config_word(pdev,
> +			state->dvsec + PCI_DVSEC_CXL_LOCK,
> +			state->dvsec_lock);
> +	}
> +}
> +
> +void pci_allocate_cxl_save_buffer(struct pci_dev *dev)
> +{
> +	if (!pcie_is_cxl(dev))
> +		return;
> +
> +	if (pci_add_virtual_ext_cap_save_buffer(dev,
> +			PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL,
> +			sizeof(struct cxl_pci_state)))
> +		pci_err(dev, "unable to allocate CXL save buffer\n");
> +}
> +
> +void pci_save_cxl_state(struct pci_dev *pdev)
> +{
> +	struct pci_cap_saved_state *save_state;
> +	struct cxl_pci_state *state;
> +
> +	save_state = pci_find_saved_ext_cap(pdev,
> +					    PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL);
> +	if (!save_state)
> +		return;
> +
> +	state = (struct cxl_pci_state *)save_state->cap.data;
> +	state->dvsec_valid = false;
> +
> +	cxl_save_dvsec(pdev, state);
> +}
> +
> +void pci_restore_cxl_state(struct pci_dev *pdev)
> +{
> +	struct pci_cap_saved_state *save_state;
> +	struct cxl_pci_state *state;
> +
> +	save_state = pci_find_saved_ext_cap(pdev,
> +					    PCI_EXT_CAP_ID_CXL_DVSEC_VIRTUAL);
> +	if (!save_state)
> +		return;
> +
> +	state = (struct cxl_pci_state *)save_state->cap.data;
> +	if (!state->dvsec_valid)
> +		return;
> +
> +	cxl_restore_dvsec(pdev, state);
> +}
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index dc8181f13864..497720c64d6d 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -1759,6 +1759,7 @@ int pci_save_state(struct pci_dev *dev)
>  	pci_save_aer_state(dev);
>  	pci_save_ptm_state(dev);
>  	pci_save_tph_state(dev);
> +	pci_save_cxl_state(dev);
>  	return pci_save_vc_state(dev);
>  }
>  EXPORT_SYMBOL(pci_save_state);
> @@ -1841,6 +1842,7 @@ void pci_restore_state(struct pci_dev *dev)
>  	pci_restore_aer_state(dev);
> 
>  	pci_restore_config_space(dev);
> +	pci_restore_cxl_state(dev);
> 
>  	pci_restore_pcix_state(dev);
>  	pci_restore_msi_state(dev);
> @@ -3489,6 +3491,7 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev)
>  		pci_err(dev, "unable to allocate suspend buffer for LTR\n");
> 
>  	pci_allocate_vc_save_buffers(dev);
> +	pci_allocate_cxl_save_buffer(dev);
>  }
> 
>  void pci_free_cap_save_buffers(struct pci_dev *dev)
> --
> 2.43.0
>