[v13] Enable CXL PCIe Port Protocol Error handling and logging

[RESEND v13 21/25] PCI/AER: Dequeue forwarded CXL error

Posted by Terry Bowman 3 months ago

The AER driver now forwards CXL protocol errors to the CXL driver via a
kfifo. The CXL driver must consume these work items, initiate protocol
error handling, and ensure RAS mappings remain valid throughout processing.

Implement cxl_proto_err_work_fn() to dequeue work items forwarded by the
AER service driver and begin protocol error processing by calling
cxl_handle_proto_error().

Add a PCI device lock on &pdev->dev within cxl_proto_err_work_fn() to
keep the PCI device structure valid during handling. Locking an Endpoint
will also defer RAS unmapping until the device is unlocked.

For Endpoints, add a lock on CXL memory device cxlds->dev. The CXL memory
device structure holds the RAS register reference needed during error
handling.

Add lock for the parent CXL Port for Root Ports, Downstream Ports, and
Upstream Ports to prevent destruction of structures holding mapped RAS
addresses while they are in use.

Invoke cxl_do_recovery() for uncorrectable errors. Treat this as a stub for
now; implement its functionality in a future patch.

Export pci_clean_device_status() to enable cleanup of AER status following
error handling.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>

---
Changes in v12->v13:
- Add cxlmd lock using guard() (Terry)
- Remove exporting of unused function, pci_aer_clear_fatal_status() (Dave Jiang)
- Change pr_err() calls to ratelimited. (Terry)
- Update commit message. (Terry)
- Remove namespace qualifier from pcie_clear_device_status()
  export (Dave Jiang)
- Move locks into cxl_proto_err_work_fn() (Dave)
- Update log messages in cxl_forward_error() (Ben)

Changes in v11->v12:
- Add guard for CE case in cxl_handle_proto_error() (Dave)

Changes in v10->v11:
- Reword patch commit message to remove RCiEP details (Jonathan)
- Add #include <linux/bitfield.h> (Terry)
- is_cxl_rcd() - Fix short comment message wrap  (Jonathan)
- is_cxl_rcd() - Combine return calls into 1  (Jonathan)
- cxl_handle_proto_error() - Move comment earlier  (Jonathan)
- Use FIELD_GET() in discovering class code (Jonathan)
- Remove BDF from cxl_proto_err_work_data. Use 'struct
pci_dev *' (Dan)
---
 drivers/cxl/core/ras.c | 153 ++++++++++++++++++++++++++++++++++++++---
 drivers/pci/pci.c      |   1 +
 drivers/pci/pci.h      |   1 -
 include/linux/pci.h    |   2 +
 4 files changed, 145 insertions(+), 12 deletions(-)

diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 142ca8794107..5bc144cde0ee 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -117,17 +117,6 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
 }
 static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
 
-int cxl_ras_init(void)
-{
-	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
-}
-
-void cxl_ras_exit(void)
-{
-	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
-	cancel_work_sync(&cxl_cper_prot_err_work);
-}
-
 static bool is_pcie_endpoint(struct pci_dev *pdev)
 {
 	return pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT;
@@ -178,6 +167,51 @@ static void __iomem *cxl_get_ras_base(struct device *dev)
 	return NULL;
 }
 
+/*
+ * Return 'struct cxl_port *' parent CXL port of dev's
+ *
+ * Reference count increments on success
+ *
+ * dev: Find the parent port of this dev
+ */
+static struct cxl_port *get_cxl_port(struct pci_dev *pdev)
+{
+	switch (pci_pcie_type(pdev)) {
+	case PCI_EXP_TYPE_ROOT_PORT:
+	case PCI_EXP_TYPE_DOWNSTREAM:
+	{
+		struct cxl_dport *dport;
+		struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
+
+		if (!port) {
+			pci_err(pdev, "Failed to find the CXL device");
+			return NULL;
+		}
+		return port;
+	}
+	case PCI_EXP_TYPE_UPSTREAM:
+	{
+		struct cxl_port *port = find_cxl_port_by_uport(&pdev->dev);
+
+		if (!port) {
+			pci_err(pdev, "Failed to find the CXL device");
+			return NULL;
+		}
+		return port;
+	}
+	case PCI_EXP_TYPE_ENDPOINT:
+	{
+		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+		struct cxl_port *port = cxlds->cxlmd->endpoint;
+
+		get_device(&port->dev);
+		return port;
+	}
+	}
+	pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
+	return NULL;
+}
+
 /**
  * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
  * @dport: the cxl_dport that needs to be initialized
@@ -212,6 +246,23 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port,
 }
 EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
 
+static bool device_lock_if(struct device *dev, bool cond)
+{
+	if (cond)
+		device_lock(dev);
+	return cond;
+}
+
+static void device_unlock_if(struct device *dev, bool take)
+{
+	if (take)
+		device_unlock(dev);
+}
+
+static void cxl_do_recovery(struct pci_dev *pdev)
+{
+}
+
 void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
 {
 	void __iomem *addr;
@@ -388,3 +439,83 @@ pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
 	return rc;
 }
 EXPORT_SYMBOL_NS_GPL(pci_error_detected, "CXL");
+
+static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
+{
+	struct pci_dev *pdev = err_info->pdev;
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+
+	if (err_info->severity == AER_CORRECTABLE) {
+
+		if (pdev->aer_cap)
+			pci_clear_and_set_config_dword(pdev,
+						       pdev->aer_cap + PCI_ERR_COR_STATUS,
+						       0, PCI_ERR_COR_INTERNAL);
+
+		if (is_pcie_endpoint(pdev))
+			cxl_cor_error_detected(&cxlds->cxlmd->dev);
+		else
+			cxl_port_cor_error_detected(&pdev->dev);
+
+		pcie_clear_device_status(pdev);
+	} else {
+		cxl_do_recovery(pdev);
+	}
+}
+
+static void cxl_proto_err_work_fn(struct work_struct *work)
+{
+	struct cxl_proto_err_work_data wd;
+
+	while (cxl_proto_err_kfifo_get(&wd)) {
+		struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(wd.pdev);
+		struct device *cxlmd_dev;
+
+		if (!pdev) {
+			pr_err_ratelimited("NULL PCI device passed in AER-CXL KFIFO\n");
+			continue;
+		}
+
+		guard(device)(&pdev->dev);
+		if (is_pcie_endpoint(pdev)) {
+			struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+
+			if (!cxl_pci_drv_bound(pdev))
+				return;
+			cxlmd_dev = &cxlds->cxlmd->dev;
+			device_lock_if(cxlmd_dev, cxlmd_dev);
+		} else {
+			cxlmd_dev = NULL;
+		}
+
+		struct cxl_port *port __free(put_cxl_port) = get_cxl_port(pdev);
+		if (!port)
+			return;
+		guard(device)(&port->dev);
+
+		cxl_handle_proto_error(&wd);
+		device_unlock_if(cxlmd_dev, cxlmd_dev);
+	}
+}
+
+static struct work_struct cxl_proto_err_work;
+static DECLARE_WORK(cxl_proto_err_work, cxl_proto_err_work_fn);
+
+int cxl_ras_init(void)
+{
+	if (cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work))
+		pr_err("Failed to initialize CXL RAS CPER\n");
+
+	cxl_register_proto_err_work(&cxl_proto_err_work);
+
+	return 0;
+}
+
+void cxl_ras_exit(void)
+{
+	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
+	cancel_work_sync(&cxl_cper_prot_err_work);
+
+	cxl_unregister_proto_err_work();
+	cancel_work_sync(&cxl_proto_err_work);
+}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 53a49bb32514..6341ca6515a5 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2277,6 +2277,7 @@ void pcie_clear_device_status(struct pci_dev *dev)
 	pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
 	pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
 }
+EXPORT_SYMBOL_GPL(pcie_clear_device_status);
 #endif
 
 /**
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a398e489318c..2af6ea82526d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -229,7 +229,6 @@ void pci_refresh_power_state(struct pci_dev *dev);
 int pci_power_up(struct pci_dev *dev);
 void pci_disable_enabled_device(struct pci_dev *dev);
 int pci_finish_runtime_suspend(struct pci_dev *dev);
-void pcie_clear_device_status(struct pci_dev *dev);
 void pcie_clear_root_pme_status(struct pci_dev *dev);
 bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index cffa5535f28d..33d16b212e0d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1886,8 +1886,10 @@ static inline void pci_hp_unignore_link_change(struct pci_dev *pdev) { }
 
 #ifdef CONFIG_PCIEAER
 bool pci_aer_available(void);
+void pcie_clear_device_status(struct pci_dev *dev);
 #else
 static inline bool pci_aer_available(void) { return false; }
+static inline void pcie_clear_device_status(struct pci_dev *dev) { }
 #endif
 
 bool pci_ats_disabled(void);
-- 
2.34.1

Re: [RESEND v13 21/25] PCI/AER: Dequeue forwarded CXL error

Posted by dan.j.williams@intel.com 2 months, 2 weeks ago

Terry Bowman wrote:
> The AER driver now forwards CXL protocol errors to the CXL driver via a
> kfifo. The CXL driver must consume these work items, initiate protocol
> error handling, and ensure RAS mappings remain valid throughout processing.
> 
> Implement cxl_proto_err_work_fn() to dequeue work items forwarded by the
> AER service driver and begin protocol error processing by calling
> cxl_handle_proto_error().
> 
> Add a PCI device lock on &pdev->dev within cxl_proto_err_work_fn() to
> keep the PCI device structure valid during handling. Locking an Endpoint
> will also defer RAS unmapping until the device is unlocked.
> 
> For Endpoints, add a lock on CXL memory device cxlds->dev. The CXL memory
> device structure holds the RAS register reference needed during error
> handling.
> 
> Add lock for the parent CXL Port for Root Ports, Downstream Ports, and
> Upstream Ports to prevent destruction of structures holding mapped RAS
> addresses while they are in use.
> 
> Invoke cxl_do_recovery() for uncorrectable errors. Treat this as a stub for
> now; implement its functionality in a future patch.
> 
> Export pci_clean_device_status() to enable cleanup of AER status following
> error handling.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> ---
> Changes in v12->v13:
> - Add cxlmd lock using guard() (Terry)
> - Remove exporting of unused function, pci_aer_clear_fatal_status() (Dave Jiang)
> - Change pr_err() calls to ratelimited. (Terry)
> - Update commit message. (Terry)
> - Remove namespace qualifier from pcie_clear_device_status()
>   export (Dave Jiang)
> - Move locks into cxl_proto_err_work_fn() (Dave)
> - Update log messages in cxl_forward_error() (Ben)
> 
> Changes in v11->v12:
> - Add guard for CE case in cxl_handle_proto_error() (Dave)
> 
> Changes in v10->v11:
> - Reword patch commit message to remove RCiEP details (Jonathan)
> - Add #include <linux/bitfield.h> (Terry)
> - is_cxl_rcd() - Fix short comment message wrap  (Jonathan)
> - is_cxl_rcd() - Combine return calls into 1  (Jonathan)
> - cxl_handle_proto_error() - Move comment earlier  (Jonathan)
> - Use FIELD_GET() in discovering class code (Jonathan)
> - Remove BDF from cxl_proto_err_work_data. Use 'struct
> pci_dev *' (Dan)
> ---
>  drivers/cxl/core/ras.c | 153 ++++++++++++++++++++++++++++++++++++++---
>  drivers/pci/pci.c      |   1 +
>  drivers/pci/pci.h      |   1 -
>  include/linux/pci.h    |   2 +
>  4 files changed, 145 insertions(+), 12 deletions(-)
[..]
> +static void cxl_proto_err_work_fn(struct work_struct *work)
> +{
> +	struct cxl_proto_err_work_data wd;
> +
> +	while (cxl_proto_err_kfifo_get(&wd)) {
> +		struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(wd.pdev);

Why does this function need its own device reference? I think this
handler should match PCI AER semantics where the device validity is
caller guaranteed.

> +		struct device *cxlmd_dev;
> +
> +		if (!pdev) {
> +			pr_err_ratelimited("NULL PCI device passed in AER-CXL KFIFO\n");
> +			continue;
> +		}
> +
> +		guard(device)(&pdev->dev);
> +		if (is_pcie_endpoint(pdev)) {
> +			struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> +			if (!cxl_pci_drv_bound(pdev))
> +				return;
> +			cxlmd_dev = &cxlds->cxlmd->dev;
> +			device_lock_if(cxlmd_dev, cxlmd_dev);

Ok, I think this demonstrates the problematic usage of
cxl_pci_drv_bound() and the presence of conditional locking is also a
tell that this is broken.

My expectation is the CXL protocol errors are exclusively reported to
cxl_ports. That means that all RAS register mapping must be exclusively
relative to cxl_port::probe() cxl_port::remove() lifetime. Once that is
in place this endpoint case melts away. The endpoint's job is to
register an endpoint-port to get protocol error services.

Given time is short for v6.19 I might take a quick stab at this to
demonstrate the proposal (or otherwise try to quickly discover why the
suggestion can not work).

Re: [RESEND v13 21/25] PCI/AER: Dequeue forwarded CXL error

Posted by Bjorn Helgaas 3 months ago

On Tue, Nov 04, 2025 at 11:03:01AM -0600, Terry Bowman wrote:
> The AER driver now forwards CXL protocol errors to the CXL driver via a
> kfifo. The CXL driver must consume these work items, initiate protocol
> error handling, and ensure RAS mappings remain valid throughout processing.
> 
> Implement cxl_proto_err_work_fn() to dequeue work items forwarded by the
> AER service driver and begin protocol error processing by calling
> cxl_handle_proto_error().
> 
> Add a PCI device lock on &pdev->dev within cxl_proto_err_work_fn() to
> keep the PCI device structure valid during handling. Locking an Endpoint
> will also defer RAS unmapping until the device is unlocked.
> 
> For Endpoints, add a lock on CXL memory device cxlds->dev. The CXL memory
> device structure holds the RAS register reference needed during error
> handling.
> 
> Add lock for the parent CXL Port for Root Ports, Downstream Ports, and
> Upstream Ports to prevent destruction of structures holding mapped RAS
> addresses while they are in use.
> 
> Invoke cxl_do_recovery() for uncorrectable errors. Treat this as a stub for
> now; implement its functionality in a future patch.
> 
> Export pci_clean_device_status() to enable cleanup of AER status following
> error handling.

s/pci_clean_device_status/pcie_clear_device_status/

Acked-by: Bjorn Helgaas <bhelgaas@google.com>

>  drivers/cxl/core/ras.c | 153 ++++++++++++++++++++++++++++++++++++++---
>  drivers/pci/pci.c      |   1 +
>  drivers/pci/pci.h      |   1 -
>  include/linux/pci.h    |   2 +

Looks like this is primarily a CXL change, and the PCI part is
minimal, so I question the "PCI/AER:" prefix in the subject.

> +static struct cxl_port *get_cxl_port(struct pci_dev *pdev)
> +{
> +	switch (pci_pcie_type(pdev)) {
> +	case PCI_EXP_TYPE_ROOT_PORT:
> +	case PCI_EXP_TYPE_DOWNSTREAM:
> +	{
> +		struct cxl_dport *dport;
> +		struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
> +
> +		if (!port) {
> +			pci_err(pdev, "Failed to find the CXL device");
> +			return NULL;
> +		}
> +		return port;
> +	}
> +	case PCI_EXP_TYPE_UPSTREAM:
> +	{
> +		struct cxl_port *port = find_cxl_port_by_uport(&pdev->dev);
> +
> +		if (!port) {
> +			pci_err(pdev, "Failed to find the CXL device");
> +			return NULL;
> +		}
> +		return port;
> +	}
> +	case PCI_EXP_TYPE_ENDPOINT:
> +	{
> +		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +		struct cxl_port *port = cxlds->cxlmd->endpoint;
> +
> +		get_device(&port->dev);
> +		return port;
> +	}
> +	}
> +	pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));

Maybe use "%#x" so it's clear that this is hex?  PCI typically uses
lower-case hex; maybe the CXL convention is different.

> +static void cxl_handle_proto_error(struct cxl_proto_err_work_data *err_info)
> +{
> +	struct pci_dev *pdev = err_info->pdev;
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> +	if (err_info->severity == AER_CORRECTABLE) {
> +
> +		if (pdev->aer_cap)
> +			pci_clear_and_set_config_dword(pdev,
> +						       pdev->aer_cap + PCI_ERR_COR_STATUS,
> +						       0, PCI_ERR_COR_INTERNAL);
> +
> +		if (is_pcie_endpoint(pdev))
> +			cxl_cor_error_detected(&cxlds->cxlmd->dev);
> +		else
> +			cxl_port_cor_error_detected(&pdev->dev);
> +
> +		pcie_clear_device_status(pdev);

The AER clear above and pcie_clear_device_status() require
ownership of the PCIe Capability and the AER Capability, typically
granted by _OSC.

I suppose it's obvious that the OS does own these Capabilities if we
get here, but I'm not familiar with this code.

Re: [RESEND v13 21/25] PCI/AER: Dequeue forwarded CXL error

Posted by Jonathan Cameron 3 months ago

On Tue, 4 Nov 2025 11:03:01 -0600
Terry Bowman <terry.bowman@amd.com> wrote:

> The AER driver now forwards CXL protocol errors to the CXL driver via a
> kfifo. The CXL driver must consume these work items, initiate protocol
> error handling, and ensure RAS mappings remain valid throughout processing.
> 
> Implement cxl_proto_err_work_fn() to dequeue work items forwarded by the
> AER service driver and begin protocol error processing by calling
> cxl_handle_proto_error().
> 
> Add a PCI device lock on &pdev->dev within cxl_proto_err_work_fn() to
> keep the PCI device structure valid during handling. Locking an Endpoint
> will also defer RAS unmapping until the device is unlocked.
> 
> For Endpoints, add a lock on CXL memory device cxlds->dev. The CXL memory
> device structure holds the RAS register reference needed during error
> handling.
> 
> Add lock for the parent CXL Port for Root Ports, Downstream Ports, and
> Upstream Ports to prevent destruction of structures holding mapped RAS
> addresses while they are in use.
> 
> Invoke cxl_do_recovery() for uncorrectable errors. Treat this as a stub for
> now; implement its functionality in a future patch.
> 
> Export pci_clean_device_status() to enable cleanup of AER status following
> error handling.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
Various comments inline.
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 142ca8794107..5bc144cde0ee 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -117,17 +117,6 @@ static void cxl_cper_prot_err_work_fn(struct work_struct *work)
>  }
>  static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
>  
> -int cxl_ras_init(void)
> -{
> -	return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> -}
> -
> -void cxl_ras_exit(void)
> -{
> -	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
> -	cancel_work_sync(&cxl_cper_prot_err_work);
> -}
> -
>  static bool is_pcie_endpoint(struct pci_dev *pdev)
>  {
>  	return pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT;
> @@ -178,6 +167,51 @@ static void __iomem *cxl_get_ras_base(struct device *dev)
>  	return NULL;
>  }
>  
> +/*
> + * Return 'struct cxl_port *' parent CXL port of dev's
> + *
> + * Reference count increments on success
> + *
> + * dev: Find the parent port of this dev

pdev. 

Generally I'd prefer kernel-doc style even for non exported
/ exposed functions.  Makes it easy to check for stuff like
this as the script will moan at you.

> + */
> +static struct cxl_port *get_cxl_port(struct pci_dev *pdev)
> +{
> +	switch (pci_pcie_type(pdev)) {
> +	case PCI_EXP_TYPE_ROOT_PORT:
> +	case PCI_EXP_TYPE_DOWNSTREAM:
> +	{
> +		struct cxl_dport *dport;
> +		struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
> +
> +		if (!port) {
> +			pci_err(pdev, "Failed to find the CXL device");
> +			return NULL;
> +		}
> +		return port;
> +	}
> +	case PCI_EXP_TYPE_UPSTREAM:
> +	{
> +		struct cxl_port *port = find_cxl_port_by_uport(&pdev->dev);
> +
> +		if (!port) {
> +			pci_err(pdev, "Failed to find the CXL device");
> +			return NULL;
> +		}
> +		return port;
> +	}
> +	case PCI_EXP_TYPE_ENDPOINT:
> +	{
> +		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +		struct cxl_port *port = cxlds->cxlmd->endpoint;
> +
> +		get_device(&port->dev);
> +		return port;
> +	}
> +	}
> +	pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
> +	return NULL;
> +}
> +
>  /**
>   * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
>   * @dport: the cxl_dport that needs to be initialized
> @@ -212,6 +246,23 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port,
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
>  
> +static bool device_lock_if(struct device *dev, bool cond)
> +{
> +	if (cond)
> +		device_lock(dev);
> +	return cond;
> +}
> +
> +static void device_unlock_if(struct device *dev, bool take)
> +{
> +	if (take)
> +		device_unlock(dev);
> +}

See below. To me these are too weird to wrap up.  Open code them inline
where we can see what they are doing.

> +static void cxl_proto_err_work_fn(struct work_struct *work)
> +{
> +	struct cxl_proto_err_work_data wd;
> +
> +	while (cxl_proto_err_kfifo_get(&wd)) {
> +		struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(wd.pdev);
> +		struct device *cxlmd_dev;
> +
> +		if (!pdev) {
> +			pr_err_ratelimited("NULL PCI device passed in AER-CXL KFIFO\n");
> +			continue;
> +		}
> +
> +		guard(device)(&pdev->dev);
> +		if (is_pcie_endpoint(pdev)) {
> +			struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> +			if (!cxl_pci_drv_bound(pdev))
> +				return;
> +			cxlmd_dev = &cxlds->cxlmd->dev;
> +			device_lock_if(cxlmd_dev, cxlmd_dev);

As below. Too odd.  Also needs comments to explain why conditionally locking it
would be useful.

> +		} else {
> +			cxlmd_dev = NULL;

Set it to NULL at declaration and drop this else leg.

> +		}
> +
> +		struct cxl_port *port __free(put_cxl_port) = get_cxl_port(pdev);
> +		if (!port)
> +			return;
> +		guard(device)(&port->dev);
> +
> +		cxl_handle_proto_error(&wd);
> +		device_unlock_if(cxlmd_dev, cxlmd_dev);
This is too odd to wrap up like that.  Particularly given the
very generic sounding device_unlock_if() naming.

> +	}
> +}