During CXL device cleanup the CXL PCIe Port device interrupts may remain
enabled. This can potentialy allow unnecessary interrupt processing on
behalf of the CXL errors while the device is destroyed.
Disable CXL protocol errors by setting the CXL devices' AER mask register.
Introduce pci_aer_mask_internal_errors() similar to pci_aer_unmask_internal_errors().
Next, introduce cxl_disable_prot_errors() to call pci_aer_mask_internal_errors().
Register cxl_disable_prot_errors() to run at CXL device cleanup.
Register for CXL Root Ports, CXL Downstream Ports, CXL Upstream Ports, and
CXL Endpoints.
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
drivers/cxl/port.c | 18 +++++++++++++++++-
drivers/pci/pcie/aer.c | 25 +++++++++++++++++++++++++
include/linux/aer.h | 1 +
3 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index bb7a0526e609..7e3efd8be8eb 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -101,6 +101,19 @@ void cxl_enable_prot_errors(struct device *dev)
}
EXPORT_SYMBOL_NS_GPL(cxl_enable_prot_errors, "CXL");
+void cxl_disable_prot_errors(void *_dev)
+{
+ struct device *dev = _dev;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct device *pci_dev __free(put_device) = get_device(&pdev->dev);
+
+ if (!pci_dev || !pdev->aer_cap)
+ return;
+
+ pci_aer_mask_internal_errors(pdev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_disable_prot_errors, "CXL");
+
static void cxl_dport_map_rch_aer(struct cxl_dport *dport)
{
resource_size_t aer_phys;
@@ -166,6 +179,7 @@ static void cxl_uport_init_ras_reporting(struct cxl_port *port,
cxl_assign_error_handlers(&port->dev, &cxl_port_error_handlers);
cxl_enable_prot_errors(port->uport_dev);
+ devm_add_action_or_reset(host, cxl_disable_prot_errors, port->uport_dev);
}
/**
@@ -197,6 +211,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
cxl_assign_error_handlers(dport->dport_dev, &cxl_port_error_handlers);
cxl_enable_prot_errors(dport->dport_dev);
+ devm_add_action_or_reset(host, cxl_disable_prot_errors, dport->dport_dev);
}
EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
@@ -223,7 +238,7 @@ static void cxl_endpoint_port_init_ras(struct cxl_port *port)
struct device *cxlmd_dev __free(put_device) = &cxlmd->dev;
struct cxl_dev_state *cxlds = cxlmd->cxlds;
- if (!dport || !dev_is_pci(dport->dport_dev)) {
+ if (!dport || !dev_is_pci(dport->dport_dev) || !dev_is_pci(cxlds->dev)) {
dev_err(&port->dev, "CXL port topology not found\n");
return;
}
@@ -232,6 +247,7 @@ static void cxl_endpoint_port_init_ras(struct cxl_port *port)
cxl_assign_error_handlers(cxlmd_dev, &cxl_ep_error_handlers);
cxl_enable_prot_errors(cxlds->dev);
+ devm_add_action_or_reset(cxlds->dev, cxl_disable_prot_errors, cxlds->dev);
}
#else
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d3068f5cc767..d1ef0c676ff8 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -977,6 +977,31 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
}
EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
+/**
+ * pci_aer_mask_internal_errors - mask internal errors
+ * @dev: pointer to the pcie_dev data structure
+ *
+ * Masks internal errors in the Uncorrectable and Correctable Error
+ * Mask registers.
+ *
+ * Note: AER must be enabled and supported by the device which must be
+ * checked in advance, e.g. with pcie_aer_is_native().
+ */
+void pci_aer_mask_internal_errors(struct pci_dev *dev)
+{
+ int aer = dev->aer_cap;
+ u32 mask;
+
+ pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
+ mask |= PCI_ERR_UNC_INTN;
+ pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
+
+ pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
+ mask |= PCI_ERR_COR_INTERNAL;
+ pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
+}
+EXPORT_SYMBOL_NS_GPL(pci_aer_mask_internal_errors, "CXL");
+
static bool is_cxl_mem_dev(struct pci_dev *dev)
{
/*
diff --git a/include/linux/aer.h b/include/linux/aer.h
index a65fe324fad2..f0c84db466e5 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -101,5 +101,6 @@ int cper_severity_to_aer(int cper_severity);
void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
int severity, struct aer_capability_regs *aer_regs);
void pci_aer_unmask_internal_errors(struct pci_dev *dev);
+void pci_aer_mask_internal_errors(struct pci_dev *dev);
#endif //_AER_H_
--
2.34.1
On Wed, 26 Mar 2025 20:47:17 -0500
Terry Bowman <terry.bowman@amd.com> wrote:
> During CXL device cleanup the CXL PCIe Port device interrupts may remain
> enabled. This can potentialy allow unnecessary interrupt processing on
> behalf of the CXL errors while the device is destroyed.
>
> Disable CXL protocol errors by setting the CXL devices' AER mask register.
>
> Introduce pci_aer_mask_internal_errors() similar to pci_aer_unmask_internal_errors().
>
> Next, introduce cxl_disable_prot_errors() to call pci_aer_mask_internal_errors().
> Register cxl_disable_prot_errors() to run at CXL device cleanup.
> Register for CXL Root Ports, CXL Downstream Ports, CXL Upstream Ports, and
> CXL Endpoints.
>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
A few small comments in here. I haven't looked through all the rest of the series
as out of time today but this one caught my eye.
>
> @@ -223,7 +238,7 @@ static void cxl_endpoint_port_init_ras(struct cxl_port *port)
> struct device *cxlmd_dev __free(put_device) = &cxlmd->dev;
> struct cxl_dev_state *cxlds = cxlmd->cxlds;
>
> - if (!dport || !dev_is_pci(dport->dport_dev)) {
> + if (!dport || !dev_is_pci(dport->dport_dev) || !dev_is_pci(cxlds->dev)) {
> dev_err(&port->dev, "CXL port topology not found\n");
> return;
> }
> @@ -232,6 +247,7 @@ static void cxl_endpoint_port_init_ras(struct cxl_port *port)
>
> cxl_assign_error_handlers(cxlmd_dev, &cxl_ep_error_handlers);
> cxl_enable_prot_errors(cxlds->dev);
> + devm_add_action_or_reset(cxlds->dev, cxl_disable_prot_errors, cxlds->dev);
This can fail (at least in theory). Should at least scream that oddly we've
disabled error handling interrupts if it is hard to return anything cleanly.
Same for all the other cases.
> }
>
> #else
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index d3068f5cc767..d1ef0c676ff8 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -977,6 +977,31 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
> }
> EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>
> +/**
> + * pci_aer_mask_internal_errors - mask internal errors
> + * @dev: pointer to the pcie_dev data structure
> + *
> + * Masks internal errors in the Uncorrectable and Correctable Error
> + * Mask registers.
> + *
> + * Note: AER must be enabled and supported by the device which must be
> + * checked in advance, e.g. with pcie_aer_is_native().
> + */
> +void pci_aer_mask_internal_errors(struct pci_dev *dev)
> +{
> + int aer = dev->aer_cap;
> + u32 mask;
> +
> + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
> + mask |= PCI_ERR_UNC_INTN;
> + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
> +
It does an extra clear we don't need, but....
pci_clear_and_set_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
0, PCI_ERR_UNC_INTN);
is at very least shorter than the above 3 lines.
> + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
> + mask |= PCI_ERR_COR_INTERNAL;
> + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
> +}
> +EXPORT_SYMBOL_NS_GPL(pci_aer_mask_internal_errors, "CXL");
> +
> static bool is_cxl_mem_dev(struct pci_dev *dev)
> {
> /*
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index a65fe324fad2..f0c84db466e5 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -101,5 +101,6 @@ int cper_severity_to_aer(int cper_severity);
> void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
> int severity, struct aer_capability_regs *aer_regs);
> void pci_aer_unmask_internal_errors(struct pci_dev *dev);
> +void pci_aer_mask_internal_errors(struct pci_dev *dev);
> #endif //_AER_H_
>
On 4/4/2025 12:04 PM, Jonathan Cameron wrote:
> On Wed, 26 Mar 2025 20:47:17 -0500
> Terry Bowman <terry.bowman@amd.com> wrote:
>
>> During CXL device cleanup the CXL PCIe Port device interrupts may remain
>> enabled. This can potentialy allow unnecessary interrupt processing on
>> behalf of the CXL errors while the device is destroyed.
>>
>> Disable CXL protocol errors by setting the CXL devices' AER mask register.
>>
>> Introduce pci_aer_mask_internal_errors() similar to pci_aer_unmask_internal_errors().
>>
>> Next, introduce cxl_disable_prot_errors() to call pci_aer_mask_internal_errors().
>> Register cxl_disable_prot_errors() to run at CXL device cleanup.
>> Register for CXL Root Ports, CXL Downstream Ports, CXL Upstream Ports, and
>> CXL Endpoints.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> A few small comments in here. I haven't looked through all the rest of the series
> as out of time today but this one caught my eye.
>>
>> @@ -223,7 +238,7 @@ static void cxl_endpoint_port_init_ras(struct cxl_port *port)
>> struct device *cxlmd_dev __free(put_device) = &cxlmd->dev;
>> struct cxl_dev_state *cxlds = cxlmd->cxlds;
>>
>> - if (!dport || !dev_is_pci(dport->dport_dev)) {
>> + if (!dport || !dev_is_pci(dport->dport_dev) || !dev_is_pci(cxlds->dev)) {
>> dev_err(&port->dev, "CXL port topology not found\n");
>> return;
>> }
>> @@ -232,6 +247,7 @@ static void cxl_endpoint_port_init_ras(struct cxl_port *port)
>>
>> cxl_assign_error_handlers(cxlmd_dev, &cxl_ep_error_handlers);
>> cxl_enable_prot_errors(cxlds->dev);
>> + devm_add_action_or_reset(cxlds->dev, cxl_disable_prot_errors, cxlds->dev);
> This can fail (at least in theory). Should at least scream that oddly we've
> disabled error handling interrupts if it is hard to return anything cleanly.
>
> Same for all the other cases.
Ok. I will add a dev_err() for errors returned by devm_add_action_or_reset().
>> }
>>
>> #else
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index d3068f5cc767..d1ef0c676ff8 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -977,6 +977,31 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>> }
>> EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>>
>> +/**
>> + * pci_aer_mask_internal_errors - mask internal errors
>> + * @dev: pointer to the pcie_dev data structure
>> + *
>> + * Masks internal errors in the Uncorrectable and Correctable Error
>> + * Mask registers.
>> + *
>> + * Note: AER must be enabled and supported by the device which must be
>> + * checked in advance, e.g. with pcie_aer_is_native().
>> + */
>> +void pci_aer_mask_internal_errors(struct pci_dev *dev)
>> +{
>> + int aer = dev->aer_cap;
>> + u32 mask;
>> +
>> + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
>> + mask |= PCI_ERR_UNC_INTN;
>> + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
>> +
> It does an extra clear we don't need, but....
> pci_clear_and_set_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
> 0, PCI_ERR_UNC_INTN);
>
> is at very least shorter than the above 3 lines.
Doing so will overwrite the existing mask. CXL normally only uses AER UIE/CIE but if the device
happens to lose alternate training and no longer identifies as a CXL device than this mask
value would be critical for reporting PCI AER errors and would need UCE/CE enabled (other
than UIE/CIE).
-Terry
>> + pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
>> + mask |= PCI_ERR_COR_INTERNAL;
>> + pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(pci_aer_mask_internal_errors, "CXL");
>> +
>> static bool is_cxl_mem_dev(struct pci_dev *dev)
>> {
>> /*
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index a65fe324fad2..f0c84db466e5 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -101,5 +101,6 @@ int cper_severity_to_aer(int cper_severity);
>> void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>> int severity, struct aer_capability_regs *aer_regs);
>> void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>> +void pci_aer_mask_internal_errors(struct pci_dev *dev);
>> #endif //_AER_H_
>>
Hi Terry,
> >> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> >> index d3068f5cc767..d1ef0c676ff8 100644
> >> --- a/drivers/pci/pcie/aer.c
> >> +++ b/drivers/pci/pcie/aer.c
> >> @@ -977,6 +977,31 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
> >> }
> >> EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
> >>
> >> +/**
> >> + * pci_aer_mask_internal_errors - mask internal errors
> >> + * @dev: pointer to the pcie_dev data structure
> >> + *
> >> + * Masks internal errors in the Uncorrectable and Correctable Error
> >> + * Mask registers.
> >> + *
> >> + * Note: AER must be enabled and supported by the device which must be
> >> + * checked in advance, e.g. with pcie_aer_is_native().
> >> + */
> >> +void pci_aer_mask_internal_errors(struct pci_dev *dev)
> >> +{
> >> + int aer = dev->aer_cap;
> >> + u32 mask;
> >> +
> >> + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
> >> + mask |= PCI_ERR_UNC_INTN;
> >> + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
> >> +
> > It does an extra clear we don't need, but....
> > pci_clear_and_set_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
> > 0, PCI_ERR_UNC_INTN);
> >
> > is at very least shorter than the above 3 lines.
> Doing so will overwrite the existing mask. CXL normally only uses AER UIE/CIE but if the device
> happens to lose alternate training and no longer identifies as a CXL device than this mask
> value would be critical for reporting PCI AER errors and would need UCE/CE enabled (other
> than UIE/CIE).
I'm not seeing that. Implementation of pci_clear_and_set_config_dword() is:
void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos,
u32 clear, u32 set)
{
u32 val;
pci_read_config_dword(dev, pos, &val);
val &= ~clear;
val |= set;
pci_write_config_dword(dev, pos, val);
}
With clear parameter as zero it will do the same the open coded
version you have above as the ~clear will be all 1s and hence
&= ~clear has no affect.
Arguably we could add pci_clear_config_dword() and pci_set_config_dword()
that both take one fewer parameter but I guess that is not worth
the bother.
Jonathan
On 4/17/2025 5:13 AM, Jonathan Cameron wrote:
> Hi Terry,
>
>>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>>> index d3068f5cc767..d1ef0c676ff8 100644
>>>> --- a/drivers/pci/pcie/aer.c
>>>> +++ b/drivers/pci/pcie/aer.c
>>>> @@ -977,6 +977,31 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev)
>>>> }
>>>> EXPORT_SYMBOL_NS_GPL(pci_aer_unmask_internal_errors, "CXL");
>>>>
>>>> +/**
>>>> + * pci_aer_mask_internal_errors - mask internal errors
>>>> + * @dev: pointer to the pcie_dev data structure
>>>> + *
>>>> + * Masks internal errors in the Uncorrectable and Correctable Error
>>>> + * Mask registers.
>>>> + *
>>>> + * Note: AER must be enabled and supported by the device which must be
>>>> + * checked in advance, e.g. with pcie_aer_is_native().
>>>> + */
>>>> +void pci_aer_mask_internal_errors(struct pci_dev *dev)
>>>> +{
>>>> + int aer = dev->aer_cap;
>>>> + u32 mask;
>>>> +
>>>> + pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
>>>> + mask |= PCI_ERR_UNC_INTN;
>>>> + pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, mask);
>>>> +
>>> It does an extra clear we don't need, but....
>>> pci_clear_and_set_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
>>> 0, PCI_ERR_UNC_INTN);
>>>
>>> is at very least shorter than the above 3 lines.
>> Doing so will overwrite the existing mask. CXL normally only uses AER UIE/CIE but if the device
>> happens to lose alternate training and no longer identifies as a CXL device than this mask
>> value would be critical for reporting PCI AER errors and would need UCE/CE enabled (other
>> than UIE/CIE).
> I'm not seeing that. Implementation of pci_clear_and_set_config_dword() is:
> void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos,
> u32 clear, u32 set)
> {
> u32 val;
>
> pci_read_config_dword(dev, pos, &val);
> val &= ~clear;
> val |= set;
> pci_write_config_dword(dev, pos, val);
> }
>
> With clear parameter as zero it will do the same the open coded
> version you have above as the ~clear will be all 1s and hence
> &= ~clear has no affect.
>
> Arguably we could add pci_clear_config_dword() and pci_set_config_dword()
> that both take one fewer parameter but I guess that is not worth
> the bother.
>
> Jonathan
>
Got it. I'll change to use pci_clear_and_set_config_dword().
-Terry
>
Hi Terry,
kernel test robot noticed the following build warnings:
[auto build test WARNING on aae0594a7053c60b82621136257c8b648c67b512]
url: https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/PCI-CXL-Introduce-PCIe-helper-function-pcie_is_cxl/20250327-095738
base: aae0594a7053c60b82621136257c8b648c67b512
patch link: https://lore.kernel.org/r/20250327014717.2988633-17-terry.bowman%40amd.com
patch subject: [PATCH v8 16/16] CXL/PCI: Disable CXL protocol errors during CXL Port cleanup
config: csky-randconfig-r122-20250327 (https://download.01.org/0day-ci/archive/20250328/202503280816.M7DZmSDT-lkp@intel.com/config)
compiler: csky-linux-gcc (GCC) 12.4.0
reproduce: (https://download.01.org/0day-ci/archive/20250328/202503280816.M7DZmSDT-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503280816.M7DZmSDT-lkp@intel.com/
sparse warnings: (new ones prefixed by >>)
drivers/cxl/port.c:68:33: sparse: sparse: symbol 'cxl_ep_error_handlers' was not declared. Should it be static?
>> drivers/cxl/port.c:104:6: sparse: sparse: symbol 'cxl_disable_prot_errors' was not declared. Should it be static?
vim +/cxl_disable_prot_errors +104 drivers/cxl/port.c
67
> 68 const struct cxl_error_handlers cxl_ep_error_handlers = {
69 .error_detected = cxl_error_detected,
70 .cor_error_detected = cxl_cor_error_detected,
71 };
72
73 static void cxl_assign_error_handlers(struct device *_dev,
74 const struct cxl_error_handlers *handlers)
75 {
76 struct device *dev __free(put_device) = get_device(_dev);
77 struct cxl_driver *pdrv;
78
79 if (!dev)
80 return;
81
82 pdrv = to_cxl_drv(dev->driver);
83 pdrv->err_handler = handlers;
84 }
85
86 void cxl_enable_prot_errors(struct device *dev)
87 {
88 struct pci_dev *pdev = to_pci_dev(dev);
89 struct device *pci_dev __free(put_device) = get_device(&pdev->dev);
90
91 if (!pci_dev)
92 return;
93
94 if (!pdev->aer_cap) {
95 pdev->aer_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
96 if (!pdev->aer_cap)
97 return;
98 }
99
100 pci_aer_unmask_internal_errors(pdev);
101 }
102 EXPORT_SYMBOL_NS_GPL(cxl_enable_prot_errors, "CXL");
103
> 104 void cxl_disable_prot_errors(void *_dev)
105 {
106 struct device *dev = _dev;
107 struct pci_dev *pdev = to_pci_dev(dev);
108 struct device *pci_dev __free(put_device) = get_device(&pdev->dev);
109
110 if (!pci_dev || !pdev->aer_cap)
111 return;
112
113 pci_aer_mask_internal_errors(pdev);
114 }
115 EXPORT_SYMBOL_NS_GPL(cxl_disable_prot_errors, "CXL");
116
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.