Introduce CXL error handlers for CXL Port devices.
Add functions cxl_port_cor_error_detected() and cxl_port_error_detected().
These will serve as the handlers for all CXL Port devices. Introduce
cxl_get_ras_base() to provide the RAS base address needed by the handlers.
Update cxl_handle_prot_error() to call the CXL Port or CXL Endpoint handler
depending on which CXL device reports the error.
Implement pci_get_ras_base() to return the cached RAS register address of a
CXL Root Port, CXL Downstream Port, or CXL Upstream Port.
Update the AER driver's is_cxl_error() to remove the filter PCI type check
because CXL Port devices are now supported.
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
drivers/cxl/core/core.h | 2 +
drivers/cxl/core/pci.c | 61 ++++++++++++++++++++++++++
drivers/cxl/core/port.c | 4 +-
drivers/cxl/core/ras.c | 96 ++++++++++++++++++++++++++++++++++++-----
drivers/cxl/cxl.h | 5 +++
drivers/pci/pcie/aer.c | 5 ---
6 files changed, 155 insertions(+), 18 deletions(-)
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index c73f39d14dd7..23d15eef01d2 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -122,6 +122,8 @@ void cxl_ras_exit(void);
int cxl_gpf_port_setup(struct cxl_dport *dport);
int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
int nid, resource_size_t *size);
+struct cxl_port *find_cxl_port(struct device *dport_dev,
+ struct cxl_dport **dport);
#ifdef CONFIG_CXL_FEATURES
size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index e094ef518e0a..b6836825e8df 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -753,6 +753,67 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
#ifdef CONFIG_PCIEAER_CXL
+static void __iomem *cxl_get_ras_base(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ void __iomem *ras_base;
+
+ switch (pci_pcie_type(pdev)) {
+ case PCI_EXP_TYPE_ROOT_PORT:
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ {
+ struct cxl_dport *dport = NULL;
+ struct cxl_port *port __free(put_cxl_port) = find_cxl_port(&pdev->dev, &dport);
+
+ if (!dport || !dport->dport_dev) {
+ pci_err(pdev, "Failed to find the CXL device");
+ return NULL;
+ }
+
+ ras_base = dport ? dport->regs.ras : NULL;
+ break;
+ }
+ case PCI_EXP_TYPE_UPSTREAM:
+ {
+ struct cxl_port *port;
+ struct device *dev __free(put_device) = bus_find_device(&cxl_bus_type, NULL,
+ &pdev->dev, match_uport);
+
+ if (!dev || !is_cxl_port(dev)) {
+ pci_err(pdev, "Failed to find the CXL device");
+ return NULL;
+ }
+
+ port = to_cxl_port(dev);
+ ras_base = port ? port->uport_regs.ras : NULL;
+ break;
+ }
+ default:
+ {
+ pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
+ return NULL;
+ }
+ }
+
+ return ras_base;
+}
+
+void cxl_port_cor_error_detected(struct device *dev)
+{
+ void __iomem *ras_base = cxl_get_ras_base(dev);
+
+ __cxl_handle_cor_ras(dev, 0, ras_base);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_cor_error_detected, "CXL");
+
+pci_ers_result_t cxl_port_error_detected(struct device *dev)
+{
+ void __iomem *ras_base = cxl_get_ras_base(dev);
+
+ return __cxl_handle_ras(dev, 0, ras_base);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_error_detected, "CXL");
+
static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index eb46c6764d20..07b9bb0f601f 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1341,8 +1341,8 @@ static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx)
return NULL;
}
-static struct cxl_port *find_cxl_port(struct device *dport_dev,
- struct cxl_dport **dport)
+struct cxl_port *find_cxl_port(struct device *dport_dev,
+ struct cxl_dport **dport)
{
struct cxl_find_port_ctx ctx = {
.dport_dev = dport_dev,
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 664f532cc838..6093e70ece37 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -140,20 +140,85 @@ static pci_ers_result_t cxl_merge_result(enum pci_ers_result orig,
return orig;
}
-static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
+int match_uport(struct device *dev, const void *data)
{
- pci_ers_result_t vote, *result = data;
- struct cxl_dev_state *cxlds;
+ const struct device *uport_dev = data;
+ struct cxl_port *port;
- if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
- (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END))
+ if (!is_cxl_port(dev))
return 0;
- cxlds = pci_get_drvdata(pdev);
- struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
+ port = to_cxl_port(dev);
+
+ return port->uport_dev == uport_dev;
+}
+EXPORT_SYMBOL_NS_GPL(match_uport, "CXL");
+
+/* Return 'struct device*' responsible for freeing pdev's CXL resources */
+static struct device *get_pci_cxl_host_dev(struct pci_dev *pdev)
+{
+ struct device *host_dev;
+
+ switch (pci_pcie_type(pdev)) {
+ case PCI_EXP_TYPE_ROOT_PORT:
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ {
+ struct cxl_dport *dport = NULL;
+ struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
+
+ if (!dport || !dport->dport_dev)
+ return NULL;
+
+ host_dev = &port->dev;
+ break;
+ }
+ case PCI_EXP_TYPE_UPSTREAM:
+ {
+ struct cxl_port *port;
+ struct device *cxl_dev = bus_find_device(&cxl_bus_type, NULL,
+ &pdev->dev, match_uport);
+
+ if (!cxl_dev || !is_cxl_port(cxl_dev))
+ return NULL;
+
+ port = to_cxl_port(cxl_dev);
+ host_dev = &port->dev;
+ break;
+ }
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_RC_END:
+ {
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+
+ if (!cxlds)
+ return NULL;
+
+ host_dev = get_device(&cxlds->cxlmd->dev);
+ break;
+ }
+ default:
+ {
+ pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
+ return NULL;
+ }
+ }
+
+ return host_dev;
+}
+
+static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
+{
+ struct device *dev = &pdev->dev;
+ struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
+ pci_ers_result_t vote, *result = data;
device_lock(&pdev->dev);
- vote = cxl_error_detected(&pdev->dev);
+ if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) ||
+ (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)) {
+ vote = cxl_error_detected(dev);
+ } else {
+ vote = cxl_port_error_detected(dev);
+ }
*result = cxl_merge_result(*result, vote);
device_unlock(&pdev->dev);
@@ -244,14 +309,18 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
{
struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
- struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
- struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
if (!pdev) {
pr_err("Failed to find the CXL device\n");
return;
}
+ struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
+ if (!host_dev) {
+ pr_err("Failed to find the CXL device's host\n");
+ return;
+ }
+
/*
* Internal errors of an RCEC indicate an AER error in an
* RCH's downstream port. Check and handle them in the CXL.mem
@@ -261,6 +330,7 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
return pcie_walk_rcec(pdev, cxl_rch_handle_error_iter, err_info);
if (err_info->severity == AER_CORRECTABLE) {
+ struct device *dev = &pdev->dev;
int aer = pdev->aer_cap;
if (aer)
@@ -268,7 +338,11 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
aer + PCI_ERR_COR_STATUS,
0, PCI_ERR_COR_INTERNAL);
- cxl_cor_error_detected(&pdev->dev);
+ if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) ||
+ (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END))
+ cxl_cor_error_detected(dev);
+ else
+ cxl_port_cor_error_detected(dev);
pcie_clear_device_status(pdev);
} else {
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 6fd9a42eb304..2c1c00466a25 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -801,6 +801,9 @@ int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint)
void cxl_cor_error_detected(struct device *dev);
pci_ers_result_t cxl_error_detected(struct device *dev);
+void cxl_port_cor_error_detected(struct device *dev);
+pci_ers_result_t cxl_port_error_detected(struct device *dev);
+
/**
* struct cxl_endpoint_dvsec_info - Cached DVSEC info
* @mem_enabled: cached value of mem_enabled in the DVSEC at init time
@@ -915,6 +918,8 @@ void cxl_coordinates_combine(struct access_coordinate *out,
bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
+int match_uport(struct device *dev, const void *data);
+
/*
* Unit test builds overrides this to __weak, find the 'strong' version
* of these symbols in tools/testing/cxl/.
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 6e88331c6303..5efe5a718960 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1018,11 +1018,6 @@ static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
if (!info || !info->is_cxl)
return false;
- /* Only CXL Endpoints are currently supported */
- if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
- (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
- return false;
-
return is_internal_error(info);
}
--
2.34.1
On Tue, 3 Jun 2025 12:22:36 -0500
Terry Bowman <terry.bowman@amd.com> wrote:
> Introduce CXL error handlers for CXL Port devices.
>
> Add functions cxl_port_cor_error_detected() and cxl_port_error_detected().
> These will serve as the handlers for all CXL Port devices. Introduce
> cxl_get_ras_base() to provide the RAS base address needed by the handlers.
>
> Update cxl_handle_prot_error() to call the CXL Port or CXL Endpoint handler
> depending on which CXL device reports the error.
>
> Implement pci_get_ras_base() to return the cached RAS register address of a
> CXL Root Port, CXL Downstream Port, or CXL Upstream Port.
>
> Update the AER driver's is_cxl_error() to remove the filter PCI type check
> because CXL Port devices are now supported.
>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
A few minor things on a fresh read.
> size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index e094ef518e0a..b6836825e8df 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -753,6 +753,67 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>
> #ifdef CONFIG_PCIEAER_CXL
>
> +static void __iomem *cxl_get_ras_base(struct device *dev)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + void __iomem *ras_base;
> +
> + switch (pci_pcie_type(pdev)) {
> + case PCI_EXP_TYPE_ROOT_PORT:
> + case PCI_EXP_TYPE_DOWNSTREAM:
> + {
> + struct cxl_dport *dport = NULL;
> + struct cxl_port *port __free(put_cxl_port) = find_cxl_port(&pdev->dev, &dport);
> +
> + if (!dport || !dport->dport_dev) {
> + pci_err(pdev, "Failed to find the CXL device");
> + return NULL;
> + }
> +
> + ras_base = dport ? dport->regs.ras : NULL;
As below - perhaps a sanity check for error and early return makes sense here.
> + break;
> + }
> + case PCI_EXP_TYPE_UPSTREAM:
> + {
> + struct cxl_port *port;
> + struct device *dev __free(put_device) = bus_find_device(&cxl_bus_type, NULL,
> + &pdev->dev, match_uport);
> +
> + if (!dev || !is_cxl_port(dev)) {
> + pci_err(pdev, "Failed to find the CXL device");
> + return NULL;
> + }
> +
> + port = to_cxl_port(dev);
> + ras_base = port ? port->uport_regs.ras : NULL;
I'd be tempted to return here to keep the flows simple. Maybe avoiding the ternary
if (!port)
return NULL;
return port->uport_regs.ras;
> + break;
> + }
> + default:
> + {
> + pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
> + return NULL;
Better not to introduce scope {} when not needed.
> + }
> + }
> +
> + return ras_base;
> +}
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 664f532cc838..6093e70ece37 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> +
> +/* Return 'struct device*' responsible for freeing pdev's CXL resources */
> +static struct device *get_pci_cxl_host_dev(struct pci_dev *pdev)
> +{
> + struct device *host_dev;
> +
> + switch (pci_pcie_type(pdev)) {
> + case PCI_EXP_TYPE_ROOT_PORT:
> + case PCI_EXP_TYPE_DOWNSTREAM:
> + {
> + struct cxl_dport *dport = NULL;
> + struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
> +
> + if (!dport || !dport->dport_dev)
What does !dport->dprot_dev mean? I.e. how does that happen?
I can only find places where we set it just after allocating a dport.
Perhaps a comment?
> + return NULL;
> +
> + host_dev = &port->dev;
> + break;
> + }
> + case PCI_EXP_TYPE_UPSTREAM:
> + {
> + struct cxl_port *port;
> + struct device *cxl_dev = bus_find_device(&cxl_bus_type, NULL,
> + &pdev->dev, match_uport);
Doesn't his leave you holding a reference to a device different form
the one you return? Hence wrong one gets put in caller.
> +
> + if (!cxl_dev || !is_cxl_port(cxl_dev))
> + return NULL;
> +
> + port = to_cxl_port(cxl_dev);
> + host_dev = &port->dev;
Actually no. Isn't this a circle that lands you on cxl_dev again?
container_of(dev, struct cxl_port, dev)->dev
> + break;
> + }
> + case PCI_EXP_TYPE_ENDPOINT:
> + case PCI_EXP_TYPE_RC_END:
> + {
> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> + if (!cxlds)
> + return NULL;
> +
> + host_dev = get_device(&cxlds->cxlmd->dev);
> + break;
Maybe just return it here? Similar for other cases.
Saves a reader keeping track of references if we get them roughly where
we return them.
> + }
> + default:
> + {
No need for scope on this one (or at least some of the others) so drop the {}
> + pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
> + return NULL;
> + }
> + }
> +
> + return host_dev;
> +}
> +
> +static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
> +{
> + struct device *dev = &pdev->dev;
> + struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
> + pci_ers_result_t vote, *result = data;
>
> device_lock(&pdev->dev);
> - vote = cxl_error_detected(&pdev->dev);
> + if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) ||
> + (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)) {
> + vote = cxl_error_detected(dev);
> + } else {
> + vote = cxl_port_error_detected(dev);
> + }
> *result = cxl_merge_result(*result, vote);
> device_unlock(&pdev->dev);
>
> @@ -244,14 +309,18 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
> static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
> {
> struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
> - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> - struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>
> if (!pdev) {
> pr_err("Failed to find the CXL device\n");
> return;
> }
>
> + struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
> + if (!host_dev) {
> + pr_err("Failed to find the CXL device's host\n");
> + return;
> + }
> +
On 6/12/2025 12:14 PM, Jonathan Cameron wrote:
> On Tue, 3 Jun 2025 12:22:36 -0500
> Terry Bowman <terry.bowman@amd.com> wrote:
>
>> Introduce CXL error handlers for CXL Port devices.
>>
>> Add functions cxl_port_cor_error_detected() and cxl_port_error_detected().
>> These will serve as the handlers for all CXL Port devices. Introduce
>> cxl_get_ras_base() to provide the RAS base address needed by the handlers.
>>
>> Update cxl_handle_prot_error() to call the CXL Port or CXL Endpoint handler
>> depending on which CXL device reports the error.
>>
>> Implement pci_get_ras_base() to return the cached RAS register address of a
>> CXL Root Port, CXL Downstream Port, or CXL Upstream Port.
>>
>> Update the AER driver's is_cxl_error() to remove the filter PCI type check
>> because CXL Port devices are now supported.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>
> A few minor things on a fresh read.
>
>> size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index e094ef518e0a..b6836825e8df 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -753,6 +753,67 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>>
>> #ifdef CONFIG_PCIEAER_CXL
>>
>> +static void __iomem *cxl_get_ras_base(struct device *dev)
>> +{
>> + struct pci_dev *pdev = to_pci_dev(dev);
>> + void __iomem *ras_base;
>> +
>> + switch (pci_pcie_type(pdev)) {
>> + case PCI_EXP_TYPE_ROOT_PORT:
>> + case PCI_EXP_TYPE_DOWNSTREAM:
>> + {
>> + struct cxl_dport *dport = NULL;
>> + struct cxl_port *port __free(put_cxl_port) = find_cxl_port(&pdev->dev, &dport);
>> +
>> + if (!dport || !dport->dport_dev) {
>> + pci_err(pdev, "Failed to find the CXL device");
>> + return NULL;
>> + }
>> +
>> + ras_base = dport ? dport->regs.ras : NULL;
> As below - perhaps a sanity check for error and early return makes sense here.
>
Good idea.
>> + break;
>> + }
>> + case PCI_EXP_TYPE_UPSTREAM:
>> + {
>> + struct cxl_port *port;
>> + struct device *dev __free(put_device) = bus_find_device(&cxl_bus_type, NULL,
>> + &pdev->dev, match_uport);
>> +
>> + if (!dev || !is_cxl_port(dev)) {
>> + pci_err(pdev, "Failed to find the CXL device");
>> + return NULL;
>> + }
>> +
>> + port = to_cxl_port(dev);
>> + ras_base = port ? port->uport_regs.ras : NULL;
>
> I'd be tempted to return here to keep the flows simple. Maybe avoiding the ternary
> if (!port)
> return NULL;
>
> return port->uport_regs.ras;
>
>
Ok.
>> + break;
>> + }
>> + default:
>> + {
>> + pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
>> + return NULL;
>
> Better not to introduce scope {} when not needed.
>
Ok.
>> + }
>> + }
>> +
>> + return ras_base;
>> +}
>
>> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
>> index 664f532cc838..6093e70ece37 100644
>> --- a/drivers/cxl/core/ras.c
>> +++ b/drivers/cxl/core/ras.c
>
>> +
>> +/* Return 'struct device*' responsible for freeing pdev's CXL resources */
>> +static struct device *get_pci_cxl_host_dev(struct pci_dev *pdev)
>> +{
>> + struct device *host_dev;
>> +
>> + switch (pci_pcie_type(pdev)) {
>> + case PCI_EXP_TYPE_ROOT_PORT:
>> + case PCI_EXP_TYPE_DOWNSTREAM:
>> + {
>> + struct cxl_dport *dport = NULL;
>> + struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
>> +
>> + if (!dport || !dport->dport_dev)
>
> What does !dport->dprot_dev mean? I.e. how does that happen?
> I can only find places where we set it just after allocating a dport.
> Perhaps a comment?
>
>
I'll remove the check for !dport->dport_dev.
>
>> + return NULL;
>> +
>> + host_dev = &port->dev;
>> + break;
>> + }
>> + case PCI_EXP_TYPE_UPSTREAM:
>> + {
>> + struct cxl_port *port;
>> + struct device *cxl_dev = bus_find_device(&cxl_bus_type, NULL,
>> + &pdev->dev, match_uport);
>
> Doesn't his leave you holding a reference to a device different form
> the one you return? Hence wrong one gets put in caller.
>
>> +
>> + if (!cxl_dev || !is_cxl_port(cxl_dev))
>> + return NULL;
>> +
>> + port = to_cxl_port(cxl_dev);
>> + host_dev = &port->dev;
> Actually no. Isn't this a circle that lands you on cxl_dev again?
>
> container_of(dev, struct cxl_port, dev)->dev
>
You're right, it is circular. I'll simplify it. Thanks.
>> + break;
>> + }
>> + case PCI_EXP_TYPE_ENDPOINT:
>> + case PCI_EXP_TYPE_RC_END:
>> + {
>> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>> +
>> + if (!cxlds)
>> + return NULL;
>> +
>> + host_dev = get_device(&cxlds->cxlmd->dev);
>> + break;
>
> Maybe just return it here? Similar for other cases.
> Saves a reader keeping track of references if we get them roughly where
> we return them.
>
Ok.
>> + }
>> + default:
>> + {
> No need for scope on this one (or at least some of the others) so drop the {}
>
Ok.
-Terry
>> + pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
>> + return NULL;
>> + }
>> + }
>> +
>> + return host_dev;
>> +}
>> +
>> +static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
>> +{
>> + struct device *dev = &pdev->dev;
>> + struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
>> + pci_ers_result_t vote, *result = data;
>>
>> device_lock(&pdev->dev);
>> - vote = cxl_error_detected(&pdev->dev);
>> + if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) ||
>> + (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)) {
>> + vote = cxl_error_detected(dev);
>> + } else {
>> + vote = cxl_port_error_detected(dev);
>> + }
>> *result = cxl_merge_result(*result, vote);
>> device_unlock(&pdev->dev);
>>
>> @@ -244,14 +309,18 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
>> static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>> {
>> struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
>> - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
>> - struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>>
>> if (!pdev) {
>> pr_err("Failed to find the CXL device\n");
>> return;
>> }
>>
>> + struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
>> + if (!host_dev) {
>> + pr_err("Failed to find the CXL device's host\n");
>> + return;
>> + }
>> +
>
On 6/3/25 10:22 AM, Terry Bowman wrote:
> Introduce CXL error handlers for CXL Port devices.
>
> Add functions cxl_port_cor_error_detected() and cxl_port_error_detected().
> These will serve as the handlers for all CXL Port devices. Introduce
> cxl_get_ras_base() to provide the RAS base address needed by the handlers.
>
> Update cxl_handle_prot_error() to call the CXL Port or CXL Endpoint handler
> depending on which CXL device reports the error.
>
> Implement pci_get_ras_base() to return the cached RAS register address of a
> CXL Root Port, CXL Downstream Port, or CXL Upstream Port.
>
> Update the AER driver's is_cxl_error() to remove the filter PCI type check
> because CXL Port devices are now supported.
>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> ---
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> drivers/cxl/core/core.h | 2 +
> drivers/cxl/core/pci.c | 61 ++++++++++++++++++++++++++
> drivers/cxl/core/port.c | 4 +-
> drivers/cxl/core/ras.c | 96 ++++++++++++++++++++++++++++++++++++-----
> drivers/cxl/cxl.h | 5 +++
> drivers/pci/pcie/aer.c | 5 ---
> 6 files changed, 155 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index c73f39d14dd7..23d15eef01d2 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -122,6 +122,8 @@ void cxl_ras_exit(void);
> int cxl_gpf_port_setup(struct cxl_dport *dport);
> int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
> int nid, resource_size_t *size);
> +struct cxl_port *find_cxl_port(struct device *dport_dev,
> + struct cxl_dport **dport);
>
> #ifdef CONFIG_CXL_FEATURES
> size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index e094ef518e0a..b6836825e8df 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -753,6 +753,67 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>
> #ifdef CONFIG_PCIEAER_CXL
>
> +static void __iomem *cxl_get_ras_base(struct device *dev)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + void __iomem *ras_base;
> +
> + switch (pci_pcie_type(pdev)) {
> + case PCI_EXP_TYPE_ROOT_PORT:
> + case PCI_EXP_TYPE_DOWNSTREAM:
> + {
> + struct cxl_dport *dport = NULL;
> + struct cxl_port *port __free(put_cxl_port) = find_cxl_port(&pdev->dev, &dport);
> +
> + if (!dport || !dport->dport_dev) {
> + pci_err(pdev, "Failed to find the CXL device");
> + return NULL;
> + }
> +
> + ras_base = dport ? dport->regs.ras : NULL;
> + break;
> + }
> + case PCI_EXP_TYPE_UPSTREAM:
> + {
> + struct cxl_port *port;
> + struct device *dev __free(put_device) = bus_find_device(&cxl_bus_type, NULL,
> + &pdev->dev, match_uport);
> +
> + if (!dev || !is_cxl_port(dev)) {
> + pci_err(pdev, "Failed to find the CXL device");
> + return NULL;
> + }
> +
> + port = to_cxl_port(dev);
> + ras_base = port ? port->uport_regs.ras : NULL;
> + break;
> + }
> + default:
> + {
> + pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
> + return NULL;
> + }
> + }
> +
> + return ras_base;
> +}
> +
> +void cxl_port_cor_error_detected(struct device *dev)
> +{
> + void __iomem *ras_base = cxl_get_ras_base(dev);
> +
> + __cxl_handle_cor_ras(dev, 0, ras_base);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_port_cor_error_detected, "CXL");
> +
> +pci_ers_result_t cxl_port_error_detected(struct device *dev)
> +{
> + void __iomem *ras_base = cxl_get_ras_base(dev);
> +
> + return __cxl_handle_ras(dev, 0, ras_base);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_port_error_detected, "CXL");
> +
> static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
> struct cxl_dport *dport)
> {
> diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
> index eb46c6764d20..07b9bb0f601f 100644
> --- a/drivers/cxl/core/port.c
> +++ b/drivers/cxl/core/port.c
> @@ -1341,8 +1341,8 @@ static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx)
> return NULL;
> }
>
> -static struct cxl_port *find_cxl_port(struct device *dport_dev,
> - struct cxl_dport **dport)
> +struct cxl_port *find_cxl_port(struct device *dport_dev,
> + struct cxl_dport **dport)
> {
> struct cxl_find_port_ctx ctx = {
> .dport_dev = dport_dev,
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 664f532cc838..6093e70ece37 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -140,20 +140,85 @@ static pci_ers_result_t cxl_merge_result(enum pci_ers_result orig,
> return orig;
> }
>
> -static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
> +int match_uport(struct device *dev, const void *data)
> {
> - pci_ers_result_t vote, *result = data;
> - struct cxl_dev_state *cxlds;
> + const struct device *uport_dev = data;
> + struct cxl_port *port;
>
> - if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
> - (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_END))
> + if (!is_cxl_port(dev))
> return 0;
>
> - cxlds = pci_get_drvdata(pdev);
> - struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
> + port = to_cxl_port(dev);
> +
> + return port->uport_dev == uport_dev;
> +}
> +EXPORT_SYMBOL_NS_GPL(match_uport, "CXL");
> +
> +/* Return 'struct device*' responsible for freeing pdev's CXL resources */
> +static struct device *get_pci_cxl_host_dev(struct pci_dev *pdev)
> +{
> + struct device *host_dev;
> +
> + switch (pci_pcie_type(pdev)) {
> + case PCI_EXP_TYPE_ROOT_PORT:
> + case PCI_EXP_TYPE_DOWNSTREAM:
> + {
> + struct cxl_dport *dport = NULL;
> + struct cxl_port *port = find_cxl_port(&pdev->dev, &dport);
> +
> + if (!dport || !dport->dport_dev)
> + return NULL;
> +
> + host_dev = &port->dev;
> + break;
> + }
> + case PCI_EXP_TYPE_UPSTREAM:
> + {
> + struct cxl_port *port;
> + struct device *cxl_dev = bus_find_device(&cxl_bus_type, NULL,
> + &pdev->dev, match_uport);
> +
> + if (!cxl_dev || !is_cxl_port(cxl_dev))
> + return NULL;
> +
> + port = to_cxl_port(cxl_dev);
> + host_dev = &port->dev;
> + break;
> + }
> + case PCI_EXP_TYPE_ENDPOINT:
> + case PCI_EXP_TYPE_RC_END:
> + {
> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> + if (!cxlds)
> + return NULL;
> +
> + host_dev = get_device(&cxlds->cxlmd->dev);
> + break;
> + }
> + default:
> + {
> + pci_warn_once(pdev, "Error: Unsupported device type (%X)", pci_pcie_type(pdev));
> + return NULL;
> + }
> + }
> +
> + return host_dev;
> +}
> +
> +static int cxl_report_error_detected(struct pci_dev *pdev, void *data)
> +{
> + struct device *dev = &pdev->dev;
> + struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
> + pci_ers_result_t vote, *result = data;
>
> device_lock(&pdev->dev);
> - vote = cxl_error_detected(&pdev->dev);
> + if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) ||
> + (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END)) {
> + vote = cxl_error_detected(dev);
> + } else {
> + vote = cxl_port_error_detected(dev);
> + }
> *result = cxl_merge_result(*result, vote);
> device_unlock(&pdev->dev);
>
> @@ -244,14 +309,18 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
> static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
> {
> struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
> - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> - struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>
> if (!pdev) {
> pr_err("Failed to find the CXL device\n");
> return;
> }
>
> + struct device *host_dev __free(put_device) = get_pci_cxl_host_dev(pdev);
> + if (!host_dev) {
> + pr_err("Failed to find the CXL device's host\n");
> + return;
> + }
> +
> /*
> * Internal errors of an RCEC indicate an AER error in an
> * RCH's downstream port. Check and handle them in the CXL.mem
> @@ -261,6 +330,7 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
> return pcie_walk_rcec(pdev, cxl_rch_handle_error_iter, err_info);
>
> if (err_info->severity == AER_CORRECTABLE) {
> + struct device *dev = &pdev->dev;
> int aer = pdev->aer_cap;
>
> if (aer)
> @@ -268,7 +338,11 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
> aer + PCI_ERR_COR_STATUS,
> 0, PCI_ERR_COR_INTERNAL);
>
> - cxl_cor_error_detected(&pdev->dev);
> + if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) ||
> + (pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END))
> + cxl_cor_error_detected(dev);
> + else
> + cxl_port_cor_error_detected(dev);
>
> pcie_clear_device_status(pdev);
> } else {
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 6fd9a42eb304..2c1c00466a25 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -801,6 +801,9 @@ int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint)
> void cxl_cor_error_detected(struct device *dev);
> pci_ers_result_t cxl_error_detected(struct device *dev);
>
> +void cxl_port_cor_error_detected(struct device *dev);
> +pci_ers_result_t cxl_port_error_detected(struct device *dev);
> +
> /**
> * struct cxl_endpoint_dvsec_info - Cached DVSEC info
> * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
> @@ -915,6 +918,8 @@ void cxl_coordinates_combine(struct access_coordinate *out,
>
> bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
>
> +int match_uport(struct device *dev, const void *data);
> +
> /*
> * Unit test builds overrides this to __weak, find the 'strong' version
> * of these symbols in tools/testing/cxl/.
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 6e88331c6303..5efe5a718960 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1018,11 +1018,6 @@ static bool is_cxl_error(struct pci_dev *pdev, struct aer_err_info *info)
> if (!info || !info->is_cxl)
> return false;
>
> - /* Only CXL Endpoints are currently supported */
> - if ((pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) &&
> - (pci_pcie_type(pdev) != PCI_EXP_TYPE_RC_EC))
> - return false;
> -
> return is_internal_error(info);
> }
>
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
© 2016 - 2025 Red Hat, Inc.