CXL currently has separate trace routines for CXL Port errors and CXL
Endpoint errors. This is inconvenient for the user because they must enable
2 sets of trace routines. Make updates to the trace logging such that a
single trace routine logs both CXL Endpoint and CXL Port protocol errors.
Keep the trace log fields 'memdev' and 'host'. While these are not accurate
for non-Endpoints the fields will remain as-is to prevent breaking
userspace RAS trace consumers.
Add serial number parameter to the trace logging. This is used for EPs
and 0 is provided for CXL port devices without a serial number.
Leave the correctable and uncorrectable trace routines' TP_STRUCT__entry()
unchanged with respect to member data types and order.
Below is output of correctable and uncorrectable protocol error logging.
CXL Root Port and CXL Endpoint examples are included below.
Root Port:
cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Endpoint:
cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
---
Changes in v12->v13:
- Added Dave Jiang's review-by
Changes in v11 -> v12:
- Correct parameters to call trace_cxl_aer_correctable_error()
- Add reviewed-by for Jonathan and Shiju
Changes in v10->v11:
- Updated CE and UCE trace routines to maintain consistent TP_Struct ABI
and unchanged TP_printk() logging.
---
drivers/cxl/core/core.h | 4 +--
drivers/cxl/core/ras.c | 26 ++++++++-------
drivers/cxl/core/ras_rch.c | 4 +--
drivers/cxl/core/trace.h | 68 ++++++--------------------------------
4 files changed, 29 insertions(+), 73 deletions(-)
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 1a419b35fa59..e47ae7365ce0 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -149,8 +149,8 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
#ifdef CONFIG_CXL_RAS
int cxl_ras_init(void);
void cxl_ras_exit(void);
-bool cxl_handle_ras(struct device *dev, void __iomem *ras_base);
-void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base);
+bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base);
+void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base);
#else
static inline int cxl_ras_init(void)
{
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 0320c391f201..599c88f0b376 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -13,7 +13,7 @@ static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
- trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+ trace_cxl_aer_correctable_error(&pdev->dev, status, 0);
}
static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
@@ -28,8 +28,8 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
else
fe = status;
- trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&pdev->dev, status, fe,
+ ras_cap.header_log, 0);
}
static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
@@ -37,7 +37,7 @@ static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
- trace_cxl_aer_correctable_error(cxlmd, status);
+ trace_cxl_aer_correctable_error(&cxlmd->dev, status, cxlmd->cxlds->serial);
}
static void
@@ -45,6 +45,7 @@ cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
u32 fe;
if (hweight32(status) > 1)
@@ -53,8 +54,9 @@ cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
else
fe = status;
- trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&cxlmd->dev, status, fe,
+ ras_cap.header_log,
+ cxlds->serial);
}
static int match_memdev_by_parent(struct device *dev, const void *uport)
@@ -160,7 +162,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
}
EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
-void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base)
+void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
{
void __iomem *addr;
u32 status;
@@ -174,7 +176,7 @@ void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base)
status = readl(addr);
if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
- trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
+ trace_cxl_aer_correctable_error(dev, status, serial);
}
}
@@ -199,7 +201,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
* Log the state of the RAS status registers and prepare them to log the
* next error status. Return 1 if reset needed.
*/
-bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+bool cxl_handle_ras(struct device *dev, u64 serial, void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32];
void __iomem *addr;
@@ -228,7 +230,7 @@ bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
}
header_log_copy(ras_base, hl);
- trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
+ trace_cxl_aer_uncorrectable_error(dev, status, fe, hl, serial);
writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
return true;
@@ -250,7 +252,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
if (cxlds->rcd)
cxl_handle_rdport_errors(cxlds);
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
}
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
@@ -279,7 +281,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
* chance the situation is recoverable dump the status of the RAS
* capability registers and bounce the active state of the memdev.
*/
- ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c
index 4d2babe8d206..421dd1bcfc9c 100644
--- a/drivers/cxl/core/ras_rch.c
+++ b/drivers/cxl/core/ras_rch.c
@@ -114,7 +114,7 @@ void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
pci_print_aer(pdev, severity, &aer_regs);
if (severity == AER_CORRECTABLE)
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, 0, dport->regs.ras);
else
- cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ cxl_handle_ras(&cxlds->cxlmd->dev, 0, dport->regs.ras);
}
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index a972e4ef1936..69f8a0efd924 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,40 +48,13 @@
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
)
-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
- TP_ARGS(dev, status, fe, hl),
- TP_STRUCT__entry(
- __string(device, dev_name(dev))
- __string(host, dev_name(dev->parent))
- __field(u32, status)
- __field(u32, first_error)
- __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
- ),
- TP_fast_assign(
- __assign_str(device);
- __assign_str(host);
- __entry->status = status;
- __entry->first_error = fe;
- /*
- * Embed the 512B headerlog data for user app retrieval and
- * parsing, but no need to print this in the trace buffer.
- */
- memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
- ),
- TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
- __get_str(device), __get_str(host),
- show_uc_errs(__entry->status),
- show_uc_errs(__entry->first_error)
- )
-);
-
TRACE_EVENT(cxl_aer_uncorrectable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
- TP_ARGS(cxlmd, status, fe, hl),
+ TP_PROTO(const struct device *cxlmd, u32 status, u32 fe, u32 *hl,
+ u64 serial),
+ TP_ARGS(cxlmd, status, fe, hl, serial),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(memdev, dev_name(cxlmd))
+ __string(host, dev_name(cxlmd->parent))
__field(u64, serial)
__field(u32, status)
__field(u32, first_error)
@@ -90,7 +63,7 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
TP_fast_assign(
__assign_str(memdev);
__assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __entry->serial = serial;
__entry->status = status;
__entry->first_error = fe;
/*
@@ -124,38 +97,19 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
)
-TRACE_EVENT(cxl_port_aer_correctable_error,
- TP_PROTO(struct device *dev, u32 status),
- TP_ARGS(dev, status),
- TP_STRUCT__entry(
- __string(device, dev_name(dev))
- __string(host, dev_name(dev->parent))
- __field(u32, status)
- ),
- TP_fast_assign(
- __assign_str(device);
- __assign_str(host);
- __entry->status = status;
- ),
- TP_printk("device=%s host=%s status='%s'",
- __get_str(device), __get_str(host),
- show_ce_errs(__entry->status)
- )
-);
-
TRACE_EVENT(cxl_aer_correctable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
- TP_ARGS(cxlmd, status),
+ TP_PROTO(const struct device *cxlmd, u32 status, u64 serial),
+ TP_ARGS(cxlmd, status, serial),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(memdev, dev_name(cxlmd))
+ __string(host, dev_name(cxlmd->parent))
__field(u64, serial)
__field(u32, status)
),
TP_fast_assign(
__assign_str(memdev);
__assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __entry->serial = serial;
__entry->status = status;
),
TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
--
2.34.1
Terry Bowman wrote: > CXL currently has separate trace routines for CXL Port errors and CXL > Endpoint errors. This is inconvenient for the user because they must enable > 2 sets of trace routines. Make updates to the trace logging such that a > single trace routine logs both CXL Endpoint and CXL Port protocol errors. No, this is not inconvient, this is required for compatible evolution of tracepoints. The change in this patch breaks compatibility as it violates the expectation the type and order of TP_ARGS does not change from one kernel to next. > Keep the trace log fields 'memdev' and 'host'. While these are not accurate > for non-Endpoints the fields will remain as-is to prevent breaking > userspace RAS trace consumers. > > Add serial number parameter to the trace logging. This is used for EPs > and 0 is provided for CXL port devices without a serial number. > > Leave the correctable and uncorrectable trace routines' TP_STRUCT__entry() > unchanged with respect to member data types and order. > > Below is output of correctable and uncorrectable protocol error logging. > CXL Root Port and CXL Endpoint examples are included below. > > Root Port: > cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit' > cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error' A root port is not a "memdev", another awkward side effect of trying to combine 2 trace points with different use cases. So a NAK from me for this change (unless there is an strong reason for Linux to inflict the compat breakage), please keep the separate tracepoints they are for distinctly different use cases. A memdev protocol error is contained to that memdev, a port protocol error implicates every CXL.cachemem descendant of that port.
On 11/19/2025 3:23 PM, dan.j.williams@intel.com wrote: > Terry Bowman wrote: >> CXL currently has separate trace routines for CXL Port errors and CXL >> Endpoint errors. This is inconvenient for the user because they must enable >> 2 sets of trace routines. Make updates to the trace logging such that a >> single trace routine logs both CXL Endpoint and CXL Port protocol errors. > No, this is not inconvient, this is required for compatible evolution of > tracepoints. The change in this patch breaks compatibility as it > violates the expectation the type and order of TP_ARGS does not change > from one kernel to next. > >> Keep the trace log fields 'memdev' and 'host'. While these are not accurate >> for non-Endpoints the fields will remain as-is to prevent breaking >> userspace RAS trace consumers. >> >> Add serial number parameter to the trace logging. This is used for EPs >> and 0 is provided for CXL port devices without a serial number. >> >> Leave the correctable and uncorrectable trace routines' TP_STRUCT__entry() >> unchanged with respect to member data types and order. >> >> Below is output of correctable and uncorrectable protocol error logging. >> CXL Root Port and CXL Endpoint examples are included below. >> >> Root Port: >> cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit' >> cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error' > A root port is not a "memdev", another awkward side effect of trying to > combine 2 trace points with different use cases. > > So a NAK from me for this change (unless there is an strong reason for > Linux to inflict the compat breakage), please keep the separate > tracepoints they are for distinctly different use cases. A memdev > protocol error is contained to that memdev, a port protocol error > implicates every CXL.cachemem descendant of that port. I misunderstood this comment from previous code review: https://lore.kernel.org/linux-cxl/67aea897cfe55_2d1e294ca@dwillia2-xfh.jf.intel.com.notmuch/#t Are you OK with the following format for Port devices? Or let me know what format is needed. cxl_port_aer_correctable_error: device=port1 parent=root0 status='Received Error From Physical Layer' cxl_port_aer_uncorrectable_error: device=port1 parent=root0 status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte Enable Parity Erro' Terry
Bowman, Terry wrote: > > > On 11/19/2025 3:23 PM, dan.j.williams@intel.com wrote: > > Terry Bowman wrote: > >> CXL currently has separate trace routines for CXL Port errors and CXL > >> Endpoint errors. This is inconvenient for the user because they must enable > >> 2 sets of trace routines. Make updates to the trace logging such that a > >> single trace routine logs both CXL Endpoint and CXL Port protocol errors. > > No, this is not inconvient, this is required for compatible evolution of > > tracepoints. The change in this patch breaks compatibility as it > > violates the expectation the type and order of TP_ARGS does not change > > from one kernel to next. > > > >> Keep the trace log fields 'memdev' and 'host'. While these are not accurate > >> for non-Endpoints the fields will remain as-is to prevent breaking > >> userspace RAS trace consumers. > >> > >> Add serial number parameter to the trace logging. This is used for EPs > >> and 0 is provided for CXL port devices without a serial number. > >> > >> Leave the correctable and uncorrectable trace routines' TP_STRUCT__entry() > >> unchanged with respect to member data types and order. > >> > >> Below is output of correctable and uncorrectable protocol error logging. > >> CXL Root Port and CXL Endpoint examples are included below. > >> > >> Root Port: > >> cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit' > >> cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error' > > A root port is not a "memdev", another awkward side effect of trying to > > combine 2 trace points with different use cases. > > > > So a NAK from me for this change (unless there is an strong reason for > > Linux to inflict the compat breakage), please keep the separate > > tracepoints they are for distinctly different use cases. A memdev > > protocol error is contained to that memdev, a port protocol error > > implicates every CXL.cachemem descendant of that port. > I misunderstood this comment from previous code review: > https://lore.kernel.org/linux-cxl/67aea897cfe55_2d1e294ca@dwillia2-xfh.jf.intel.com.notmuch/#t No, you did not misunderstand, I just did not realize at the time I was asking for compatibility breakage with that suggestion. Apologies for that thrash. > Are you OK with the following format for Port devices? Or let me know what format is needed. > cxl_port_aer_correctable_error: device=port1 parent=root0 status='Received Error From Physical Layer' > cxl_port_aer_uncorrectable_error: device=port1 parent=root0 status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte Enable Parity Erro' That looks good to me. Also, I realize this patch set has gone through many revisions. We really need to get at least some of these pre-req patches into a topic branch so they do not need to keep being sent out in this large series.
On 11/19/2025 5:40 PM, dan.j.williams@intel.com wrote: > Bowman, Terry wrote: >> >> On 11/19/2025 3:23 PM, dan.j.williams@intel.com wrote: >>> Terry Bowman wrote: >>>> CXL currently has separate trace routines for CXL Port errors and CXL >>>> Endpoint errors. This is inconvenient for the user because they must enable >>>> 2 sets of trace routines. Make updates to the trace logging such that a >>>> single trace routine logs both CXL Endpoint and CXL Port protocol errors. >>> No, this is not inconvient, this is required for compatible evolution of >>> tracepoints. The change in this patch breaks compatibility as it >>> violates the expectation the type and order of TP_ARGS does not change >>> from one kernel to next. >>> >>>> Keep the trace log fields 'memdev' and 'host'. While these are not accurate >>>> for non-Endpoints the fields will remain as-is to prevent breaking >>>> userspace RAS trace consumers. >>>> >>>> Add serial number parameter to the trace logging. This is used for EPs >>>> and 0 is provided for CXL port devices without a serial number. >>>> >>>> Leave the correctable and uncorrectable trace routines' TP_STRUCT__entry() >>>> unchanged with respect to member data types and order. >>>> >>>> Below is output of correctable and uncorrectable protocol error logging. >>>> CXL Root Port and CXL Endpoint examples are included below. >>>> >>>> Root Port: >>>> cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit' >>>> cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error' >>> A root port is not a "memdev", another awkward side effect of trying to >>> combine 2 trace points with different use cases. >>> >>> So a NAK from me for this change (unless there is an strong reason for >>> Linux to inflict the compat breakage), please keep the separate >>> tracepoints they are for distinctly different use cases. A memdev >>> protocol error is contained to that memdev, a port protocol error >>> implicates every CXL.cachemem descendant of that port. >> I misunderstood this comment from previous code review: >> https://lore.kernel.org/linux-cxl/67aea897cfe55_2d1e294ca@dwillia2-xfh.jf.intel.com.notmuch/#t > No, you did not misunderstand, I just did not realize at the time I was > asking for compatibility breakage with that suggestion. Apologies for > that thrash. > >> Are you OK with the following format for Port devices? Or let me know what format is needed. >> cxl_port_aer_correctable_error: device=port1 parent=root0 status='Received Error From Physical Layer' >> cxl_port_aer_uncorrectable_error: device=port1 parent=root0 status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte Enable Parity Erro' > That looks good to me. > > Also, I realize this patch set has gone through many revisions. We > really need to get at least some of these pre-req patches into a topic > branch so they do not need to keep being sent out in this large series. Do we want a serial number field in the port log (missing above)? CXL USP and DSP will have serial from the Identify Command. It would make the port logs look like: cxl_port_aer_correctable_error: device=port1 parent=root0 serial=0 status='Received Error From Physical Layer' cxl_port_aer_uncorrectable_error: device=port1 parent=root0 serial=0 status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte -Terry
© 2016 - 2025 Red Hat, Inc.