CXL currently has separate trace routines for CXL Port errors and CXL
Endpoint errors. This is inconvenient for the user because they must enable
2 sets of trace routines. Make updates to the trace logging such that a
single trace routine logs both CXL Endpoint and CXL Port protocol errors.
Rename the 'host' field from the CXL Endpoint trace to 'parent' in the
unified trace routines. 'host' does not correctly apply to CXL Port
devices. Parent is more general and applies to CXL Port devices and CXL
Endpoints.
Add serial number parameter to the trace logging. This is used for EPs
and 0 is provided for CXL port devices without a serial number.
Below is output of correctable and uncorrectable protocol error logging.
CXL Root Port and CXL Endpoint examples are included below.
Root Port:
cxl_aer_correctable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Endpoint:
cxl_aer_correctable_error: device=mem3 parent=0000:0f:00.0 serial=0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: device=mem3 parent=0000:0f:00.0 serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
drivers/cxl/core/pci.c | 18 +++++----
drivers/cxl/core/ras.c | 14 ++++---
drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
3 files changed, 37 insertions(+), 79 deletions(-)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 186a5a20b951..0f4c07fd64a5 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -664,7 +664,7 @@ void read_cdat_data(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
-static void __cxl_handle_cor_ras(struct device *dev,
+static void __cxl_handle_cor_ras(struct device *dev, u64 serial,
void __iomem *ras_base)
{
void __iomem *addr;
@@ -679,13 +679,13 @@ static void __cxl_handle_cor_ras(struct device *dev,
status = readl(addr);
if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
- trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
+ trace_cxl_aer_correctable_error(dev, serial, status);
}
}
static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds)
{
- return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
/* CXL spec rev3.0 8.2.4.16.1 */
@@ -709,7 +709,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
* Log the state of the RAS status registers and prepare them to log the
* next error status. Return 1 if reset needed.
*/
-static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+static bool __cxl_handle_ras(struct device *dev, u64 serial,
+ void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32];
void __iomem *addr;
@@ -738,7 +739,7 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
}
header_log_copy(ras_base, hl);
- trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
+ trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
return true;
@@ -746,7 +747,8 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
{
- return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+
+ return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
#ifdef CONFIG_PCIEAER_CXL
@@ -754,13 +756,13 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
- return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
}
static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
- return __cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
}
/*
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 715f7221ea3a..0ef8c2068c0c 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -13,7 +13,7 @@ static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
- trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+ trace_cxl_aer_correctable_error(&pdev->dev, 0, status);
}
static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
@@ -28,8 +28,8 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
else
fe = status;
- trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&pdev->dev, 0, status, fe,
+ ras_cap.header_log);
}
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
@@ -42,7 +42,8 @@ static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
if (!cxlds)
return;
- trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+ trace_cxl_aer_correctable_error(&cxlds->cxlmd->dev, cxlds->serial,
+ status);
}
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
@@ -62,8 +63,9 @@ static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
else
fe = status;
- trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&cxlds->cxlmd->dev,
+ cxlds->serial, status,
+ fe, ras_cap.header_log);
}
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 25ebfbc1616c..8c91b0f3d165 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,49 +48,22 @@
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
)
-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
- TP_ARGS(dev, status, fe, hl),
- TP_STRUCT__entry(
- __string(device, dev_name(dev))
- __string(host, dev_name(dev->parent))
- __field(u32, status)
- __field(u32, first_error)
- __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
- ),
- TP_fast_assign(
- __assign_str(device);
- __assign_str(host);
- __entry->status = status;
- __entry->first_error = fe;
- /*
- * Embed the 512B headerlog data for user app retrieval and
- * parsing, but no need to print this in the trace buffer.
- */
- memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
- ),
- TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
- __get_str(device), __get_str(host),
- show_uc_errs(__entry->status),
- show_uc_errs(__entry->first_error)
- )
-);
-
TRACE_EVENT(cxl_aer_uncorrectable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
- TP_ARGS(cxlmd, status, fe, hl),
+ TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
+ u32 *hl),
+ TP_ARGS(dev, serial, status, fe, hl),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(name, dev_name(dev))
+ __string(parent, dev_name(dev->parent))
__field(u64, serial)
__field(u32, status)
__field(u32, first_error)
__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
),
TP_fast_assign(
- __assign_str(memdev);
- __assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __assign_str(name);
+ __assign_str(parent);
+ __entry->serial = serial;
__entry->status = status;
__entry->first_error = fe;
/*
@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
*/
memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
),
- TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'",
- __get_str(memdev), __get_str(host), __entry->serial,
+ TP_printk("device=%s parent=%s serial=%lld status='%s' first_error='%s'",
+ __get_str(name), __get_str(parent), __entry->serial,
show_uc_errs(__entry->status),
show_uc_errs(__entry->first_error)
)
@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
)
-TRACE_EVENT(cxl_port_aer_correctable_error,
- TP_PROTO(struct device *dev, u32 status),
- TP_ARGS(dev, status),
- TP_STRUCT__entry(
- __string(device, dev_name(dev))
- __string(host, dev_name(dev->parent))
- __field(u32, status)
- ),
- TP_fast_assign(
- __assign_str(device);
- __assign_str(host);
- __entry->status = status;
- ),
- TP_printk("device=%s host=%s status='%s'",
- __get_str(device), __get_str(host),
- show_ce_errs(__entry->status)
- )
-);
-
TRACE_EVENT(cxl_aer_correctable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
- TP_ARGS(cxlmd, status),
+ TP_PROTO(struct device *dev, u64 serial, u32 status),
+ TP_ARGS(dev, serial, status),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(name, dev_name(dev))
+ __string(parent, dev_name(dev->parent))
__field(u64, serial)
__field(u32, status)
),
TP_fast_assign(
- __assign_str(memdev);
- __assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __assign_str(name);
+ __assign_str(parent);
+ __entry->serial = serial;
__entry->status = status;
),
- TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
- __get_str(memdev), __get_str(host), __entry->serial,
+ TP_printk("device=%s parent=%s serial=%lld status='%s'",
+ __get_str(name), __get_str(parent), __entry->serial,
show_ce_errs(__entry->status)
)
);
--
2.34.1
>-----Original Message-----
>From: Terry Bowman <terry.bowman@amd.com>
>Sent: 03 June 2025 18:23
>To: PradeepVineshReddy.Kodamati@amd.com; dave@stgolabs.net; Jonathan
>Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com;
>alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com;
>dan.j.williams@intel.com; bhelgaas@google.com; bp@alien8.de;
>ming.li@zohomail.com; Shiju Jose <shiju.jose@huawei.com>;
>dan.carpenter@linaro.org; Smita.KoralahalliChannabasappa@amd.com;
>kobayashi.da-06@fujitsu.com; terry.bowman@amd.com; yanfei.xu@intel.com;
>rrichter@amd.com; peterz@infradead.org; colyli@suse.de;
>uaisheng.ye@intel.com; fabio.m.de.francesco@linux.intel.com;
>ilpo.jarvinen@linux.intel.com; yazen.ghannam@amd.com; linux-
>cxl@vger.kernel.org; linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>Subject: [PATCH v9 10/16] cxl/pci: Unify CXL trace logging for CXL Endpoints and
>CXL Ports
>
>CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>errors. This is inconvenient for the user because they must enable
>2 sets of trace routines. Make updates to the trace logging such that a single
>trace routine logs both CXL Endpoint and CXL Port protocol errors.
>
>Rename the 'host' field from the CXL Endpoint trace to 'parent' in the unified
>trace routines. 'host' does not correctly apply to CXL Port devices. Parent is more
>general and applies to CXL Port devices and CXL Endpoints.
>
>Add serial number parameter to the trace logging. This is used for EPs and 0 is
>provided for CXL port devices without a serial number.
>
>Below is output of correctable and uncorrectable protocol error logging.
>CXL Root Port and CXL Endpoint examples are included below.
>
>Root Port:
>cxl_aer_correctable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
>status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>Error'
>
>Endpoint:
>cxl_aer_correctable_error: device=mem3 parent=0000:0f:00.0 serial=0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: device=mem3 parent=0000:0f:00.0 serial: 0
>status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>Error'
>
>Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>---
> drivers/cxl/core/pci.c | 18 +++++----
> drivers/cxl/core/ras.c | 14 ++++---
> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
> 3 files changed, 37 insertions(+), 79 deletions(-)
>
>diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index
>186a5a20b951..0f4c07fd64a5 100644
>--- a/drivers/cxl/core/pci.c
>+++ b/drivers/cxl/core/pci.c
>@@ -664,7 +664,7 @@ void read_cdat_data(struct cxl_port *port) }
>EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
>
[...]
> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>*data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>25ebfbc1616c..8c91b0f3d165 100644
>--- a/drivers/cxl/core/trace.h
>+++ b/drivers/cxl/core/trace.h
>@@ -48,49 +48,22 @@
> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
> )
>
>-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>- TP_ARGS(dev, status, fe, hl),
>- TP_STRUCT__entry(
>- __string(device, dev_name(dev))
>- __string(host, dev_name(dev->parent))
>- __field(u32, status)
>- __field(u32, first_error)
>- __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>- ),
>- TP_fast_assign(
>- __assign_str(device);
>- __assign_str(host);
>- __entry->status = status;
>- __entry->first_error = fe;
>- /*
>- * Embed the 512B headerlog data for user app retrieval and
>- * parsing, but no need to print this in the trace buffer.
>- */
>- memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>- ),
>- TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>- __get_str(device), __get_str(host),
>- show_uc_errs(__entry->status),
>- show_uc_errs(__entry->first_error)
>- )
>-);
>-
> TRACE_EVENT(cxl_aer_uncorrectable_error,
>- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>*hl),
>- TP_ARGS(cxlmd, status, fe, hl),
>+ TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>+ u32 *hl),
>+ TP_ARGS(dev, serial, status, fe, hl),
> TP_STRUCT__entry(
>- __string(memdev, dev_name(&cxlmd->dev))
>- __string(host, dev_name(cxlmd->dev.parent))
>+ __string(name, dev_name(dev))
>+ __string(parent, dev_name(dev->parent))
Hi Terry,
As we pointed out in v8, renaming the fields "memdev" to "name" and "host" to "parent"
causes issues and failures in userspace rasdaemon while parsing the trace event data.
Additionally, we can't rename these fields in rasdaemon due to backward compatibility.
> __field(u64, serial)
> __field(u32, status)
> __field(u32, first_error)
> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> ),
> TP_fast_assign(
>- __assign_str(memdev);
>- __assign_str(host);
>- __entry->serial = cxlmd->cxlds->serial;
>+ __assign_str(name);
>+ __assign_str(parent);
>+ __entry->serial = serial;
> __entry->status = status;
> __entry->first_error = fe;
> /*
>@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> */
> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> ),
>- TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>'%s'",
>- __get_str(memdev), __get_str(host), __entry->serial,
>+ TP_printk("device=%s parent=%s serial=%lld status='%s'
>first_error='%s'",
>+ __get_str(name), __get_str(parent), __entry->serial,
> show_uc_errs(__entry->status),
> show_uc_errs(__entry->first_error)
> )
>@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>} \
> )
>
>-TRACE_EVENT(cxl_port_aer_correctable_error,
>- TP_PROTO(struct device *dev, u32 status),
>- TP_ARGS(dev, status),
>- TP_STRUCT__entry(
>- __string(device, dev_name(dev))
>- __string(host, dev_name(dev->parent))
>- __field(u32, status)
>- ),
>- TP_fast_assign(
>- __assign_str(device);
>- __assign_str(host);
>- __entry->status = status;
>- ),
>- TP_printk("device=%s host=%s status='%s'",
>- __get_str(device), __get_str(host),
>- show_ce_errs(__entry->status)
>- )
>-);
>-
> TRACE_EVENT(cxl_aer_correctable_error,
>- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>- TP_ARGS(cxlmd, status),
>+ TP_PROTO(struct device *dev, u64 serial, u32 status),
>+ TP_ARGS(dev, serial, status),
> TP_STRUCT__entry(
>- __string(memdev, dev_name(&cxlmd->dev))
>- __string(host, dev_name(cxlmd->dev.parent))
>+ __string(name, dev_name(dev))
>+ __string(parent, dev_name(dev->parent))
Renaming these fields is an issue for userspace as mentioned above
in cxl_aer_uncorrectable_error.
> __field(u64, serial)
> __field(u32, status)
> ),
> TP_fast_assign(
>- __assign_str(memdev);
>- __assign_str(host);
>- __entry->serial = cxlmd->cxlds->serial;
>+ __assign_str(name);
>+ __assign_str(parent);
>+ __entry->serial = serial;
> __entry->status = status;
> ),
>- TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>- __get_str(memdev), __get_str(host), __entry->serial,
>+ TP_printk("device=%s parent=%s serial=%lld status='%s'",
>+ __get_str(name), __get_str(parent), __entry->serial,
> show_ce_errs(__entry->status)
> )
> );
>--
>2.34.1
Thanks,
Shiju
On 6/6/2025 4:08 AM, Shiju Jose wrote:
>> -----Original Message-----
>> From: Terry Bowman <terry.bowman@amd.com>
>> Sent: 03 June 2025 18:23
>> To: PradeepVineshReddy.Kodamati@amd.com; dave@stgolabs.net; Jonathan
>> Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com;
>> alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com;
>> dan.j.williams@intel.com; bhelgaas@google.com; bp@alien8.de;
>> ming.li@zohomail.com; Shiju Jose <shiju.jose@huawei.com>;
>> dan.carpenter@linaro.org; Smita.KoralahalliChannabasappa@amd.com;
>> kobayashi.da-06@fujitsu.com; terry.bowman@amd.com; yanfei.xu@intel.com;
>> rrichter@amd.com; peterz@infradead.org; colyli@suse.de;
>> uaisheng.ye@intel.com; fabio.m.de.francesco@linux.intel.com;
>> ilpo.jarvinen@linux.intel.com; yazen.ghannam@amd.com; linux-
>> cxl@vger.kernel.org; linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>> Subject: [PATCH v9 10/16] cxl/pci: Unify CXL trace logging for CXL Endpoints and
>> CXL Ports
>>
>> CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>> errors. This is inconvenient for the user because they must enable
>> 2 sets of trace routines. Make updates to the trace logging such that a single
>> trace routine logs both CXL Endpoint and CXL Port protocol errors.
>>
>> Rename the 'host' field from the CXL Endpoint trace to 'parent' in the unified
>> trace routines. 'host' does not correctly apply to CXL Port devices. Parent is more
>> general and applies to CXL Port devices and CXL Endpoints.
>>
>> Add serial number parameter to the trace logging. This is used for EPs and 0 is
>> provided for CXL port devices without a serial number.
>>
>> Below is output of correctable and uncorrectable protocol error logging.
>> CXL Root Port and CXL Endpoint examples are included below.
>>
>> Root Port:
>> cxl_aer_correctable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
>> status='CRC Threshold Hit'
>> cxl_aer_uncorrectable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>> Error'
>>
>> Endpoint:
>> cxl_aer_correctable_error: device=mem3 parent=0000:0f:00.0 serial=0
>> status='CRC Threshold Hit'
>> cxl_aer_uncorrectable_error: device=mem3 parent=0000:0f:00.0 serial: 0
>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>> Error'
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>> drivers/cxl/core/pci.c | 18 +++++----
>> drivers/cxl/core/ras.c | 14 ++++---
>> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
>> 3 files changed, 37 insertions(+), 79 deletions(-)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index
>> 186a5a20b951..0f4c07fd64a5 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -664,7 +664,7 @@ void read_cdat_data(struct cxl_port *port) }
>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
>>
> [...]
>> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>> *data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>> 25ebfbc1616c..8c91b0f3d165 100644
>> --- a/drivers/cxl/core/trace.h
>> +++ b/drivers/cxl/core/trace.h
>> @@ -48,49 +48,22 @@
>> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
>> )
>>
>> -TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>> - TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>> - TP_ARGS(dev, status, fe, hl),
>> - TP_STRUCT__entry(
>> - __string(device, dev_name(dev))
>> - __string(host, dev_name(dev->parent))
>> - __field(u32, status)
>> - __field(u32, first_error)
>> - __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>> - ),
>> - TP_fast_assign(
>> - __assign_str(device);
>> - __assign_str(host);
>> - __entry->status = status;
>> - __entry->first_error = fe;
>> - /*
>> - * Embed the 512B headerlog data for user app retrieval and
>> - * parsing, but no need to print this in the trace buffer.
>> - */
>> - memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>> - ),
>> - TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>> - __get_str(device), __get_str(host),
>> - show_uc_errs(__entry->status),
>> - show_uc_errs(__entry->first_error)
>> - )
>> -);
>> -
>> TRACE_EVENT(cxl_aer_uncorrectable_error,
>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>> *hl),
>> - TP_ARGS(cxlmd, status, fe, hl),
>> + TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>> + u32 *hl),
>> + TP_ARGS(dev, serial, status, fe, hl),
>> TP_STRUCT__entry(
>> - __string(memdev, dev_name(&cxlmd->dev))
>> - __string(host, dev_name(cxlmd->dev.parent))
>> + __string(name, dev_name(dev))
>> + __string(parent, dev_name(dev->parent))
> Hi Terry,
>
> As we pointed out in v8, renaming the fields "memdev" to "name" and "host" to "parent"
> causes issues and failures in userspace rasdaemon while parsing the trace event data.
> Additionally, we can't rename these fields in rasdaemon due to backward compatibility.
Yes, I remember but didn't understand why other SW couldn't be updated to handle. I will
change as you request but many people will be confused why a port device's name is labeled
as a memdev. memdev is only correct for EPs and does not correctly reflect *any* of the
other CXL device types (RP, USP, DSP).
>> __field(u64, serial)
>> __field(u32, status)
>> __field(u32, first_error)
>> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>> ),
>> TP_fast_assign(
>> - __assign_str(memdev);
>> - __assign_str(host);
>> - __entry->serial = cxlmd->cxlds->serial;
>> + __assign_str(name);
>> + __assign_str(parent);
>> + __entry->serial = serial;
>> __entry->status = status;
>> __entry->first_error = fe;
>> /*
>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>> */
>> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>> ),
>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>> '%s'",
>> - __get_str(memdev), __get_str(host), __entry->serial,
>> + TP_printk("device=%s parent=%s serial=%lld status='%s'
>> first_error='%s'",
>> + __get_str(name), __get_str(parent), __entry->serial,
>> show_uc_errs(__entry->status),
>> show_uc_errs(__entry->first_error)
>> )
>> @@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>> } \
>> )
>>
>> -TRACE_EVENT(cxl_port_aer_correctable_error,
>> - TP_PROTO(struct device *dev, u32 status),
>> - TP_ARGS(dev, status),
>> - TP_STRUCT__entry(
>> - __string(device, dev_name(dev))
>> - __string(host, dev_name(dev->parent))
>> - __field(u32, status)
>> - ),
>> - TP_fast_assign(
>> - __assign_str(device);
>> - __assign_str(host);
>> - __entry->status = status;
>> - ),
>> - TP_printk("device=%s host=%s status='%s'",
>> - __get_str(device), __get_str(host),
>> - show_ce_errs(__entry->status)
>> - )
>> -);
>> -
>> TRACE_EVENT(cxl_aer_correctable_error,
>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>> - TP_ARGS(cxlmd, status),
>> + TP_PROTO(struct device *dev, u64 serial, u32 status),
>> + TP_ARGS(dev, serial, status),
>> TP_STRUCT__entry(
>> - __string(memdev, dev_name(&cxlmd->dev))
>> - __string(host, dev_name(cxlmd->dev.parent))
>> + __string(name, dev_name(dev))
>> + __string(parent, dev_name(dev->parent))
> Renaming these fields is an issue for userspace as mentioned above
> in cxl_aer_uncorrectable_error.
I understand, I'll revert as you request.
Terry
>> __field(u64, serial)
>> __field(u32, status)
>> ),
>> TP_fast_assign(
>> - __assign_str(memdev);
>> - __assign_str(host);
>> - __entry->serial = cxlmd->cxlds->serial;
>> + __assign_str(name);
>> + __assign_str(parent);
>> + __entry->serial = serial;
>> __entry->status = status;
>> ),
>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>> - __get_str(memdev), __get_str(host), __entry->serial,
>> + TP_printk("device=%s parent=%s serial=%lld status='%s'",
>> + __get_str(name), __get_str(parent), __entry->serial,
>> show_ce_errs(__entry->status)
>> )
>> );
>> --
>> 2.34.1
>
> Thanks,
> Shiju
On 6/6/2025 9:41 AM, Bowman, Terry wrote:
>
> On 6/6/2025 4:08 AM, Shiju Jose wrote:
>>> -----Original Message-----
>>> From: Terry Bowman <terry.bowman@amd.com>
>>> Sent: 03 June 2025 18:23
>>> To: PradeepVineshReddy.Kodamati@amd.com; dave@stgolabs.net; Jonathan
>>> Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com;
>>> alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com;
>>> dan.j.williams@intel.com; bhelgaas@google.com; bp@alien8.de;
>>> ming.li@zohomail.com; Shiju Jose <shiju.jose@huawei.com>;
>>> dan.carpenter@linaro.org; Smita.KoralahalliChannabasappa@amd.com;
>>> kobayashi.da-06@fujitsu.com; terry.bowman@amd.com; yanfei.xu@intel.com;
>>> rrichter@amd.com; peterz@infradead.org; colyli@suse.de;
>>> uaisheng.ye@intel.com; fabio.m.de.francesco@linux.intel.com;
>>> ilpo.jarvinen@linux.intel.com; yazen.ghannam@amd.com; linux-
>>> cxl@vger.kernel.org; linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>>> Subject: [PATCH v9 10/16] cxl/pci: Unify CXL trace logging for CXL Endpoints and
>>> CXL Ports
>>>
>>> CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>>> errors. This is inconvenient for the user because they must enable
>>> 2 sets of trace routines. Make updates to the trace logging such that a single
>>> trace routine logs both CXL Endpoint and CXL Port protocol errors.
>>>
>>> Rename the 'host' field from the CXL Endpoint trace to 'parent' in the unified
>>> trace routines. 'host' does not correctly apply to CXL Port devices. Parent is more
>>> general and applies to CXL Port devices and CXL Endpoints.
>>>
>>> Add serial number parameter to the trace logging. This is used for EPs and 0 is
>>> provided for CXL port devices without a serial number.
>>>
>>> Below is output of correctable and uncorrectable protocol error logging.
>>> CXL Root Port and CXL Endpoint examples are included below.
>>>
>>> Root Port:
>>> cxl_aer_correctable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
>>> status='CRC Threshold Hit'
>>> cxl_aer_uncorrectable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
>>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>>> Error'
>>>
>>> Endpoint:
>>> cxl_aer_correctable_error: device=mem3 parent=0000:0f:00.0 serial=0
>>> status='CRC Threshold Hit'
>>> cxl_aer_uncorrectable_error: device=mem3 parent=0000:0f:00.0 serial: 0
>>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>>> Error'
>>>
>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>> ---
>>> drivers/cxl/core/pci.c | 18 +++++----
>>> drivers/cxl/core/ras.c | 14 ++++---
>>> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
>>> 3 files changed, 37 insertions(+), 79 deletions(-)
>>>
>>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index
>>> 186a5a20b951..0f4c07fd64a5 100644
>>> --- a/drivers/cxl/core/pci.c
>>> +++ b/drivers/cxl/core/pci.c
>>> @@ -664,7 +664,7 @@ void read_cdat_data(struct cxl_port *port) }
>>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
>>>
>> [...]
>>> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>>> *data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>>> 25ebfbc1616c..8c91b0f3d165 100644
>>> --- a/drivers/cxl/core/trace.h
>>> +++ b/drivers/cxl/core/trace.h
>>> @@ -48,49 +48,22 @@
>>> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
>>> )
>>>
>>> -TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>>> - TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>>> - TP_ARGS(dev, status, fe, hl),
>>> - TP_STRUCT__entry(
>>> - __string(device, dev_name(dev))
>>> - __string(host, dev_name(dev->parent))
>>> - __field(u32, status)
>>> - __field(u32, first_error)
>>> - __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>>> - ),
>>> - TP_fast_assign(
>>> - __assign_str(device);
>>> - __assign_str(host);
>>> - __entry->status = status;
>>> - __entry->first_error = fe;
>>> - /*
>>> - * Embed the 512B headerlog data for user app retrieval and
>>> - * parsing, but no need to print this in the trace buffer.
>>> - */
>>> - memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>>> - ),
>>> - TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>>> - __get_str(device), __get_str(host),
>>> - show_uc_errs(__entry->status),
>>> - show_uc_errs(__entry->first_error)
>>> - )
>>> -);
>>> -
>>> TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>>> *hl),
>>> - TP_ARGS(cxlmd, status, fe, hl),
>>> + TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>>> + u32 *hl),
>>> + TP_ARGS(dev, serial, status, fe, hl),
>>> TP_STRUCT__entry(
>>> - __string(memdev, dev_name(&cxlmd->dev))
>>> - __string(host, dev_name(cxlmd->dev.parent))
>>> + __string(name, dev_name(dev))
>>> + __string(parent, dev_name(dev->parent))
>> Hi Terry,
>>
>> As we pointed out in v8, renaming the fields "memdev" to "name" and "host" to "parent"
>> causes issues and failures in userspace rasdaemon while parsing the trace event data.
>> Additionally, we can't rename these fields in rasdaemon due to backward compatibility.
> Yes, I remember but didn't understand why other SW couldn't be updated to handle. I will
> change as you request but many people will be confused why a port device's name is labeled
> as a memdev. memdev is only correct for EPs and does not correctly reflect *any* of the
> other CXL device types (RP, USP, DSP).
>
>>> __field(u64, serial)
>>> __field(u32, status)
>>> __field(u32, first_error)
>>> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>>> ),
>>> TP_fast_assign(
>>> - __assign_str(memdev);
>>> - __assign_str(host);
>>> - __entry->serial = cxlmd->cxlds->serial;
>>> + __assign_str(name);
>>> + __assign_str(parent);
>>> + __entry->serial = serial;
>>> __entry->status = status;
>>> __entry->first_error = fe;
>>> /*
>>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> */
>>> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>>> ),
>>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>>> '%s'",
>>> - __get_str(memdev), __get_str(host), __entry->serial,
>>> + TP_printk("device=%s parent=%s serial=%lld status='%s'
>>> first_error='%s'",
>>> + __get_str(name), __get_str(parent), __entry->serial,
>>> show_uc_errs(__entry->status),
>>> show_uc_errs(__entry->first_error)
>>> )
>>> @@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>>> } \
>>> )
>>>
>>> -TRACE_EVENT(cxl_port_aer_correctable_error,
>>> - TP_PROTO(struct device *dev, u32 status),
>>> - TP_ARGS(dev, status),
>>> - TP_STRUCT__entry(
>>> - __string(device, dev_name(dev))
>>> - __string(host, dev_name(dev->parent))
>>> - __field(u32, status)
>>> - ),
>>> - TP_fast_assign(
>>> - __assign_str(device);
>>> - __assign_str(host);
>>> - __entry->status = status;
>>> - ),
>>> - TP_printk("device=%s host=%s status='%s'",
>>> - __get_str(device), __get_str(host),
>>> - show_ce_errs(__entry->status)
>>> - )
>>> -);
>>> -
>>> TRACE_EVENT(cxl_aer_correctable_error,
>>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>>> - TP_ARGS(cxlmd, status),
>>> + TP_PROTO(struct device *dev, u64 serial, u32 status),
>>> + TP_ARGS(dev, serial, status),
>>> TP_STRUCT__entry(
>>> - __string(memdev, dev_name(&cxlmd->dev))
>>> - __string(host, dev_name(cxlmd->dev.parent))
>>> + __string(name, dev_name(dev))
>>> + __string(parent, dev_name(dev->parent))
>> Renaming these fields is an issue for userspace as mentioned above
>> in cxl_aer_uncorrectable_error.
> I understand, I'll revert as you request.
>
> Terry
I'll update the commit message with explanation for leaving as-is.
Terry
>>> __field(u64, serial)
>>> __field(u32, status)
>>> ),
>>> TP_fast_assign(
>>> - __assign_str(memdev);
>>> - __assign_str(host);
>>> - __entry->serial = cxlmd->cxlds->serial;
>>> + __assign_str(name);
>>> + __assign_str(parent);
>>> + __entry->serial = serial;
>>> __entry->status = status;
>>> ),
>>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>>> - __get_str(memdev), __get_str(host), __entry->serial,
>>> + TP_printk("device=%s parent=%s serial=%lld status='%s'",
>>> + __get_str(name), __get_str(parent), __entry->serial,
>>> show_ce_errs(__entry->status)
>>> )
>>> );
>>> --
>>> 2.34.1
>> Thanks,
>> Shiju
On Fri, 6 Jun 2025 10:24:25 -0500
"Bowman, Terry" <terry.bowman@amd.com> wrote:
> On 6/6/2025 9:41 AM, Bowman, Terry wrote:
> >
> > On 6/6/2025 4:08 AM, Shiju Jose wrote:
> >>> -----Original Message-----
> >>> From: Terry Bowman <terry.bowman@amd.com>
> >>> Sent: 03 June 2025 18:23
> >>> To: PradeepVineshReddy.Kodamati@amd.com; dave@stgolabs.net; Jonathan
> >>> Cameron <jonathan.cameron@huawei.com>; dave.jiang@intel.com;
> >>> alison.schofield@intel.com; vishal.l.verma@intel.com; ira.weiny@intel.com;
> >>> dan.j.williams@intel.com; bhelgaas@google.com; bp@alien8.de;
> >>> ming.li@zohomail.com; Shiju Jose <shiju.jose@huawei.com>;
> >>> dan.carpenter@linaro.org; Smita.KoralahalliChannabasappa@amd.com;
> >>> kobayashi.da-06@fujitsu.com; terry.bowman@amd.com; yanfei.xu@intel.com;
> >>> rrichter@amd.com; peterz@infradead.org; colyli@suse.de;
> >>> uaisheng.ye@intel.com; fabio.m.de.francesco@linux.intel.com;
> >>> ilpo.jarvinen@linux.intel.com; yazen.ghannam@amd.com; linux-
> >>> cxl@vger.kernel.org; linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
> >>> Subject: [PATCH v9 10/16] cxl/pci: Unify CXL trace logging for CXL Endpoints and
> >>> CXL Ports
> >>>
> >>> CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
> >>> errors. This is inconvenient for the user because they must enable
> >>> 2 sets of trace routines. Make updates to the trace logging such that a single
> >>> trace routine logs both CXL Endpoint and CXL Port protocol errors.
> >>>
> >>> Rename the 'host' field from the CXL Endpoint trace to 'parent' in the unified
> >>> trace routines. 'host' does not correctly apply to CXL Port devices. Parent is more
> >>> general and applies to CXL Port devices and CXL Endpoints.
> >>>
> >>> Add serial number parameter to the trace logging. This is used for EPs and 0 is
> >>> provided for CXL port devices without a serial number.
> >>>
> >>> Below is output of correctable and uncorrectable protocol error logging.
> >>> CXL Root Port and CXL Endpoint examples are included below.
> >>>
> >>> Root Port:
> >>> cxl_aer_correctable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
> >>> status='CRC Threshold Hit'
> >>> cxl_aer_uncorrectable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0
> >>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
> >>> Error'
> >>>
> >>> Endpoint:
> >>> cxl_aer_correctable_error: device=mem3 parent=0000:0f:00.0 serial=0
> >>> status='CRC Threshold Hit'
> >>> cxl_aer_uncorrectable_error: device=mem3 parent=0000:0f:00.0 serial: 0
> >>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
> >>> Error'
> >>>
> >>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> >>> ---
> >>> drivers/cxl/core/pci.c | 18 +++++----
> >>> drivers/cxl/core/ras.c | 14 ++++---
> >>> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
> >>> 3 files changed, 37 insertions(+), 79 deletions(-)
> >>>
> >>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index
> >>> 186a5a20b951..0f4c07fd64a5 100644
> >>> --- a/drivers/cxl/core/pci.c
> >>> +++ b/drivers/cxl/core/pci.c
> >>> @@ -664,7 +664,7 @@ void read_cdat_data(struct cxl_port *port) }
> >>> EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
> >>>
> >> [...]
> >>> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
> >>> *data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
> >>> 25ebfbc1616c..8c91b0f3d165 100644
> >>> --- a/drivers/cxl/core/trace.h
> >>> +++ b/drivers/cxl/core/trace.h
> >>> @@ -48,49 +48,22 @@
> >>> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
> >>> )
> >>>
> >>> -TRACE_EVENT(cxl_port_aer_uncorrectable_error,
> >>> - TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
> >>> - TP_ARGS(dev, status, fe, hl),
> >>> - TP_STRUCT__entry(
> >>> - __string(device, dev_name(dev))
> >>> - __string(host, dev_name(dev->parent))
> >>> - __field(u32, status)
> >>> - __field(u32, first_error)
> >>> - __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> >>> - ),
> >>> - TP_fast_assign(
> >>> - __assign_str(device);
> >>> - __assign_str(host);
> >>> - __entry->status = status;
> >>> - __entry->first_error = fe;
> >>> - /*
> >>> - * Embed the 512B headerlog data for user app retrieval and
> >>> - * parsing, but no need to print this in the trace buffer.
> >>> - */
> >>> - memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> >>> - ),
> >>> - TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
> >>> - __get_str(device), __get_str(host),
> >>> - show_uc_errs(__entry->status),
> >>> - show_uc_errs(__entry->first_error)
> >>> - )
> >>> -);
> >>> -
> >>> TRACE_EVENT(cxl_aer_uncorrectable_error,
> >>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
> >>> *hl),
> >>> - TP_ARGS(cxlmd, status, fe, hl),
> >>> + TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
> >>> + u32 *hl),
> >>> + TP_ARGS(dev, serial, status, fe, hl),
> >>> TP_STRUCT__entry(
> >>> - __string(memdev, dev_name(&cxlmd->dev))
> >>> - __string(host, dev_name(cxlmd->dev.parent))
> >>> + __string(name, dev_name(dev))
> >>> + __string(parent, dev_name(dev->parent))
> >> Hi Terry,
> >>
> >> As we pointed out in v8, renaming the fields "memdev" to "name" and "host" to "parent"
> >> causes issues and failures in userspace rasdaemon while parsing the trace event data.
> >> Additionally, we can't rename these fields in rasdaemon due to backward compatibility.
> > Yes, I remember but didn't understand why other SW couldn't be updated to handle. I will
> > change as you request but many people will be confused why a port device's name is labeled
> > as a memdev. memdev is only correct for EPs and does not correctly reflect *any* of the
> > other CXL device types (RP, USP, DSP).
RAS trace points are ABI so you can introduce whatever you like that is new, but you can't
remove or rename existing elements. If userspace breaks (which Shiju is confirming it will)
and anyone reports the regression, then you get grumpy Linus.
Test this stuff against upstream rasdaemon. It's easy and means things
like this don't sneak in - various cloud vendors have stated it's what their
stacks are based on, so they will see this in production if they get a mismatch.
https://github.com/mchehab/rasdaemon/blob/master/ras-cxl-handler.c#L354
It think this will error out if an AER UE does not contain the memdev field.
So your options are a new tracepoint, or leave those fields in place and just
let them contain whatever is most useful. If you go with new tracepoint you
still should generate the old one as well if it is enabled. Note that
even adding fields can be a bit painful as it requires some DB restructuring
on an upgrade of rasdaemon (or dumping existing logs and starting from scratch).
+CC Mauro in case I've missed any other ways to deal with this change.
> >
> >>> __field(u64, serial)
> >>> __field(u32, status)
> >>> __field(u32, first_error)
> >>> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> >>> ),
> >>> TP_fast_assign(
> >>> - __assign_str(memdev);
> >>> - __assign_str(host);
> >>> - __entry->serial = cxlmd->cxlds->serial;
> >>> + __assign_str(name);
> >>> + __assign_str(parent);
> >>> + __entry->serial = serial;
> >>> __entry->status = status;
> >>> __entry->first_error = fe;
> >>> /*
> >>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> >>> */
> >>> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> >>> ),
> >>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
> >>> '%s'",
> >>> - __get_str(memdev), __get_str(host), __entry->serial,
> >>> + TP_printk("device=%s parent=%s serial=%lld status='%s'
> >>> first_error='%s'",
> >>> + __get_str(name), __get_str(parent), __entry->serial,
> >>> show_uc_errs(__entry->status),
> >>> show_uc_errs(__entry->first_error)
> >>> )
> >>> @@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> >>> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
> >>> } \
> >>> )
> >>>
> >>> -TRACE_EVENT(cxl_port_aer_correctable_error,
> >>> - TP_PROTO(struct device *dev, u32 status),
> >>> - TP_ARGS(dev, status),
> >>> - TP_STRUCT__entry(
> >>> - __string(device, dev_name(dev))
> >>> - __string(host, dev_name(dev->parent))
> >>> - __field(u32, status)
> >>> - ),
> >>> - TP_fast_assign(
> >>> - __assign_str(device);
> >>> - __assign_str(host);
> >>> - __entry->status = status;
> >>> - ),
> >>> - TP_printk("device=%s host=%s status='%s'",
> >>> - __get_str(device), __get_str(host),
> >>> - show_ce_errs(__entry->status)
> >>> - )
> >>> -);
> >>> -
> >>> TRACE_EVENT(cxl_aer_correctable_error,
> >>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
> >>> - TP_ARGS(cxlmd, status),
> >>> + TP_PROTO(struct device *dev, u64 serial, u32 status),
> >>> + TP_ARGS(dev, serial, status),
> >>> TP_STRUCT__entry(
> >>> - __string(memdev, dev_name(&cxlmd->dev))
> >>> - __string(host, dev_name(cxlmd->dev.parent))
> >>> + __string(name, dev_name(dev))
> >>> + __string(parent, dev_name(dev->parent))
> >> Renaming these fields is an issue for userspace as mentioned above
> >> in cxl_aer_uncorrectable_error.
> > I understand, I'll revert as you request.
> >
> > Terry
>
> I'll update the commit message with explanation for leaving as-is.
>
> Terry
> >>> __field(u64, serial)
> >>> __field(u32, status)
> >>> ),
> >>> TP_fast_assign(
> >>> - __assign_str(memdev);
> >>> - __assign_str(host);
> >>> - __entry->serial = cxlmd->cxlds->serial;
> >>> + __assign_str(name);
> >>> + __assign_str(parent);
> >>> + __entry->serial = serial;
> >>> __entry->status = status;
> >>> ),
> >>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
> >>> - __get_str(memdev), __get_str(host), __entry->serial,
> >>> + TP_printk("device=%s parent=%s serial=%lld status='%s'",
> >>> + __get_str(name), __get_str(parent), __entry->serial,
> >>> show_ce_errs(__entry->status)
> >>> )
> >>> );
> >>> --
> >>> 2.34.1
> >> Thanks,
> >> Shiju
>
On 6/3/25 10:22 AM, Terry Bowman wrote:
> CXL currently has separate trace routines for CXL Port errors and CXL
> Endpoint errors. This is inconvenient for the user because they must enable
> 2 sets of trace routines. Make updates to the trace logging such that a
> single trace routine logs both CXL Endpoint and CXL Port protocol errors.
>
> Rename the 'host' field from the CXL Endpoint trace to 'parent' in the
> unified trace routines. 'host' does not correctly apply to CXL Port
> devices. Parent is more general and applies to CXL Port devices and CXL
> Endpoints.
>
> Add serial number parameter to the trace logging. This is used for EPs
> and 0 is provided for CXL port devices without a serial number.
>
> Below is output of correctable and uncorrectable protocol error logging.
> CXL Root Port and CXL Endpoint examples are included below.
>
> Root Port:
> cxl_aer_correctable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0 status='CRC Threshold Hit'
> cxl_aer_uncorrectable_error: device=0000:0c:00.0 parent=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>
> Endpoint:
> cxl_aer_correctable_error: device=mem3 parent=0000:0f:00.0 serial=0 status='CRC Threshold Hit'
> cxl_aer_uncorrectable_error: device=mem3 parent=0000:0f:00.0 serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> drivers/cxl/core/pci.c | 18 +++++----
> drivers/cxl/core/ras.c | 14 ++++---
> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
> 3 files changed, 37 insertions(+), 79 deletions(-)
>
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 186a5a20b951..0f4c07fd64a5 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -664,7 +664,7 @@ void read_cdat_data(struct cxl_port *port)
> }
> EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
>
> -static void __cxl_handle_cor_ras(struct device *dev,
> +static void __cxl_handle_cor_ras(struct device *dev, u64 serial,
> void __iomem *ras_base)
> {
> void __iomem *addr;
> @@ -679,13 +679,13 @@ static void __cxl_handle_cor_ras(struct device *dev,
> status = readl(addr);
> if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
> writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
> - trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
> + trace_cxl_aer_correctable_error(dev, serial, status);
> }
> }
>
> static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds)
> {
> - return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
> + return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
> }
>
> /* CXL spec rev3.0 8.2.4.16.1 */
> @@ -709,7 +709,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
> * Log the state of the RAS status registers and prepare them to log the
> * next error status. Return 1 if reset needed.
> */
> -static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
> +static bool __cxl_handle_ras(struct device *dev, u64 serial,
> + void __iomem *ras_base)
> {
> u32 hl[CXL_HEADERLOG_SIZE_U32];
> void __iomem *addr;
> @@ -738,7 +739,7 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
> }
>
> header_log_copy(ras_base, hl);
> - trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
> + trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
> writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>
> return true;
> @@ -746,7 +747,8 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>
> static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
> {
> - return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
> +
> + return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
> }
>
> #ifdef CONFIG_PCIEAER_CXL
> @@ -754,13 +756,13 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
> static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
> struct cxl_dport *dport)
> {
> - return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
> + return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
> }
>
> static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
> struct cxl_dport *dport)
> {
> - return __cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
> + return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
> }
>
> /*
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 715f7221ea3a..0ef8c2068c0c 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -13,7 +13,7 @@ static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
> {
> u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
>
> - trace_cxl_port_aer_correctable_error(&pdev->dev, status);
> + trace_cxl_aer_correctable_error(&pdev->dev, 0, status);
> }
>
> static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
> @@ -28,8 +28,8 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
> else
> fe = status;
>
> - trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
> - ras_cap.header_log);
> + trace_cxl_aer_uncorrectable_error(&pdev->dev, 0, status, fe,
> + ras_cap.header_log);
> }
>
> static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> @@ -42,7 +42,8 @@ static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
> if (!cxlds)
> return;
>
> - trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
> + trace_cxl_aer_correctable_error(&cxlds->cxlmd->dev, cxlds->serial,
> + status);
> }
>
> static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> @@ -62,8 +63,9 @@ static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
> else
> fe = status;
>
> - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
> - ras_cap.header_log);
> + trace_cxl_aer_uncorrectable_error(&cxlds->cxlmd->dev,
> + cxlds->serial, status,
> + fe, ras_cap.header_log);
> }
>
> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
> diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
> index 25ebfbc1616c..8c91b0f3d165 100644
> --- a/drivers/cxl/core/trace.h
> +++ b/drivers/cxl/core/trace.h
> @@ -48,49 +48,22 @@
> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
> )
>
> -TRACE_EVENT(cxl_port_aer_uncorrectable_error,
> - TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
> - TP_ARGS(dev, status, fe, hl),
> - TP_STRUCT__entry(
> - __string(device, dev_name(dev))
> - __string(host, dev_name(dev->parent))
> - __field(u32, status)
> - __field(u32, first_error)
> - __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> - ),
> - TP_fast_assign(
> - __assign_str(device);
> - __assign_str(host);
> - __entry->status = status;
> - __entry->first_error = fe;
> - /*
> - * Embed the 512B headerlog data for user app retrieval and
> - * parsing, but no need to print this in the trace buffer.
> - */
> - memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> - ),
> - TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
> - __get_str(device), __get_str(host),
> - show_uc_errs(__entry->status),
> - show_uc_errs(__entry->first_error)
> - )
> -);
> -
> TRACE_EVENT(cxl_aer_uncorrectable_error,
> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
> - TP_ARGS(cxlmd, status, fe, hl),
> + TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
> + u32 *hl),
> + TP_ARGS(dev, serial, status, fe, hl),
> TP_STRUCT__entry(
> - __string(memdev, dev_name(&cxlmd->dev))
> - __string(host, dev_name(cxlmd->dev.parent))
> + __string(name, dev_name(dev))
> + __string(parent, dev_name(dev->parent))
> __field(u64, serial)
> __field(u32, status)
> __field(u32, first_error)
> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> ),
> TP_fast_assign(
> - __assign_str(memdev);
> - __assign_str(host);
> - __entry->serial = cxlmd->cxlds->serial;
> + __assign_str(name);
> + __assign_str(parent);
> + __entry->serial = serial;
> __entry->status = status;
> __entry->first_error = fe;
> /*
> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> */
> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> ),
> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'",
> - __get_str(memdev), __get_str(host), __entry->serial,
> + TP_printk("device=%s parent=%s serial=%lld status='%s' first_error='%s'",
> + __get_str(name), __get_str(parent), __entry->serial,
> show_uc_errs(__entry->status),
> show_uc_errs(__entry->first_error)
> )
> @@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
> )
>
> -TRACE_EVENT(cxl_port_aer_correctable_error,
> - TP_PROTO(struct device *dev, u32 status),
> - TP_ARGS(dev, status),
> - TP_STRUCT__entry(
> - __string(device, dev_name(dev))
> - __string(host, dev_name(dev->parent))
> - __field(u32, status)
> - ),
> - TP_fast_assign(
> - __assign_str(device);
> - __assign_str(host);
> - __entry->status = status;
> - ),
> - TP_printk("device=%s host=%s status='%s'",
> - __get_str(device), __get_str(host),
> - show_ce_errs(__entry->status)
> - )
> -);
> -
> TRACE_EVENT(cxl_aer_correctable_error,
> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
> - TP_ARGS(cxlmd, status),
> + TP_PROTO(struct device *dev, u64 serial, u32 status),
> + TP_ARGS(dev, serial, status),
> TP_STRUCT__entry(
> - __string(memdev, dev_name(&cxlmd->dev))
> - __string(host, dev_name(cxlmd->dev.parent))
> + __string(name, dev_name(dev))
> + __string(parent, dev_name(dev->parent))
> __field(u64, serial)
> __field(u32, status)
> ),
> TP_fast_assign(
> - __assign_str(memdev);
> - __assign_str(host);
> - __entry->serial = cxlmd->cxlds->serial;
> + __assign_str(name);
> + __assign_str(parent);
> + __entry->serial = serial;
> __entry->status = status;
> ),
> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
> - __get_str(memdev), __get_str(host), __entry->serial,
> + TP_printk("device=%s parent=%s serial=%lld status='%s'",
> + __get_str(name), __get_str(parent), __entry->serial,
> show_ce_errs(__entry->status)
> )
> );
--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer
© 2016 - 2025 Red Hat, Inc.