CXL currently has separate trace routines for CXL Port errors and CXL
Endpoint errors. This is inconvenient for the user because they must enable
2 sets of trace routines. Make updates to the trace logging such that a
single trace routine logs both CXL Endpoint and CXL Port protocol errors.
Keep the trace log fields 'memdev' and 'host'. While these are not accurate
for non-Endpoints the fields will remain as-is to prevent breaking userspace
RAS trace consumers.
Add serial number parameter to the trace logging. This is used for EPs
and 0 is provided for CXL port devices without a serial number.
Below is output of correctable and uncorrectable protocol error logging.
CXL Root Port and CXL Endpoint examples are included below.
Root Port:
cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Endpoint:
cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
drivers/cxl/core/pci.c | 19 ++++-----
drivers/cxl/core/ras.c | 14 ++++---
drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
3 files changed, 37 insertions(+), 80 deletions(-)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index c9a4b528e0b8..156ce094a8b9 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -664,8 +664,8 @@ void read_cdat_data(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
-static void cxl_handle_cor_ras(struct device *dev,
- void __iomem *ras_base)
+static void cxl_handle_cor_ras(struct device *dev, u64 serial,
+ void __iomem *ras_base)
{
void __iomem *addr;
u32 status;
@@ -679,7 +679,7 @@ static void cxl_handle_cor_ras(struct device *dev,
status = readl(addr);
if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
- trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
+ trace_cxl_aer_correctable_error(dev, serial, status);
}
}
@@ -704,7 +704,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
* Log the state of the RAS status registers and prepare them to log the
* next error status. Return 1 if reset needed.
*/
-static bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+static bool cxl_handle_ras(struct device *dev, u64 serial,
+ void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32];
void __iomem *addr;
@@ -733,7 +734,7 @@ static bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
}
header_log_copy(ras_base, hl);
- trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
+ trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
return true;
@@ -744,13 +745,13 @@ static bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
}
static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
struct cxl_dport *dport)
{
- return cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+ return cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
}
/*
@@ -847,7 +848,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
if (cxlds->rcd)
cxl_handle_rdport_errors(cxlds);
- cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
}
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
@@ -876,7 +877,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
* chance the situation is recoverable dump the status of the RAS
* capability registers and bounce the active state of the memdev.
*/
- ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+ ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
}
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 962dc94fed8c..9588b39faabd 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -13,7 +13,7 @@ static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
- trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+ trace_cxl_aer_correctable_error(&pdev->dev, 0, status);
}
static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
@@ -28,8 +28,8 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
else
fe = status;
- trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&pdev->dev, 0, status, fe,
+ ras_cap.header_log);
}
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
@@ -42,7 +42,8 @@ static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
if (!cxlds)
return;
- trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+ trace_cxl_aer_correctable_error(&cxlds->cxlmd->dev, cxlds->serial,
+ status);
}
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
@@ -62,8 +63,9 @@ static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
else
fe = status;
- trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
- ras_cap.header_log);
+ trace_cxl_aer_uncorrectable_error(&cxlds->cxlmd->dev,
+ cxlds->serial, status,
+ fe, ras_cap.header_log);
}
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 25ebfbc1616c..494d6db461a7 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,49 +48,22 @@
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
)
-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
- TP_ARGS(dev, status, fe, hl),
- TP_STRUCT__entry(
- __string(device, dev_name(dev))
- __string(host, dev_name(dev->parent))
- __field(u32, status)
- __field(u32, first_error)
- __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
- ),
- TP_fast_assign(
- __assign_str(device);
- __assign_str(host);
- __entry->status = status;
- __entry->first_error = fe;
- /*
- * Embed the 512B headerlog data for user app retrieval and
- * parsing, but no need to print this in the trace buffer.
- */
- memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
- ),
- TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
- __get_str(device), __get_str(host),
- show_uc_errs(__entry->status),
- show_uc_errs(__entry->first_error)
- )
-);
-
TRACE_EVENT(cxl_aer_uncorrectable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
- TP_ARGS(cxlmd, status, fe, hl),
+ TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
+ u32 *hl),
+ TP_ARGS(dev, serial, status, fe, hl),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(name, dev_name(dev))
+ __string(parent, dev_name(dev->parent))
__field(u64, serial)
__field(u32, status)
__field(u32, first_error)
__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
),
TP_fast_assign(
- __assign_str(memdev);
- __assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __assign_str(name);
+ __assign_str(parent);
+ __entry->serial = serial;
__entry->status = status;
__entry->first_error = fe;
/*
@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
*/
memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
),
- TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'",
- __get_str(memdev), __get_str(host), __entry->serial,
+ TP_printk("memdev=%s host=%s serial=%lld status='%s' first_error='%s'",
+ __get_str(name), __get_str(parent), __entry->serial,
show_uc_errs(__entry->status),
show_uc_errs(__entry->first_error)
)
@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
)
-TRACE_EVENT(cxl_port_aer_correctable_error,
- TP_PROTO(struct device *dev, u32 status),
- TP_ARGS(dev, status),
- TP_STRUCT__entry(
- __string(device, dev_name(dev))
- __string(host, dev_name(dev->parent))
- __field(u32, status)
- ),
- TP_fast_assign(
- __assign_str(device);
- __assign_str(host);
- __entry->status = status;
- ),
- TP_printk("device=%s host=%s status='%s'",
- __get_str(device), __get_str(host),
- show_ce_errs(__entry->status)
- )
-);
-
TRACE_EVENT(cxl_aer_correctable_error,
- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
- TP_ARGS(cxlmd, status),
+ TP_PROTO(struct device *dev, u64 serial, u32 status),
+ TP_ARGS(dev, serial, status),
TP_STRUCT__entry(
- __string(memdev, dev_name(&cxlmd->dev))
- __string(host, dev_name(cxlmd->dev.parent))
+ __string(name, dev_name(dev))
+ __string(parent, dev_name(dev->parent))
__field(u64, serial)
__field(u32, status)
),
TP_fast_assign(
- __assign_str(memdev);
- __assign_str(host);
- __entry->serial = cxlmd->cxlds->serial;
+ __assign_str(name);
+ __assign_str(parent);
+ __entry->serial = serial;
__entry->status = status;
),
- TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
- __get_str(memdev), __get_str(host), __entry->serial,
+ TP_printk("memdev=%s host=%s serial=%lld status='%s'",
+ __get_str(name), __get_str(parent), __entry->serial,
show_ce_errs(__entry->status)
)
);
--
2.34.1
>-----Original Message-----
>From: Terry Bowman <terry.bowman@amd.com>
>Sent: 26 June 2025 23:43
>To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>rrichter@amd.com; dan.carpenter@linaro.org;
>PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>Benjamin.Cheatham@amd.com;
>sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>linux-cxl@vger.kernel.org
>Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>and CXL Ports
>
>CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>errors. This is inconvenient for the user because they must enable
>2 sets of trace routines. Make updates to the trace logging such that a single
>trace routine logs both CXL Endpoint and CXL Port protocol errors.
>
>Keep the trace log fields 'memdev' and 'host'. While these are not accurate for
>non-Endpoints the fields will remain as-is to prevent breaking userspace RAS
>trace consumers.
>
>Add serial number parameter to the trace logging. This is used for EPs and 0 is
>provided for CXL port devices without a serial number.
>
>Below is output of correctable and uncorrectable protocol error logging.
>CXL Root Port and CXL Endpoint examples are included below.
>
>Root Port:
>cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>Error'
>
>Endpoint:
>cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status:
>'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>
>Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>---
> drivers/cxl/core/pci.c | 19 ++++-----
> drivers/cxl/core/ras.c | 14 ++++---
> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
> 3 files changed, 37 insertions(+), 80 deletions(-)
>
[...]
>
> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>*data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>25ebfbc1616c..494d6db461a7 100644
>--- a/drivers/cxl/core/trace.h
>+++ b/drivers/cxl/core/trace.h
>@@ -48,49 +48,22 @@
> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
> )
>
>-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>- TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>- TP_ARGS(dev, status, fe, hl),
>- TP_STRUCT__entry(
>- __string(device, dev_name(dev))
>- __string(host, dev_name(dev->parent))
>- __field(u32, status)
>- __field(u32, first_error)
>- __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>- ),
>- TP_fast_assign(
>- __assign_str(device);
>- __assign_str(host);
>- __entry->status = status;
>- __entry->first_error = fe;
>- /*
>- * Embed the 512B headerlog data for user app retrieval and
>- * parsing, but no need to print this in the trace buffer.
>- */
>- memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>- ),
>- TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>- __get_str(device), __get_str(host),
>- show_uc_errs(__entry->status),
>- show_uc_errs(__entry->first_error)
>- )
>-);
>-
> TRACE_EVENT(cxl_aer_uncorrectable_error,
>- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>*hl),
>- TP_ARGS(cxlmd, status, fe, hl),
>+ TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>+ u32 *hl),
>+ TP_ARGS(dev, serial, status, fe, hl),
> TP_STRUCT__entry(
>- __string(memdev, dev_name(&cxlmd->dev))
>- __string(host, dev_name(cxlmd->dev.parent))
>+ __string(name, dev_name(dev))
>+ __string(parent, dev_name(dev->parent))
Hi Terry,
Thanks for considering the feedback given in v9 regarding the compatibility issue
with the rasdaemon.
https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
Probably some confusion w.r.t the feedback.
Unfortunately TP_printk(...) is not an ABI that we need to keep stable,
it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
> __field(u64, serial)
> __field(u32, status)
> __field(u32, first_error)
> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> ),
> TP_fast_assign(
>- __assign_str(memdev);
>- __assign_str(host);
>- __entry->serial = cxlmd->cxlds->serial;
>+ __assign_str(name);
>+ __assign_str(parent);
>+ __entry->serial = serial;
> __entry->status = status;
> __entry->first_error = fe;
> /*
>@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> */
> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> ),
>- TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>'%s'",
>- __get_str(memdev), __get_str(host), __entry->serial,
>+ TP_printk("memdev=%s host=%s serial=%lld status='%s'
>first_error='%s'",
>+ __get_str(name), __get_str(parent), __entry->serial,
> show_uc_errs(__entry->status),
> show_uc_errs(__entry->first_error)
> )
>@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>} \
> )
>
>-TRACE_EVENT(cxl_port_aer_correctable_error,
>- TP_PROTO(struct device *dev, u32 status),
>- TP_ARGS(dev, status),
>- TP_STRUCT__entry(
>- __string(device, dev_name(dev))
>- __string(host, dev_name(dev->parent))
>- __field(u32, status)
>- ),
>- TP_fast_assign(
>- __assign_str(device);
>- __assign_str(host);
>- __entry->status = status;
>- ),
>- TP_printk("device=%s host=%s status='%s'",
>- __get_str(device), __get_str(host),
>- show_ce_errs(__entry->status)
>- )
>-);
>-
> TRACE_EVENT(cxl_aer_correctable_error,
>- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>- TP_ARGS(cxlmd, status),
>+ TP_PROTO(struct device *dev, u64 serial, u32 status),
>+ TP_ARGS(dev, serial, status),
> TP_STRUCT__entry(
>- __string(memdev, dev_name(&cxlmd->dev))
>- __string(host, dev_name(cxlmd->dev.parent))
>+ __string(name, dev_name(dev))
>+ __string(parent, dev_name(dev->parent))
Same as above.
> __field(u64, serial)
> __field(u32, status)
> ),
> TP_fast_assign(
>- __assign_str(memdev);
>- __assign_str(host);
>- __entry->serial = cxlmd->cxlds->serial;
>+ __assign_str(name);
>+ __assign_str(parent);
>+ __entry->serial = serial;
> __entry->status = status;
> ),
>- TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>- __get_str(memdev), __get_str(host), __entry->serial,
>+ TP_printk("memdev=%s host=%s serial=%lld status='%s'",
>+ __get_str(name), __get_str(parent), __entry->serial,
> show_ce_errs(__entry->status)
> )
> );
>--
>2.34.1
Thanks,
Shiju
On 6/27/2025 7:22 AM, Shiju Jose wrote:
>> -----Original Message-----
>> From: Terry Bowman <terry.bowman@amd.com>
>> Sent: 26 June 2025 23:43
>> To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>> dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>> bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>> ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>> rrichter@amd.com; dan.carpenter@linaro.org;
>> PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>> Benjamin.Cheatham@amd.com;
>> sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>> linux-cxl@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>> Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>> and CXL Ports
>>
>> CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>> errors. This is inconvenient for the user because they must enable
>> 2 sets of trace routines. Make updates to the trace logging such that a single
>> trace routine logs both CXL Endpoint and CXL Port protocol errors.
>>
>> Keep the trace log fields 'memdev' and 'host'. While these are not accurate for
>> non-Endpoints the fields will remain as-is to prevent breaking userspace RAS
>> trace consumers.
>>
>> Add serial number parameter to the trace logging. This is used for EPs and 0 is
>> provided for CXL port devices without a serial number.
>>
>> Below is output of correctable and uncorrectable protocol error logging.
>> CXL Root Port and CXL Endpoint examples are included below.
>>
>> Root Port:
>> cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>> status='CRC Threshold Hit'
>> cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>> Error'
>>
>> Endpoint:
>> cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0
>> status='CRC Threshold Hit'
>> cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status:
>> 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>> drivers/cxl/core/pci.c | 19 ++++-----
>> drivers/cxl/core/ras.c | 14 ++++---
>> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
>> 3 files changed, 37 insertions(+), 80 deletions(-)
>>
> [...]
>> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>> *data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>> 25ebfbc1616c..494d6db461a7 100644
>> --- a/drivers/cxl/core/trace.h
>> +++ b/drivers/cxl/core/trace.h
>> @@ -48,49 +48,22 @@
>> { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
>> )
>>
>> -TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>> - TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>> - TP_ARGS(dev, status, fe, hl),
>> - TP_STRUCT__entry(
>> - __string(device, dev_name(dev))
>> - __string(host, dev_name(dev->parent))
>> - __field(u32, status)
>> - __field(u32, first_error)
>> - __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>> - ),
>> - TP_fast_assign(
>> - __assign_str(device);
>> - __assign_str(host);
>> - __entry->status = status;
>> - __entry->first_error = fe;
>> - /*
>> - * Embed the 512B headerlog data for user app retrieval and
>> - * parsing, but no need to print this in the trace buffer.
>> - */
>> - memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>> - ),
>> - TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>> - __get_str(device), __get_str(host),
>> - show_uc_errs(__entry->status),
>> - show_uc_errs(__entry->first_error)
>> - )
>> -);
>> -
>> TRACE_EVENT(cxl_aer_uncorrectable_error,
>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>> *hl),
>> - TP_ARGS(cxlmd, status, fe, hl),
>> + TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>> + u32 *hl),
>> + TP_ARGS(dev, serial, status, fe, hl),
>> TP_STRUCT__entry(
>> - __string(memdev, dev_name(&cxlmd->dev))
>> - __string(host, dev_name(cxlmd->dev.parent))
>> + __string(name, dev_name(dev))
>> + __string(parent, dev_name(dev->parent))
> Hi Terry,
>
> Thanks for considering the feedback given in v9 regarding the compatibility issue
> with the rasdaemon.
> https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
>
> Probably some confusion w.r.t the feedback.
> Unfortunately TP_printk(...) is not an ABI that we need to keep stable,
> it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
>
>>
Oh. Apologies, I didn't realize TP_STRUCT was an ABI requirement. I will change back
the TP_STRUCT as well.
-Terry
>> __field(u64, serial)
>> __field(u32, status)
>> __field(u32, first_error)
>> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>> ),
>> TP_fast_assign(
>> - __assign_str(memdev);
>> - __assign_str(host);
>> - __entry->serial = cxlmd->cxlds->serial;
>> + __assign_str(name);
>> + __assign_str(parent);
>> + __entry->serial = serial;
>> __entry->status = status;
>> __entry->first_error = fe;
>> /*
>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>> */
>> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>> ),
>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>> '%s'",
>> - __get_str(memdev), __get_str(host), __entry->serial,
>> + TP_printk("memdev=%s host=%s serial=%lld status='%s'
>> first_error='%s'",
>> + __get_str(name), __get_str(parent), __entry->serial,
>> show_uc_errs(__entry->status),
>> show_uc_errs(__entry->first_error)
>> )
>> @@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>> { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>> } \
>> )
>>
>> -TRACE_EVENT(cxl_port_aer_correctable_error,
>> - TP_PROTO(struct device *dev, u32 status),
>> - TP_ARGS(dev, status),
>> - TP_STRUCT__entry(
>> - __string(device, dev_name(dev))
>> - __string(host, dev_name(dev->parent))
>> - __field(u32, status)
>> - ),
>> - TP_fast_assign(
>> - __assign_str(device);
>> - __assign_str(host);
>> - __entry->status = status;
>> - ),
>> - TP_printk("device=%s host=%s status='%s'",
>> - __get_str(device), __get_str(host),
>> - show_ce_errs(__entry->status)
>> - )
>> -);
>> -
>> TRACE_EVENT(cxl_aer_correctable_error,
>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>> - TP_ARGS(cxlmd, status),
>> + TP_PROTO(struct device *dev, u64 serial, u32 status),
>> + TP_ARGS(dev, serial, status),
>> TP_STRUCT__entry(
>> - __string(memdev, dev_name(&cxlmd->dev))
>> - __string(host, dev_name(cxlmd->dev.parent))
>> + __string(name, dev_name(dev))
>> + __string(parent, dev_name(dev->parent))
> Same as above.
>> __field(u64, serial)
>> __field(u32, status)
>> ),
>> TP_fast_assign(
>> - __assign_str(memdev);
>> - __assign_str(host);
>> - __entry->serial = cxlmd->cxlds->serial;
>> + __assign_str(name);
>> + __assign_str(parent);
>> + __entry->serial = serial;
>> __entry->status = status;
>> ),
>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>> - __get_str(memdev), __get_str(host), __entry->serial,
>> + TP_printk("memdev=%s host=%s serial=%lld status='%s'",
>> + __get_str(name), __get_str(parent), __entry->serial,
>> show_ce_errs(__entry->status)
>> )
>> );
>> --
>> 2.34.1
> Thanks,
> Shiju
On Fri, Jun 27, 2025 at 12:22:39PM +0000, Shiju Jose wrote:
> >-----Original Message-----
> >From: Terry Bowman <terry.bowman@amd.com>
> >Sent: 26 June 2025 23:43
> >To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
> >dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
> >bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
> >ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
> >rrichter@amd.com; dan.carpenter@linaro.org;
> >PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
> >Benjamin.Cheatham@amd.com;
> >sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
> >linux-cxl@vger.kernel.org
> >Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
> >Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
> >and CXL Ports
> >
big snip -
> >-);
> >-
> > TRACE_EVENT(cxl_aer_uncorrectable_error,
> >- TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
> >*hl),
> >- TP_ARGS(cxlmd, status, fe, hl),
> >+ TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
> >+ u32 *hl),
> >+ TP_ARGS(dev, serial, status, fe, hl),
> > TP_STRUCT__entry(
> >- __string(memdev, dev_name(&cxlmd->dev))
> >- __string(host, dev_name(cxlmd->dev.parent))
> >+ __string(name, dev_name(dev))
> >+ __string(parent, dev_name(dev->parent))
>
> Hi Terry,
>
> Thanks for considering the feedback given in v9 regarding the compatibility issue
> with the rasdaemon.
> https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
>
> Probably some confusion w.r.t the feedback.
> Unfortunately TP_printk(...) is not an ABI that we need to keep stable,
> it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
>
I'm not so sure you should be letting him off the hook for TP_printk ;)
It seems TP_printk should be kept aligned w TP_STRUCT_entry(). As a
user who often looks at TP_printk output, I'd say keep them all in
sync, and consider them ABI - ie. add to but don't modify.
> > __field(u64, serial)
> > __field(u32, status)
> > __field(u32, first_error)
> > __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> > ),
> > TP_fast_assign(
> >- __assign_str(memdev);
> >- __assign_str(host);
> >- __entry->serial = cxlmd->cxlds->serial;
> >+ __assign_str(name);
> >+ __assign_str(parent);
> >+ __entry->serial = serial;
> > __entry->status = status;
> > __entry->first_error = fe;
> > /*
> >@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> > */
> > memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> > ),
> >- TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
> >'%s'",
> >- __get_str(memdev), __get_str(host), __entry->serial,
> >+ TP_printk("memdev=%s host=%s serial=%lld status='%s'
> >first_error='%s'",
> >+ __get_str(name), __get_str(parent), __entry->serial,
> > show_uc_errs(__entry->status),
> > show_uc_errs(__entry->first_error)
> > )
snip
On 7/1/2025 8:18 PM, Alison Schofield wrote:
> On Fri, Jun 27, 2025 at 12:22:39PM +0000, Shiju Jose wrote:
>>> -----Original Message-----
>>> From: Terry Bowman <terry.bowman@amd.com>
>>> Sent: 26 June 2025 23:43
>>> To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>>> dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>>> bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>>> ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>>> rrichter@amd.com; dan.carpenter@linaro.org;
>>> PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>>> Benjamin.Cheatham@amd.com;
>>> sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>>> linux-cxl@vger.kernel.org
>>> Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>>> Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>>> and CXL Ports
>>>
> big snip -
>
>>> -);
>>> -
>>> TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> - TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>>> *hl),
>>> - TP_ARGS(cxlmd, status, fe, hl),
>>> + TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>>> + u32 *hl),
>>> + TP_ARGS(dev, serial, status, fe, hl),
>>> TP_STRUCT__entry(
>>> - __string(memdev, dev_name(&cxlmd->dev))
>>> - __string(host, dev_name(cxlmd->dev.parent))
>>> + __string(name, dev_name(dev))
>>> + __string(parent, dev_name(dev->parent))
>> Hi Terry,
>>
>> Thanks for considering the feedback given in v9 regarding the compatibility issue
>> with the rasdaemon.
>> https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
>>
>> Probably some confusion w.r.t the feedback.
>> Unfortunately TP_printk(...) is not an ABI that we need to keep stable,
>> it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
>>
> I'm not so sure you should be letting him off the hook for TP_printk ;)
> It seems TP_printk should be kept aligned w TP_STRUCT_entry(). As a
> user who often looks at TP_printk output, I'd say keep them all in
> sync, and consider them ABI - ie. add to but don't modify.
>
>
I agree. I will keep TP_printk and TP_STRUCT aligned by using 'memdev' and 'host'.
The only change here will be TP_PROTO and will be:
TP_PROTO(struct device *dev, u64 serial, u32 status),
Let me know if that's not ok.
-Terry
>>> __field(u64, serial)
>>> __field(u32, status)
>>> __field(u32, first_error)
>>> __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>>> ),
>>> TP_fast_assign(
>>> - __assign_str(memdev);
>>> - __assign_str(host);
>>> - __entry->serial = cxlmd->cxlds->serial;
>>> + __assign_str(name);
>>> + __assign_str(parent);
>>> + __entry->serial = serial;
>>> __entry->status = status;
>>> __entry->first_error = fe;
>>> /*
>>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> */
>>> memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>>> ),
>>> - TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>>> '%s'",
>>> - __get_str(memdev), __get_str(host), __entry->serial,
>>> + TP_printk("memdev=%s host=%s serial=%lld status='%s'
>>> first_error='%s'",
>>> + __get_str(name), __get_str(parent), __entry->serial,
>>> show_uc_errs(__entry->status),
>>> show_uc_errs(__entry->first_error)
>>> )
> snip
>
© 2016 - 2026 Red Hat, Inc.