[PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports

Terry Bowman posted 17 patches 3 months, 1 week ago
There is a newer version of this series
[PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports
Posted by Terry Bowman 3 months, 1 week ago
CXL currently has separate trace routines for CXL Port errors and CXL
Endpoint errors. This is inconvenient for the user because they must enable
2 sets of trace routines. Make updates to the trace logging such that a
single trace routine logs both CXL Endpoint and CXL Port protocol errors.

Keep the trace log fields 'memdev' and 'host'. While these are not accurate
for non-Endpoints the fields will remain as-is to prevent breaking userspace
RAS trace consumers.

Add serial number parameter to the trace logging. This is used for EPs
and 0 is provided for CXL port devices without a serial number.

Below is output of correctable and uncorrectable protocol error logging.
CXL Root Port and CXL Endpoint examples are included below.

Root Port:
cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'

Endpoint:
cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0 status='CRC Threshold Hit'
cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c   | 19 ++++-----
 drivers/cxl/core/ras.c   | 14 ++++---
 drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
 3 files changed, 37 insertions(+), 80 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index c9a4b528e0b8..156ce094a8b9 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -664,8 +664,8 @@ void read_cdat_data(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
 
-static void cxl_handle_cor_ras(struct device *dev,
-				 void __iomem *ras_base)
+static void cxl_handle_cor_ras(struct device *dev, u64 serial,
+			       void __iomem *ras_base)
 {
 	void __iomem *addr;
 	u32 status;
@@ -679,7 +679,7 @@ static void cxl_handle_cor_ras(struct device *dev,
 	status = readl(addr);
 	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
 		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
-		trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
+		trace_cxl_aer_correctable_error(dev, serial, status);
 	}
 }
 
@@ -704,7 +704,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
  * Log the state of the RAS status registers and prepare them to log the
  * next error status. Return 1 if reset needed.
  */
-static bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+static bool cxl_handle_ras(struct device *dev, u64 serial,
+			   void __iomem *ras_base)
 {
 	u32 hl[CXL_HEADERLOG_SIZE_U32];
 	void __iomem *addr;
@@ -733,7 +734,7 @@ static bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
 	}
 
 	header_log_copy(ras_base, hl);
-	trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
+	trace_cxl_aer_uncorrectable_error(dev, serial, status, fe, hl);
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
 
 	return true;
@@ -744,13 +745,13 @@ static bool cxl_handle_ras(struct device *dev, void __iomem *ras_base)
 static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
 					  struct cxl_dport *dport)
 {
-	cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+	cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
 }
 
 static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
 				       struct cxl_dport *dport)
 {
-	return cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+	return cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, dport->regs.ras);
 }
 
 /*
@@ -847,7 +848,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev)
 		if (cxlds->rcd)
 			cxl_handle_rdport_errors(cxlds);
 
-		cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+		cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
 	}
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
@@ -876,7 +877,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 		 * chance the situation is recoverable dump the status of the RAS
 		 * capability registers and bounce the active state of the memdev.
 		 */
-		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+		ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
 	}
 
 
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 962dc94fed8c..9588b39faabd 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -13,7 +13,7 @@ static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
 {
 	u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
 
-	trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+	trace_cxl_aer_correctable_error(&pdev->dev, 0, status);
 }
 
 static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
@@ -28,8 +28,8 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
 	else
 		fe = status;
 
-	trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
-					       ras_cap.header_log);
+	trace_cxl_aer_uncorrectable_error(&pdev->dev, 0, status, fe,
+					  ras_cap.header_log);
 }
 
 static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
@@ -42,7 +42,8 @@ static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
 	if (!cxlds)
 		return;
 
-	trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+	trace_cxl_aer_correctable_error(&cxlds->cxlmd->dev, cxlds->serial,
+					status);
 }
 
 static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
@@ -62,8 +63,9 @@ static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
 	else
 		fe = status;
 
-	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
-					  ras_cap.header_log);
+	trace_cxl_aer_uncorrectable_error(&cxlds->cxlmd->dev,
+					  cxlds->serial, status,
+					  fe, ras_cap.header_log);
 }
 
 static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 25ebfbc1616c..494d6db461a7 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,49 +48,22 @@
 	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
 )
 
-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
-	TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
-	TP_ARGS(dev, status, fe, hl),
-	TP_STRUCT__entry(
-		__string(device, dev_name(dev))
-		__string(host, dev_name(dev->parent))
-		__field(u32, status)
-		__field(u32, first_error)
-		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
-	),
-	TP_fast_assign(
-		__assign_str(device);
-		__assign_str(host);
-		__entry->status = status;
-		__entry->first_error = fe;
-		/*
-		 * Embed the 512B headerlog data for user app retrieval and
-		 * parsing, but no need to print this in the trace buffer.
-		 */
-		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
-	),
-	TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
-		  __get_str(device), __get_str(host),
-		  show_uc_errs(__entry->status),
-		  show_uc_errs(__entry->first_error)
-	)
-);
-
 TRACE_EVENT(cxl_aer_uncorrectable_error,
-	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
-	TP_ARGS(cxlmd, status, fe, hl),
+	TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
+		 u32 *hl),
+	TP_ARGS(dev, serial, status, fe, hl),
 	TP_STRUCT__entry(
-		__string(memdev, dev_name(&cxlmd->dev))
-		__string(host, dev_name(cxlmd->dev.parent))
+		__string(name, dev_name(dev))
+		__string(parent, dev_name(dev->parent))
 		__field(u64, serial)
 		__field(u32, status)
 		__field(u32, first_error)
 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
 	),
 	TP_fast_assign(
-		__assign_str(memdev);
-		__assign_str(host);
-		__entry->serial = cxlmd->cxlds->serial;
+		__assign_str(name);
+		__assign_str(parent);
+		__entry->serial = serial;
 		__entry->status = status;
 		__entry->first_error = fe;
 		/*
@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
 		 */
 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
 	),
-	TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'",
-		  __get_str(memdev), __get_str(host), __entry->serial,
+	TP_printk("memdev=%s host=%s serial=%lld status='%s' first_error='%s'",
+		  __get_str(name), __get_str(parent), __entry->serial,
 		  show_uc_errs(__entry->status),
 		  show_uc_errs(__entry->first_error)
 	)
@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
 	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" }	\
 )
 
-TRACE_EVENT(cxl_port_aer_correctable_error,
-	TP_PROTO(struct device *dev, u32 status),
-	TP_ARGS(dev, status),
-	TP_STRUCT__entry(
-		__string(device, dev_name(dev))
-		__string(host, dev_name(dev->parent))
-		__field(u32, status)
-	),
-	TP_fast_assign(
-		__assign_str(device);
-		__assign_str(host);
-		__entry->status = status;
-	),
-	TP_printk("device=%s host=%s status='%s'",
-		  __get_str(device), __get_str(host),
-		  show_ce_errs(__entry->status)
-	)
-);
-
 TRACE_EVENT(cxl_aer_correctable_error,
-	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
-	TP_ARGS(cxlmd, status),
+	TP_PROTO(struct device *dev, u64 serial, u32 status),
+	TP_ARGS(dev, serial, status),
 	TP_STRUCT__entry(
-		__string(memdev, dev_name(&cxlmd->dev))
-		__string(host, dev_name(cxlmd->dev.parent))
+		__string(name, dev_name(dev))
+		__string(parent, dev_name(dev->parent))
 		__field(u64, serial)
 		__field(u32, status)
 	),
 	TP_fast_assign(
-		__assign_str(memdev);
-		__assign_str(host);
-		__entry->serial = cxlmd->cxlds->serial;
+		__assign_str(name);
+		__assign_str(parent);
+		__entry->serial = serial;
 		__entry->status = status;
 	),
-	TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
-		  __get_str(memdev), __get_str(host), __entry->serial,
+	TP_printk("memdev=%s host=%s serial=%lld status='%s'",
+		  __get_str(name), __get_str(parent), __entry->serial,
 		  show_ce_errs(__entry->status)
 	)
 );
-- 
2.34.1
RE: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports
Posted by Shiju Jose 3 months, 1 week ago
>-----Original Message-----
>From: Terry Bowman <terry.bowman@amd.com>
>Sent: 26 June 2025 23:43
>To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>rrichter@amd.com; dan.carpenter@linaro.org;
>PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>Benjamin.Cheatham@amd.com;
>sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>linux-cxl@vger.kernel.org
>Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>and CXL Ports
>
>CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>errors. This is inconvenient for the user because they must enable
>2 sets of trace routines. Make updates to the trace logging such that a single
>trace routine logs both CXL Endpoint and CXL Port protocol errors.
>
>Keep the trace log fields 'memdev' and 'host'. While these are not accurate for
>non-Endpoints the fields will remain as-is to prevent breaking userspace RAS
>trace consumers.
>
>Add serial number parameter to the trace logging. This is used for EPs and 0 is
>provided for CXL port devices without a serial number.
>
>Below is output of correctable and uncorrectable protocol error logging.
>CXL Root Port and CXL Endpoint examples are included below.
>
>Root Port:
>cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>Error'
>
>Endpoint:
>cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0
>status='CRC Threshold Hit'
>cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status:
>'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>
>Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>---
> drivers/cxl/core/pci.c   | 19 ++++-----
> drivers/cxl/core/ras.c   | 14 ++++---
> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
> 3 files changed, 37 insertions(+), 80 deletions(-)
>
[...]
>
> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>*data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>25ebfbc1616c..494d6db461a7 100644
>--- a/drivers/cxl/core/trace.h
>+++ b/drivers/cxl/core/trace.h
>@@ -48,49 +48,22 @@
> 	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
> )
>
>-TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>-	TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>-	TP_ARGS(dev, status, fe, hl),
>-	TP_STRUCT__entry(
>-		__string(device, dev_name(dev))
>-		__string(host, dev_name(dev->parent))
>-		__field(u32, status)
>-		__field(u32, first_error)
>-		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>-	),
>-	TP_fast_assign(
>-		__assign_str(device);
>-		__assign_str(host);
>-		__entry->status = status;
>-		__entry->first_error = fe;
>-		/*
>-		 * Embed the 512B headerlog data for user app retrieval and
>-		 * parsing, but no need to print this in the trace buffer.
>-		 */
>-		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>-	),
>-	TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>-		  __get_str(device), __get_str(host),
>-		  show_uc_errs(__entry->status),
>-		  show_uc_errs(__entry->first_error)
>-	)
>-);
>-
> TRACE_EVENT(cxl_aer_uncorrectable_error,
>-	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>*hl),
>-	TP_ARGS(cxlmd, status, fe, hl),
>+	TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>+		 u32 *hl),
>+	TP_ARGS(dev, serial, status, fe, hl),
> 	TP_STRUCT__entry(
>-		__string(memdev, dev_name(&cxlmd->dev))
>-		__string(host, dev_name(cxlmd->dev.parent))
>+		__string(name, dev_name(dev))
>+		__string(parent, dev_name(dev->parent))

Hi Terry,

Thanks for considering the feedback given in v9 regarding the compatibility issue
with the rasdaemon.
https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/

Probably some confusion w.r.t the feedback.
Unfortunately  TP_printk(...) is not an ABI that we need to keep stable, 
it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.

> 		__field(u64, serial)
> 		__field(u32, status)
> 		__field(u32, first_error)
> 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> 	),
> 	TP_fast_assign(
>-		__assign_str(memdev);
>-		__assign_str(host);
>-		__entry->serial = cxlmd->cxlds->serial;
>+		__assign_str(name);
>+		__assign_str(parent);
>+		__entry->serial = serial;
> 		__entry->status = status;
> 		__entry->first_error = fe;
> 		/*
>@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> 		 */
> 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> 	),
>-	TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>'%s'",
>-		  __get_str(memdev), __get_str(host), __entry->serial,
>+	TP_printk("memdev=%s host=%s serial=%lld status='%s'
>first_error='%s'",
>+		  __get_str(name), __get_str(parent), __entry->serial,
> 		  show_uc_errs(__entry->status),
> 		  show_uc_errs(__entry->first_error)
> 	)
>@@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> 	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>}	\
> )
>
>-TRACE_EVENT(cxl_port_aer_correctable_error,
>-	TP_PROTO(struct device *dev, u32 status),
>-	TP_ARGS(dev, status),
>-	TP_STRUCT__entry(
>-		__string(device, dev_name(dev))
>-		__string(host, dev_name(dev->parent))
>-		__field(u32, status)
>-	),
>-	TP_fast_assign(
>-		__assign_str(device);
>-		__assign_str(host);
>-		__entry->status = status;
>-	),
>-	TP_printk("device=%s host=%s status='%s'",
>-		  __get_str(device), __get_str(host),
>-		  show_ce_errs(__entry->status)
>-	)
>-);
>-
> TRACE_EVENT(cxl_aer_correctable_error,
>-	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>-	TP_ARGS(cxlmd, status),
>+	TP_PROTO(struct device *dev, u64 serial, u32 status),
>+	TP_ARGS(dev, serial, status),
> 	TP_STRUCT__entry(
>-		__string(memdev, dev_name(&cxlmd->dev))
>-		__string(host, dev_name(cxlmd->dev.parent))
>+		__string(name, dev_name(dev))
>+		__string(parent, dev_name(dev->parent))

Same as above.
> 		__field(u64, serial)
> 		__field(u32, status)
> 	),
> 	TP_fast_assign(
>-		__assign_str(memdev);
>-		__assign_str(host);
>-		__entry->serial = cxlmd->cxlds->serial;
>+		__assign_str(name);
>+		__assign_str(parent);
>+		__entry->serial = serial;
> 		__entry->status = status;
> 	),
>-	TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>-		  __get_str(memdev), __get_str(host), __entry->serial,
>+	TP_printk("memdev=%s host=%s serial=%lld status='%s'",
>+		  __get_str(name), __get_str(parent), __entry->serial,
> 		  show_ce_errs(__entry->status)
> 	)
> );
>--
>2.34.1

Thanks,
Shiju
Re: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports
Posted by Bowman, Terry 3 months, 1 week ago

On 6/27/2025 7:22 AM, Shiju Jose wrote:
>> -----Original Message-----
>> From: Terry Bowman <terry.bowman@amd.com>
>> Sent: 26 June 2025 23:43
>> To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>> dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>> bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>> ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>> rrichter@amd.com; dan.carpenter@linaro.org;
>> PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>> Benjamin.Cheatham@amd.com;
>> sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>> linux-cxl@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>> Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>> and CXL Ports
>>
>> CXL currently has separate trace routines for CXL Port errors and CXL Endpoint
>> errors. This is inconvenient for the user because they must enable
>> 2 sets of trace routines. Make updates to the trace logging such that a single
>> trace routine logs both CXL Endpoint and CXL Port protocol errors.
>>
>> Keep the trace log fields 'memdev' and 'host'. While these are not accurate for
>> non-Endpoints the fields will remain as-is to prevent breaking userspace RAS
>> trace consumers.
>>
>> Add serial number parameter to the trace logging. This is used for EPs and 0 is
>> provided for CXL port devices without a serial number.
>>
>> Below is output of correctable and uncorrectable protocol error logging.
>> CXL Root Port and CXL Endpoint examples are included below.
>>
>> Root Port:
>> cxl_aer_correctable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>> status='CRC Threshold Hit'
>> cxl_aer_uncorrectable_error: memdev=0000:0c:00.0 host=pci0000:0c serial: 0
>> status: 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity
>> Error'
>>
>> Endpoint:
>> cxl_aer_correctable_error: memdev=mem3 host=0000:0f:00.0 serial=0
>> status='CRC Threshold Hit'
>> cxl_aer_uncorrectable_error: memdev=mem3 host=0000:0f:00.0 serial: 0 status:
>> 'Cache Byte Enable Parity Error' first_error: 'Cache Byte Enable Parity Error'
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>> drivers/cxl/core/pci.c   | 19 ++++-----
>> drivers/cxl/core/ras.c   | 14 ++++---
>> drivers/cxl/core/trace.h | 84 +++++++++-------------------------------
>> 3 files changed, 37 insertions(+), 80 deletions(-)
>>
> [...]
>> static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data
>> *data) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index
>> 25ebfbc1616c..494d6db461a7 100644
>> --- a/drivers/cxl/core/trace.h
>> +++ b/drivers/cxl/core/trace.h
>> @@ -48,49 +48,22 @@
>> 	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
>> )
>>
>> -TRACE_EVENT(cxl_port_aer_uncorrectable_error,
>> -	TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
>> -	TP_ARGS(dev, status, fe, hl),
>> -	TP_STRUCT__entry(
>> -		__string(device, dev_name(dev))
>> -		__string(host, dev_name(dev->parent))
>> -		__field(u32, status)
>> -		__field(u32, first_error)
>> -		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>> -	),
>> -	TP_fast_assign(
>> -		__assign_str(device);
>> -		__assign_str(host);
>> -		__entry->status = status;
>> -		__entry->first_error = fe;
>> -		/*
>> -		 * Embed the 512B headerlog data for user app retrieval and
>> -		 * parsing, but no need to print this in the trace buffer.
>> -		 */
>> -		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>> -	),
>> -	TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
>> -		  __get_str(device), __get_str(host),
>> -		  show_uc_errs(__entry->status),
>> -		  show_uc_errs(__entry->first_error)
>> -	)
>> -);
>> -
>> TRACE_EVENT(cxl_aer_uncorrectable_error,
>> -	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>> *hl),
>> -	TP_ARGS(cxlmd, status, fe, hl),
>> +	TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>> +		 u32 *hl),
>> +	TP_ARGS(dev, serial, status, fe, hl),
>> 	TP_STRUCT__entry(
>> -		__string(memdev, dev_name(&cxlmd->dev))
>> -		__string(host, dev_name(cxlmd->dev.parent))
>> +		__string(name, dev_name(dev))
>> +		__string(parent, dev_name(dev->parent))
> Hi Terry,
>
> Thanks for considering the feedback given in v9 regarding the compatibility issue
> with the rasdaemon.
> https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
>
> Probably some confusion w.r.t the feedback.
> Unfortunately  TP_printk(...) is not an ABI that we need to keep stable, 
> it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
>
>> 		

Oh. Apologies, I didn't realize TP_STRUCT was an ABI requirement. I will change back
the TP_STRUCT as well.

-Terry

>> __field(u64, serial)
>> 		__field(u32, status)
>> 		__field(u32, first_error)
>> 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>> 	),
>> 	TP_fast_assign(
>> -		__assign_str(memdev);
>> -		__assign_str(host);
>> -		__entry->serial = cxlmd->cxlds->serial;
>> +		__assign_str(name);
>> +		__assign_str(parent);
>> +		__entry->serial = serial;
>> 		__entry->status = status;
>> 		__entry->first_error = fe;
>> 		/*
>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>> 		 */
>> 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>> 	),
>> -	TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>> '%s'",
>> -		  __get_str(memdev), __get_str(host), __entry->serial,
>> +	TP_printk("memdev=%s host=%s serial=%lld status='%s'
>> first_error='%s'",
>> +		  __get_str(name), __get_str(parent), __entry->serial,
>> 		  show_uc_errs(__entry->status),
>> 		  show_uc_errs(__entry->first_error)
>> 	)
>> @@ -124,42 +97,23 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>> 	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer"
>> }	\
>> )
>>
>> -TRACE_EVENT(cxl_port_aer_correctable_error,
>> -	TP_PROTO(struct device *dev, u32 status),
>> -	TP_ARGS(dev, status),
>> -	TP_STRUCT__entry(
>> -		__string(device, dev_name(dev))
>> -		__string(host, dev_name(dev->parent))
>> -		__field(u32, status)
>> -	),
>> -	TP_fast_assign(
>> -		__assign_str(device);
>> -		__assign_str(host);
>> -		__entry->status = status;
>> -	),
>> -	TP_printk("device=%s host=%s status='%s'",
>> -		  __get_str(device), __get_str(host),
>> -		  show_ce_errs(__entry->status)
>> -	)
>> -);
>> -
>> TRACE_EVENT(cxl_aer_correctable_error,
>> -	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
>> -	TP_ARGS(cxlmd, status),
>> +	TP_PROTO(struct device *dev, u64 serial, u32 status),
>> +	TP_ARGS(dev, serial, status),
>> 	TP_STRUCT__entry(
>> -		__string(memdev, dev_name(&cxlmd->dev))
>> -		__string(host, dev_name(cxlmd->dev.parent))
>> +		__string(name, dev_name(dev))
>> +		__string(parent, dev_name(dev->parent))
> Same as above.
>> 		__field(u64, serial)
>> 		__field(u32, status)
>> 	),
>> 	TP_fast_assign(
>> -		__assign_str(memdev);
>> -		__assign_str(host);
>> -		__entry->serial = cxlmd->cxlds->serial;
>> +		__assign_str(name);
>> +		__assign_str(parent);
>> +		__entry->serial = serial;
>> 		__entry->status = status;
>> 	),
>> -	TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
>> -		  __get_str(memdev), __get_str(host), __entry->serial,
>> +	TP_printk("memdev=%s host=%s serial=%lld status='%s'",
>> +		  __get_str(name), __get_str(parent), __entry->serial,
>> 		  show_ce_errs(__entry->status)
>> 	)
>> );
>> --
>> 2.34.1
> Thanks,
> Shiju
Re: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports
Posted by Alison Schofield 3 months, 1 week ago
On Fri, Jun 27, 2025 at 12:22:39PM +0000, Shiju Jose wrote:
> >-----Original Message-----
> >From: Terry Bowman <terry.bowman@amd.com>
> >Sent: 26 June 2025 23:43
> >To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
> >dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
> >bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
> >ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
> >rrichter@amd.com; dan.carpenter@linaro.org;
> >PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
> >Benjamin.Cheatham@amd.com;
> >sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
> >linux-cxl@vger.kernel.org
> >Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
> >Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
> >and CXL Ports
> >

big snip -

> >-);
> >-
> > TRACE_EVENT(cxl_aer_uncorrectable_error,
> >-	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
> >*hl),
> >-	TP_ARGS(cxlmd, status, fe, hl),
> >+	TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
> >+		 u32 *hl),
> >+	TP_ARGS(dev, serial, status, fe, hl),
> > 	TP_STRUCT__entry(
> >-		__string(memdev, dev_name(&cxlmd->dev))
> >-		__string(host, dev_name(cxlmd->dev.parent))
> >+		__string(name, dev_name(dev))
> >+		__string(parent, dev_name(dev->parent))
> 
> Hi Terry,
> 
> Thanks for considering the feedback given in v9 regarding the compatibility issue
> with the rasdaemon.
> https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
> 
> Probably some confusion w.r.t the feedback.
> Unfortunately  TP_printk(...) is not an ABI that we need to keep stable, 
> it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
> 

I'm not so sure you should be letting him off the hook for TP_printk ;)
It seems TP_printk should be kept aligned w TP_STRUCT_entry(). As a
user who often looks at TP_printk output, I'd say keep them all in
sync, and consider them ABI - ie. add to but don't modify.



> > 		__field(u64, serial)
> > 		__field(u32, status)
> > 		__field(u32, first_error)
> > 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> > 	),
> > 	TP_fast_assign(
> >-		__assign_str(memdev);
> >-		__assign_str(host);
> >-		__entry->serial = cxlmd->cxlds->serial;
> >+		__assign_str(name);
> >+		__assign_str(parent);
> >+		__entry->serial = serial;
> > 		__entry->status = status;
> > 		__entry->first_error = fe;
> > 		/*
> >@@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
> > 		 */
> > 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> > 	),
> >-	TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
> >'%s'",
> >-		  __get_str(memdev), __get_str(host), __entry->serial,
> >+	TP_printk("memdev=%s host=%s serial=%lld status='%s'
> >first_error='%s'",
> >+		  __get_str(name), __get_str(parent), __entry->serial,
> > 		  show_uc_errs(__entry->status),
> > 		  show_uc_errs(__entry->first_error)
> > 	)

snip
Re: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports
Posted by Bowman, Terry 3 months, 1 week ago

On 7/1/2025 8:18 PM, Alison Schofield wrote:
> On Fri, Jun 27, 2025 at 12:22:39PM +0000, Shiju Jose wrote:
>>> -----Original Message-----
>>> From: Terry Bowman <terry.bowman@amd.com>
>>> Sent: 26 June 2025 23:43
>>> To: dave@stgolabs.net; Jonathan Cameron <jonathan.cameron@huawei.com>;
>>> dave.jiang@intel.com; alison.schofield@intel.com; dan.j.williams@intel.com;
>>> bhelgaas@google.com; Shiju Jose <shiju.jose@huawei.com>;
>>> ming.li@zohomail.com; Smita.KoralahalliChannabasappa@amd.com;
>>> rrichter@amd.com; dan.carpenter@linaro.org;
>>> PradeepVineshReddy.Kodamati@amd.com; lukas@wunner.de;
>>> Benjamin.Cheatham@amd.com;
>>> sathyanarayanan.kuppuswamy@linux.intel.com; terry.bowman@amd.com;
>>> linux-cxl@vger.kernel.org
>>> Cc: linux-kernel@vger.kernel.org; linux-pci@vger.kernel.org
>>> Subject: [PATCH v10 12/17] cxl/pci: Unify CXL trace logging for CXL Endpoints
>>> and CXL Ports
>>>
> big snip -
>
>>> -);
>>> -
>>> TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> -	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32
>>> *hl),
>>> -	TP_ARGS(cxlmd, status, fe, hl),
>>> +	TP_PROTO(struct device *dev, u64 serial, u32 status, u32 fe,
>>> +		 u32 *hl),
>>> +	TP_ARGS(dev, serial, status, fe, hl),
>>> 	TP_STRUCT__entry(
>>> -		__string(memdev, dev_name(&cxlmd->dev))
>>> -		__string(host, dev_name(cxlmd->dev.parent))
>>> +		__string(name, dev_name(dev))
>>> +		__string(parent, dev_name(dev->parent))
>> Hi Terry,
>>
>> Thanks for considering the feedback given in v9 regarding the compatibility issue
>> with the rasdaemon.
>> https://lore.kernel.org/all/959acc682e6e4b52ac0283b37ee21026@huawei.com/
>>
>> Probably some confusion w.r.t the feedback.
>> Unfortunately  TP_printk(...) is not an ABI that we need to keep stable, 
>> it's this structure, TP_STRUCT__entry(..) , that matters to the rasdaemon.
>>
> I'm not so sure you should be letting him off the hook for TP_printk ;)
> It seems TP_printk should be kept aligned w TP_STRUCT_entry(). As a
> user who often looks at TP_printk output, I'd say keep them all in
> sync, and consider them ABI - ie. add to but don't modify.
>
>
I agree. I will keep TP_printk and TP_STRUCT aligned by using 'memdev' and 'host'.
The only change here will be TP_PROTO and will be:
TP_PROTO(struct device *dev, u64 serial, u32 status),

Let me know if that's not ok.

-Terry

>>> 		__field(u64, serial)
>>> 		__field(u32, status)
>>> 		__field(u32, first_error)
>>> 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
>>> 	),
>>> 	TP_fast_assign(
>>> -		__assign_str(memdev);
>>> -		__assign_str(host);
>>> -		__entry->serial = cxlmd->cxlds->serial;
>>> +		__assign_str(name);
>>> +		__assign_str(parent);
>>> +		__entry->serial = serial;
>>> 		__entry->status = status;
>>> 		__entry->first_error = fe;
>>> 		/*
>>> @@ -99,8 +72,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
>>> 		 */
>>> 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
>>> 	),
>>> -	TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error:
>>> '%s'",
>>> -		  __get_str(memdev), __get_str(host), __entry->serial,
>>> +	TP_printk("memdev=%s host=%s serial=%lld status='%s'
>>> first_error='%s'",
>>> +		  __get_str(name), __get_str(parent), __entry->serial,
>>> 		  show_uc_errs(__entry->status),
>>> 		  show_uc_errs(__entry->first_error)
>>> 	)
> snip
>