[RFC PATCH 4/4] cxl/events: Updates for CXL Memory Module Event Record

shiju.jose@huawei.com posted 4 patches 1 month, 1 week ago
[RFC PATCH 4/4] cxl/events: Updates for CXL Memory Module Event Record
Posted by shiju.jose@huawei.com 1 month, 1 week ago
From: Shiju Jose <shiju.jose@huawei.com>

CXL spec 3.1 section 8.2.9.2.1.3 Table 8-47, Memory Module Event Record
has updated with following new fields and new info for Device Event Type
and Device Health Information fields.
1. Validity Flags
2. Component Identifier
3. Device Event Sub-Type

Add updates for the above spec changes in the CXL events record and CXL
Memory Module trace event implementations.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/cxl/core/trace.h | 48 +++++++++++++++++++++++++++++++++++-----
 include/cxl/event.h      |  5 ++++-
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 20790dffa2b4..1ce43bff49c7 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -613,7 +613,7 @@ TRACE_EVENT(cxl_dram,
 /*
  * Memory Module Event Record - MMER
  *
- * CXL res 3.0 section 8.2.9.2.1.3; Table 8-45
+ * CXL res 3.1 section 8.2.9.2.1.3; Table 8-47
  */
 #define CXL_MMER_HEALTH_STATUS_CHANGE		0x00
 #define CXL_MMER_MEDIA_STATUS_CHANGE		0x01
@@ -621,27 +621,35 @@ TRACE_EVENT(cxl_dram,
 #define CXL_MMER_TEMP_CHANGE			0x03
 #define CXL_MMER_DATA_PATH_ERROR		0x04
 #define CXL_MMER_LSA_ERROR			0x05
+#define CXL_MMER_UNRECOV_SIDEBAND_BUS_ERROR	0x06
+#define CXL_MMER_MEMORY_MEDIA_FRU_ERROR		0x07
+#define CXL_MMER_POWER_MANAGEMENT_FAULT		0x08
 #define show_dev_evt_type(type)	__print_symbolic(type,			   \
 	{ CXL_MMER_HEALTH_STATUS_CHANGE,	"Health Status Change"	}, \
 	{ CXL_MMER_MEDIA_STATUS_CHANGE,		"Media Status Change"	}, \
 	{ CXL_MMER_LIFE_USED_CHANGE,		"Life Used Change"	}, \
 	{ CXL_MMER_TEMP_CHANGE,			"Temperature Change"	}, \
 	{ CXL_MMER_DATA_PATH_ERROR,		"Data Path Error"	}, \
-	{ CXL_MMER_LSA_ERROR,			"LSA Error"		}  \
+	{ CXL_MMER_LSA_ERROR,			"LSA Error"		}, \
+	{ CXL_MMER_UNRECOV_SIDEBAND_BUS_ERROR,	"Unrecoverable Internal Sideband Bus Error"	}, \
+	{ CXL_MMER_MEMORY_MEDIA_FRU_ERROR,	"Memory Media FRU Error"	}, \
+	{ CXL_MMER_POWER_MANAGEMENT_FAULT,	"Power Management Fault"	}  \
 )
 
 /*
  * Device Health Information - DHI
  *
- * CXL res 3.0 section 8.2.9.8.3.1; Table 8-100
+ * CXL res 3.1 section 8.2.9.9.3.1; Table 8-133
  */
 #define CXL_DHI_HS_MAINTENANCE_NEEDED				BIT(0)
 #define CXL_DHI_HS_PERFORMANCE_DEGRADED				BIT(1)
 #define CXL_DHI_HS_HW_REPLACEMENT_NEEDED			BIT(2)
+#define CXL_DHI_HS_MEM_CAPACITY_DEGRADED			BIT(3)
 #define show_health_status_flags(flags)	__print_flags(flags, "|",	   \
 	{ CXL_DHI_HS_MAINTENANCE_NEEDED,	"MAINTENANCE_NEEDED"	}, \
 	{ CXL_DHI_HS_PERFORMANCE_DEGRADED,	"PERFORMANCE_DEGRADED"	}, \
-	{ CXL_DHI_HS_HW_REPLACEMENT_NEEDED,	"REPLACEMENT_NEEDED"	}  \
+	{ CXL_DHI_HS_HW_REPLACEMENT_NEEDED,	"REPLACEMENT_NEEDED"	}, \
+	{ CXL_DHI_HS_MEM_CAPACITY_DEGRADED,	"MEM_CAPACITY_DEGRADED"	}  \
 )
 
 #define CXL_DHI_MS_NORMAL							0x00
@@ -695,6 +703,22 @@ TRACE_EVENT(cxl_dram,
 #define CXL_DHI_AS_COR_VOL_ERR_CNT(as)			((as & 0x10) >> 4)
 #define CXL_DHI_AS_COR_PER_ERR_CNT(as)			((as & 0x20) >> 5)
 
+#define CXL_MMER_VALID_COMPONENT			BIT(0)
+#define CXL_MMER_VALID_COMPONENT_ID_FORMAT		BIT(1)
+#define show_mem_module_valid_flags(flags)	__print_flags(flags, "|",	\
+	{ CXL_MMER_VALID_COMPONENT,	"COMPONENT"	}			\
+)
+#define CXL_MMER_DEV_EVT_SUB_TYPE_NOT_REPORTED			0x00
+#define CXL_MMER_DEV_EVT_SUB_TYPE_INVALID_CONFIG_DATA		0x01
+#define CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_CONFIG_DATA		0x02
+#define CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_MEM_MEDIA_FRU		0x03
+#define show_dev_event_sub_type(sub_type)	__print_symbolic(sub_type,	  \
+	{ CXL_MMER_DEV_EVT_SUB_TYPE_NOT_REPORTED,		"Not Reported" }, \
+	{ CXL_MMER_DEV_EVT_SUB_TYPE_INVALID_CONFIG_DATA,	"Invalid Config Data" }, \
+	{ CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_CONFIG_DATA,		"Unsupported Config Data" }, \
+	{ CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_MEM_MEDIA_FRU,	"Unsupported Memory Media FRU" } \
+)
+
 TRACE_EVENT(cxl_memory_module,
 
 	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
@@ -717,6 +741,9 @@ TRACE_EVENT(cxl_memory_module,
 		__field(u32, cor_per_err_cnt)
 		__field(s16, device_temp)
 		__field(u8, add_status)
+		__field(u16, validity_flags)
+		__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+		__field(u8, sub_type)
 	),
 
 	TP_fast_assign(
@@ -735,12 +762,17 @@ TRACE_EVENT(cxl_memory_module,
 		__entry->cor_per_err_cnt = get_unaligned_le32(rec->info.cor_per_err_cnt);
 		__entry->device_temp = get_unaligned_le16(rec->info.device_temp);
 		__entry->add_status = rec->info.add_status;
+		__entry->validity_flags = get_unaligned_le16(rec->validity_flags);
+		memcpy(__entry->comp_id, &rec->component_id,
+		       CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+		__entry->sub_type = rec->sub_type;
 	),
 
 	CXL_EVT_TP_printk("event_type='%s' health_status='%s' media_status='%s' " \
 		"as_life_used=%s as_dev_temp=%s as_cor_vol_err_cnt=%s " \
 		"as_cor_per_err_cnt=%s life_used=%u device_temp=%d " \
-		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u",
+		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u " \
+		"validity_flags='%s' comp_id=%s sub_type='%s'",
 		show_dev_evt_type(__entry->event_type),
 		show_health_status_flags(__entry->health_status),
 		show_media_status(__entry->media_status),
@@ -750,7 +782,11 @@ TRACE_EVENT(cxl_memory_module,
 		show_one_bit_status(CXL_DHI_AS_COR_PER_ERR_CNT(__entry->add_status)),
 		__entry->life_used, __entry->device_temp,
 		__entry->dirty_shutdown_cnt, __entry->cor_vol_err_cnt,
-		__entry->cor_per_err_cnt
+		__entry->cor_per_err_cnt,
+		show_mem_module_valid_flags(__entry->validity_flags),
+		cxl_print_component_id(__entry->validity_flags, CXL_MMER_VALID_COMPONENT,
+				       CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+		show_dev_event_sub_type(__entry->sub_type)
 	)
 );
 
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 7e98492c85df..18b7f96dea77 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -102,7 +102,10 @@ struct cxl_event_mem_module {
 	struct cxl_event_record_hdr hdr;
 	u8 event_type;
 	struct cxl_get_health_info info;
-	u8 reserved[0x3d];
+	u8 validity_flags[2];
+	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
+	u8 sub_type;
+	u8 reserved[0x2a];
 } __packed;
 
 union cxl_event {
-- 
2.34.1
Re: [RFC PATCH 4/4] cxl/events: Updates for CXL Memory Module Event Record
Posted by Jonathan Cameron 1 month, 1 week ago
On Wed, 16 Oct 2024 17:33:49 +0100
<shiju.jose@huawei.com> wrote:

> From: Shiju Jose <shiju.jose@huawei.com>
> 
> CXL spec 3.1 section 8.2.9.2.1.3 Table 8-47, Memory Module Event Record
> has updated with following new fields and new info for Device Event Type
> and Device Health Information fields.
> 1. Validity Flags
> 2. Component Identifier
> 3. Device Event Sub-Type
> 
> Add updates for the above spec changes in the CXL events record and CXL
> Memory Module trace event implementations.
> 
> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>

A few minor things inline, but with the event_sub_type naming feel
free to add
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

>  
>  	CXL_EVT_TP_printk("event_type='%s' health_status='%s' media_status='%s' " \
>  		"as_life_used=%s as_dev_temp=%s as_cor_vol_err_cnt=%s " \
>  		"as_cor_per_err_cnt=%s life_used=%u device_temp=%d " \
> -		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u",
> +		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u " \
> +		"validity_flags='%s' comp_id=%s sub_type='%s'",
>  		show_dev_evt_type(__entry->event_type),
>  		show_health_status_flags(__entry->health_status),
>  		show_media_status(__entry->media_status),
> @@ -750,7 +782,11 @@ TRACE_EVENT(cxl_memory_module,
>  		show_one_bit_status(CXL_DHI_AS_COR_PER_ERR_CNT(__entry->add_status)),
>  		__entry->life_used, __entry->device_temp,
>  		__entry->dirty_shutdown_cnt, __entry->cor_vol_err_cnt,
> -		__entry->cor_per_err_cnt
> +		__entry->cor_per_err_cnt,
> +		show_mem_module_valid_flags(__entry->validity_flags),
> +		cxl_print_component_id(__entry->validity_flags, CXL_MMER_VALID_COMPONENT,
> +				       CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
> +		show_dev_event_sub_type(__entry->sub_type)
If we are going to reorganize for the other patches, why not move this next to the event type
field?  There isn't a validity flag for this (0 means not specified)
so fine to move it earlier I think.
>  	)
>  );
>  
> diff --git a/include/cxl/event.h b/include/cxl/event.h
> index 7e98492c85df..18b7f96dea77 100644
> --- a/include/cxl/event.h
> +++ b/include/cxl/event.h
> @@ -102,7 +102,10 @@ struct cxl_event_mem_module {
>  	struct cxl_event_record_hdr hdr;
>  	u8 event_type;
>  	struct cxl_get_health_info info;
> -	u8 reserved[0x3d];
> +	u8 validity_flags[2];
> +	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
> +	u8 sub_type;
maybe event_sub_type to match spec naming?

> +	u8 reserved[0x2a];
>  } __packed;
>  
>  union cxl_event {
RE: [RFC PATCH 4/4] cxl/events: Updates for CXL Memory Module Event Record
Posted by Shiju Jose 1 month, 1 week ago

>-----Original Message-----
>From: Jonathan Cameron <jonathan.cameron@huawei.com>
>Sent: 17 October 2024 13:44
>To: Shiju Jose <shiju.jose@huawei.com>
>Cc: dave.jiang@intel.com; dan.j.williams@intel.com; alison.schofield@intel.com;
>vishal.l.verma@intel.com; ira.weiny@intel.com; dave@stgolabs.net; linux-
>cxl@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
><linuxarm@huawei.com>; tanxiaofei <tanxiaofei@huawei.com>; Zengtao (B)
><prime.zeng@hisilicon.com>
>Subject: Re: [RFC PATCH 4/4] cxl/events: Updates for CXL Memory Module Event
>Record
>
>On Wed, 16 Oct 2024 17:33:49 +0100
><shiju.jose@huawei.com> wrote:
>
>> From: Shiju Jose <shiju.jose@huawei.com>
>>
>> CXL spec 3.1 section 8.2.9.2.1.3 Table 8-47, Memory Module Event
>> Record has updated with following new fields and new info for Device
>> Event Type and Device Health Information fields.
>> 1. Validity Flags
>> 2. Component Identifier
>> 3. Device Event Sub-Type
>>
>> Add updates for the above spec changes in the CXL events record and
>> CXL Memory Module trace event implementations.
>>
>> Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
>
>A few minor things inline, but with the event_sub_type naming feel free to add
>Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>
>>
>>  	CXL_EVT_TP_printk("event_type='%s' health_status='%s'
>media_status='%s' " \
>>  		"as_life_used=%s as_dev_temp=%s as_cor_vol_err_cnt=%s " \
>>  		"as_cor_per_err_cnt=%s life_used=%u device_temp=%d " \
>> -		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u
>cor_per_err_cnt=%u",
>> +		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u
>cor_per_err_cnt=%u " \
>> +		"validity_flags='%s' comp_id=%s sub_type='%s'",
>>  		show_dev_evt_type(__entry->event_type),
>>  		show_health_status_flags(__entry->health_status),
>>  		show_media_status(__entry->media_status),
>> @@ -750,7 +782,11 @@ TRACE_EVENT(cxl_memory_module,
>>  		show_one_bit_status(CXL_DHI_AS_COR_PER_ERR_CNT(__entry-
>>add_status)),
>>  		__entry->life_used, __entry->device_temp,
>>  		__entry->dirty_shutdown_cnt, __entry->cor_vol_err_cnt,
>> -		__entry->cor_per_err_cnt
>> +		__entry->cor_per_err_cnt,
>> +		show_mem_module_valid_flags(__entry->validity_flags),
>> +		cxl_print_component_id(__entry->validity_flags,
>CXL_MMER_VALID_COMPONENT,
>> +
>CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
>> +		show_dev_event_sub_type(__entry->sub_type)
>If we are going to reorganize for the other patches, why not move this next to
>the event type field?  There isn't a validity flag for this (0 means not specified) so
>fine to move it earlier I think.
Will do.
>>  	)
>>  );
>>
>> diff --git a/include/cxl/event.h b/include/cxl/event.h index
>> 7e98492c85df..18b7f96dea77 100644
>> --- a/include/cxl/event.h
>> +++ b/include/cxl/event.h
>> @@ -102,7 +102,10 @@ struct cxl_event_mem_module {
>>  	struct cxl_event_record_hdr hdr;
>>  	u8 event_type;
>>  	struct cxl_get_health_info info;
>> -	u8 reserved[0x3d];
>> +	u8 validity_flags[2];
>> +	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
>> +	u8 sub_type;
>maybe event_sub_type to match spec naming?
Will do.
>
>> +	u8 reserved[0x2a];
>>  } __packed;
>>
>>  union cxl_event {

Thanks,
Shiju