[PATCH] EDAC/versal: Report PFN and page offset for DDR errors

Shubhrajyoti Datta posted 1 patch 2 months ago
There is a newer version of this series
drivers/edac/versal_edac.c | 36 +++++++++++++++++-------------------
1 file changed, 17 insertions(+), 19 deletions(-)
[PATCH] EDAC/versal: Report PFN and page offset for DDR errors
Posted by Shubhrajyoti Datta 2 months ago
Currently, DDRMC correctable and uncorrectable error events are reported
to EDAC with page frame number (pfn) and offset set to zero.
This information is not useful to locate the address for memory errors.

Compute the physical address from the error information and extract
the page frame number and offset before calling edac_mc_handle_error().
This provides the actual memory location information to the userspace.

Fixes: 6f15b178cd63 ("EDAC/versal: Add a Xilinx Versal memory controller driver")
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
---

 drivers/edac/versal_edac.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
index 5a43b5d43ca2..18045f96610e 100644
--- a/drivers/edac/versal_edac.c
+++ b/drivers/edac/versal_edac.c
@@ -414,34 +414,32 @@ static unsigned long convert_to_physical(struct edac_priv *priv, union ecc_error
 static void handle_error(struct mem_ctl_info *mci, struct ecc_status *stat)
 {
 	struct edac_priv *priv = mci->pvt_info;
+	enum hw_event_mc_err_type type;
 	union ecc_error_info pinf;
+	unsigned long pa, pfn;
 
 	if (stat->error_type == XDDR_ERR_TYPE_CE) {
 		priv->ce_cnt++;
 		pinf = stat->ceinfo[stat->channel];
-		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
-			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
-			 "CE", priv->mc_id,
-			 convert_to_physical(priv, pinf), pinf.burstpos);
-
-		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
-				     1, 0, 0, 0, 0, 0, -1,
-				     priv->message, "");
-	}
-
-	if (stat->error_type == XDDR_ERR_TYPE_UE) {
+		type = HW_EVENT_ERR_CORRECTED;
+	} else if (stat->error_type == XDDR_ERR_TYPE_UE) {
 		priv->ue_cnt++;
 		pinf = stat->ueinfo[stat->channel];
-		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
-			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
-			 "UE", priv->mc_id,
-			 convert_to_physical(priv, pinf), pinf.burstpos);
-
-		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
-				     1, 0, 0, 0, 0, 0, -1,
-				     priv->message, "");
+		type = HW_EVENT_ERR_UNCORRECTED;
+	} else {
+		return;
 	}
 
+	pa = convert_to_physical(priv, pinf);
+	pfn = PHYS_PFN(pa);
+	snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
+		 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
+		 type == HW_EVENT_ERR_UNCORRECTED ? "UE" : "CE", priv->mc_id,
+		 pa, pinf.burstpos);
+	edac_mc_handle_error(type, mci,
+			     1, pfn, offset_in_page(pa), 0, 0, 0, -1,
+			     priv->message, "");
+
 	memset(stat, 0, sizeof(*stat));
 }
 
-- 
2.34.1
Re: [PATCH] EDAC/versal: Report PFN and page offset for DDR errors
Posted by Srivatsa S. Bhat 1 month, 3 weeks ago
On Wed, Apr 15, 2026 at 11:32:39AM +0530, Shubhrajyoti Datta wrote:
> Currently, DDRMC correctable and uncorrectable error events are reported
> to EDAC with page frame number (pfn) and offset set to zero.
> This information is not useful to locate the address for memory errors.
> 
> Compute the physical address from the error information and extract
> the page frame number and offset before calling edac_mc_handle_error().
> This provides the actual memory location information to the userspace.
> 
> Fixes: 6f15b178cd63 ("EDAC/versal: Add a Xilinx Versal memory controller driver")
> Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
> ---
> 
>  drivers/edac/versal_edac.c | 36 +++++++++++++++++-------------------
>  1 file changed, 17 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
> index 5a43b5d43ca2..18045f96610e 100644
> --- a/drivers/edac/versal_edac.c
> +++ b/drivers/edac/versal_edac.c
> @@ -414,34 +414,32 @@ static unsigned long convert_to_physical(struct edac_priv *priv, union ecc_error
>  static void handle_error(struct mem_ctl_info *mci, struct ecc_status *stat)
>  {

[...]

>  	if (stat->error_type == XDDR_ERR_TYPE_CE) {

[...]

> +	} else if (stat->error_type == XDDR_ERR_TYPE_UE) {

[...]
> +	} else {
> +		return;

I like the cleanup contributed by this patch (in terms of reducing
code duplication) in addition to the actual fix. However, this patch
also introduces a subtle behavior change - the existing code calls
memset() to clear out the ecc_status struct unconditionally, but this
patch doesn't call memset if the error type is not CE or UE (i.e., in
the early return path).

Was this change intentional? Wouldn't it potentially cause stale data
to be left over in the ecc_status struct, affecting future reuse?

[...]

> +
>  	memset(stat, 0, sizeof(*stat));
>  }
>  

If the expectation is to actually clear it out unconditionally, it
would be great to document it in the comments (if not done already).

Thank you!

Regards,
Srivatsa
Microsoft Linux Systems Group
Re: [PATCH] EDAC/versal: Report PFN and page offset for DDR errors
Posted by Prasanna Kumar T S M 2 months ago

On 15-04-2026 11:32, Shubhrajyoti Datta wrote:
> Currently, DDRMC correctable and uncorrectable error events are reported
> to EDAC with page frame number (pfn) and offset set to zero.
> This information is not useful to locate the address for memory errors.
> 
> Compute the physical address from the error information and extract
> the page frame number and offset before calling edac_mc_handle_error().
> This provides the actual memory location information to the userspace.
> 
> Fixes: 6f15b178cd63 ("EDAC/versal: Add a Xilinx Versal memory controller driver")
> Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
> ---
> 
>   drivers/edac/versal_edac.c | 36 +++++++++++++++++-------------------
>   1 file changed, 17 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
> index 5a43b5d43ca2..18045f96610e 100644
> --- a/drivers/edac/versal_edac.c
> +++ b/drivers/edac/versal_edac.c
> @@ -414,34 +414,32 @@ static unsigned long convert_to_physical(struct edac_priv *priv, union ecc_error
>   static void handle_error(struct mem_ctl_info *mci, struct ecc_status *stat)
>   {
>   	struct edac_priv *priv = mci->pvt_info;
> +	enum hw_event_mc_err_type type;
>   	union ecc_error_info pinf;
> +	unsigned long pa, pfn;
>   
>   	if (stat->error_type == XDDR_ERR_TYPE_CE) {
>   		priv->ce_cnt++;
>   		pinf = stat->ceinfo[stat->channel];
> -		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
> -			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
> -			 "CE", priv->mc_id,
> -			 convert_to_physical(priv, pinf), pinf.burstpos);
> -
> -		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
> -				     1, 0, 0, 0, 0, 0, -1,
> -				     priv->message, "");
> -	}
> -
> -	if (stat->error_type == XDDR_ERR_TYPE_UE) {
> +		type = HW_EVENT_ERR_CORRECTED;
> +	} else if (stat->error_type == XDDR_ERR_TYPE_UE) {
>   		priv->ue_cnt++;
>   		pinf = stat->ueinfo[stat->channel];
> -		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
> -			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
> -			 "UE", priv->mc_id,
> -			 convert_to_physical(priv, pinf), pinf.burstpos);
> -
> -		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
> -				     1, 0, 0, 0, 0, 0, -1,
> -				     priv->message, "");
> +		type = HW_EVENT_ERR_UNCORRECTED;
> +	} else {
> +		return;
>   	}
>   
> +	pa = convert_to_physical(priv, pinf);
> +	pfn = PHYS_PFN(pa);
> +	snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
> +		 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
> +		 type == HW_EVENT_ERR_UNCORRECTED ? "UE" : "CE", priv->mc_id,
> +		 pa, pinf.burstpos);
> +	edac_mc_handle_error(type, mci,
> +			     1, pfn, offset_in_page(pa), 0, 0, 0, -1,
> +			     priv->message, "");
> +
>   	memset(stat, 0, sizeof(*stat));
>   }
>   

Hi Shubhrajyoti,

Looks good to me.

Reviewed-by: Prasanna Kumar T S M <ptsm@linux.microsoft.com>

Thanks,
Prasanna