drivers/edac/versalnet_edac.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-)
Populate the EDAC location fields when reporting DDRMC CE/UE events.
Compute the physical address from the decoded error info and pass the
PFN along with the offset-within-page to edac_mc_handle_error(). This
allows user space (mc_event consumers) to correlate memory errors with
a real address.
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
---
Changes in v2:
Refactor the common code
drivers/edac/versalnet_edac.c | 34 ++++++++++++++++------------------
1 file changed, 16 insertions(+), 18 deletions(-)
diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c
index ec1315582414..85d1a829a8aa 100644
--- a/drivers/edac/versalnet_edac.c
+++ b/drivers/edac/versalnet_edac.c
@@ -429,6 +429,7 @@ static unsigned long convert_to_physical(struct mc_priv *priv,
static void handle_error(struct mc_priv *priv, struct ecc_status *stat,
int ctl_num, int *error_data)
{
+ enum hw_event_mc_err_type type;
union ecc_error_info pinf;
struct mem_ctl_info *mci;
unsigned long pa;
@@ -440,29 +441,26 @@ static void handle_error(struct mc_priv *priv, struct ecc_status *stat,
mci = priv->mci[ctl_num];
- if (stat->error_type == MC5_ERR_TYPE_CE) {
+ if (stat->error_type == MC5_ERR_TYPE_UE) {
+ pinf = stat->ueinfo[stat->channel];
+ type = HW_EVENT_ERR_UNCORRECTED;
+ } else {
pinf = stat->ceinfo[stat->channel];
- snprintf(priv->message, sizeof(priv->message),
- "Error type:%s Controller %d Addr at %lx\n",
- "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
-
- edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
- 1, 0, 0, 0, 0, 0, -1,
- priv->message, "");
+ type = HW_EVENT_ERR_CORRECTED;
}
- if (stat->error_type == MC5_ERR_TYPE_UE) {
- pinf = stat->ueinfo[stat->channel];
- snprintf(priv->message, sizeof(priv->message),
- "Error type:%s controller %d Addr at %lx\n",
- "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
+ pa = convert_to_physical(priv, pinf, ctl_num, error_data);
+ pfn = PHYS_PFN(pa);
+ snprintf(priv->message, sizeof(priv->message),
+ "Error type:%s Controller %d Addr at %lx\n",
+ type == HW_EVENT_ERR_UNCORRECTED ? "UE" : "CE",
+ ctl_num, pa);
- edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
- 1, 0, 0, 0, 0, 0, -1,
- priv->message, "");
- pa = convert_to_physical(priv, pinf, ctl_num, error_data);
- pfn = PHYS_PFN(pa);
+ edac_mc_handle_error(type, mci,
+ 1, pfn, offset_in_page(pa), 0, 0, 0, -1,
+ priv->message, "");
+ if (stat->error_type == MC5_ERR_TYPE_UE) {
if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
err = memory_failure(pfn, MF_ACTION_REQUIRED);
if (err)
--
2.49.1
Hi Shubhrajyoti,
> From: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
> Sent: Friday, May 8, 2026 6:42 PM
> To: linux-kernel@vger.kernel.org; linux-edac@vger.kernel.org
> Cc: git@amd.com; shubhrajyoti.datta@gmail.com; Borislav Petkov
> <bp@alien8.de>; Luck, Tony <tony.luck@intel.com>; Sai Krishna Potthuri
> <sai.krishna.potthuri@amd.com>; Shubhrajyoti Datta
> <shubhrajyoti.datta@amd.com>
> Subject: [PATCH v2] EDAC/versalnet: Report PFN and page offset for DDR
> errors
>
> Populate the EDAC location fields when reporting DDRMC CE/UE events.
>
> Compute the physical address from the decoded error info and pass the PFN
> along with the offset-within-page to edac_mc_handle_error(). This allows
> user space (mc_event consumers) to correlate memory errors with a real
> address.
>
> Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
> ---
>
> Changes in v2:
> Refactor the common code
>
> drivers/edac/versalnet_edac.c | 34 ++++++++++++++++------------------
> 1 file changed, 16 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c
> index ec1315582414..85d1a829a8aa 100644
> --- a/drivers/edac/versalnet_edac.c
> +++ b/drivers/edac/versalnet_edac.c
> @@ -429,6 +429,7 @@ static unsigned long convert_to_physical(struct
> mc_priv *priv, static void handle_error(struct mc_priv *priv, struct ecc_status
> *stat,
> int ctl_num, int *error_data)
> {
> + enum hw_event_mc_err_type type;
> union ecc_error_info pinf;
> struct mem_ctl_info *mci;
> unsigned long pa;
> @@ -440,29 +441,26 @@ static void handle_error(struct mc_priv *priv, struct
> ecc_status *stat,
>
> mci = priv->mci[ctl_num];
>
> - if (stat->error_type == MC5_ERR_TYPE_CE) {
> + if (stat->error_type == MC5_ERR_TYPE_UE) {
> + pinf = stat->ueinfo[stat->channel];
> + type = HW_EVENT_ERR_UNCORRECTED;
> + } else {
> pinf = stat->ceinfo[stat->channel];
> - snprintf(priv->message, sizeof(priv->message),
> - "Error type:%s Controller %d Addr at %lx\n",
> - "CE", ctl_num, convert_to_physical(priv, pinf,
> ctl_num, error_data));
> -
> - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
> - 1, 0, 0, 0, 0, 0, -1,
> - priv->message, "");
> + type = HW_EVENT_ERR_CORRECTED;
> }
>
> - if (stat->error_type == MC5_ERR_TYPE_UE) {
> - pinf = stat->ueinfo[stat->channel];
> - snprintf(priv->message, sizeof(priv->message),
> - "Error type:%s controller %d Addr at %lx\n",
> - "UE", ctl_num, convert_to_physical(priv, pinf,
> ctl_num, error_data));
> + pa = convert_to_physical(priv, pinf, ctl_num, error_data);
I noticed that the return comment of convert_to_physical() says "physical address of the
DDR memory" [1]. I'm just curious if this refers to:
- the DDR address space (DRAM-only, without MMIO holes), or
- the system physical address space (including MMIO holes)
If it's the former, it may need to convert it to the latter one by properly adding
back the MMIO hole(s), since the OS uses system physical addresses.
[1] https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/versal_edac.c#n363
> + pfn = PHYS_PFN(pa);
[...]
On Fri, May 8, 2026 at 7:00 PM Zhuo, Qiuxu <qiuxu.zhuo@intel.com> wrote:
>
> Hi Shubhrajyoti,
>
> > From: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
> > Sent: Friday, May 8, 2026 6:42 PM
> > To: linux-kernel@vger.kernel.org; linux-edac@vger.kernel.org
> > Cc: git@amd.com; shubhrajyoti.datta@gmail.com; Borislav Petkov
> > <bp@alien8.de>; Luck, Tony <tony.luck@intel.com>; Sai Krishna Potthuri
> > <sai.krishna.potthuri@amd.com>; Shubhrajyoti Datta
> > <shubhrajyoti.datta@amd.com>
> > Subject: [PATCH v2] EDAC/versalnet: Report PFN and page offset for DDR
> > errors
> >
> > Populate the EDAC location fields when reporting DDRMC CE/UE events.
> >
> > Compute the physical address from the decoded error info and pass the PFN
> > along with the offset-within-page to edac_mc_handle_error(). This allows
> > user space (mc_event consumers) to correlate memory errors with a real
> > address.
> >
> > Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
> > ---
> >
> > Changes in v2:
> > Refactor the common code
> >
> > drivers/edac/versalnet_edac.c | 34 ++++++++++++++++------------------
> > 1 file changed, 16 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c
> > index ec1315582414..85d1a829a8aa 100644
> > --- a/drivers/edac/versalnet_edac.c
> > +++ b/drivers/edac/versalnet_edac.c
> > @@ -429,6 +429,7 @@ static unsigned long convert_to_physical(struct
> > mc_priv *priv, static void handle_error(struct mc_priv *priv, struct ecc_status
> > *stat,
> > int ctl_num, int *error_data)
> > {
> > + enum hw_event_mc_err_type type;
> > union ecc_error_info pinf;
> > struct mem_ctl_info *mci;
> > unsigned long pa;
> > @@ -440,29 +441,26 @@ static void handle_error(struct mc_priv *priv, struct
> > ecc_status *stat,
> >
> > mci = priv->mci[ctl_num];
> >
> > - if (stat->error_type == MC5_ERR_TYPE_CE) {
> > + if (stat->error_type == MC5_ERR_TYPE_UE) {
> > + pinf = stat->ueinfo[stat->channel];
> > + type = HW_EVENT_ERR_UNCORRECTED;
> > + } else {
> > pinf = stat->ceinfo[stat->channel];
> > - snprintf(priv->message, sizeof(priv->message),
> > - "Error type:%s Controller %d Addr at %lx\n",
> > - "CE", ctl_num, convert_to_physical(priv, pinf,
> > ctl_num, error_data));
> > -
> > - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
> > - 1, 0, 0, 0, 0, 0, -1,
> > - priv->message, "");
> > + type = HW_EVENT_ERR_CORRECTED;
> > }
> >
> > - if (stat->error_type == MC5_ERR_TYPE_UE) {
> > - pinf = stat->ueinfo[stat->channel];
> > - snprintf(priv->message, sizeof(priv->message),
> > - "Error type:%s controller %d Addr at %lx\n",
> > - "UE", ctl_num, convert_to_physical(priv, pinf,
> > ctl_num, error_data));
> > + pa = convert_to_physical(priv, pinf, ctl_num, error_data);
>
> I noticed that the return comment of convert_to_physical() says "physical address of the
> DDR memory" [1]. I'm just curious if this refers to:
>
> - the DDR address space (DRAM-only, without MMIO holes), or
> - the system physical address space (including MMIO holes)
It is the second one that it returns.
Let me know if the below comment should be updated
- * Returns physical address of the DDR memory
+ * Returns the system physical address corresponding to the reported DDR error.
>
> If it's the former, it may need to convert it to the latter one by properly adding
> back the MMIO hole(s), since the OS uses system physical addresses.
>
> [1] https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/versal_edac.c#n363
>
> > + pfn = PHYS_PFN(pa);
>
> [...]
> From: Shubhrajyoti Datta <shubhrajyoti.datta@gmail.com>
> [...]
> Subject: Re: [PATCH v2] EDAC/versalnet: Report PFN and page offset for DDR
> [...]
> > > - if (stat->error_type == MC5_ERR_TYPE_UE) {
> > > - pinf = stat->ueinfo[stat->channel];
> > > - snprintf(priv->message, sizeof(priv->message),
> > > - "Error type:%s controller %d Addr at %lx\n",
> > > - "UE", ctl_num, convert_to_physical(priv, pinf,
> > > ctl_num, error_data));
> > > + pa = convert_to_physical(priv, pinf, ctl_num, error_data);
> >
> > I noticed that the return comment of convert_to_physical() says
> > "physical address of the DDR memory" [1]. I'm just curious if this refers to:
> >
> > - the DDR address space (DRAM-only, without MMIO holes), or
> > - the system physical address space (including MMIO holes)
>
> It is the second one that it returns.
> Let me know if the below comment should be updated
>
> - * Returns physical address of the DDR memory
> + * Returns the system physical address corresponding to the reported DDR
> error.
>
Yes, please update the return comment accordingly.
Other than that, LGTM. Thanks!
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> >
> > If it's the former, it may need to convert it to the latter one by
> > properly adding back the MMIO hole(s), since the OS uses system physical
> addresses.
> >
© 2016 - 2026 Red Hat, Inc.