[PATCH] mtd: nand: brcmnand: fix mtd corrected bits stat

David Regan posted 1 patch 8 months, 2 weeks ago
There is a newer version of this series
drivers/mtd/nand/raw/brcmnand/brcmnand.c | 48 ++++++++++++++++++------
1 file changed, 37 insertions(+), 11 deletions(-)
[PATCH] mtd: nand: brcmnand: fix mtd corrected bits stat
Posted by David Regan 8 months, 2 weeks ago
Currently we attempt to get the amount of flipped bits from a hardware
location which is reset on every subpage. Instead obtain total flipped
bits stat from hardware accumulator. In addition identify the correct
maximum subpage corrected bits.

Signed-off-by: David Regan <dregan@broadcom.com>
Reviewed-by: William Zhang <william.zhang@broadcom.com>
Reviewed-by: Kamal Dasu <kamal.dasu@broadcom.com>
---
 drivers/mtd/nand/raw/brcmnand/brcmnand.c | 48 ++++++++++++++++++------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
index 62bdda3be92f..43ab4aedda55 100644
--- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
@@ -361,6 +361,7 @@ enum brcmnand_reg {
 	BRCMNAND_CORR_COUNT,
 	BRCMNAND_CORR_EXT_ADDR,
 	BRCMNAND_CORR_ADDR,
+	BRCMNAND_READ_ERROR_COUNT,
 	BRCMNAND_UNCORR_EXT_ADDR,
 	BRCMNAND_UNCORR_ADDR,
 	BRCMNAND_SEMAPHORE,
@@ -389,6 +390,7 @@ static const u16 brcmnand_regs_v21[] = {
 	[BRCMNAND_CORR_THRESHOLD_EXT]	=     0,
 	[BRCMNAND_UNCORR_COUNT]		=     0,
 	[BRCMNAND_CORR_COUNT]		=     0,
+	[BRCMNAND_READ_ERROR_COUNT]	=     0,
 	[BRCMNAND_CORR_EXT_ADDR]	=  0x60,
 	[BRCMNAND_CORR_ADDR]		=  0x64,
 	[BRCMNAND_UNCORR_EXT_ADDR]	=  0x68,
@@ -419,6 +421,7 @@ static const u16 brcmnand_regs_v33[] = {
 	[BRCMNAND_CORR_THRESHOLD_EXT]	=     0,
 	[BRCMNAND_UNCORR_COUNT]		=     0,
 	[BRCMNAND_CORR_COUNT]		=     0,
+	[BRCMNAND_READ_ERROR_COUNT]	=     0,
 	[BRCMNAND_CORR_EXT_ADDR]	=  0x70,
 	[BRCMNAND_CORR_ADDR]		=  0x74,
 	[BRCMNAND_UNCORR_EXT_ADDR]	=  0x78,
@@ -449,6 +452,7 @@ static const u16 brcmnand_regs_v50[] = {
 	[BRCMNAND_CORR_THRESHOLD_EXT]	=     0,
 	[BRCMNAND_UNCORR_COUNT]		=     0,
 	[BRCMNAND_CORR_COUNT]		=     0,
+	[BRCMNAND_READ_ERROR_COUNT]	=     0,
 	[BRCMNAND_CORR_EXT_ADDR]	=  0x70,
 	[BRCMNAND_CORR_ADDR]		=  0x74,
 	[BRCMNAND_UNCORR_EXT_ADDR]	=  0x78,
@@ -479,6 +483,7 @@ static const u16 brcmnand_regs_v60[] = {
 	[BRCMNAND_CORR_THRESHOLD_EXT]	=  0xc4,
 	[BRCMNAND_UNCORR_COUNT]		=  0xfc,
 	[BRCMNAND_CORR_COUNT]		= 0x100,
+	[BRCMNAND_READ_ERROR_COUNT]	= 0x104,
 	[BRCMNAND_CORR_EXT_ADDR]	= 0x10c,
 	[BRCMNAND_CORR_ADDR]		= 0x110,
 	[BRCMNAND_UNCORR_EXT_ADDR]	= 0x114,
@@ -509,6 +514,7 @@ static const u16 brcmnand_regs_v71[] = {
 	[BRCMNAND_CORR_THRESHOLD_EXT]	=  0xe0,
 	[BRCMNAND_UNCORR_COUNT]		=  0xfc,
 	[BRCMNAND_CORR_COUNT]		= 0x100,
+	[BRCMNAND_READ_ERROR_COUNT]	= 0x104,
 	[BRCMNAND_CORR_EXT_ADDR]	= 0x10c,
 	[BRCMNAND_CORR_ADDR]		= 0x110,
 	[BRCMNAND_UNCORR_EXT_ADDR]	= 0x114,
@@ -539,6 +545,7 @@ static const u16 brcmnand_regs_v72[] = {
 	[BRCMNAND_CORR_THRESHOLD_EXT]	=  0xe0,
 	[BRCMNAND_UNCORR_COUNT]		=  0xfc,
 	[BRCMNAND_CORR_COUNT]		= 0x100,
+	[BRCMNAND_READ_ERROR_COUNT]	= 0x104,
 	[BRCMNAND_CORR_EXT_ADDR]	= 0x10c,
 	[BRCMNAND_CORR_ADDR]		= 0x110,
 	[BRCMNAND_UNCORR_EXT_ADDR]	= 0x114,
@@ -966,6 +973,13 @@ static inline u32 brcmnand_count_corrected(struct brcmnand_controller *ctrl)
 	return brcmnand_read_reg(ctrl, BRCMNAND_CORR_COUNT);
 }
 
+static inline u32 brcmnand_corr_total(struct brcmnand_controller *ctrl)
+{
+	if (ctrl->nand_version < 0x0600)
+		return 0;
+	return brcmnand_read_reg(ctrl, BRCMNAND_READ_ERROR_COUNT);
+}
+
 static void brcmnand_wr_corr_thresh(struct brcmnand_host *host, u8 val)
 {
 	struct brcmnand_controller *ctrl = host->ctrl;
@@ -2066,12 +2080,15 @@ static int brcmnand_dma_trans(struct brcmnand_host *host, u64 addr, u32 *buf,
  */
 static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
 				u64 addr, unsigned int trans, u32 *buf,
-				u8 *oob, u64 *err_addr)
+				u8 *oob, u64 *err_addr, unsigned int *corr)
 {
 	struct brcmnand_host *host = nand_get_controller_data(chip);
 	struct brcmnand_controller *ctrl = host->ctrl;
 	int i, ret = 0;
 
+	if (corr)
+		*corr = 0;
+
 	brcmnand_clear_ecc_addr(ctrl);
 
 	for (i = 0; i < trans; i++, addr += FC_BYTES) {
@@ -2099,13 +2116,16 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
 
 			if (*err_addr)
 				ret = -EBADMSG;
-		}
+			else {
+				*err_addr = brcmnand_get_correcc_addr(ctrl);
 
-		if (!ret) {
-			*err_addr = brcmnand_get_correcc_addr(ctrl);
+				if (*err_addr) {
+					ret = -EUCLEAN;
 
-			if (*err_addr)
-				ret = -EUCLEAN;
+					if (corr && brcmnand_count_corrected(ctrl) > *corr)
+						*corr = brcmnand_count_corrected(ctrl);
+				}
+			}
 		}
 	}
 
@@ -2173,6 +2193,8 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
 	int err;
 	bool retry = true;
 	bool edu_err = false;
+	unsigned int corr; /* max corrected bits per subpage */
+	bool dma = 0;
 
 	dev_dbg(ctrl->dev, "read %llx -> %p\n", (unsigned long long)addr, buf);
 
@@ -2195,14 +2217,17 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
 		if (has_edu(ctrl) && err_addr)
 			edu_err = true;
 
+		dma = 1;
 	} else {
 		if (oob)
 			memset(oob, 0x99, mtd->oobsize);
 
 		err = brcmnand_read_by_pio(mtd, chip, addr, trans, buf,
-					       oob, &err_addr);
+					   oob, &err_addr, &corr);
 	}
 
+	mtd->ecc_stats.corrected = brcmnand_corr_total(ctrl);
+
 	if (mtd_is_eccerr(err)) {
 		/*
 		 * On controller version and 7.0, 7.1 , DMA read after a
@@ -2240,18 +2265,19 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
 	}
 
 	if (mtd_is_bitflip(err)) {
-		unsigned int corrected = brcmnand_count_corrected(ctrl);
+		if (dma)
+			corr = brcmnand_count_corrected(ctrl);
 
 		/* in case of EDU correctable error we read again using PIO */
 		if (edu_err)
 			err = brcmnand_read_by_pio(mtd, chip, addr, trans, buf,
-						   oob, &err_addr);
+						   oob, &err_addr, &corr);
 
 		dev_dbg(ctrl->dev, "corrected error at 0x%llx\n",
 			(unsigned long long)err_addr);
-		mtd->ecc_stats.corrected += corrected;
+
 		/* Always exceed the software-imposed threshold */
-		return max(mtd->bitflip_threshold, corrected);
+		return max(mtd->bitflip_threshold, corr);
 	}
 
 	return 0;
-- 
2.43.5
Re: [PATCH] mtd: nand: brcmnand: fix mtd corrected bits stat
Posted by Miquel Raynal 8 months, 1 week ago
On 29/05/2025 at 20:46:59 -07, David Regan <dregan@broadcom.com> wrote:

> Currently we attempt to get the amount of flipped bits from a hardware
> location which is reset on every subpage. Instead obtain total flipped
> bits stat from hardware accumulator. In addition identify the correct
> maximum subpage corrected bits.

This change does not feel correct. We gather two statistics:
- the maximum number of bitflips corrected in a single ECC chunk
- the total number of bitflips among the whole page

The most important one is the former, because it may trigger wear
levelling from the top layer (UBI, usually). It feels like you are
breaking this, am I wrong? Would you mind to be more explicit otherwise?

Thanks,
Miquèl
Re: [PATCH] mtd: nand: brcmnand: fix mtd corrected bits stat
Posted by David Regan 8 months, 1 week ago
Hi Miquèl,

On Mon, Jun 2, 2025 at 7:19 AM Miquel Raynal <miquel.raynal@bootlin.com> wrote:
>
> On 29/05/2025 at 20:46:59 -07, David Regan <dregan@broadcom.com> wrote:
>
> > Currently we attempt to get the amount of flipped bits from a hardware
> > location which is reset on every subpage. Instead obtain total flipped
> > bits stat from hardware accumulator. In addition identify the correct
> > maximum subpage corrected bits.
>
> This change does not feel correct. We gather two statistics:
> - the maximum number of bitflips corrected in a single ECC chunk
> - the total number of bitflips among the whole page
>
> The most important one is the former, because it may trigger wear
> levelling from the top layer (UBI, usually). It feels like you are
> breaking this, am I wrong? Would you mind to be more explicit otherwise?

Detecting the maximum number of bitflips corrected in a single subpage
(ECC chunk) doesn't work correctly, however the code will still
trigger an EUCLEAN since we return max(mtd->bitflip_threshold,
corrected) . My fix is to accurately report the maximum number of bit
flips per subpage, in addition to counting the total bit flips per
whole page to update the mtd corrected bits stat.

>
> Thanks,
> Miquèl

-Dave
Re: [PATCH] mtd: nand: brcmnand: fix mtd corrected bits stat
Posted by Jonas Gorski 8 months, 2 weeks ago
Hi,

On Fri, May 30, 2025 at 5:48 AM David Regan <dregan@broadcom.com> wrote:
>
> Currently we attempt to get the amount of flipped bits from a hardware
> location which is reset on every subpage. Instead obtain total flipped
> bits stat from hardware accumulator. In addition identify the correct
> maximum subpage corrected bits.
>
> Signed-off-by: David Regan <dregan@broadcom.com>
> Reviewed-by: William Zhang <william.zhang@broadcom.com>
> Reviewed-by: Kamal Dasu <kamal.dasu@broadcom.com>
> ---
>  drivers/mtd/nand/raw/brcmnand/brcmnand.c | 48 ++++++++++++++++++------
>  1 file changed, 37 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> index 62bdda3be92f..43ab4aedda55 100644
> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> @@ -361,6 +361,7 @@ enum brcmnand_reg {
>         BRCMNAND_CORR_COUNT,
>         BRCMNAND_CORR_EXT_ADDR,
>         BRCMNAND_CORR_ADDR,
> +       BRCMNAND_READ_ERROR_COUNT,
>         BRCMNAND_UNCORR_EXT_ADDR,
>         BRCMNAND_UNCORR_ADDR,
>         BRCMNAND_SEMAPHORE,
> @@ -389,6 +390,7 @@ static const u16 brcmnand_regs_v21[] = {
>         [BRCMNAND_CORR_THRESHOLD_EXT]   =     0,
>         [BRCMNAND_UNCORR_COUNT]         =     0,
>         [BRCMNAND_CORR_COUNT]           =     0,
> +       [BRCMNAND_READ_ERROR_COUNT]     =     0,
>         [BRCMNAND_CORR_EXT_ADDR]        =  0x60,
>         [BRCMNAND_CORR_ADDR]            =  0x64,
>         [BRCMNAND_UNCORR_EXT_ADDR]      =  0x68,
> @@ -419,6 +421,7 @@ static const u16 brcmnand_regs_v33[] = {
>         [BRCMNAND_CORR_THRESHOLD_EXT]   =     0,
>         [BRCMNAND_UNCORR_COUNT]         =     0,
>         [BRCMNAND_CORR_COUNT]           =     0,
> +       [BRCMNAND_READ_ERROR_COUNT]     =     0,
>         [BRCMNAND_CORR_EXT_ADDR]        =  0x70,
>         [BRCMNAND_CORR_ADDR]            =  0x74,
>         [BRCMNAND_UNCORR_EXT_ADDR]      =  0x78,
> @@ -449,6 +452,7 @@ static const u16 brcmnand_regs_v50[] = {
>         [BRCMNAND_CORR_THRESHOLD_EXT]   =     0,
>         [BRCMNAND_UNCORR_COUNT]         =     0,
>         [BRCMNAND_CORR_COUNT]           =     0,
> +       [BRCMNAND_READ_ERROR_COUNT]     =     0,

I see this register in BCM63268's NAND controller at 0x80, which is a
v4.x one, so I'm surprised v5.0 doesn't have it. Or does it not work
there? I don't know if v3.3 also has it or if using this on v4.x would
require a separate brcmnand_regs entry.

Can't really comment on the remaining changes.

Regards,
Jonas
Re: [PATCH] mtd: nand: brcmnand: fix mtd corrected bits stat
Posted by David Regan 8 months, 1 week ago
Hi Jonas,

On Fri, May 30, 2025 at 1:27 AM Jonas Gorski <jonas.gorski@gmail.com> wrote:
>
> Hi,
>
> On Fri, May 30, 2025 at 5:48 AM David Regan <dregan@broadcom.com> wrote:
> >
> > Currently we attempt to get the amount of flipped bits from a hardware
> > location which is reset on every subpage. Instead obtain total flipped
> > bits stat from hardware accumulator. In addition identify the correct
> > maximum subpage corrected bits.
> >
> > Signed-off-by: David Regan <dregan@broadcom.com>
> > Reviewed-by: William Zhang <william.zhang@broadcom.com>
> > Reviewed-by: Kamal Dasu <kamal.dasu@broadcom.com>
> > ---
> >  drivers/mtd/nand/raw/brcmnand/brcmnand.c | 48 ++++++++++++++++++------
> >  1 file changed, 37 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> > index 62bdda3be92f..43ab4aedda55 100644
> > --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> > +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> > @@ -361,6 +361,7 @@ enum brcmnand_reg {
> >         BRCMNAND_CORR_COUNT,
> >         BRCMNAND_CORR_EXT_ADDR,
> >         BRCMNAND_CORR_ADDR,
> > +       BRCMNAND_READ_ERROR_COUNT,
> >         BRCMNAND_UNCORR_EXT_ADDR,
> >         BRCMNAND_UNCORR_ADDR,
> >         BRCMNAND_SEMAPHORE,
> > @@ -389,6 +390,7 @@ static const u16 brcmnand_regs_v21[] = {
> >         [BRCMNAND_CORR_THRESHOLD_EXT]   =     0,
> >         [BRCMNAND_UNCORR_COUNT]         =     0,
> >         [BRCMNAND_CORR_COUNT]           =     0,
> > +       [BRCMNAND_READ_ERROR_COUNT]     =     0,
> >         [BRCMNAND_CORR_EXT_ADDR]        =  0x60,
> >         [BRCMNAND_CORR_ADDR]            =  0x64,
> >         [BRCMNAND_UNCORR_EXT_ADDR]      =  0x68,
> > @@ -419,6 +421,7 @@ static const u16 brcmnand_regs_v33[] = {
> >         [BRCMNAND_CORR_THRESHOLD_EXT]   =     0,
> >         [BRCMNAND_UNCORR_COUNT]         =     0,
> >         [BRCMNAND_CORR_COUNT]           =     0,
> > +       [BRCMNAND_READ_ERROR_COUNT]     =     0,
> >         [BRCMNAND_CORR_EXT_ADDR]        =  0x70,
> >         [BRCMNAND_CORR_ADDR]            =  0x74,
> >         [BRCMNAND_UNCORR_EXT_ADDR]      =  0x78,
> > @@ -449,6 +452,7 @@ static const u16 brcmnand_regs_v50[] = {
> >         [BRCMNAND_CORR_THRESHOLD_EXT]   =     0,
> >         [BRCMNAND_UNCORR_COUNT]         =     0,
> >         [BRCMNAND_CORR_COUNT]           =     0,
> > +       [BRCMNAND_READ_ERROR_COUNT]     =     0,
>
> I see this register in BCM63268's NAND controller at 0x80, which is a
> v4.x one, so I'm surprised v5.0 doesn't have it. Or does it not work
> there? I don't know if v3.3 also has it or if using this on v4.x would
> require a separate brcmnand_regs entry.

Thank you for pointing this out, I did just verify the register at offset
0x80 does appear to work correctly on my 63268. I'll put some update
in the next patch revision to reflect this.

>
> Can't really comment on the remaining changes.
>
> Regards,
> Jonas

-Dave