In the current implementation, write failures are not retried on rdevs
with badblocks disabled. This is because narrow_write_error, which issues
retry bios, immediately returns when badblocks are disabled. As a result,
a single write failure on such an rdev will immediately mark it as Faulty.
The retry mechanism appears to have been implemented under the assumption
that a bad block is involved in the failure. However, the retry after
MD_FAILFAST write failure depend on this code, and a Failfast write request
may fail for reasons unrelated to bad blocks.
Consequently, if failfast is enabled and badblocks are disabled on all
rdevs, and all rdevs encounter a failfast write bio failure at the same
time, no retries will occur and the entire array can be lost.
This commit adds a path in narrow_write_error to retry writes even on rdevs
where bad blocks are disabled, and failed bios marked with MD_FAILFAST will
use this path. For non-failfast cases, the behavior remains unchanged: no
retry writes are attempted to rdevs with bad blocks disabled.
Fixes: 1919cbb23bf1 ("md/raid10: add failfast handling for writes.")
Fixes: 212e7eb7a340 ("md/raid1: add failfast handling for writes.")
Signed-off-by: Kenta Akagi <k@mgml.me>
---
drivers/md/raid1.c | 44 +++++++++++++++++++++++++++++---------------
drivers/md/raid10.c | 37 ++++++++++++++++++++++++-------------
2 files changed, 53 insertions(+), 28 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 806f5cb33a8e..55213bcd82f4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2521,18 +2521,19 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
* narrow_write_error() - Retry write and set badblock
* @r1_bio: the r1bio containing the write error
* @i: which device to retry
+ * @force: Retry writing even if badblock is disabled
*
* Rewrites the bio, splitting it at the least common multiple of the logical
* block size and the badblock size. Blocks that fail to be written are marked
- * as bad. If badblocks are disabled, no write is attempted and false is
- * returned immediately.
+ * as bad. If bbl disabled and @force is not set, no retry is attempted.
+ * If bbl disabled and @force is set, the write is retried in the same way.
*
* Return:
* * %true - all blocks were written or marked bad successfully
* * %false - bbl disabled or
* one or more blocks write failed and could not be marked bad
*/
-static bool narrow_write_error(struct r1bio *r1_bio, int i)
+static bool narrow_write_error(struct r1bio *r1_bio, int i, bool force)
{
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
@@ -2553,13 +2554,17 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r1_bio->sectors;
- bool ok = true;
+ bool write_ok = true;
+ bool setbad_ok = true;
+ bool bbl_enabled = !(rdev->badblocks.shift < 0);
- if (rdev->badblocks.shift < 0)
+ if (!force && !bbl_enabled)
return false;
- block_sectors = roundup(1 << rdev->badblocks.shift,
- bdev_logical_block_size(rdev->bdev) >> 9);
+ block_sectors = bdev_logical_block_size(rdev->bdev) >> 9;
+ if (bbl_enabled)
+ block_sectors = roundup(1 << rdev->badblocks.shift,
+ block_sectors);
sector = r1_bio->sector;
sectors = ((sector + block_sectors)
& ~(sector_t)(block_sectors - 1))
@@ -2587,18 +2592,22 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
bio_trim(wbio, sector - r1_bio->sector, sectors);
wbio->bi_iter.bi_sector += rdev->data_offset;
- if (submit_bio_wait(wbio) < 0)
+ if (submit_bio_wait(wbio) < 0) {
/* failure! */
- ok = rdev_set_badblocks(rdev, sector,
- sectors, 0)
- && ok;
+ write_ok = false;
+ if (bbl_enabled)
+ setbad_ok = rdev_set_badblocks(rdev, sector,
+ sectors, 0)
+ && setbad_ok;
+ }
bio_put(wbio);
sect_to_write -= sectors;
sector += sectors;
sectors = block_sectors;
}
- return ok;
+ return (write_ok ||
+ (bbl_enabled && setbad_ok));
}
static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2631,18 +2640,23 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
for (m = 0; m < conf->raid_disks * 2 ; m++) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
- if (r1_bio->bios[m] == IO_MADE_GOOD) {
+ struct bio *bio = r1_bio->bios[m];
+
+ if (bio == IO_MADE_GOOD) {
rdev_clear_badblocks(rdev,
r1_bio->sector,
r1_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
- } else if (r1_bio->bios[m] != NULL) {
+ } else if (bio != NULL) {
/* This drive got a write error. We need to
* narrow down and record precise write
* errors.
*/
fail = true;
- if (!narrow_write_error(r1_bio, m))
+ if (!narrow_write_error(
+ r1_bio, m,
+ test_bit(FailFast, &rdev->flags) &&
+ (bio->bi_opf & MD_FAILFAST)))
md_error(conf->mddev, rdev);
/* an I/O failed, we can't clear the bitmap */
else if (test_bit(In_sync, &rdev->flags) &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 21c2821453e1..92cf3047dce6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2813,18 +2813,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
* narrow_write_error() - Retry write and set badblock
* @r10_bio: the r10bio containing the write error
* @i: which device to retry
+ * @force: Retry writing even if badblock is disabled
*
* Rewrites the bio, splitting it at the least common multiple of the logical
* block size and the badblock size. Blocks that fail to be written are marked
- * as bad. If badblocks are disabled, no write is attempted and false is
- * returned immediately.
+ * as bad. If bbl disabled and @force is not set, no retry is attempted.
*
* Return:
* * %true - all blocks were written or marked bad successfully
* * %false - bbl disabled or
* one or more blocks write failed and could not be marked bad
*/
-static bool narrow_write_error(struct r10bio *r10_bio, int i)
+static bool narrow_write_error(struct r10bio *r10_bio, int i, bool force)
{
struct bio *bio = r10_bio->master_bio;
struct mddev *mddev = r10_bio->mddev;
@@ -2845,13 +2845,17 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
sector_t sector;
int sectors;
int sect_to_write = r10_bio->sectors;
- bool ok = true;
+ bool write_ok = true;
+ bool setbad_ok = true;
+ bool bbl_enabled = !(rdev->badblocks.shift < 0);
- if (rdev->badblocks.shift < 0)
+ if (!force && !bbl_enabled)
return false;
- block_sectors = roundup(1 << rdev->badblocks.shift,
- bdev_logical_block_size(rdev->bdev) >> 9);
+ block_sectors = bdev_logical_block_size(rdev->bdev) >> 9;
+ if (bbl_enabled)
+ block_sectors = roundup(1 << rdev->badblocks.shift,
+ block_sectors);
sector = r10_bio->sector;
sectors = ((r10_bio->sector + block_sectors)
& ~(sector_t)(block_sectors - 1))
@@ -2871,18 +2875,22 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
choose_data_offset(r10_bio, rdev);
wbio->bi_opf = REQ_OP_WRITE;
- if (submit_bio_wait(wbio) < 0)
+ if (submit_bio_wait(wbio) < 0) {
/* Failure! */
- ok = rdev_set_badblocks(rdev, wsector,
- sectors, 0)
- && ok;
+ write_ok = false;
+ if (bbl_enabled)
+ setbad_ok = rdev_set_badblocks(rdev, wsector,
+ sectors, 0)
+ && setbad_ok;
+ }
bio_put(wbio);
sect_to_write -= sectors;
sector += sectors;
sectors = block_sectors;
}
- return ok;
+ return (write_ok ||
+ (bbl_enabled && setbad_ok));
}
static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
@@ -2988,7 +2996,10 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_status) {
fail = true;
- if (!narrow_write_error(r10_bio, m))
+ if (!narrow_write_error(
+ r10_bio, m,
+ test_bit(FailFast, &rdev->flags) &&
+ (bio->bi_opf & MD_FAILFAST)))
md_error(conf->mddev, rdev);
else if (test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
--
2.50.1
在 2025/9/15 11:42, Kenta Akagi 写道: > In the current implementation, write failures are not retried on rdevs > with badblocks disabled. This is because narrow_write_error, which issues > retry bios, immediately returns when badblocks are disabled. As a result, > a single write failure on such an rdev will immediately mark it as Faulty. > IMO, there's no need to add extra logic for scenarios where badblocks is not enabled. Do you have real-world scenarios where badblocks is disabled? > The retry mechanism appears to have been implemented under the assumption > that a bad block is involved in the failure. However, the retry after > MD_FAILFAST write failure depend on this code, and a Failfast write request > may fail for reasons unrelated to bad blocks. > > Consequently, if failfast is enabled and badblocks are disabled on all > rdevs, and all rdevs encounter a failfast write bio failure at the same > time, no retries will occur and the entire array can be lost. > > This commit adds a path in narrow_write_error to retry writes even on rdevs > where bad blocks are disabled, and failed bios marked with MD_FAILFAST will > use this path. For non-failfast cases, the behavior remains unchanged: no > retry writes are attempted to rdevs with bad blocks disabled. > > Fixes: 1919cbb23bf1 ("md/raid10: add failfast handling for writes.") > Fixes: 212e7eb7a340 ("md/raid1: add failfast handling for writes.") > Signed-off-by: Kenta Akagi <k@mgml.me> > --- > drivers/md/raid1.c | 44 +++++++++++++++++++++++++++++--------------- > drivers/md/raid10.c | 37 ++++++++++++++++++++++++------------- > 2 files changed, 53 insertions(+), 28 deletions(-) > !test_bit(Faulty, &rdev->flags) && -- Thanks, Nan
On 2025/09/17 19:06, Li Nan wrote: > > > 在 2025/9/15 11:42, Kenta Akagi 写道: >> In the current implementation, write failures are not retried on rdevs >> with badblocks disabled. This is because narrow_write_error, which issues >> retry bios, immediately returns when badblocks are disabled. As a result, >> a single write failure on such an rdev will immediately mark it as Faulty. >> > > IMO, there's no need to add extra logic for scenarios where badblocks > is not enabled. Do you have real-world scenarios where badblocks is > disabled? No, badblocks are enabled in my environment. I'm fine if it's not added, but I still think it's worth adding WARN_ON like: @@ -2553,13 +2554,17 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) fail = true; + WARN_ON( (bio->bi_opf & MD_FAILFAST) && (rdev->badblocks.shift < 0) ); if (!narrow_write_error(r1_bio, m)) What do you think? Thanks, Akagi >> The retry mechanism appears to have been implemented under the assumption >> that a bad block is involved in the failure. However, the retry after >> MD_FAILFAST write failure depend on this code, and a Failfast write request >> may fail for reasons unrelated to bad blocks. >> >> Consequently, if failfast is enabled and badblocks are disabled on all >> rdevs, and all rdevs encounter a failfast write bio failure at the same >> time, no retries will occur and the entire array can be lost. >> >> This commit adds a path in narrow_write_error to retry writes even on rdevs >> where bad blocks are disabled, and failed bios marked with MD_FAILFAST will >> use this path. For non-failfast cases, the behavior remains unchanged: no >> retry writes are attempted to rdevs with bad blocks disabled. >> >> Fixes: 1919cbb23bf1 ("md/raid10: add failfast handling for writes.") >> Fixes: 212e7eb7a340 ("md/raid1: add failfast handling for writes.") >> Signed-off-by: Kenta Akagi <k@mgml.me> >> --- >> drivers/md/raid1.c | 44 +++++++++++++++++++++++++++++--------------- >> drivers/md/raid10.c | 37 ++++++++++++++++++++++++------------- >> 2 files changed, 53 insertions(+), 28 deletions(-) >> !test_bit(Faulty, &rdev->flags) && > > -- > Thanks, > Nan > >
在 2025/9/17 21:33, Kenta Akagi 写道: > > > On 2025/09/17 19:06, Li Nan wrote: >> >> >> 在 2025/9/15 11:42, Kenta Akagi 写道: >>> In the current implementation, write failures are not retried on rdevs >>> with badblocks disabled. This is because narrow_write_error, which issues >>> retry bios, immediately returns when badblocks are disabled. As a result, >>> a single write failure on such an rdev will immediately mark it as Faulty. >>> >> >> IMO, there's no need to add extra logic for scenarios where badblocks >> is not enabled. Do you have real-world scenarios where badblocks is >> disabled? > > No, badblocks are enabled in my environment. > I'm fine if it's not added, but I still think it's worth adding WARN_ON like: > > @@ -2553,13 +2554,17 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) > fail = true; > + WARN_ON( (bio->bi_opf & MD_FAILFAST) && (rdev->badblocks.shift < 0) ); > if (!narrow_write_error(r1_bio, m)) > > What do you think? > How about this? --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2522,10 +2522,11 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) bool ok = true; if (rdev->badblocks.shift < 0) - return false; + block_sectors = bdev_logical_block_size(rdev->bdev) >> 9; + else + block_sectors = roundup(1 << rdev->badblocks.shift, + bdev_logical_block_size(rdev->bdev) >> 9); - block_sectors = roundup(1 << rdev->badblocks.shift, - bdev_logical_block_size(rdev->bdev) >> 9); sector = r1_bio->sector; sectors = ((sector + block_sectors) & ~(sector_t)(block_sectors - 1)) rdev_set_badblocks() checks shift, too. rdev is marked to Faulty if setting badblocks fails. > > Thanks, > Akagi > >>> The retry mechanism appears to have been implemented under the assumption >>> that a bad block is involved in the failure. However, the retry after >>> MD_FAILFAST write failure depend on this code, and a Failfast write request >>> may fail for reasons unrelated to bad blocks. >>> >>> Consequently, if failfast is enabled and badblocks are disabled on all >>> rdevs, and all rdevs encounter a failfast write bio failure at the same >>> time, no retries will occur and the entire array can be lost. >>> >>> This commit adds a path in narrow_write_error to retry writes even on rdevs >>> where bad blocks are disabled, and failed bios marked with MD_FAILFAST will >>> use this path. For non-failfast cases, the behavior remains unchanged: no >>> retry writes are attempted to rdevs with bad blocks disabled. >>> >>> Fixes: 1919cbb23bf1 ("md/raid10: add failfast handling for writes.") >>> Fixes: 212e7eb7a340 ("md/raid1: add failfast handling for writes.") >>> Signed-off-by: Kenta Akagi <k@mgml.me> >>> --- >>> drivers/md/raid1.c | 44 +++++++++++++++++++++++++++++--------------- >>> drivers/md/raid10.c | 37 ++++++++++++++++++++++++------------- >>> 2 files changed, 53 insertions(+), 28 deletions(-) >>> !test_bit(Faulty, &rdev->flags) && >> >> -- >> Thanks, >> Nan >> >> > > > > . -- Thanks, Nan
On 2025/09/18 15:58, Li Nan wrote: > > > 在 2025/9/17 21:33, Kenta Akagi 写道: >> >> >> On 2025/09/17 19:06, Li Nan wrote: >>> >>> >>> 在 2025/9/15 11:42, Kenta Akagi 写道: >>>> In the current implementation, write failures are not retried on rdevs >>>> with badblocks disabled. This is because narrow_write_error, which issues >>>> retry bios, immediately returns when badblocks are disabled. As a result, >>>> a single write failure on such an rdev will immediately mark it as Faulty. >>>> >>> >>> IMO, there's no need to add extra logic for scenarios where badblocks >>> is not enabled. Do you have real-world scenarios where badblocks is >>> disabled? >> >> No, badblocks are enabled in my environment. >> I'm fine if it's not added, but I still think it's worth adding WARN_ON like: >> >> @@ -2553,13 +2554,17 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) >> fail = true; >> + WARN_ON( (bio->bi_opf & MD_FAILFAST) && (rdev->badblocks.shift < 0) ); >> if (!narrow_write_error(r1_bio, m)) >> >> What do you think? >> > > How about this? > > --- a/drivers/md/raid1.c > +++ b/drivers/md/raid1.c > @@ -2522,10 +2522,11 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) > bool ok = true; > > if (rdev->badblocks.shift < 0) > - return false; > + block_sectors = bdev_logical_block_size(rdev->bdev) >> 9; > + else > + block_sectors = roundup(1 << rdev->badblocks.shift, > + bdev_logical_block_size(rdev->bdev) >> 9); > > - block_sectors = roundup(1 << rdev->badblocks.shift, > - bdev_logical_block_size(rdev->bdev) >> 9); > sector = r1_bio->sector; > sectors = ((sector + block_sectors) > & ~(sector_t)(block_sectors - 1)) > > rdev_set_badblocks() checks shift, too. rdev is marked to Faulty if setting > badblocks fails. Sounds good. If badblocks are disabled, rdev_set_badblocks() returns false, so there should be no problem. Can this be included in the cleanup? https://lore.kernel.org/linux-raid/20250917093508.456790-3-linan666@huaweicloud.com/T/#u or should I resend this patch as proposed? Thanks, Akagi > > >> >> Thanks, >> Akagi >> >>>> The retry mechanism appears to have been implemented under the assumption >>>> that a bad block is involved in the failure. However, the retry after >>>> MD_FAILFAST write failure depend on this code, and a Failfast write request >>>> may fail for reasons unrelated to bad blocks. >>>> >>>> Consequently, if failfast is enabled and badblocks are disabled on all >>>> rdevs, and all rdevs encounter a failfast write bio failure at the same >>>> time, no retries will occur and the entire array can be lost. >>>> >>>> This commit adds a path in narrow_write_error to retry writes even on rdevs >>>> where bad blocks are disabled, and failed bios marked with MD_FAILFAST will >>>> use this path. For non-failfast cases, the behavior remains unchanged: no >>>> retry writes are attempted to rdevs with bad blocks disabled. >>>> >>>> Fixes: 1919cbb23bf1 ("md/raid10: add failfast handling for writes.") >>>> Fixes: 212e7eb7a340 ("md/raid1: add failfast handling for writes.") >>>> Signed-off-by: Kenta Akagi <k@mgml.me> >>>> --- >>>> drivers/md/raid1.c | 44 +++++++++++++++++++++++++++++--------------- >>>> drivers/md/raid10.c | 37 ++++++++++++++++++++++++------------- >>>> 2 files changed, 53 insertions(+), 28 deletions(-) >>>> !test_bit(Faulty, &rdev->flags) && >>> >>> -- >>> Thanks, >>> Nan >>> >>> >> >> >> >> . > > -- > Thanks, > Nan > >
在 2025/9/19 0:23, Kenta Akagi 写道: > > > On 2025/09/18 15:58, Li Nan wrote: >> >> >> 在 2025/9/17 21:33, Kenta Akagi 写道: >>> >>> >>> On 2025/09/17 19:06, Li Nan wrote: >>>> >>>> >>>> 在 2025/9/15 11:42, Kenta Akagi 写道: >>>>> In the current implementation, write failures are not retried on rdevs >>>>> with badblocks disabled. This is because narrow_write_error, which issues >>>>> retry bios, immediately returns when badblocks are disabled. As a result, >>>>> a single write failure on such an rdev will immediately mark it as Faulty. >>>>> >>>> >>>> IMO, there's no need to add extra logic for scenarios where badblocks >>>> is not enabled. Do you have real-world scenarios where badblocks is >>>> disabled? >>> >>> No, badblocks are enabled in my environment. >>> I'm fine if it's not added, but I still think it's worth adding WARN_ON like: >>> >>> @@ -2553,13 +2554,17 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) >>> fail = true; >>> + WARN_ON( (bio->bi_opf & MD_FAILFAST) && (rdev->badblocks.shift < 0) ); >>> if (!narrow_write_error(r1_bio, m)) >>> >>> What do you think? >>> >> >> How about this? >> >> --- a/drivers/md/raid1.c >> +++ b/drivers/md/raid1.c >> @@ -2522,10 +2522,11 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) >> bool ok = true; >> >> if (rdev->badblocks.shift < 0) >> - return false; >> + block_sectors = bdev_logical_block_size(rdev->bdev) >> 9; >> + else >> + block_sectors = roundup(1 << rdev->badblocks.shift, >> + bdev_logical_block_size(rdev->bdev) >> 9); >> >> - block_sectors = roundup(1 << rdev->badblocks.shift, >> - bdev_logical_block_size(rdev->bdev) >> 9); >> sector = r1_bio->sector; >> sectors = ((sector + block_sectors) >> & ~(sector_t)(block_sectors - 1)) >> >> rdev_set_badblocks() checks shift, too. rdev is marked to Faulty if setting >> badblocks fails. > > Sounds good. If badblocks are disabled, rdev_set_badblocks() returns false, so there > should be no problem. > > Can this be included in the cleanup? > https://lore.kernel.org/linux-raid/20250917093508.456790-3-linan666@huaweicloud.com/T/#u > > or should I resend this patch as proposed? Yes, please resend. I will rewrite my patch. -- Thanks, Nan
© 2016 - 2025 Red Hat, Inc.