Currently, the LastDev flag is set on an rdev that failed a failfast
metadata write and called md_error, but did not become Faulty. It is
cleared when the metadata write retry succeeds. This has problems for
the following reasons:
* Despite its name, the flag is only set during a metadata write window.
* Unlike when LastDev and Failfast was introduced, md_error on the last
rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
the array is already unwritable.
A following commit will prevent failfast bios from breaking the array,
which requires knowing from outside the personality whether an rdev is
the last one. For that purpose, LastDev should be set on rdevs that must
not be lost.
This commit ensures that LastDev is set on the indispensable rdev in a
degraded RAID1/10 array.
Signed-off-by: Kenta Akagi <k@mgml.me>
---
drivers/md/md.c | 4 +---
drivers/md/md.h | 6 +++---
drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++++-
drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
4 files changed, 70 insertions(+), 8 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e033c26fdd4..268410b66b83 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
- set_bit(LastDev, &rdev->flags);
}
- } else
- clear_bit(LastDev, &rdev->flags);
+ }
bio_put(bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 51af29a03079..ec598f9a8381 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -281,9 +281,9 @@ enum flag_bits {
* It is expects that no bad block log
* is present.
*/
- LastDev, /* Seems to be the last working dev as
- * it didn't fail, so don't use FailFast
- * any more for metadata
+ LastDev, /* This is the last working rdev.
+ * so don't use FailFast any more for
+ * metadata.
*/
CollisionCheck, /*
* check if there is collision between raid1
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bf44878ec640..32ad6b102ff7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "]");
}
+/**
+ * update_lastdev - Set or clear LastDev flag for all rdevs in array
+ * @conf: pointer to r1conf
+ *
+ * Sets LastDev if the device is In_sync and cannot be lost for the array.
+ * Otherwise, clear it.
+ *
+ * Caller must hold ->device_lock.
+ */
+static void update_lastdev(struct r1conf *conf)
+{
+ int i;
+ int alive_disks = conf->raid_disks - conf->mddev->degraded;
+
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (rdev) {
+ if (test_bit(In_sync, &rdev->flags) &&
+ alive_disks == 1)
+ set_bit(LastDev, &rdev->flags);
+ else
+ clear_bit(LastDev, &rdev->flags);
+ }
+ }
+}
+
/**
* raid1_error() - RAID1 error handler.
* @mddev: affected md device.
@@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
}
}
set_bit(Blocked, &rdev->flags);
- if (test_and_clear_bit(In_sync, &rdev->flags))
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
mddev->degraded++;
+ update_lastdev(conf);
+ }
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
@@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
}
}
mddev->degraded -= count;
+ update_lastdev(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
print_conf(conf);
@@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
rcu_assign_pointer(conf->thread, NULL);
mddev->private = conf;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+ update_lastdev(conf);
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
@@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (raid_disks - conf->raid_disks);
+ update_lastdev(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b60c30bfb6c7..dc4edd4689f8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
_enough(conf, 1, ignore);
}
+/**
+ * update_lastdev - Set or clear LastDev flag for all rdevs in array
+ * @conf: pointer to r10conf
+ *
+ * Sets LastDev if the device is In_sync and cannot be lost for the array.
+ * Otherwise, clear it.
+ *
+ * Caller must hold ->reconfig_mutex or ->device_lock.
+ */
+static void update_lastdev(struct r10conf *conf)
+{
+ int i;
+ int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
+
+ for (i = 0; i < raid_disks; i++) {
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (rdev) {
+ if (test_bit(In_sync, &rdev->flags) &&
+ !enough(conf, i))
+ set_bit(LastDev, &rdev->flags);
+ else
+ clear_bit(LastDev, &rdev->flags);
+ }
+ }
+}
+
/**
* raid10_error() - RAID10 error handler.
* @mddev: affected md device.
@@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
return;
}
}
- if (test_and_clear_bit(In_sync, &rdev->flags))
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
mddev->degraded++;
+ update_lastdev(conf);
+ }
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
@@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
}
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
+ update_lastdev(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
print_conf(conf);
@@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
md_set_array_sectors(mddev, size);
mddev->resync_max_sectors = size;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+ update_lastdev(conf);
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
*/
spin_lock_irq(&conf->device_lock);
mddev->degraded = calc_degraded(conf);
+ update_lastdev(conf);
spin_unlock_irq(&conf->device_lock);
mddev->raid_disks = conf->geo.raid_disks;
mddev->reshape_position = conf->reshape_progress;
--
2.50.1
Hi Kenta On Mon, Sep 15, 2025 at 11:44 AM Kenta Akagi <k@mgml.me> wrote: > > Currently, the LastDev flag is set on an rdev that failed a failfast > metadata write and called md_error, but did not become Faulty. It is > cleared when the metadata write retry succeeds. This has problems for > the following reasons: > > * Despite its name, the flag is only set during a metadata write window. > * Unlike when LastDev and Failfast was introduced, md_error on the last > rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set, > the array is already unwritable. > > A following commit will prevent failfast bios from breaking the array, > which requires knowing from outside the personality whether an rdev is > the last one. For that purpose, LastDev should be set on rdevs that must > not be lost. > > This commit ensures that LastDev is set on the indispensable rdev in a > degraded RAID1/10 array. > > Signed-off-by: Kenta Akagi <k@mgml.me> > --- > drivers/md/md.c | 4 +--- > drivers/md/md.h | 6 +++--- > drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++++- > drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++- > 4 files changed, 70 insertions(+), 8 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 4e033c26fdd4..268410b66b83 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio) > if (!test_bit(Faulty, &rdev->flags) > && (bio->bi_opf & MD_FAILFAST)) { > set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); > - set_bit(LastDev, &rdev->flags); > } > - } else > - clear_bit(LastDev, &rdev->flags); > + } > > bio_put(bio); > > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 51af29a03079..ec598f9a8381 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -281,9 +281,9 @@ enum flag_bits { > * It is expects that no bad block log > * is present. > */ > - LastDev, /* Seems to be the last working dev as > - * it didn't fail, so don't use FailFast > - * any more for metadata > + LastDev, /* This is the last working rdev. > + * so don't use FailFast any more for > + * metadata. > */ > CollisionCheck, /* > * check if there is collision between raid1 > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > index bf44878ec640..32ad6b102ff7 100644 > --- a/drivers/md/raid1.c > +++ b/drivers/md/raid1.c > @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) > seq_printf(seq, "]"); > } > > +/** > + * update_lastdev - Set or clear LastDev flag for all rdevs in array > + * @conf: pointer to r1conf > + * > + * Sets LastDev if the device is In_sync and cannot be lost for the array. > + * Otherwise, clear it. > + * > + * Caller must hold ->device_lock. > + */ > +static void update_lastdev(struct r1conf *conf) > +{ > + int i; > + int alive_disks = conf->raid_disks - conf->mddev->degraded; > + > + for (i = 0; i < conf->raid_disks; i++) { > + struct md_rdev *rdev = conf->mirrors[i].rdev; > + > + if (rdev) { > + if (test_bit(In_sync, &rdev->flags) && > + alive_disks == 1) > + set_bit(LastDev, &rdev->flags); > + else > + clear_bit(LastDev, &rdev->flags); > + } > + } > +} > + > /** > * raid1_error() - RAID1 error handler. > * @mddev: affected md device. > @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) > } > } > set_bit(Blocked, &rdev->flags); > - if (test_and_clear_bit(In_sync, &rdev->flags)) > + if (test_and_clear_bit(In_sync, &rdev->flags)) { > mddev->degraded++; > + update_lastdev(conf); > + } > set_bit(Faulty, &rdev->flags); > spin_unlock_irqrestore(&conf->device_lock, flags); > /* > @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev) > } > } > mddev->degraded -= count; > + update_lastdev(conf); update_lastdev is called in raid1_spare_active, raid1_run and raid1_reshape. Could you explain the reason why it needs to call this function? Is it the reason you want to clear LastDev flag? If so, is it a right place to do it? As your commit message says, it will be cleared after retry metadata successfully. In raid1, is it the right place that fixes read/write successfully? Best Regards Xiao > spin_unlock_irqrestore(&conf->device_lock, flags); > > print_conf(conf); > @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev) > rcu_assign_pointer(conf->thread, NULL); > mddev->private = conf; > set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); > + update_lastdev(conf); > > md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); > > @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev) > > spin_lock_irqsave(&conf->device_lock, flags); > mddev->degraded += (raid_disks - conf->raid_disks); > + update_lastdev(conf); > spin_unlock_irqrestore(&conf->device_lock, flags); > conf->raid_disks = mddev->raid_disks = raid_disks; > mddev->delta_disks = 0; > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > index b60c30bfb6c7..dc4edd4689f8 100644 > --- a/drivers/md/raid10.c > +++ b/drivers/md/raid10.c > @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore) > _enough(conf, 1, ignore); > } > > +/** > + * update_lastdev - Set or clear LastDev flag for all rdevs in array > + * @conf: pointer to r10conf > + * > + * Sets LastDev if the device is In_sync and cannot be lost for the array. > + * Otherwise, clear it. > + * > + * Caller must hold ->reconfig_mutex or ->device_lock. > + */ > +static void update_lastdev(struct r10conf *conf) > +{ > + int i; > + int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks); > + > + for (i = 0; i < raid_disks; i++) { > + struct md_rdev *rdev = conf->mirrors[i].rdev; > + > + if (rdev) { > + if (test_bit(In_sync, &rdev->flags) && > + !enough(conf, i)) > + set_bit(LastDev, &rdev->flags); > + else > + clear_bit(LastDev, &rdev->flags); > + } > + } > +} > + > /** > * raid10_error() - RAID10 error handler. > * @mddev: affected md device. > @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) > return; > } > } > - if (test_and_clear_bit(In_sync, &rdev->flags)) > + if (test_and_clear_bit(In_sync, &rdev->flags)) { > mddev->degraded++; > + update_lastdev(conf); > + } > > set_bit(MD_RECOVERY_INTR, &mddev->recovery); > set_bit(Blocked, &rdev->flags); > @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev) > } > spin_lock_irqsave(&conf->device_lock, flags); > mddev->degraded -= count; > + update_lastdev(conf); > spin_unlock_irqrestore(&conf->device_lock, flags); > > print_conf(conf); > @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev) > md_set_array_sectors(mddev, size); > mddev->resync_max_sectors = size; > set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); > + update_lastdev(conf); > > if (md_integrity_register(mddev)) > goto out_free_conf; > @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev) > */ > spin_lock_irq(&conf->device_lock); > mddev->degraded = calc_degraded(conf); > + update_lastdev(conf); > spin_unlock_irq(&conf->device_lock); > mddev->raid_disks = conf->geo.raid_disks; > mddev->reshape_position = conf->reshape_progress; > -- > 2.50.1 > >
Hi, Thank you for review. On 2025/09/21 16:54, Xiao Ni wrote: > Hi Kenta > > On Mon, Sep 15, 2025 at 11:44 AM Kenta Akagi <k@mgml.me> wrote: >> >> Currently, the LastDev flag is set on an rdev that failed a failfast >> metadata write and called md_error, but did not become Faulty. It is >> cleared when the metadata write retry succeeds. This has problems for >> the following reasons: >> >> * Despite its name, the flag is only set during a metadata write window. >> * Unlike when LastDev and Failfast was introduced, md_error on the last >> rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set, >> the array is already unwritable. >> >> A following commit will prevent failfast bios from breaking the array, >> which requires knowing from outside the personality whether an rdev is >> the last one. For that purpose, LastDev should be set on rdevs that must >> not be lost. >> >> This commit ensures that LastDev is set on the indispensable rdev in a >> degraded RAID1/10 array. >> >> Signed-off-by: Kenta Akagi <k@mgml.me> >> --- >> drivers/md/md.c | 4 +--- >> drivers/md/md.h | 6 +++--- >> drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++++- >> drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++- >> 4 files changed, 70 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 4e033c26fdd4..268410b66b83 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio) >> if (!test_bit(Faulty, &rdev->flags) >> && (bio->bi_opf & MD_FAILFAST)) { >> set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); >> - set_bit(LastDev, &rdev->flags); >> } >> - } else >> - clear_bit(LastDev, &rdev->flags); >> + } >> >> bio_put(bio); >> >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 51af29a03079..ec598f9a8381 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -281,9 +281,9 @@ enum flag_bits { >> * It is expects that no bad block log >> * is present. >> */ >> - LastDev, /* Seems to be the last working dev as >> - * it didn't fail, so don't use FailFast >> - * any more for metadata >> + LastDev, /* This is the last working rdev. >> + * so don't use FailFast any more for >> + * metadata. >> */ >> CollisionCheck, /* >> * check if there is collision between raid1 >> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c >> index bf44878ec640..32ad6b102ff7 100644 >> --- a/drivers/md/raid1.c >> +++ b/drivers/md/raid1.c >> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) >> seq_printf(seq, "]"); >> } >> >> +/** >> + * update_lastdev - Set or clear LastDev flag for all rdevs in array >> + * @conf: pointer to r1conf >> + * >> + * Sets LastDev if the device is In_sync and cannot be lost for the array. >> + * Otherwise, clear it. >> + * >> + * Caller must hold ->device_lock. >> + */ >> +static void update_lastdev(struct r1conf *conf) >> +{ >> + int i; >> + int alive_disks = conf->raid_disks - conf->mddev->degraded; >> + >> + for (i = 0; i < conf->raid_disks; i++) { >> + struct md_rdev *rdev = conf->mirrors[i].rdev; >> + >> + if (rdev) { >> + if (test_bit(In_sync, &rdev->flags) && >> + alive_disks == 1) >> + set_bit(LastDev, &rdev->flags); >> + else >> + clear_bit(LastDev, &rdev->flags); >> + } >> + } >> +} >> + >> /** >> * raid1_error() - RAID1 error handler. >> * @mddev: affected md device. >> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) >> } >> } >> set_bit(Blocked, &rdev->flags); >> - if (test_and_clear_bit(In_sync, &rdev->flags)) >> + if (test_and_clear_bit(In_sync, &rdev->flags)) { >> mddev->degraded++; >> + update_lastdev(conf); >> + } >> set_bit(Faulty, &rdev->flags); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> /* >> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev) >> } >> } >> mddev->degraded -= count; >> + update_lastdev(conf); > > update_lastdev is called in raid1_spare_active, raid1_run and > raid1_reshape. Could you explain the reason why it needs to call this > function? Is it the reason you want to clear LastDev flag? If so, is > it a right place to do it? As your commit message says, it will be > cleared after retry metadata successfully. In raid1, is it the right > place that fixes read/write successfully? The intention was to set LastDev only when the rdev is the last device in the array. As suggested in review, I will check whether it is the last device when md_error is called, instead of using a flag. As a result, update_lastdev() will be removed, and the purpose and behavior of LastDev will remain unchanged in v5. Thanks, Akagi > > Best Regards > Xiao > >> spin_unlock_irqrestore(&conf->device_lock, flags); >> >> print_conf(conf); >> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev) >> rcu_assign_pointer(conf->thread, NULL); >> mddev->private = conf; >> set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); >> + update_lastdev(conf); >> >> md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); >> >> @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev) >> >> spin_lock_irqsave(&conf->device_lock, flags); >> mddev->degraded += (raid_disks - conf->raid_disks); >> + update_lastdev(conf); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> conf->raid_disks = mddev->raid_disks = raid_disks; >> mddev->delta_disks = 0; >> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c >> index b60c30bfb6c7..dc4edd4689f8 100644 >> --- a/drivers/md/raid10.c >> +++ b/drivers/md/raid10.c >> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore) >> _enough(conf, 1, ignore); >> } >> >> +/** >> + * update_lastdev - Set or clear LastDev flag for all rdevs in array >> + * @conf: pointer to r10conf >> + * >> + * Sets LastDev if the device is In_sync and cannot be lost for the array. >> + * Otherwise, clear it. >> + * >> + * Caller must hold ->reconfig_mutex or ->device_lock. >> + */ >> +static void update_lastdev(struct r10conf *conf) >> +{ >> + int i; >> + int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks); >> + >> + for (i = 0; i < raid_disks; i++) { >> + struct md_rdev *rdev = conf->mirrors[i].rdev; >> + >> + if (rdev) { >> + if (test_bit(In_sync, &rdev->flags) && >> + !enough(conf, i)) >> + set_bit(LastDev, &rdev->flags); >> + else >> + clear_bit(LastDev, &rdev->flags); >> + } >> + } >> +} >> + >> /** >> * raid10_error() - RAID10 error handler. >> * @mddev: affected md device. >> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) >> return; >> } >> } >> - if (test_and_clear_bit(In_sync, &rdev->flags)) >> + if (test_and_clear_bit(In_sync, &rdev->flags)) { >> mddev->degraded++; >> + update_lastdev(conf); >> + } >> >> set_bit(MD_RECOVERY_INTR, &mddev->recovery); >> set_bit(Blocked, &rdev->flags); >> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev) >> } >> spin_lock_irqsave(&conf->device_lock, flags); >> mddev->degraded -= count; >> + update_lastdev(conf); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> >> print_conf(conf); >> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev) >> md_set_array_sectors(mddev, size); >> mddev->resync_max_sectors = size; >> set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); >> + update_lastdev(conf); >> >> if (md_integrity_register(mddev)) >> goto out_free_conf; >> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev) >> */ >> spin_lock_irq(&conf->device_lock); >> mddev->degraded = calc_degraded(conf); >> + update_lastdev(conf); >> spin_unlock_irq(&conf->device_lock); >> mddev->raid_disks = conf->geo.raid_disks; >> mddev->reshape_position = conf->reshape_progress; >> -- >> 2.50.1 >> >> > >
Hi, 在 2025/09/15 11:42, Kenta Akagi 写道: > Currently, the LastDev flag is set on an rdev that failed a failfast > metadata write and called md_error, but did not become Faulty. It is > cleared when the metadata write retry succeeds. This has problems for > the following reasons: > > * Despite its name, the flag is only set during a metadata write window. > * Unlike when LastDev and Failfast was introduced, md_error on the last > rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set, > the array is already unwritable. > > A following commit will prevent failfast bios from breaking the array, > which requires knowing from outside the personality whether an rdev is > the last one. For that purpose, LastDev should be set on rdevs that must > not be lost. > > This commit ensures that LastDev is set on the indispensable rdev in a > degraded RAID1/10 array. > > Signed-off-by: Kenta Akagi <k@mgml.me> > --- > drivers/md/md.c | 4 +--- > drivers/md/md.h | 6 +++--- > drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++++- > drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++- > 4 files changed, 70 insertions(+), 8 deletions(-) > After md_error() is serialized, why not check if rdev is the last rdev from md_bio_failure_error() in patch 3 directly? I think it is cleaner and code changes should be much less to add a new method like pers->lastdev. Thanks, Kuai > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 4e033c26fdd4..268410b66b83 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio) > if (!test_bit(Faulty, &rdev->flags) > && (bio->bi_opf & MD_FAILFAST)) { > set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); > - set_bit(LastDev, &rdev->flags); > } > - } else > - clear_bit(LastDev, &rdev->flags); > + } > > bio_put(bio); > > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 51af29a03079..ec598f9a8381 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -281,9 +281,9 @@ enum flag_bits { > * It is expects that no bad block log > * is present. > */ > - LastDev, /* Seems to be the last working dev as > - * it didn't fail, so don't use FailFast > - * any more for metadata > + LastDev, /* This is the last working rdev. > + * so don't use FailFast any more for > + * metadata. > */ > CollisionCheck, /* > * check if there is collision between raid1 > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > index bf44878ec640..32ad6b102ff7 100644 > --- a/drivers/md/raid1.c > +++ b/drivers/md/raid1.c > @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) > seq_printf(seq, "]"); > } > > +/** > + * update_lastdev - Set or clear LastDev flag for all rdevs in array > + * @conf: pointer to r1conf > + * > + * Sets LastDev if the device is In_sync and cannot be lost for the array. > + * Otherwise, clear it. > + * > + * Caller must hold ->device_lock. > + */ > +static void update_lastdev(struct r1conf *conf) > +{ > + int i; > + int alive_disks = conf->raid_disks - conf->mddev->degraded; > + > + for (i = 0; i < conf->raid_disks; i++) { > + struct md_rdev *rdev = conf->mirrors[i].rdev; > + > + if (rdev) { > + if (test_bit(In_sync, &rdev->flags) && > + alive_disks == 1) > + set_bit(LastDev, &rdev->flags); > + else > + clear_bit(LastDev, &rdev->flags); > + } > + } > +} > + > /** > * raid1_error() - RAID1 error handler. > * @mddev: affected md device. > @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) > } > } > set_bit(Blocked, &rdev->flags); > - if (test_and_clear_bit(In_sync, &rdev->flags)) > + if (test_and_clear_bit(In_sync, &rdev->flags)) { > mddev->degraded++; > + update_lastdev(conf); > + } > set_bit(Faulty, &rdev->flags); > spin_unlock_irqrestore(&conf->device_lock, flags); > /* > @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev) > } > } > mddev->degraded -= count; > + update_lastdev(conf); > spin_unlock_irqrestore(&conf->device_lock, flags); > > print_conf(conf); > @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev) > rcu_assign_pointer(conf->thread, NULL); > mddev->private = conf; > set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); > + update_lastdev(conf); > > md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); > > @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev) > > spin_lock_irqsave(&conf->device_lock, flags); > mddev->degraded += (raid_disks - conf->raid_disks); > + update_lastdev(conf); > spin_unlock_irqrestore(&conf->device_lock, flags); > conf->raid_disks = mddev->raid_disks = raid_disks; > mddev->delta_disks = 0; > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > index b60c30bfb6c7..dc4edd4689f8 100644 > --- a/drivers/md/raid10.c > +++ b/drivers/md/raid10.c > @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore) > _enough(conf, 1, ignore); > } > > +/** > + * update_lastdev - Set or clear LastDev flag for all rdevs in array > + * @conf: pointer to r10conf > + * > + * Sets LastDev if the device is In_sync and cannot be lost for the array. > + * Otherwise, clear it. > + * > + * Caller must hold ->reconfig_mutex or ->device_lock. > + */ > +static void update_lastdev(struct r10conf *conf) > +{ > + int i; > + int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks); > + > + for (i = 0; i < raid_disks; i++) { > + struct md_rdev *rdev = conf->mirrors[i].rdev; > + > + if (rdev) { > + if (test_bit(In_sync, &rdev->flags) && > + !enough(conf, i)) > + set_bit(LastDev, &rdev->flags); > + else > + clear_bit(LastDev, &rdev->flags); > + } > + } > +} > + > /** > * raid10_error() - RAID10 error handler. > * @mddev: affected md device. > @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) > return; > } > } > - if (test_and_clear_bit(In_sync, &rdev->flags)) > + if (test_and_clear_bit(In_sync, &rdev->flags)) { > mddev->degraded++; > + update_lastdev(conf); > + } > > set_bit(MD_RECOVERY_INTR, &mddev->recovery); > set_bit(Blocked, &rdev->flags); > @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev) > } > spin_lock_irqsave(&conf->device_lock, flags); > mddev->degraded -= count; > + update_lastdev(conf); > spin_unlock_irqrestore(&conf->device_lock, flags); > > print_conf(conf); > @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev) > md_set_array_sectors(mddev, size); > mddev->resync_max_sectors = size; > set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); > + update_lastdev(conf); > > if (md_integrity_register(mddev)) > goto out_free_conf; > @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev) > */ > spin_lock_irq(&conf->device_lock); > mddev->degraded = calc_degraded(conf); > + update_lastdev(conf); > spin_unlock_irq(&conf->device_lock); > mddev->raid_disks = conf->geo.raid_disks; > mddev->reshape_position = conf->reshape_progress; >
Hi, Thank you for reviewing. On 2025/09/18 10:00, Yu Kuai wrote: > Hi, > > 在 2025/09/15 11:42, Kenta Akagi 写道: >> Currently, the LastDev flag is set on an rdev that failed a failfast >> metadata write and called md_error, but did not become Faulty. It is >> cleared when the metadata write retry succeeds. This has problems for >> the following reasons: >> >> * Despite its name, the flag is only set during a metadata write window. >> * Unlike when LastDev and Failfast was introduced, md_error on the last >> rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set, >> the array is already unwritable. >> >> A following commit will prevent failfast bios from breaking the array, >> which requires knowing from outside the personality whether an rdev is >> the last one. For that purpose, LastDev should be set on rdevs that must >> not be lost. >> >> This commit ensures that LastDev is set on the indispensable rdev in a >> degraded RAID1/10 array. >> >> Signed-off-by: Kenta Akagi <k@mgml.me> >> --- >> drivers/md/md.c | 4 +--- >> drivers/md/md.h | 6 +++--- >> drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++++- >> drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++- >> 4 files changed, 70 insertions(+), 8 deletions(-) >> > After md_error() is serialized, why not check if rdev is the last rdev > from md_bio_failure_error() in patch 3 directly? I think it is cleaner > and code changes should be much less to add a new method like > pers->lastdev. That makes sense. I'll proceed that way. Thanks, Akagi > > Thanks, > Kuai > >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 4e033c26fdd4..268410b66b83 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio) >> if (!test_bit(Faulty, &rdev->flags) >> && (bio->bi_opf & MD_FAILFAST)) { >> set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); >> - set_bit(LastDev, &rdev->flags); >> } >> - } else >> - clear_bit(LastDev, &rdev->flags); >> + } >> bio_put(bio); >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 51af29a03079..ec598f9a8381 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -281,9 +281,9 @@ enum flag_bits { >> * It is expects that no bad block log >> * is present. >> */ >> - LastDev, /* Seems to be the last working dev as >> - * it didn't fail, so don't use FailFast >> - * any more for metadata >> + LastDev, /* This is the last working rdev. >> + * so don't use FailFast any more for >> + * metadata. >> */ >> CollisionCheck, /* >> * check if there is collision between raid1 >> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c >> index bf44878ec640..32ad6b102ff7 100644 >> --- a/drivers/md/raid1.c >> +++ b/drivers/md/raid1.c >> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) >> seq_printf(seq, "]"); >> } >> +/** >> + * update_lastdev - Set or clear LastDev flag for all rdevs in array >> + * @conf: pointer to r1conf >> + * >> + * Sets LastDev if the device is In_sync and cannot be lost for the array. >> + * Otherwise, clear it. >> + * >> + * Caller must hold ->device_lock. >> + */ >> +static void update_lastdev(struct r1conf *conf) >> +{ >> + int i; >> + int alive_disks = conf->raid_disks - conf->mddev->degraded; >> + >> + for (i = 0; i < conf->raid_disks; i++) { >> + struct md_rdev *rdev = conf->mirrors[i].rdev; >> + >> + if (rdev) { >> + if (test_bit(In_sync, &rdev->flags) && >> + alive_disks == 1) >> + set_bit(LastDev, &rdev->flags); >> + else >> + clear_bit(LastDev, &rdev->flags); >> + } >> + } >> +} >> + >> /** >> * raid1_error() - RAID1 error handler. >> * @mddev: affected md device. >> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) >> } >> } >> set_bit(Blocked, &rdev->flags); >> - if (test_and_clear_bit(In_sync, &rdev->flags)) >> + if (test_and_clear_bit(In_sync, &rdev->flags)) { >> mddev->degraded++; >> + update_lastdev(conf); >> + } >> set_bit(Faulty, &rdev->flags); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> /* >> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev) >> } >> } >> mddev->degraded -= count; >> + update_lastdev(conf); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> print_conf(conf); >> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev) >> rcu_assign_pointer(conf->thread, NULL); >> mddev->private = conf; >> set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); >> + update_lastdev(conf); >> md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); >> @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev) >> spin_lock_irqsave(&conf->device_lock, flags); >> mddev->degraded += (raid_disks - conf->raid_disks); >> + update_lastdev(conf); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> conf->raid_disks = mddev->raid_disks = raid_disks; >> mddev->delta_disks = 0; >> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c >> index b60c30bfb6c7..dc4edd4689f8 100644 >> --- a/drivers/md/raid10.c >> +++ b/drivers/md/raid10.c >> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore) >> _enough(conf, 1, ignore); >> } >> +/** >> + * update_lastdev - Set or clear LastDev flag for all rdevs in array >> + * @conf: pointer to r10conf >> + * >> + * Sets LastDev if the device is In_sync and cannot be lost for the array. >> + * Otherwise, clear it. >> + * >> + * Caller must hold ->reconfig_mutex or ->device_lock. >> + */ >> +static void update_lastdev(struct r10conf *conf) >> +{ >> + int i; >> + int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks); >> + >> + for (i = 0; i < raid_disks; i++) { >> + struct md_rdev *rdev = conf->mirrors[i].rdev; >> + >> + if (rdev) { >> + if (test_bit(In_sync, &rdev->flags) && >> + !enough(conf, i)) >> + set_bit(LastDev, &rdev->flags); >> + else >> + clear_bit(LastDev, &rdev->flags); >> + } >> + } >> +} >> + >> /** >> * raid10_error() - RAID10 error handler. >> * @mddev: affected md device. >> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) >> return; >> } >> } >> - if (test_and_clear_bit(In_sync, &rdev->flags)) >> + if (test_and_clear_bit(In_sync, &rdev->flags)) { >> mddev->degraded++; >> + update_lastdev(conf); >> + } >> set_bit(MD_RECOVERY_INTR, &mddev->recovery); >> set_bit(Blocked, &rdev->flags); >> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev) >> } >> spin_lock_irqsave(&conf->device_lock, flags); >> mddev->degraded -= count; >> + update_lastdev(conf); >> spin_unlock_irqrestore(&conf->device_lock, flags); >> print_conf(conf); >> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev) >> md_set_array_sectors(mddev, size); >> mddev->resync_max_sectors = size; >> set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); >> + update_lastdev(conf); >> if (md_integrity_register(mddev)) >> goto out_free_conf; >> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev) >> */ >> spin_lock_irq(&conf->device_lock); >> mddev->degraded = calc_degraded(conf); >> + update_lastdev(conf); >> spin_unlock_irq(&conf->device_lock); >> mddev->raid_disks = conf->geo.raid_disks; >> mddev->reshape_position = conf->reshape_progress; >> > >
© 2016 - 2025 Red Hat, Inc.