[v4] Don't set MD_BROKEN on failfast bio failure

[PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes

Posted by Kenta Akagi 4 months, 3 weeks ago

Currently, the LastDev flag is set on an rdev that failed a failfast
metadata write and called md_error, but did not become Faulty. It is
cleared when the metadata write retry succeeds. This has problems for
the following reasons:

* Despite its name, the flag is only set during a metadata write window.
* Unlike when LastDev and Failfast was introduced, md_error on the last
  rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
  the array is already unwritable.

A following commit will prevent failfast bios from breaking the array,
which requires knowing from outside the personality whether an rdev is
the last one. For that purpose, LastDev should be set on rdevs that must
not be lost.

This commit ensures that LastDev is set on the indispensable rdev in a
degraded RAID1/10 array.

Signed-off-by: Kenta Akagi <k@mgml.me>
---
 drivers/md/md.c     |  4 +---
 drivers/md/md.h     |  6 +++---
 drivers/md/raid1.c  | 34 +++++++++++++++++++++++++++++++++-
 drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
 4 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e033c26fdd4..268410b66b83 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
 		if (!test_bit(Faulty, &rdev->flags)
 		    && (bio->bi_opf & MD_FAILFAST)) {
 			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
-			set_bit(LastDev, &rdev->flags);
 		}
-	} else
-		clear_bit(LastDev, &rdev->flags);
+	}
 
 	bio_put(bio);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 51af29a03079..ec598f9a8381 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -281,9 +281,9 @@ enum flag_bits {
 				 * It is expects that no bad block log
 				 * is present.
 				 */
-	LastDev,		/* Seems to be the last working dev as
-				 * it didn't fail, so don't use FailFast
-				 * any more for metadata
+	LastDev,		/* This is the last working rdev.
+				 * so don't use FailFast any more for
+				 * metadata.
 				 */
 	CollisionCheck,		/*
 				 * check if there is collision between raid1
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bf44878ec640..32ad6b102ff7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
 	seq_printf(seq, "]");
 }
 
+/**
+ * update_lastdev - Set or clear LastDev flag for all rdevs in array
+ * @conf: pointer to r1conf
+ *
+ * Sets LastDev if the device is In_sync and cannot be lost for the array.
+ * Otherwise, clear it.
+ *
+ * Caller must hold ->device_lock.
+ */
+static void update_lastdev(struct r1conf *conf)
+{
+	int i;
+	int alive_disks = conf->raid_disks - conf->mddev->degraded;
+
+	for (i = 0; i < conf->raid_disks; i++) {
+		struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+		if (rdev) {
+			if (test_bit(In_sync, &rdev->flags) &&
+			    alive_disks == 1)
+				set_bit(LastDev, &rdev->flags);
+			else
+				clear_bit(LastDev, &rdev->flags);
+		}
+	}
+}
+
 /**
  * raid1_error() - RAID1 error handler.
  * @mddev: affected md device.
@@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 		}
 	}
 	set_bit(Blocked, &rdev->flags);
-	if (test_and_clear_bit(In_sync, &rdev->flags))
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		mddev->degraded++;
+		update_lastdev(conf);
+	}
 	set_bit(Faulty, &rdev->flags);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	/*
@@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
 		}
 	}
 	mddev->degraded -= count;
+	update_lastdev(conf);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	print_conf(conf);
@@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
 	rcu_assign_pointer(conf->thread, NULL);
 	mddev->private = conf;
 	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	update_lastdev(conf);
 
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
@@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded += (raid_disks - conf->raid_disks);
+	update_lastdev(conf);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b60c30bfb6c7..dc4edd4689f8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
 		_enough(conf, 1, ignore);
 }
 
+/**
+ * update_lastdev - Set or clear LastDev flag for all rdevs in array
+ * @conf: pointer to r10conf
+ *
+ * Sets LastDev if the device is In_sync and cannot be lost for the array.
+ * Otherwise, clear it.
+ *
+ * Caller must hold ->reconfig_mutex or ->device_lock.
+ */
+static void update_lastdev(struct r10conf *conf)
+{
+	int i;
+	int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
+
+	for (i = 0; i < raid_disks; i++) {
+		struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+		if (rdev) {
+			if (test_bit(In_sync, &rdev->flags) &&
+			    !enough(conf, i))
+				set_bit(LastDev, &rdev->flags);
+			else
+				clear_bit(LastDev, &rdev->flags);
+		}
+	}
+}
+
 /**
  * raid10_error() - RAID10 error handler.
  * @mddev: affected md device.
@@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 			return;
 		}
 	}
-	if (test_and_clear_bit(In_sync, &rdev->flags))
+	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		mddev->degraded++;
+		update_lastdev(conf);
+	}
 
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(Blocked, &rdev->flags);
@@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
 	}
 	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded -= count;
+	update_lastdev(conf);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	print_conf(conf);
@@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
 	md_set_array_sectors(mddev, size);
 	mddev->resync_max_sectors = size;
 	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	update_lastdev(conf);
 
 	if (md_integrity_register(mddev))
 		goto out_free_conf;
@@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 	 */
 	spin_lock_irq(&conf->device_lock);
 	mddev->degraded = calc_degraded(conf);
+	update_lastdev(conf);
 	spin_unlock_irq(&conf->device_lock);
 	mddev->raid_disks = conf->geo.raid_disks;
 	mddev->reshape_position = conf->reshape_progress;
-- 
2.50.1

Re: [PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes

Posted by Xiao Ni 4 months, 3 weeks ago

Hi Kenta

On Mon, Sep 15, 2025 at 11:44 AM Kenta Akagi <k@mgml.me> wrote:
>
> Currently, the LastDev flag is set on an rdev that failed a failfast
> metadata write and called md_error, but did not become Faulty. It is
> cleared when the metadata write retry succeeds. This has problems for
> the following reasons:
>
> * Despite its name, the flag is only set during a metadata write window.
> * Unlike when LastDev and Failfast was introduced, md_error on the last
>   rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
>   the array is already unwritable.
>
> A following commit will prevent failfast bios from breaking the array,
> which requires knowing from outside the personality whether an rdev is
> the last one. For that purpose, LastDev should be set on rdevs that must
> not be lost.
>
> This commit ensures that LastDev is set on the indispensable rdev in a
> degraded RAID1/10 array.
>
> Signed-off-by: Kenta Akagi <k@mgml.me>
> ---
>  drivers/md/md.c     |  4 +---
>  drivers/md/md.h     |  6 +++---
>  drivers/md/raid1.c  | 34 +++++++++++++++++++++++++++++++++-
>  drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
>  4 files changed, 70 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4e033c26fdd4..268410b66b83 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
>                 if (!test_bit(Faulty, &rdev->flags)
>                     && (bio->bi_opf & MD_FAILFAST)) {
>                         set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
> -                       set_bit(LastDev, &rdev->flags);
>                 }
> -       } else
> -               clear_bit(LastDev, &rdev->flags);
> +       }
>
>         bio_put(bio);
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 51af29a03079..ec598f9a8381 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -281,9 +281,9 @@ enum flag_bits {
>                                  * It is expects that no bad block log
>                                  * is present.
>                                  */
> -       LastDev,                /* Seems to be the last working dev as
> -                                * it didn't fail, so don't use FailFast
> -                                * any more for metadata
> +       LastDev,                /* This is the last working rdev.
> +                                * so don't use FailFast any more for
> +                                * metadata.
>                                  */
>         CollisionCheck,         /*
>                                  * check if there is collision between raid1
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index bf44878ec640..32ad6b102ff7 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
>         seq_printf(seq, "]");
>  }
>
> +/**
> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
> + * @conf: pointer to r1conf
> + *
> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
> + * Otherwise, clear it.
> + *
> + * Caller must hold ->device_lock.
> + */
> +static void update_lastdev(struct r1conf *conf)
> +{
> +       int i;
> +       int alive_disks = conf->raid_disks - conf->mddev->degraded;
> +
> +       for (i = 0; i < conf->raid_disks; i++) {
> +               struct md_rdev *rdev = conf->mirrors[i].rdev;
> +
> +               if (rdev) {
> +                       if (test_bit(In_sync, &rdev->flags) &&
> +                           alive_disks == 1)
> +                               set_bit(LastDev, &rdev->flags);
> +                       else
> +                               clear_bit(LastDev, &rdev->flags);
> +               }
> +       }
> +}
> +
>  /**
>   * raid1_error() - RAID1 error handler.
>   * @mddev: affected md device.
> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
>                 }
>         }
>         set_bit(Blocked, &rdev->flags);
> -       if (test_and_clear_bit(In_sync, &rdev->flags))
> +       if (test_and_clear_bit(In_sync, &rdev->flags)) {
>                 mddev->degraded++;
> +               update_lastdev(conf);
> +       }
>         set_bit(Faulty, &rdev->flags);
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>         /*
> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
>                 }
>         }
>         mddev->degraded -= count;
> +       update_lastdev(conf);

update_lastdev is called in raid1_spare_active, raid1_run and
raid1_reshape. Could you explain the reason why it needs to call this
function? Is it the reason you want to clear LastDev flag? If so, is
it a right place to do it? As your commit message says, it will be
cleared after retry metadata successfully. In raid1, is it the right
place that fixes read/write successfully?

Best Regards
Xiao

>         spin_unlock_irqrestore(&conf->device_lock, flags);
>
>         print_conf(conf);
> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
>         rcu_assign_pointer(conf->thread, NULL);
>         mddev->private = conf;
>         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
> +       update_lastdev(conf);
>
>         md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
>
> @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
>
>         spin_lock_irqsave(&conf->device_lock, flags);
>         mddev->degraded += (raid_disks - conf->raid_disks);
> +       update_lastdev(conf);
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>         conf->raid_disks = mddev->raid_disks = raid_disks;
>         mddev->delta_disks = 0;
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index b60c30bfb6c7..dc4edd4689f8 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
>                 _enough(conf, 1, ignore);
>  }
>
> +/**
> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
> + * @conf: pointer to r10conf
> + *
> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
> + * Otherwise, clear it.
> + *
> + * Caller must hold ->reconfig_mutex or ->device_lock.
> + */
> +static void update_lastdev(struct r10conf *conf)
> +{
> +       int i;
> +       int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
> +
> +       for (i = 0; i < raid_disks; i++) {
> +               struct md_rdev *rdev = conf->mirrors[i].rdev;
> +
> +               if (rdev) {
> +                       if (test_bit(In_sync, &rdev->flags) &&
> +                           !enough(conf, i))
> +                               set_bit(LastDev, &rdev->flags);
> +                       else
> +                               clear_bit(LastDev, &rdev->flags);
> +               }
> +       }
> +}
> +
>  /**
>   * raid10_error() - RAID10 error handler.
>   * @mddev: affected md device.
> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
>                         return;
>                 }
>         }
> -       if (test_and_clear_bit(In_sync, &rdev->flags))
> +       if (test_and_clear_bit(In_sync, &rdev->flags)) {
>                 mddev->degraded++;
> +               update_lastdev(conf);
> +       }
>
>         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>         set_bit(Blocked, &rdev->flags);
> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
>         }
>         spin_lock_irqsave(&conf->device_lock, flags);
>         mddev->degraded -= count;
> +       update_lastdev(conf);
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>
>         print_conf(conf);
> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
>         md_set_array_sectors(mddev, size);
>         mddev->resync_max_sectors = size;
>         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
> +       update_lastdev(conf);
>
>         if (md_integrity_register(mddev))
>                 goto out_free_conf;
> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
>          */
>         spin_lock_irq(&conf->device_lock);
>         mddev->degraded = calc_degraded(conf);
> +       update_lastdev(conf);
>         spin_unlock_irq(&conf->device_lock);
>         mddev->raid_disks = conf->geo.raid_disks;
>         mddev->reshape_position = conf->reshape_progress;
> --
> 2.50.1
>
>

Re: [PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes

Posted by Kenta Akagi 4 months, 2 weeks ago

Hi,
Thank you for review.

On 2025/09/21 16:54, Xiao Ni wrote:
> Hi Kenta
> 
> On Mon, Sep 15, 2025 at 11:44 AM Kenta Akagi <k@mgml.me> wrote:
>>
>> Currently, the LastDev flag is set on an rdev that failed a failfast
>> metadata write and called md_error, but did not become Faulty. It is
>> cleared when the metadata write retry succeeds. This has problems for
>> the following reasons:
>>
>> * Despite its name, the flag is only set during a metadata write window.
>> * Unlike when LastDev and Failfast was introduced, md_error on the last
>>   rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
>>   the array is already unwritable.
>>
>> A following commit will prevent failfast bios from breaking the array,
>> which requires knowing from outside the personality whether an rdev is
>> the last one. For that purpose, LastDev should be set on rdevs that must
>> not be lost.
>>
>> This commit ensures that LastDev is set on the indispensable rdev in a
>> degraded RAID1/10 array.
>>
>> Signed-off-by: Kenta Akagi <k@mgml.me>
>> ---
>>  drivers/md/md.c     |  4 +---
>>  drivers/md/md.h     |  6 +++---
>>  drivers/md/raid1.c  | 34 +++++++++++++++++++++++++++++++++-
>>  drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
>>  4 files changed, 70 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 4e033c26fdd4..268410b66b83 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
>>                 if (!test_bit(Faulty, &rdev->flags)
>>                     && (bio->bi_opf & MD_FAILFAST)) {
>>                         set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
>> -                       set_bit(LastDev, &rdev->flags);
>>                 }
>> -       } else
>> -               clear_bit(LastDev, &rdev->flags);
>> +       }
>>
>>         bio_put(bio);
>>
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 51af29a03079..ec598f9a8381 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -281,9 +281,9 @@ enum flag_bits {
>>                                  * It is expects that no bad block log
>>                                  * is present.
>>                                  */
>> -       LastDev,                /* Seems to be the last working dev as
>> -                                * it didn't fail, so don't use FailFast
>> -                                * any more for metadata
>> +       LastDev,                /* This is the last working rdev.
>> +                                * so don't use FailFast any more for
>> +                                * metadata.
>>                                  */
>>         CollisionCheck,         /*
>>                                  * check if there is collision between raid1
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index bf44878ec640..32ad6b102ff7 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
>>         seq_printf(seq, "]");
>>  }
>>
>> +/**
>> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
>> + * @conf: pointer to r1conf
>> + *
>> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
>> + * Otherwise, clear it.
>> + *
>> + * Caller must hold ->device_lock.
>> + */
>> +static void update_lastdev(struct r1conf *conf)
>> +{
>> +       int i;
>> +       int alive_disks = conf->raid_disks - conf->mddev->degraded;
>> +
>> +       for (i = 0; i < conf->raid_disks; i++) {
>> +               struct md_rdev *rdev = conf->mirrors[i].rdev;
>> +
>> +               if (rdev) {
>> +                       if (test_bit(In_sync, &rdev->flags) &&
>> +                           alive_disks == 1)
>> +                               set_bit(LastDev, &rdev->flags);
>> +                       else
>> +                               clear_bit(LastDev, &rdev->flags);
>> +               }
>> +       }
>> +}
>> +
>>  /**
>>   * raid1_error() - RAID1 error handler.
>>   * @mddev: affected md device.
>> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
>>                 }
>>         }
>>         set_bit(Blocked, &rdev->flags);
>> -       if (test_and_clear_bit(In_sync, &rdev->flags))
>> +       if (test_and_clear_bit(In_sync, &rdev->flags)) {
>>                 mddev->degraded++;
>> +               update_lastdev(conf);
>> +       }
>>         set_bit(Faulty, &rdev->flags);
>>         spin_unlock_irqrestore(&conf->device_lock, flags);
>>         /*
>> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
>>                 }
>>         }
>>         mddev->degraded -= count;
>> +       update_lastdev(conf);
> 
> update_lastdev is called in raid1_spare_active, raid1_run and
> raid1_reshape. Could you explain the reason why it needs to call this
> function? Is it the reason you want to clear LastDev flag? If so, is
> it a right place to do it? As your commit message says, it will be
> cleared after retry metadata successfully. In raid1, is it the right
> place that fixes read/write successfully?

The intention was to set LastDev only when the rdev is the last device in the array.

As suggested in review, I will check whether it is the last device when
md_error is called, instead of using a flag.
As a result, update_lastdev() will be removed, and the purpose and behavior
of LastDev will remain unchanged in v5.

Thanks,
Akagi

> 
> Best Regards
> Xiao
> 
>>         spin_unlock_irqrestore(&conf->device_lock, flags);
>>
>>         print_conf(conf);
>> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
>>         rcu_assign_pointer(conf->thread, NULL);
>>         mddev->private = conf;
>>         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
>> +       update_lastdev(conf);
>>
>>         md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
>>
>> @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
>>
>>         spin_lock_irqsave(&conf->device_lock, flags);
>>         mddev->degraded += (raid_disks - conf->raid_disks);
>> +       update_lastdev(conf);
>>         spin_unlock_irqrestore(&conf->device_lock, flags);
>>         conf->raid_disks = mddev->raid_disks = raid_disks;
>>         mddev->delta_disks = 0;
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index b60c30bfb6c7..dc4edd4689f8 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
>>                 _enough(conf, 1, ignore);
>>  }
>>
>> +/**
>> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
>> + * @conf: pointer to r10conf
>> + *
>> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
>> + * Otherwise, clear it.
>> + *
>> + * Caller must hold ->reconfig_mutex or ->device_lock.
>> + */
>> +static void update_lastdev(struct r10conf *conf)
>> +{
>> +       int i;
>> +       int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
>> +
>> +       for (i = 0; i < raid_disks; i++) {
>> +               struct md_rdev *rdev = conf->mirrors[i].rdev;
>> +
>> +               if (rdev) {
>> +                       if (test_bit(In_sync, &rdev->flags) &&
>> +                           !enough(conf, i))
>> +                               set_bit(LastDev, &rdev->flags);
>> +                       else
>> +                               clear_bit(LastDev, &rdev->flags);
>> +               }
>> +       }
>> +}
>> +
>>  /**
>>   * raid10_error() - RAID10 error handler.
>>   * @mddev: affected md device.
>> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
>>                         return;
>>                 }
>>         }
>> -       if (test_and_clear_bit(In_sync, &rdev->flags))
>> +       if (test_and_clear_bit(In_sync, &rdev->flags)) {
>>                 mddev->degraded++;
>> +               update_lastdev(conf);
>> +       }
>>
>>         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>>         set_bit(Blocked, &rdev->flags);
>> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
>>         }
>>         spin_lock_irqsave(&conf->device_lock, flags);
>>         mddev->degraded -= count;
>> +       update_lastdev(conf);
>>         spin_unlock_irqrestore(&conf->device_lock, flags);
>>
>>         print_conf(conf);
>> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
>>         md_set_array_sectors(mddev, size);
>>         mddev->resync_max_sectors = size;
>>         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
>> +       update_lastdev(conf);
>>
>>         if (md_integrity_register(mddev))
>>                 goto out_free_conf;
>> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
>>          */
>>         spin_lock_irq(&conf->device_lock);
>>         mddev->degraded = calc_degraded(conf);
>> +       update_lastdev(conf);
>>         spin_unlock_irq(&conf->device_lock);
>>         mddev->raid_disks = conf->geo.raid_disks;
>>         mddev->reshape_position = conf->reshape_progress;
>> --
>> 2.50.1
>>
>>
> 
>

Re: [PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes

Posted by Yu Kuai 4 months, 3 weeks ago

Hi,

在 2025/09/15 11:42, Kenta Akagi 写道:
> Currently, the LastDev flag is set on an rdev that failed a failfast
> metadata write and called md_error, but did not become Faulty. It is
> cleared when the metadata write retry succeeds. This has problems for
> the following reasons:
> 
> * Despite its name, the flag is only set during a metadata write window.
> * Unlike when LastDev and Failfast was introduced, md_error on the last
>    rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
>    the array is already unwritable.
> 
> A following commit will prevent failfast bios from breaking the array,
> which requires knowing from outside the personality whether an rdev is
> the last one. For that purpose, LastDev should be set on rdevs that must
> not be lost.
> 
> This commit ensures that LastDev is set on the indispensable rdev in a
> degraded RAID1/10 array.
> 
> Signed-off-by: Kenta Akagi <k@mgml.me>
> ---
>   drivers/md/md.c     |  4 +---
>   drivers/md/md.h     |  6 +++---
>   drivers/md/raid1.c  | 34 +++++++++++++++++++++++++++++++++-
>   drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
>   4 files changed, 70 insertions(+), 8 deletions(-)
> 
After md_error() is serialized, why not check if rdev is the last rdev
from md_bio_failure_error() in patch 3 directly? I think it is cleaner
and code changes should be much less to add a new method like
pers->lastdev.

Thanks,
Kuai

> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4e033c26fdd4..268410b66b83 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
>   		if (!test_bit(Faulty, &rdev->flags)
>   		    && (bio->bi_opf & MD_FAILFAST)) {
>   			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
> -			set_bit(LastDev, &rdev->flags);
>   		}
> -	} else
> -		clear_bit(LastDev, &rdev->flags);
> +	}
>   
>   	bio_put(bio);
>   
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 51af29a03079..ec598f9a8381 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -281,9 +281,9 @@ enum flag_bits {
>   				 * It is expects that no bad block log
>   				 * is present.
>   				 */
> -	LastDev,		/* Seems to be the last working dev as
> -				 * it didn't fail, so don't use FailFast
> -				 * any more for metadata
> +	LastDev,		/* This is the last working rdev.
> +				 * so don't use FailFast any more for
> +				 * metadata.
>   				 */
>   	CollisionCheck,		/*
>   				 * check if there is collision between raid1
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index bf44878ec640..32ad6b102ff7 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
>   	seq_printf(seq, "]");
>   }
>   
> +/**
> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
> + * @conf: pointer to r1conf
> + *
> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
> + * Otherwise, clear it.
> + *
> + * Caller must hold ->device_lock.
> + */
> +static void update_lastdev(struct r1conf *conf)
> +{
> +	int i;
> +	int alive_disks = conf->raid_disks - conf->mddev->degraded;
> +
> +	for (i = 0; i < conf->raid_disks; i++) {
> +		struct md_rdev *rdev = conf->mirrors[i].rdev;
> +
> +		if (rdev) {
> +			if (test_bit(In_sync, &rdev->flags) &&
> +			    alive_disks == 1)
> +				set_bit(LastDev, &rdev->flags);
> +			else
> +				clear_bit(LastDev, &rdev->flags);
> +		}
> +	}
> +}
> +
>   /**
>    * raid1_error() - RAID1 error handler.
>    * @mddev: affected md device.
> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
>   		}
>   	}
>   	set_bit(Blocked, &rdev->flags);
> -	if (test_and_clear_bit(In_sync, &rdev->flags))
> +	if (test_and_clear_bit(In_sync, &rdev->flags)) {
>   		mddev->degraded++;
> +		update_lastdev(conf);
> +	}
>   	set_bit(Faulty, &rdev->flags);
>   	spin_unlock_irqrestore(&conf->device_lock, flags);
>   	/*
> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
>   		}
>   	}
>   	mddev->degraded -= count;
> +	update_lastdev(conf);
>   	spin_unlock_irqrestore(&conf->device_lock, flags);
>   
>   	print_conf(conf);
> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
>   	rcu_assign_pointer(conf->thread, NULL);
>   	mddev->private = conf;
>   	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
> +	update_lastdev(conf);
>   
>   	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
>   
> @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
>   
>   	spin_lock_irqsave(&conf->device_lock, flags);
>   	mddev->degraded += (raid_disks - conf->raid_disks);
> +	update_lastdev(conf);
>   	spin_unlock_irqrestore(&conf->device_lock, flags);
>   	conf->raid_disks = mddev->raid_disks = raid_disks;
>   	mddev->delta_disks = 0;
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index b60c30bfb6c7..dc4edd4689f8 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
>   		_enough(conf, 1, ignore);
>   }
>   
> +/**
> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
> + * @conf: pointer to r10conf
> + *
> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
> + * Otherwise, clear it.
> + *
> + * Caller must hold ->reconfig_mutex or ->device_lock.
> + */
> +static void update_lastdev(struct r10conf *conf)
> +{
> +	int i;
> +	int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
> +
> +	for (i = 0; i < raid_disks; i++) {
> +		struct md_rdev *rdev = conf->mirrors[i].rdev;
> +
> +		if (rdev) {
> +			if (test_bit(In_sync, &rdev->flags) &&
> +			    !enough(conf, i))
> +				set_bit(LastDev, &rdev->flags);
> +			else
> +				clear_bit(LastDev, &rdev->flags);
> +		}
> +	}
> +}
> +
>   /**
>    * raid10_error() - RAID10 error handler.
>    * @mddev: affected md device.
> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
>   			return;
>   		}
>   	}
> -	if (test_and_clear_bit(In_sync, &rdev->flags))
> +	if (test_and_clear_bit(In_sync, &rdev->flags)) {
>   		mddev->degraded++;
> +		update_lastdev(conf);
> +	}
>   
>   	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>   	set_bit(Blocked, &rdev->flags);
> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
>   	}
>   	spin_lock_irqsave(&conf->device_lock, flags);
>   	mddev->degraded -= count;
> +	update_lastdev(conf);
>   	spin_unlock_irqrestore(&conf->device_lock, flags);
>   
>   	print_conf(conf);
> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
>   	md_set_array_sectors(mddev, size);
>   	mddev->resync_max_sectors = size;
>   	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
> +	update_lastdev(conf);
>   
>   	if (md_integrity_register(mddev))
>   		goto out_free_conf;
> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
>   	 */
>   	spin_lock_irq(&conf->device_lock);
>   	mddev->degraded = calc_degraded(conf);
> +	update_lastdev(conf);
>   	spin_unlock_irq(&conf->device_lock);
>   	mddev->raid_disks = conf->geo.raid_disks;
>   	mddev->reshape_position = conf->reshape_progress;
>

Re: [PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes

Posted by Kenta Akagi 4 months, 3 weeks ago

Hi,
Thank you for reviewing.

On 2025/09/18 10:00, Yu Kuai wrote:
> Hi,
> 
> 在 2025/09/15 11:42, Kenta Akagi 写道:
>> Currently, the LastDev flag is set on an rdev that failed a failfast
>> metadata write and called md_error, but did not become Faulty. It is
>> cleared when the metadata write retry succeeds. This has problems for
>> the following reasons:
>>
>> * Despite its name, the flag is only set during a metadata write window.
>> * Unlike when LastDev and Failfast was introduced, md_error on the last
>>    rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
>>    the array is already unwritable.
>>
>> A following commit will prevent failfast bios from breaking the array,
>> which requires knowing from outside the personality whether an rdev is
>> the last one. For that purpose, LastDev should be set on rdevs that must
>> not be lost.
>>
>> This commit ensures that LastDev is set on the indispensable rdev in a
>> degraded RAID1/10 array.
>>
>> Signed-off-by: Kenta Akagi <k@mgml.me>
>> ---
>>   drivers/md/md.c     |  4 +---
>>   drivers/md/md.h     |  6 +++---
>>   drivers/md/raid1.c  | 34 +++++++++++++++++++++++++++++++++-
>>   drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
>>   4 files changed, 70 insertions(+), 8 deletions(-)
>>
> After md_error() is serialized, why not check if rdev is the last rdev
> from md_bio_failure_error() in patch 3 directly? I think it is cleaner
> and code changes should be much less to add a new method like
> pers->lastdev.

That makes sense. I'll proceed that way.

Thanks,
Akagi

> 
> Thanks,
> Kuai
> 
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 4e033c26fdd4..268410b66b83 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
>>           if (!test_bit(Faulty, &rdev->flags)
>>               && (bio->bi_opf & MD_FAILFAST)) {
>>               set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
>> -            set_bit(LastDev, &rdev->flags);
>>           }
>> -    } else
>> -        clear_bit(LastDev, &rdev->flags);
>> +    }
>>         bio_put(bio);
>>   diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 51af29a03079..ec598f9a8381 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -281,9 +281,9 @@ enum flag_bits {
>>                    * It is expects that no bad block log
>>                    * is present.
>>                    */
>> -    LastDev,        /* Seems to be the last working dev as
>> -                 * it didn't fail, so don't use FailFast
>> -                 * any more for metadata
>> +    LastDev,        /* This is the last working rdev.
>> +                 * so don't use FailFast any more for
>> +                 * metadata.
>>                    */
>>       CollisionCheck,        /*
>>                    * check if there is collision between raid1
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index bf44878ec640..32ad6b102ff7 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
>>       seq_printf(seq, "]");
>>   }
>>   +/**
>> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
>> + * @conf: pointer to r1conf
>> + *
>> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
>> + * Otherwise, clear it.
>> + *
>> + * Caller must hold ->device_lock.
>> + */
>> +static void update_lastdev(struct r1conf *conf)
>> +{
>> +    int i;
>> +    int alive_disks = conf->raid_disks - conf->mddev->degraded;
>> +
>> +    for (i = 0; i < conf->raid_disks; i++) {
>> +        struct md_rdev *rdev = conf->mirrors[i].rdev;
>> +
>> +        if (rdev) {
>> +            if (test_bit(In_sync, &rdev->flags) &&
>> +                alive_disks == 1)
>> +                set_bit(LastDev, &rdev->flags);
>> +            else
>> +                clear_bit(LastDev, &rdev->flags);
>> +        }
>> +    }
>> +}
>> +
>>   /**
>>    * raid1_error() - RAID1 error handler.
>>    * @mddev: affected md device.
>> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
>>           }
>>       }
>>       set_bit(Blocked, &rdev->flags);
>> -    if (test_and_clear_bit(In_sync, &rdev->flags))
>> +    if (test_and_clear_bit(In_sync, &rdev->flags)) {
>>           mddev->degraded++;
>> +        update_lastdev(conf);
>> +    }
>>       set_bit(Faulty, &rdev->flags);
>>       spin_unlock_irqrestore(&conf->device_lock, flags);
>>       /*
>> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
>>           }
>>       }
>>       mddev->degraded -= count;
>> +    update_lastdev(conf);
>>       spin_unlock_irqrestore(&conf->device_lock, flags);
>>         print_conf(conf);
>> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
>>       rcu_assign_pointer(conf->thread, NULL);
>>       mddev->private = conf;
>>       set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
>> +    update_lastdev(conf);
>>         md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
>>   @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
>>         spin_lock_irqsave(&conf->device_lock, flags);
>>       mddev->degraded += (raid_disks - conf->raid_disks);
>> +    update_lastdev(conf);
>>       spin_unlock_irqrestore(&conf->device_lock, flags);
>>       conf->raid_disks = mddev->raid_disks = raid_disks;
>>       mddev->delta_disks = 0;
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index b60c30bfb6c7..dc4edd4689f8 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
>>           _enough(conf, 1, ignore);
>>   }
>>   +/**
>> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
>> + * @conf: pointer to r10conf
>> + *
>> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
>> + * Otherwise, clear it.
>> + *
>> + * Caller must hold ->reconfig_mutex or ->device_lock.
>> + */
>> +static void update_lastdev(struct r10conf *conf)
>> +{
>> +    int i;
>> +    int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
>> +
>> +    for (i = 0; i < raid_disks; i++) {
>> +        struct md_rdev *rdev = conf->mirrors[i].rdev;
>> +
>> +        if (rdev) {
>> +            if (test_bit(In_sync, &rdev->flags) &&
>> +                !enough(conf, i))
>> +                set_bit(LastDev, &rdev->flags);
>> +            else
>> +                clear_bit(LastDev, &rdev->flags);
>> +        }
>> +    }
>> +}
>> +
>>   /**
>>    * raid10_error() - RAID10 error handler.
>>    * @mddev: affected md device.
>> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
>>               return;
>>           }
>>       }
>> -    if (test_and_clear_bit(In_sync, &rdev->flags))
>> +    if (test_and_clear_bit(In_sync, &rdev->flags)) {
>>           mddev->degraded++;
>> +        update_lastdev(conf);
>> +    }
>>         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>>       set_bit(Blocked, &rdev->flags);
>> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
>>       }
>>       spin_lock_irqsave(&conf->device_lock, flags);
>>       mddev->degraded -= count;
>> +    update_lastdev(conf);
>>       spin_unlock_irqrestore(&conf->device_lock, flags);
>>         print_conf(conf);
>> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
>>       md_set_array_sectors(mddev, size);
>>       mddev->resync_max_sectors = size;
>>       set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
>> +    update_lastdev(conf);
>>         if (md_integrity_register(mddev))
>>           goto out_free_conf;
>> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
>>        */
>>       spin_lock_irq(&conf->device_lock);
>>       mddev->degraded = calc_degraded(conf);
>> +    update_lastdev(conf);
>>       spin_unlock_irq(&conf->device_lock);
>>       mddev->raid_disks = conf->geo.raid_disks;
>>       mddev->reshape_position = conf->reshape_progress;
>>
> 
>

[PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes
[PATCH v4 2/9] md: serialize md_error()
[PATCH v4 3/9] md: introduce md_bio_failure_error()
[PATCH v4 4/9] md/raid1,raid10: Don't set MD_BROKEN on failfast bio failure
[PATCH v4 5/9] md/raid1,raid10: Set R{1,10}BIO_Uptodate when successful retry of a failed bio
[PATCH v4 6/9] md/raid1,raid10: Fix missing retries Failfast write bios on no-bbl rdevs
[PATCH v4 7/9] md/raid10: fix failfast read error not rescheduled
[PATCH v4 8/9] md/raid1,raid10: Add error message when setting MD_BROKEN
[PATCH v4 9/9] md/raid1,raid10: Fix: Operation continuing on 0 devices.