From: Zheng Qixing <zhengqixing@huawei.com>
During raid resync, if a disk becomes faulty, the operation is
briefly interrupted. The MD_RECOVERY_RECOVER flag triggered by
the disk failure causes sync_action to incorrectly show "recover"
instead of "resync". The same issue affects reshape operations.
Reproduction steps:
mdadm -Cv /dev/md1 -l1 -n4 -e1.2 /dev/sd{a..d} // -> resync happended
mdadm -f /dev/md1 /dev/sda // -> resync interrupted
cat sync_action
-> recover
Add progress checks in md_sync_action() for resync/recover/reshape
to ensure the interface correctly reports the actual operation type.
Fixes: 4b10a3bc67c1 ("md: ensure resync is prioritized over recovery")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
---
drivers/md/md.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4ea956a80343..798428d0870b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4845,9 +4845,34 @@ static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
return false;
}
+static enum sync_action md_get_active_sync_action(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ bool is_recover = false;
+
+ if (mddev->resync_offset < MaxSector)
+ return ACTION_RESYNC;
+
+ if (mddev->reshape_position != MaxSector)
+ return ACTION_RESHAPE;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->raid_disk >= 0 &&
+ rdev_needs_recovery(rdev, MaxSector)) {
+ is_recover = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return is_recover ? ACTION_RECOVER : ACTION_IDLE;
+}
+
enum sync_action md_sync_action(struct mddev *mddev)
{
unsigned long recovery = mddev->recovery;
+ enum sync_action active_action;
/*
* frozen has the highest priority, means running sync_thread will be
@@ -4871,8 +4896,17 @@ enum sync_action md_sync_action(struct mddev *mddev)
!test_bit(MD_RECOVERY_NEEDED, &recovery))
return ACTION_IDLE;
- if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
- mddev->reshape_position != MaxSector)
+ /*
+ * Check if any sync operation (resync/recover/reshape) is
+ * currently active. This ensures that only one sync operation
+ * can run at a time. Returns the type of active operation, or
+ * ACTION_IDLE if none are active.
+ */
+ active_action = md_get_active_sync_action(mddev);
+ if (active_action != ACTION_IDLE)
+ return active_action;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return ACTION_RESHAPE;
if (test_bit(MD_RECOVERY_RECOVER, &recovery))
--
2.39.2
Dear Zheng,
Thank you for your patch.
Am 12.08.25 um 04:17 schrieb Zheng Qixing:
> From: Zheng Qixing <zhengqixing@huawei.com>
>
> During raid resync, if a disk becomes faulty, the operation is
> briefly interrupted. The MD_RECOVERY_RECOVER flag triggered by
> the disk failure causes sync_action to incorrectly show "recover"
> instead of "resync". The same issue affects reshape operations.
>
> Reproduction steps:
> mdadm -Cv /dev/md1 -l1 -n4 -e1.2 /dev/sd{a..d} // -> resync happended
> mdadm -f /dev/md1 /dev/sda // -> resync interrupted
> cat sync_action
> -> recover
>
> Add progress checks in md_sync_action() for resync/recover/reshape
> to ensure the interface correctly reports the actual operation type.
>
> Fixes: 4b10a3bc67c1 ("md: ensure resync is prioritized over recovery")
> Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
> ---
> drivers/md/md.c | 38 ++++++++++++++++++++++++++++++++++++--
> 1 file changed, 36 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4ea956a80343..798428d0870b 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -4845,9 +4845,34 @@ static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
> return false;
> }
>
> +static enum sync_action md_get_active_sync_action(struct mddev *mddev)
> +{
> + struct md_rdev *rdev;
> + bool is_recover = false;
`is_recover` sounds strange to me, but I am not an expert with the code.
Maybe `needs_recovery`?
> +
> + if (mddev->resync_offset < MaxSector)
> + return ACTION_RESYNC;
> +
> + if (mddev->reshape_position != MaxSector)
> + return ACTION_RESHAPE;
> +
> + rcu_read_lock();
> + rdev_for_each_rcu(rdev, mddev) {
> + if (rdev->raid_disk >= 0 &&
> + rdev_needs_recovery(rdev, MaxSector)) {
> + is_recover = true;
> + break;
> + }
> + }
> + rcu_read_unlock();
> +
> + return is_recover ? ACTION_RECOVER : ACTION_IDLE;
> +}
> +
> enum sync_action md_sync_action(struct mddev *mddev)
> {
> unsigned long recovery = mddev->recovery;
> + enum sync_action active_action;
>
> /*
> * frozen has the highest priority, means running sync_thread will be
> @@ -4871,8 +4896,17 @@ enum sync_action md_sync_action(struct mddev *mddev)
> !test_bit(MD_RECOVERY_NEEDED, &recovery))
> return ACTION_IDLE;
>
> - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
> - mddev->reshape_position != MaxSector)
> + /*
> + * Check if any sync operation (resync/recover/reshape) is
> + * currently active. This ensures that only one sync operation
> + * can run at a time. Returns the type of active operation, or
> + * ACTION_IDLE if none are active.
> + */
> + active_action = md_get_active_sync_action(mddev);
> + if (active_action != ACTION_IDLE)
> + return active_action;
> +
> + if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
> return ACTION_RESHAPE;
>
> if (test_bit(MD_RECOVERY_RECOVER, &recovery))
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Kind regards,
Paul
Hi,
在 2025/8/12 17:22, Paul Menzel 写道:
> Dear Zheng,
>
>
> Thank you for your patch.
>
> Am 12.08.25 um 04:17 schrieb Zheng Qixing:
>> From: Zheng Qixing <zhengqixing@huawei.com>
>>
>> During raid resync, if a disk becomes faulty, the operation is
>> briefly interrupted. The MD_RECOVERY_RECOVER flag triggered by
>> the disk failure causes sync_action to incorrectly show "recover"
>> instead of "resync". The same issue affects reshape operations.
>>
>> Reproduction steps:
>> mdadm -Cv /dev/md1 -l1 -n4 -e1.2 /dev/sd{a..d} // -> resync happended
>> mdadm -f /dev/md1 /dev/sda // -> resync
>> interrupted
>> cat sync_action
>> -> recover
>>
>> Add progress checks in md_sync_action() for resync/recover/reshape
>> to ensure the interface correctly reports the actual operation type.
>>
>> Fixes: 4b10a3bc67c1 ("md: ensure resync is prioritized over recovery")
>> Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
>> ---
>> drivers/md/md.c | 38 ++++++++++++++++++++++++++++++++++++--
>> 1 file changed, 36 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 4ea956a80343..798428d0870b 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -4845,9 +4845,34 @@ static bool rdev_needs_recovery(struct md_rdev
>> *rdev, sector_t sectors)
>> return false;
>> }
>> +static enum sync_action md_get_active_sync_action(struct mddev
>> *mddev)
>> +{
>> + struct md_rdev *rdev;
>> + bool is_recover = false;
>
> `is_recover` sounds strange to me, but I am not an expert with the
> code. Maybe `needs_recovery`?
is_recover is used here to distinguish whether the current sync_action
is a recover, rather than a resync or reshape.
But it's not a big deal, no need to focus on it :)
>
>> +
>> + if (mddev->resync_offset < MaxSector)
>> + return ACTION_RESYNC;
>> +
>> + if (mddev->reshape_position != MaxSector)
>> + return ACTION_RESHAPE;
>> +
>> + rcu_read_lock();
>> + rdev_for_each_rcu(rdev, mddev) {
>> + if (rdev->raid_disk >= 0 &&
>> + rdev_needs_recovery(rdev, MaxSector)) {
>> + is_recover = true;
>> + break;
>> + }
>> + }
>> + rcu_read_unlock();
>> +
>> + return is_recover ? ACTION_RECOVER : ACTION_IDLE;
>> +}
>> +
>> enum sync_action md_sync_action(struct mddev *mddev)
>> {
>> unsigned long recovery = mddev->recovery;
>> + enum sync_action active_action;
>> /*
>> * frozen has the highest priority, means running sync_thread
>> will be
>> @@ -4871,8 +4896,17 @@ enum sync_action md_sync_action(struct mddev
>> *mddev)
>> !test_bit(MD_RECOVERY_NEEDED, &recovery))
>> return ACTION_IDLE;
>> - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
>> - mddev->reshape_position != MaxSector)
>> + /*
>> + * Check if any sync operation (resync/recover/reshape) is
>> + * currently active. This ensures that only one sync operation
>> + * can run at a time. Returns the type of active operation, or
>> + * ACTION_IDLE if none are active.
>> + */
>> + active_action = md_get_active_sync_action(mddev);
>> + if (active_action != ACTION_IDLE)
>> + return active_action;
>> +
>> + if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
>> return ACTION_RESHAPE;
>> if (test_bit(MD_RECOVERY_RECOVER, &recovery))
>
> Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
>
>
> Kind regards,
>
> Paul
Thanks,
Qixing
© 2016 - 2026 Red Hat, Inc.