From: Zheng Qixing <zhengqixing@huawei.com>
During raid resync, if a disk becomes faulty, the operation is
briefly interrupted. The MD_RECOVERY_RECOVER flag triggered by
the disk failure causes sync_action to incorrectly show "recover"
instead of "resync". The same issue affects reshape operations.
Reproduction steps:
mdadm -Cv /dev/md1 -l1 -n4 -e1.2 /dev/sd{a..d} // -> resync happended
mdadm -f /dev/md1 /dev/sda // -> resync interrupted
cat sync_action
-> recover
Add progress checks in md_sync_action() for resync/recover/reshape
to ensure the interface correctly reports the actual operation type.
Fixes: 4b10a3bc67c1 ("md: ensure resync is prioritized over recovery")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
---
drivers/md/md.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4ea956a80343..798428d0870b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4845,9 +4845,34 @@ static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
return false;
}
+static enum sync_action md_get_active_sync_action(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ bool is_recover = false;
+
+ if (mddev->resync_offset < MaxSector)
+ return ACTION_RESYNC;
+
+ if (mddev->reshape_position != MaxSector)
+ return ACTION_RESHAPE;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->raid_disk >= 0 &&
+ rdev_needs_recovery(rdev, MaxSector)) {
+ is_recover = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return is_recover ? ACTION_RECOVER : ACTION_IDLE;
+}
+
enum sync_action md_sync_action(struct mddev *mddev)
{
unsigned long recovery = mddev->recovery;
+ enum sync_action active_action;
/*
* frozen has the highest priority, means running sync_thread will be
@@ -4871,8 +4896,17 @@ enum sync_action md_sync_action(struct mddev *mddev)
!test_bit(MD_RECOVERY_NEEDED, &recovery))
return ACTION_IDLE;
- if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
- mddev->reshape_position != MaxSector)
+ /*
+ * Check if any sync operation (resync/recover/reshape) is
+ * currently active. This ensures that only one sync operation
+ * can run at a time. Returns the type of active operation, or
+ * ACTION_IDLE if none are active.
+ */
+ active_action = md_get_active_sync_action(mddev);
+ if (active_action != ACTION_IDLE)
+ return active_action;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return ACTION_RESHAPE;
if (test_bit(MD_RECOVERY_RECOVER, &recovery))
--
2.39.2
Dear Zheng, Thank you for your patch. Am 12.08.25 um 04:17 schrieb Zheng Qixing: > From: Zheng Qixing <zhengqixing@huawei.com> > > During raid resync, if a disk becomes faulty, the operation is > briefly interrupted. The MD_RECOVERY_RECOVER flag triggered by > the disk failure causes sync_action to incorrectly show "recover" > instead of "resync". The same issue affects reshape operations. > > Reproduction steps: > mdadm -Cv /dev/md1 -l1 -n4 -e1.2 /dev/sd{a..d} // -> resync happended > mdadm -f /dev/md1 /dev/sda // -> resync interrupted > cat sync_action > -> recover > > Add progress checks in md_sync_action() for resync/recover/reshape > to ensure the interface correctly reports the actual operation type. > > Fixes: 4b10a3bc67c1 ("md: ensure resync is prioritized over recovery") > Signed-off-by: Zheng Qixing <zhengqixing@huawei.com> > --- > drivers/md/md.c | 38 ++++++++++++++++++++++++++++++++++++-- > 1 file changed, 36 insertions(+), 2 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 4ea956a80343..798428d0870b 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -4845,9 +4845,34 @@ static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) > return false; > } > > +static enum sync_action md_get_active_sync_action(struct mddev *mddev) > +{ > + struct md_rdev *rdev; > + bool is_recover = false; `is_recover` sounds strange to me, but I am not an expert with the code. Maybe `needs_recovery`? > + > + if (mddev->resync_offset < MaxSector) > + return ACTION_RESYNC; > + > + if (mddev->reshape_position != MaxSector) > + return ACTION_RESHAPE; > + > + rcu_read_lock(); > + rdev_for_each_rcu(rdev, mddev) { > + if (rdev->raid_disk >= 0 && > + rdev_needs_recovery(rdev, MaxSector)) { > + is_recover = true; > + break; > + } > + } > + rcu_read_unlock(); > + > + return is_recover ? ACTION_RECOVER : ACTION_IDLE; > +} > + > enum sync_action md_sync_action(struct mddev *mddev) > { > unsigned long recovery = mddev->recovery; > + enum sync_action active_action; > > /* > * frozen has the highest priority, means running sync_thread will be > @@ -4871,8 +4896,17 @@ enum sync_action md_sync_action(struct mddev *mddev) > !test_bit(MD_RECOVERY_NEEDED, &recovery)) > return ACTION_IDLE; > > - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || > - mddev->reshape_position != MaxSector) > + /* > + * Check if any sync operation (resync/recover/reshape) is > + * currently active. This ensures that only one sync operation > + * can run at a time. Returns the type of active operation, or > + * ACTION_IDLE if none are active. > + */ > + active_action = md_get_active_sync_action(mddev); > + if (active_action != ACTION_IDLE) > + return active_action; > + > + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) > return ACTION_RESHAPE; > > if (test_bit(MD_RECOVERY_RECOVER, &recovery)) Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de> Kind regards, Paul
Hi, 在 2025/8/12 17:22, Paul Menzel 写道: > Dear Zheng, > > > Thank you for your patch. > > Am 12.08.25 um 04:17 schrieb Zheng Qixing: >> From: Zheng Qixing <zhengqixing@huawei.com> >> >> During raid resync, if a disk becomes faulty, the operation is >> briefly interrupted. The MD_RECOVERY_RECOVER flag triggered by >> the disk failure causes sync_action to incorrectly show "recover" >> instead of "resync". The same issue affects reshape operations. >> >> Reproduction steps: >> mdadm -Cv /dev/md1 -l1 -n4 -e1.2 /dev/sd{a..d} // -> resync happended >> mdadm -f /dev/md1 /dev/sda // -> resync >> interrupted >> cat sync_action >> -> recover >> >> Add progress checks in md_sync_action() for resync/recover/reshape >> to ensure the interface correctly reports the actual operation type. >> >> Fixes: 4b10a3bc67c1 ("md: ensure resync is prioritized over recovery") >> Signed-off-by: Zheng Qixing <zhengqixing@huawei.com> >> --- >> drivers/md/md.c | 38 ++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 36 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 4ea956a80343..798428d0870b 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -4845,9 +4845,34 @@ static bool rdev_needs_recovery(struct md_rdev >> *rdev, sector_t sectors) >> return false; >> } >> +static enum sync_action md_get_active_sync_action(struct mddev >> *mddev) >> +{ >> + struct md_rdev *rdev; >> + bool is_recover = false; > > `is_recover` sounds strange to me, but I am not an expert with the > code. Maybe `needs_recovery`? is_recover is used here to distinguish whether the current sync_action is a recover, rather than a resync or reshape. But it's not a big deal, no need to focus on it :) > >> + >> + if (mddev->resync_offset < MaxSector) >> + return ACTION_RESYNC; >> + >> + if (mddev->reshape_position != MaxSector) >> + return ACTION_RESHAPE; >> + >> + rcu_read_lock(); >> + rdev_for_each_rcu(rdev, mddev) { >> + if (rdev->raid_disk >= 0 && >> + rdev_needs_recovery(rdev, MaxSector)) { >> + is_recover = true; >> + break; >> + } >> + } >> + rcu_read_unlock(); >> + >> + return is_recover ? ACTION_RECOVER : ACTION_IDLE; >> +} >> + >> enum sync_action md_sync_action(struct mddev *mddev) >> { >> unsigned long recovery = mddev->recovery; >> + enum sync_action active_action; >> /* >> * frozen has the highest priority, means running sync_thread >> will be >> @@ -4871,8 +4896,17 @@ enum sync_action md_sync_action(struct mddev >> *mddev) >> !test_bit(MD_RECOVERY_NEEDED, &recovery)) >> return ACTION_IDLE; >> - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || >> - mddev->reshape_position != MaxSector) >> + /* >> + * Check if any sync operation (resync/recover/reshape) is >> + * currently active. This ensures that only one sync operation >> + * can run at a time. Returns the type of active operation, or >> + * ACTION_IDLE if none are active. >> + */ >> + active_action = md_get_active_sync_action(mddev); >> + if (active_action != ACTION_IDLE) >> + return active_action; >> + >> + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) >> return ACTION_RESHAPE; >> if (test_bit(MD_RECOVERY_RECOVER, &recovery)) > > Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de> > > > Kind regards, > > Paul Thanks, Qixing
© 2016 - 2025 Red Hat, Inc.