drivers/md/md.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-)
From: Li Nan <linan122@huawei.com>
The 'mddev->recovery' flags can change during md_do_sync(), leading to
inconsistencies. For example, starting with MD_RECOVERY_RECOVER and
ending with MD_RECOVERY_SYNC can cause incorrect offset updates.
To avoid this, use the 'action' determined at the beginning of the
function instead of repeatedly checking 'mddev->recovery'.
Signed-off-by: Li Nan <linan122@huawei.com>
---
drivers/md/md.c | 21 +++++++++------------
1 file changed, 9 insertions(+), 12 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6828a569e819..67cda9b64c87 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9516,7 +9516,7 @@ void md_do_sync(struct md_thread *thread)
skipped = 0;
- if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ if (action != ACTION_RESHAPE &&
((mddev->curr_resync > mddev->curr_resync_completed &&
(mddev->curr_resync - mddev->curr_resync_completed)
> (max_sectors >> 4)) ||
@@ -9529,8 +9529,7 @@ void md_do_sync(struct md_thread *thread)
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
- j > mddev->resync_offset)
+ if (action == ACTION_RESYNC && j > mddev->resync_offset)
mddev->resync_offset = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
@@ -9646,7 +9645,7 @@ void md_do_sync(struct md_thread *thread)
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
- if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ if (action != ACTION_RESHAPE &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync >= MD_RESYNC_ACTIVE) {
mddev->curr_resync_completed = mddev->curr_resync;
@@ -9654,9 +9653,8 @@ void md_do_sync(struct md_thread *thread)
}
mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
- if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
- mddev->curr_resync > MD_RESYNC_ACTIVE) {
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (action != ACTION_CHECK && mddev->curr_resync > MD_RESYNC_ACTIVE) {
+ if (action == ACTION_RESYNC) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->resync_offset) {
pr_debug("md: checkpointing %s of %s.\n",
@@ -9674,8 +9672,7 @@ void md_do_sync(struct md_thread *thread)
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
- if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
+ if (action == ACTION_RECOVER) {
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (mddev->delta_disks >= 0 &&
@@ -9692,7 +9689,7 @@ void md_do_sync(struct md_thread *thread)
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ if (action == ACTION_RESHAPE &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->delta_disks > 0 &&
mddev->pers->finish_reshape &&
@@ -9709,10 +9706,10 @@ void md_do_sync(struct md_thread *thread)
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
- if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ if (action == ACTION_REPAIR)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
- } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ } else if (action == ACTION_REPAIR)
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = MD_RESYNC_NONE;
--
2.39.2
Dear Nan, Thank you for your patch. Am 30.08.25 um 11:05 schrieb linan666@huaweicloud.com: > From: Li Nan <linan122@huawei.com> > > The 'mddev->recovery' flags can change during md_do_sync(), leading to > inconsistencies. For example, starting with MD_RECOVERY_RECOVER and > ending with MD_RECOVERY_SYNC can cause incorrect offset updates. Can you give a concrete example? > To avoid this, use the 'action' determined at the beginning of the > function instead of repeatedly checking 'mddev->recovery'. Do you have a reproducer? Add a Fixes: tag? > Signed-off-by: Li Nan <linan122@huawei.com> > --- > drivers/md/md.c | 21 +++++++++------------ > 1 file changed, 9 insertions(+), 12 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 6828a569e819..67cda9b64c87 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -9516,7 +9516,7 @@ void md_do_sync(struct md_thread *thread) > > skipped = 0; > > - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > + if (action != ACTION_RESHAPE && > ((mddev->curr_resync > mddev->curr_resync_completed && > (mddev->curr_resync - mddev->curr_resync_completed) > > (max_sectors >> 4)) || > @@ -9529,8 +9529,7 @@ void md_do_sync(struct md_thread *thread) > wait_event(mddev->recovery_wait, > atomic_read(&mddev->recovery_active) == 0); > mddev->curr_resync_completed = j; > - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && > - j > mddev->resync_offset) > + if (action == ACTION_RESYNC && j > mddev->resync_offset) > mddev->resync_offset = j; > update_time = jiffies; > set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); > @@ -9646,7 +9645,7 @@ void md_do_sync(struct md_thread *thread) > blk_finish_plug(&plug); > wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); > > - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > + if (action != ACTION_RESHAPE && > !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && > mddev->curr_resync >= MD_RESYNC_ACTIVE) { > mddev->curr_resync_completed = mddev->curr_resync; > @@ -9654,9 +9653,8 @@ void md_do_sync(struct md_thread *thread) > } > mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); > > - if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && > - mddev->curr_resync > MD_RESYNC_ACTIVE) { > - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { > + if (action != ACTION_CHECK && mddev->curr_resync > MD_RESYNC_ACTIVE) { > + if (action == ACTION_RESYNC) { > if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { > if (mddev->curr_resync >= mddev->resync_offset) { > pr_debug("md: checkpointing %s of %s.\n", > @@ -9674,8 +9672,7 @@ void md_do_sync(struct md_thread *thread) > } else { > if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) > mddev->curr_resync = MaxSector; > - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > - test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { > + if (action == ACTION_RECOVER) { What about `MD_RECOVERY_RESHAPE`? > rcu_read_lock(); > rdev_for_each_rcu(rdev, mddev) > if (mddev->delta_disks >= 0 && > @@ -9692,7 +9689,7 @@ void md_do_sync(struct md_thread *thread) > set_mask_bits(&mddev->sb_flags, 0, > BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); > > - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && > + if (action == ACTION_RESHAPE && > !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && > mddev->delta_disks > 0 && > mddev->pers->finish_reshape && > @@ -9709,10 +9706,10 @@ void md_do_sync(struct md_thread *thread) > spin_lock(&mddev->lock); > if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { > /* We completed so min/max setting can be forgotten if used. */ > - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) > + if (action == ACTION_REPAIR) > mddev->resync_min = 0; > mddev->resync_max = MaxSector; > - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) > + } else if (action == ACTION_REPAIR) > mddev->resync_min = mddev->curr_resync_completed; > set_bit(MD_RECOVERY_DONE, &mddev->recovery); > mddev->curr_resync = MD_RESYNC_NONE; I have not fully grogged yet, what the consequence of a mismatch between `action`, set at the beginning, and changed flags in `&mddev->recovery` are. Kind regards, Paul
在 2025/8/30 17:51, Paul Menzel 写道: > Dear Nan, > > > Thank you for your patch. > > Am 30.08.25 um 11:05 schrieb linan666@huaweicloud.com: >> From: Li Nan <linan122@huawei.com> >> >> The 'mddev->recovery' flags can change during md_do_sync(), leading to >> inconsistencies. For example, starting with MD_RECOVERY_RECOVER and >> ending with MD_RECOVERY_SYNC can cause incorrect offset updates. > > Can you give a concrete example? > T1 T2 md_do_sync action = ACTION_RECOVER (write sysfs) action_store set MD_RECOVERY_SYNC [ do recovery ] update resync_offset The corresponding code is: ``` if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { ->SYNC is set, But what we do is recovery if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->resync_offset) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) mddev->resync_offset = mddev->curr_resync_completed; else mddev->resync_offset = mddev->curr_resync; } ``` >> To avoid this, use the 'action' determined at the beginning of the >> function instead of repeatedly checking 'mddev->recovery'. > > Do you have a reproducer? > I don't have a reproducer because reproducing it requires modifying the kernel. The approximate steps are: - Modify the kernel to add a delay before the above check. - Trigger recovery by removing and adding disks. - After recovery completes, write to the sysfs interface at the delay point to set the sync flag. > Add a Fixes: tag? > I will add it in v2. >> Signed-off-by: Li Nan <linan122@huawei.com> >> --- >> drivers/md/md.c | 21 +++++++++------------ >> 1 file changed, 9 insertions(+), 12 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 6828a569e819..67cda9b64c87 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -9516,7 +9516,7 @@ void md_do_sync(struct md_thread *thread) >> skipped = 0; >> - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && >> + if (action != ACTION_RESHAPE && >> ((mddev->curr_resync > mddev->curr_resync_completed && >> (mddev->curr_resync - mddev->curr_resync_completed) >> > (max_sectors >> 4)) || >> @@ -9529,8 +9529,7 @@ void md_do_sync(struct md_thread *thread) >> wait_event(mddev->recovery_wait, >> atomic_read(&mddev->recovery_active) == 0); >> mddev->curr_resync_completed = j; >> - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && >> - j > mddev->resync_offset) >> + if (action == ACTION_RESYNC && j > mddev->resync_offset) >> mddev->resync_offset = j; >> update_time = jiffies; >> set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); >> @@ -9646,7 +9645,7 @@ void md_do_sync(struct md_thread *thread) >> blk_finish_plug(&plug); >> wait_event(mddev->recovery_wait, >> !atomic_read(&mddev->recovery_active)); >> - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && >> + if (action != ACTION_RESHAPE && >> !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && >> mddev->curr_resync >= MD_RESYNC_ACTIVE) { >> mddev->curr_resync_completed = mddev->curr_resync; >> @@ -9654,9 +9653,8 @@ void md_do_sync(struct md_thread *thread) >> } >> mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); >> - if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && >> - mddev->curr_resync > MD_RESYNC_ACTIVE) { >> - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { >> + if (action != ACTION_CHECK && mddev->curr_resync > MD_RESYNC_ACTIVE) { >> + if (action == ACTION_RESYNC) { >> if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { >> if (mddev->curr_resync >= mddev->resync_offset) { >> pr_debug("md: checkpointing %s of %s.\n", >> @@ -9674,8 +9672,7 @@ void md_do_sync(struct md_thread *thread) >> } else { >> if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) >> mddev->curr_resync = MaxSector; >> - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && >> - test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { >> + if (action == ACTION_RECOVER) { > > What about `MD_RECOVERY_RESHAPE`? > >> rcu_read_lock(); >> rdev_for_each_rcu(rdev, mddev) >> if (mddev->delta_disks >= 0 && >> @@ -9692,7 +9689,7 @@ void md_do_sync(struct md_thread *thread) >> set_mask_bits(&mddev->sb_flags, 0, >> BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); >> - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && >> + if (action == ACTION_RESHAPE && >> !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && >> mddev->delta_disks > 0 && >> mddev->pers->finish_reshape && >> @@ -9709,10 +9706,10 @@ void md_do_sync(struct md_thread *thread) >> spin_lock(&mddev->lock); >> if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { >> /* We completed so min/max setting can be forgotten if used. */ >> - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) >> + if (action == ACTION_REPAIR) >> mddev->resync_min = 0; >> mddev->resync_max = MaxSector; >> - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) >> + } else if (action == ACTION_REPAIR) >> mddev->resync_min = mddev->curr_resync_completed; >> set_bit(MD_RECOVERY_DONE, &mddev->recovery); >> mddev->curr_resync = MD_RESYNC_NONE; > > I have not fully grogged yet, what the consequence of a mismatch between > `action`, set at the beginning, and changed flags in `&mddev->recovery` are. > > > Kind regards, > > Paul > > . -- Thanks, Nan
在 2025/9/1 10:16, Li Nan 写道: > > > 在 2025/8/30 17:51, Paul Menzel 写道: >> Dear Nan, >> >> >> Thank you for your patch. >> >> Am 30.08.25 um 11:05 schrieb linan666@huaweicloud.com: >>> From: Li Nan <linan122@huawei.com> >>> >>> The 'mddev->recovery' flags can change during md_do_sync(), leading to >>> inconsistencies. For example, starting with MD_RECOVERY_RECOVER and >>> ending with MD_RECOVERY_SYNC can cause incorrect offset updates. >> >> Can you give a concrete example? >> > > T1 T2 > md_do_sync > action = ACTION_RECOVER > (write sysfs) > action_store > set MD_RECOVERY_SYNC > [ do recovery ] > update resync_offset > > The corresponding code is: > ``` > if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && > mddev->curr_resync > MD_RESYNC_ACTIVE) { > if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { > ->SYNC is set, But what we do is recovery > if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { > if (mddev->curr_resync >= > mddev->resync_offset) { > pr_debug("md: checkpointing %s of > %s.\n", > desc, mdname(mddev)); > if (test_bit(MD_RECOVERY_ERROR, > &mddev->recovery)) > mddev->resync_offset = > > mddev->curr_resync_completed; > else > mddev->resync_offset = > mddev->curr_resync; > } > ``` > >>> To avoid this, use the 'action' determined at the beginning of the >>> function instead of repeatedly checking 'mddev->recovery'. >> >> Do you have a reproducer? >> > > I don't have a reproducer because reproducing it requires modifying the > kernel. The approximate steps are: > > - Modify the kernel to add a delay before the above check. > - Trigger recovery by removing and adding disks. > - After recovery completes, write to the sysfs interface at the delay point > to set the sync flag. > Please ignore my previous reply — it was wrong. When MD_RECOVERY_RUNNING is set, the recovery state should not be changed, so this is just a cleanup. I will further improve the code about sync finish in v2. -- Thanks, Nan
© 2016 - 2025 Red Hat, Inc.