md: ensure consistent action state in md_do_sync

[PATCH] md: ensure consistent action state in md_do_sync

Posted by linan666@huaweicloud.com 1 month ago

From: Li Nan <linan122@huawei.com>

The 'mddev->recovery' flags can change during md_do_sync(), leading to
inconsistencies. For example, starting with MD_RECOVERY_RECOVER and
ending with MD_RECOVERY_SYNC can cause incorrect offset updates.

To avoid this, use the 'action' determined at the beginning of the
function instead of repeatedly checking 'mddev->recovery'.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6828a569e819..67cda9b64c87 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9516,7 +9516,7 @@ void md_do_sync(struct md_thread *thread)
 
 		skipped = 0;
 
-		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+		if (action != ACTION_RESHAPE &&
 		    ((mddev->curr_resync > mddev->curr_resync_completed &&
 		      (mddev->curr_resync - mddev->curr_resync_completed)
 		      > (max_sectors >> 4)) ||
@@ -9529,8 +9529,7 @@ void md_do_sync(struct md_thread *thread)
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
-			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
-			    j > mddev->resync_offset)
+			if (action == ACTION_RESYNC && j > mddev->resync_offset)
 				mddev->resync_offset = j;
 			update_time = jiffies;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
@@ -9646,7 +9645,7 @@ void md_do_sync(struct md_thread *thread)
 	blk_finish_plug(&plug);
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
-	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	if (action != ACTION_RESHAPE &&
 	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
 		mddev->curr_resync_completed = mddev->curr_resync;
@@ -9654,9 +9653,8 @@ void md_do_sync(struct md_thread *thread)
 	}
 	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
 
-	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+	if (action != ACTION_CHECK && mddev->curr_resync > MD_RESYNC_ACTIVE) {
+		if (action == ACTION_RESYNC) {
 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				if (mddev->curr_resync >= mddev->resync_offset) {
 					pr_debug("md: checkpointing %s of %s.\n",
@@ -9674,8 +9672,7 @@ void md_do_sync(struct md_thread *thread)
 		} else {
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 				mddev->curr_resync = MaxSector;
-			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
+			if (action == ACTION_RECOVER) {
 				rcu_read_lock();
 				rdev_for_each_rcu(rdev, mddev)
 					if (mddev->delta_disks >= 0 &&
@@ -9692,7 +9689,7 @@ void md_do_sync(struct md_thread *thread)
 	set_mask_bits(&mddev->sb_flags, 0,
 		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
 
-	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	if (action == ACTION_RESHAPE &&
 			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 			mddev->delta_disks > 0 &&
 			mddev->pers->finish_reshape &&
@@ -9709,10 +9706,10 @@ void md_do_sync(struct md_thread *thread)
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 		/* We completed so min/max setting can be forgotten if used. */
-		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		if (action == ACTION_REPAIR)
 			mddev->resync_min = 0;
 		mddev->resync_max = MaxSector;
-	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+	} else if (action == ACTION_REPAIR)
 		mddev->resync_min = mddev->curr_resync_completed;
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	mddev->curr_resync = MD_RESYNC_NONE;
-- 
2.39.2

Re: [PATCH] md: ensure consistent action state in md_do_sync

Posted by Paul Menzel 1 month ago

Dear Nan,


Thank you for your patch.

Am 30.08.25 um 11:05 schrieb linan666@huaweicloud.com:
> From: Li Nan <linan122@huawei.com>
> 
> The 'mddev->recovery' flags can change during md_do_sync(), leading to
> inconsistencies. For example, starting with MD_RECOVERY_RECOVER and
> ending with MD_RECOVERY_SYNC can cause incorrect offset updates.

Can you give a concrete example?

> To avoid this, use the 'action' determined at the beginning of the
> function instead of repeatedly checking 'mddev->recovery'.

Do you have a reproducer?

Add a Fixes: tag?

> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/md.c | 21 +++++++++------------
>   1 file changed, 9 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 6828a569e819..67cda9b64c87 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -9516,7 +9516,7 @@ void md_do_sync(struct md_thread *thread)
>   
>   		skipped = 0;
>   
> -		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> +		if (action != ACTION_RESHAPE &&
>   		    ((mddev->curr_resync > mddev->curr_resync_completed &&
>   		      (mddev->curr_resync - mddev->curr_resync_completed)
>   		      > (max_sectors >> 4)) ||
> @@ -9529,8 +9529,7 @@ void md_do_sync(struct md_thread *thread)
>   			wait_event(mddev->recovery_wait,
>   				   atomic_read(&mddev->recovery_active) == 0);
>   			mddev->curr_resync_completed = j;
> -			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
> -			    j > mddev->resync_offset)
> +			if (action == ACTION_RESYNC && j > mddev->resync_offset)
>   				mddev->resync_offset = j;
>   			update_time = jiffies;
>   			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
> @@ -9646,7 +9645,7 @@ void md_do_sync(struct md_thread *thread)
>   	blk_finish_plug(&plug);
>   	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
>   
> -	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> +	if (action != ACTION_RESHAPE &&
>   	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
>   	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
>   		mddev->curr_resync_completed = mddev->curr_resync;
> @@ -9654,9 +9653,8 @@ void md_do_sync(struct md_thread *thread)
>   	}
>   	mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
>   
> -	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
> -	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
> -		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
> +	if (action != ACTION_CHECK && mddev->curr_resync > MD_RESYNC_ACTIVE) {
> +		if (action == ACTION_RESYNC) {
>   			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
>   				if (mddev->curr_resync >= mddev->resync_offset) {
>   					pr_debug("md: checkpointing %s of %s.\n",
> @@ -9674,8 +9672,7 @@ void md_do_sync(struct md_thread *thread)
>   		} else {
>   			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
>   				mddev->curr_resync = MaxSector;
> -			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> -			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
> +			if (action == ACTION_RECOVER) {

What about `MD_RECOVERY_RESHAPE`?

>   				rcu_read_lock();
>   				rdev_for_each_rcu(rdev, mddev)
>   					if (mddev->delta_disks >= 0 &&
> @@ -9692,7 +9689,7 @@ void md_do_sync(struct md_thread *thread)
>   	set_mask_bits(&mddev->sb_flags, 0,
>   		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
>   
> -	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
> +	if (action == ACTION_RESHAPE &&
>   			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
>   			mddev->delta_disks > 0 &&
>   			mddev->pers->finish_reshape &&
> @@ -9709,10 +9706,10 @@ void md_do_sync(struct md_thread *thread)
>   	spin_lock(&mddev->lock);
>   	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
>   		/* We completed so min/max setting can be forgotten if used. */
> -		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
> +		if (action == ACTION_REPAIR)
>   			mddev->resync_min = 0;
>   		mddev->resync_max = MaxSector;
> -	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
> +	} else if (action == ACTION_REPAIR)
>   		mddev->resync_min = mddev->curr_resync_completed;
>   	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
>   	mddev->curr_resync = MD_RESYNC_NONE;

I have not fully grogged yet, what the consequence of a mismatch between 
`action`, set at the beginning, and changed flags in `&mddev->recovery` are.


Kind regards,

Paul

Re: [PATCH] md: ensure consistent action state in md_do_sync

Posted by Li Nan 1 month ago


在 2025/8/30 17:51, Paul Menzel 写道:
> Dear Nan,
> 
> 
> Thank you for your patch.
> 
> Am 30.08.25 um 11:05 schrieb linan666@huaweicloud.com:
>> From: Li Nan <linan122@huawei.com>
>>
>> The 'mddev->recovery' flags can change during md_do_sync(), leading to
>> inconsistencies. For example, starting with MD_RECOVERY_RECOVER and
>> ending with MD_RECOVERY_SYNC can cause incorrect offset updates.
> 
> Can you give a concrete example?
> 

T1					T2
md_do_sync
  action = ACTION_RECOVER
					(write sysfs)
					action_store
					 set MD_RECOVERY_SYNC
  [ do recovery ]
  update resync_offset

The corresponding code is:
```
         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
             mddev->curr_resync > MD_RESYNC_ACTIVE) {
                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {	->SYNC 
is set, But what we do is recovery
                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                                 if (mddev->curr_resync >= 
mddev->resync_offset) {
                                         pr_debug("md: checkpointing %s of 
%s.\n",
                                                  desc, mdname(mddev));
                                         if (test_bit(MD_RECOVERY_ERROR,
                                                 &mddev->recovery))
                                                 mddev->resync_offset =
 
mddev->curr_resync_completed;
                                         else
                                                 mddev->resync_offset =
                                                         mddev->curr_resync;
                                 }
```

>> To avoid this, use the 'action' determined at the beginning of the
>> function instead of repeatedly checking 'mddev->recovery'.
> 
> Do you have a reproducer?
> 

I don't have a reproducer because reproducing it requires modifying the
kernel. The approximate steps are:

- Modify the kernel to add a delay before the above check.
- Trigger recovery by removing and adding disks.
- After recovery completes, write to the sysfs interface at the delay point
to set the sync flag.

> Add a Fixes: tag?
> 

I will add it in v2.

>> Signed-off-by: Li Nan <linan122@huawei.com>
>> ---
>>   drivers/md/md.c | 21 +++++++++------------
>>   1 file changed, 9 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 6828a569e819..67cda9b64c87 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -9516,7 +9516,7 @@ void md_do_sync(struct md_thread *thread)
>>           skipped = 0;
>> -        if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>> +        if (action != ACTION_RESHAPE &&
>>               ((mddev->curr_resync > mddev->curr_resync_completed &&
>>                 (mddev->curr_resync - mddev->curr_resync_completed)
>>                 > (max_sectors >> 4)) ||
>> @@ -9529,8 +9529,7 @@ void md_do_sync(struct md_thread *thread)
>>               wait_event(mddev->recovery_wait,
>>                      atomic_read(&mddev->recovery_active) == 0);
>>               mddev->curr_resync_completed = j;
>> -            if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
>> -                j > mddev->resync_offset)
>> +            if (action == ACTION_RESYNC && j > mddev->resync_offset)
>>                   mddev->resync_offset = j;
>>               update_time = jiffies;
>>               set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>> @@ -9646,7 +9645,7 @@ void md_do_sync(struct md_thread *thread)
>>       blk_finish_plug(&plug);
>>       wait_event(mddev->recovery_wait, 
>> !atomic_read(&mddev->recovery_active));
>> -    if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>> +    if (action != ACTION_RESHAPE &&
>>           !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
>>           mddev->curr_resync >= MD_RESYNC_ACTIVE) {
>>           mddev->curr_resync_completed = mddev->curr_resync;
>> @@ -9654,9 +9653,8 @@ void md_do_sync(struct md_thread *thread)
>>       }
>>       mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
>> -    if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
>> -        mddev->curr_resync > MD_RESYNC_ACTIVE) {
>> -        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
>> +    if (action != ACTION_CHECK && mddev->curr_resync > MD_RESYNC_ACTIVE) {
>> +        if (action == ACTION_RESYNC) {
>>               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
>>                   if (mddev->curr_resync >= mddev->resync_offset) {
>>                       pr_debug("md: checkpointing %s of %s.\n",
>> @@ -9674,8 +9672,7 @@ void md_do_sync(struct md_thread *thread)
>>           } else {
>>               if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
>>                   mddev->curr_resync = MaxSector;
>> -            if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>> -                test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
>> +            if (action == ACTION_RECOVER) {
> 
> What about `MD_RECOVERY_RESHAPE`?
> 
>>                   rcu_read_lock();
>>                   rdev_for_each_rcu(rdev, mddev)
>>                       if (mddev->delta_disks >= 0 &&
>> @@ -9692,7 +9689,7 @@ void md_do_sync(struct md_thread *thread)
>>       set_mask_bits(&mddev->sb_flags, 0,
>>                 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
>> -    if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>> +    if (action == ACTION_RESHAPE &&
>>               !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
>>               mddev->delta_disks > 0 &&
>>               mddev->pers->finish_reshape &&
>> @@ -9709,10 +9706,10 @@ void md_do_sync(struct md_thread *thread)
>>       spin_lock(&mddev->lock);
>>       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
>>           /* We completed so min/max setting can be forgotten if used. */
>> -        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
>> +        if (action == ACTION_REPAIR)
>>               mddev->resync_min = 0;
>>           mddev->resync_max = MaxSector;
>> -    } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
>> +    } else if (action == ACTION_REPAIR)
>>           mddev->resync_min = mddev->curr_resync_completed;
>>       set_bit(MD_RECOVERY_DONE, &mddev->recovery);
>>       mddev->curr_resync = MD_RESYNC_NONE;
> 
> I have not fully grogged yet, what the consequence of a mismatch between 
> `action`, set at the beginning, and changed flags in `&mddev->recovery` are.
> 
> 
> Kind regards,
> 
> Paul
> 
> .

-- 
Thanks,
Nan

Re: [PATCH] md: ensure consistent action state in md_do_sync

Posted by Li Nan 1 month ago


在 2025/9/1 10:16, Li Nan 写道:
> 
> 
> 在 2025/8/30 17:51, Paul Menzel 写道:
>> Dear Nan,
>>
>>
>> Thank you for your patch.
>>
>> Am 30.08.25 um 11:05 schrieb linan666@huaweicloud.com:
>>> From: Li Nan <linan122@huawei.com>
>>>
>>> The 'mddev->recovery' flags can change during md_do_sync(), leading to
>>> inconsistencies. For example, starting with MD_RECOVERY_RECOVER and
>>> ending with MD_RECOVERY_SYNC can cause incorrect offset updates.
>>
>> Can you give a concrete example?
>>
> 
> T1                    T2
> md_do_sync
>   action = ACTION_RECOVER
>                      (write sysfs)
>                      action_store
>                       set MD_RECOVERY_SYNC
>   [ do recovery ]
>   update resync_offset
> 
> The corresponding code is:
> ```
>          if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
>              mddev->curr_resync > MD_RESYNC_ACTIVE) {
>                  if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {    
> ->SYNC is set, But what we do is recovery
>                          if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
>                                  if (mddev->curr_resync >= 
> mddev->resync_offset) {
>                                          pr_debug("md: checkpointing %s of 
> %s.\n",
>                                                   desc, mdname(mddev));
>                                          if (test_bit(MD_RECOVERY_ERROR,
>                                                  &mddev->recovery))
>                                                  mddev->resync_offset =
> 
> mddev->curr_resync_completed;
>                                          else
>                                                  mddev->resync_offset =
>                                                          mddev->curr_resync;
>                                  }
> ```
> 
>>> To avoid this, use the 'action' determined at the beginning of the
>>> function instead of repeatedly checking 'mddev->recovery'.
>>
>> Do you have a reproducer?
>>
> 
> I don't have a reproducer because reproducing it requires modifying the
> kernel. The approximate steps are:
> 
> - Modify the kernel to add a delay before the above check.
> - Trigger recovery by removing and adding disks.
> - After recovery completes, write to the sysfs interface at the delay point
> to set the sync flag.
> 

Please ignore my previous reply — it was wrong. When MD_RECOVERY_RUNNING
is set, the recovery state should not be changed, so this is just a
cleanup. I will further improve the code about sync finish in v2.

-- 
Thanks,
Nan