From: Yu Kuai <yukuai3@huawei.com>
Prepare to handle 'idle' and 'frozen' differently to fix a deadlock, there
are no functional changes except that MD_RECOVERY_RUNNING is checked
again after 'reconfig_mutex' is held.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
drivers/md/md.c | 61 ++++++++++++++++++++++++++++++++++++-------------
1 file changed, 45 insertions(+), 16 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9b97731e1fe4..23e8e7eae062 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4755,6 +4755,46 @@ action_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", type);
}
+static void stop_sync_thread(struct mddev *mddev)
+{
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return;
+
+ if (mddev_lock(mddev))
+ return;
+
+ /*
+ * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
+ * held.
+ */
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ mddev_unlock(mddev);
+ return;
+ }
+
+ if (work_pending(&mddev->del_work))
+ flush_workqueue(md_misc_wq);
+
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ }
+
+ mddev_unlock(mddev);
+}
+
+static void idle_sync_thread(struct mddev *mddev)
+{
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev);
+}
+
+static void frozen_sync_thread(struct mddev *mddev)
+{
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev);
+}
+
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
@@ -4762,22 +4802,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
- if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
- if (cmd_match(page, "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
- mddev_lock(mddev) == 0) {
- if (work_pending(&mddev->del_work))
- flush_workqueue(md_misc_wq);
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
- }
- mddev_unlock(mddev);
- }
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ if (cmd_match(page, "idle"))
+ idle_sync_thread(mddev);
+ else if (cmd_match(page, "frozen"))
+ frozen_sync_thread(mddev);
+ else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
--
2.39.2
在 2023/5/29 下午9:20, Yu Kuai 写道:
> From: Yu Kuai <yukuai3@huawei.com>
>
> Prepare to handle 'idle' and 'frozen' differently to fix a deadlock, there
> are no functional changes except that MD_RECOVERY_RUNNING is checked
> again after 'reconfig_mutex' is held.
Can you explain more about why it needs to check MD_RECOVERY_RUNNING
again here?
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
> drivers/md/md.c | 61 ++++++++++++++++++++++++++++++++++++-------------
> 1 file changed, 45 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 9b97731e1fe4..23e8e7eae062 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -4755,6 +4755,46 @@ action_show(struct mddev *mddev, char *page)
> return sprintf(page, "%s\n", type);
> }
>
> +static void stop_sync_thread(struct mddev *mddev)
> +{
> + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
> + return;
> +
> + if (mddev_lock(mddev))
> + return;
> +
> + /*
> + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
> + * held.
> + */
> + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
> + mddev_unlock(mddev);
> + return;
> + }
> +
> + if (work_pending(&mddev->del_work))
> + flush_workqueue(md_misc_wq);
> +
> + if (mddev->sync_thread) {
> + set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> + md_reap_sync_thread(mddev);
> + }
> +
> + mddev_unlock(mddev);
> +}
> +
> +static void idle_sync_thread(struct mddev *mddev)
> +{
> + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> + stop_sync_thread(mddev);
> +}
> +
> +static void frozen_sync_thread(struct mddev *mddev)
> +{
> + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> + stop_sync_thread(mddev);
> +}
> +
> static ssize_t
> action_store(struct mddev *mddev, const char *page, size_t len)
> {
> @@ -4762,22 +4802,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
> return -EINVAL;
>
>
> - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
> - if (cmd_match(page, "frozen"))
> - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> - else
> - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
> - mddev_lock(mddev) == 0) {
> - if (work_pending(&mddev->del_work))
> - flush_workqueue(md_misc_wq);
> - if (mddev->sync_thread) {
> - set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> - md_reap_sync_thread(mddev);
> - }
> - mddev_unlock(mddev);
> - }
> - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
> + if (cmd_match(page, "idle"))
> + idle_sync_thread(mddev);
> + else if (cmd_match(page, "frozen"))
> + frozen_sync_thread(mddev);
> + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
> return -EBUSY;
> else if (cmd_match(page, "resync"))
> clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
Hi,
在 2023/06/13 16:02, Xiao Ni 写道:
>
> 在 2023/5/29 下午9:20, Yu Kuai 写道:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Prepare to handle 'idle' and 'frozen' differently to fix a deadlock,
>> there
>> are no functional changes except that MD_RECOVERY_RUNNING is checked
>> again after 'reconfig_mutex' is held.
>
>
> Can you explain more about why it needs to check MD_RECOVERY_RUNNING
> again here?
As I explain in the following comment:
>> + /*
>> + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
>> + * held.
>> + */
>> + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
>> + mddev_unlock(mddev);
>> + return;
>> + }
Thanks,
Kuai
On Tue, Jun 13, 2023 at 8:00 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/06/13 16:02, Xiao Ni 写道:
> >
> > 在 2023/5/29 下午9:20, Yu Kuai 写道:
> >> From: Yu Kuai <yukuai3@huawei.com>
> >>
> >> Prepare to handle 'idle' and 'frozen' differently to fix a deadlock,
> >> there
> >> are no functional changes except that MD_RECOVERY_RUNNING is checked
> >> again after 'reconfig_mutex' is held.
> >
> >
> > Can you explain more about why it needs to check MD_RECOVERY_RUNNING
> > again here?
>
> As I explain in the following comment:
Hi
Who can clear the flag before the lock is held?
Regards
Xiao
> >> + /*
> >> + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
> >> + * held.
> >> + */
> >> + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
> >> + mddev_unlock(mddev);
> >> + return;
> >> + }
>
> Thanks,
> Kuai
>
Hi,
在 2023/06/13 20:25, Xiao Ni 写道:
> On Tue, Jun 13, 2023 at 8:00 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi,
>>
>> 在 2023/06/13 16:02, Xiao Ni 写道:
>>>
>>> 在 2023/5/29 下午9:20, Yu Kuai 写道:
>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>
>>>> Prepare to handle 'idle' and 'frozen' differently to fix a deadlock,
>>>> there
>>>> are no functional changes except that MD_RECOVERY_RUNNING is checked
>>>> again after 'reconfig_mutex' is held.
>>>
>>>
>>> Can you explain more about why it needs to check MD_RECOVERY_RUNNING
>>> again here?
>>
>> As I explain in the following comment:
>
> Hi
>
> Who can clear the flag before the lock is held?
Basically every where that can clear the flag...
// This context // Other context
mutex_lock
...
test_bit -> pass
clear_bit
mutex_unlock
mutex_lock
test_bit -> check again
Thanks,
Kuai
>
> Regards
> Xiao
>>>> + /*
>>>> + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
>>>> + * held.
>>>> + */
>>>> + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
>>>> + mddev_unlock(mddev);
>>>> + return;
>>>> + }
>>
>> Thanks,
>> Kuai
>>
>
> .
>
在 2023/6/13 下午8:44, Yu Kuai 写道:
> Hi,
>
> 在 2023/06/13 20:25, Xiao Ni 写道:
>> On Tue, Jun 13, 2023 at 8:00 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>
>>> Hi,
>>>
>>> 在 2023/06/13 16:02, Xiao Ni 写道:
>>>>
>>>> 在 2023/5/29 下午9:20, Yu Kuai 写道:
>>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>>
>>>>> Prepare to handle 'idle' and 'frozen' differently to fix a deadlock,
>>>>> there
>>>>> are no functional changes except that MD_RECOVERY_RUNNING is checked
>>>>> again after 'reconfig_mutex' is held.
>>>>
>>>>
>>>> Can you explain more about why it needs to check MD_RECOVERY_RUNNING
>>>> again here?
>>>
>>> As I explain in the following comment:
>>
>> Hi
>>
>> Who can clear the flag before the lock is held?
>
> Basically every where that can clear the flag...
>
> // This context // Other context
> mutex_lock
> ...
> test_bit -> pass
> clear_bit
> mutex_unlock
> mutex_lock
> test_bit -> check again
>
> Thanks,
> Kuai
At first, I wanted to figure out a specific case. Now I have the answer.
Maybe there are two people that want to stop
the sync action at the same time. So this is the case that can be
checked by the codes.
Regards
Xiao
>>
>> Regards
>> Xiao
>>>>> + /*
>>>>> + * Check again in case MD_RECOVERY_RUNNING is cleared before
>>>>> lock is
>>>>> + * held.
>>>>> + */
>>>>> + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
>>>>> + mddev_unlock(mddev);
>>>>> + return;
>>>>> + }
>>>
>>> Thanks,
>>> Kuai
>>>
>>
>> .
>>
>
© 2016 - 2026 Red Hat, Inc.