From: Yu Kuai <yukuai3@huawei.com>
Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
and io can't be dispatched until 'barrier' is dropped.
Since holding the 'barrier' is not common, convert 'resync_lock' to use
seqlock so that holding lock can be avoided in fast path.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
drivers/md/raid10.c | 62 ++++++++++++++++++++++++++++++---------------
drivers/md/raid10.h | 2 +-
2 files changed, 43 insertions(+), 21 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b70c207f7932..086216b051f5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -930,38 +930,60 @@ static void flush_pending_writes(struct r10conf *conf)
static void raise_barrier(struct r10conf *conf, int force)
{
- spin_lock_irq(&conf->resync_lock);
+ write_seqlock_irq(&conf->resync_lock);
BUG_ON(force && !conf->barrier);
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock);
+ conf->resync_lock.lock);
/* block any new IO from starting */
- conf->barrier++;
+ WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock);
+ conf->resync_lock.lock);
- spin_unlock_irq(&conf->resync_lock);
+ write_sequnlock_irq(&conf->resync_lock);
}
static void lower_barrier(struct r10conf *conf)
{
unsigned long flags;
- spin_lock_irqsave(&conf->resync_lock, flags);
- conf->barrier--;
- spin_unlock_irqrestore(&conf->resync_lock, flags);
+
+ write_seqlock_irqsave(&conf->resync_lock, flags);
+ WRITE_ONCE(conf->barrier, conf->barrier - 1);
+ write_sequnlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
+static bool wait_barrier_nolock(struct r10conf *conf)
+{
+ unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
+
+ if (seq & 1)
+ return false;
+
+ if (READ_ONCE(conf->barrier))
+ return false;
+
+ atomic_inc(&conf->nr_pending);
+ if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
+ return true;
+
+ atomic_dec(&conf->nr_pending);
+ return false;
+}
+
static bool wait_barrier(struct r10conf *conf, bool nowait)
{
bool ret = true;
- spin_lock_irq(&conf->resync_lock);
+ if (wait_barrier_nolock(conf))
+ return true;
+
+ write_seqlock_irq(&conf->resync_lock);
if (conf->barrier) {
struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++;
@@ -992,7 +1014,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
test_bit(MD_RECOVERY_RUNNING,
&conf->mddev->recovery) &&
conf->nr_queued > 0),
- conf->resync_lock);
+ conf->resync_lock.lock);
}
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -1001,7 +1023,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
/* Only increment nr_pending when we wait */
if (ret)
atomic_inc(&conf->nr_pending);
- spin_unlock_irq(&conf->resync_lock);
+ write_sequnlock_irq(&conf->resync_lock);
return ret;
}
@@ -1026,27 +1048,27 @@ static void freeze_array(struct r10conf *conf, int extra)
* must match the number of pending IOs (nr_pending) before
* we continue.
*/
- spin_lock_irq(&conf->resync_lock);
+ write_seqlock_irq(&conf->resync_lock);
conf->array_freeze_pending++;
- conf->barrier++;
+ WRITE_ONCE(conf->barrier, conf->barrier + 1);
conf->nr_waiting++;
wait_event_lock_irq_cmd(conf->wait_barrier,
atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
- conf->resync_lock,
+ conf->resync_lock.lock,
flush_pending_writes(conf));
conf->array_freeze_pending--;
- spin_unlock_irq(&conf->resync_lock);
+ write_sequnlock_irq(&conf->resync_lock);
}
static void unfreeze_array(struct r10conf *conf)
{
/* reverse the effect of the freeze */
- spin_lock_irq(&conf->resync_lock);
- conf->barrier--;
+ write_seqlock_irq(&conf->resync_lock);
+ WRITE_ONCE(conf->barrier, conf->barrier - 1);
conf->nr_waiting--;
wake_up(&conf->wait_barrier);
- spin_unlock_irq(&conf->resync_lock);
+ write_sequnlock_irq(&conf->resync_lock);
}
static sector_t choose_data_offset(struct r10bio *r10_bio,
@@ -4033,7 +4055,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
- spin_lock_init(&conf->resync_lock);
+ seqlock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
atomic_set(&conf->nr_pending, 0);
@@ -4352,7 +4374,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
rdev->new_raid_disk = rdev->raid_disk * 2;
rdev->sectors = size;
}
- conf->barrier = 1;
+ WRITE_ONCE(conf->barrier, 1);
}
return conf;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 5c0804d8bb1f..8c072ce0bc54 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -76,7 +76,7 @@ struct r10conf {
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
- spinlock_t resync_lock;
+ seqlock_t resync_lock;
atomic_t nr_pending;
int nr_waiting;
int nr_queued;
--
2.31.1
Hi,
On 8/29/22 9:15 PM, Yu Kuai wrote:
> +static bool wait_barrier_nolock(struct r10conf *conf)
> +{
> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
> +
> + if (seq & 1)
> + return false;
> +
> + if (READ_ONCE(conf->barrier))
> + return false;
> +
> + atomic_inc(&conf->nr_pending);
> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
I think 'seq' is usually get from read_seqcount_begin.
> + return true;
> +
> + atomic_dec(&conf->nr_pending);
> + return false;
> +
Thanks,
Guoqing
Hi,
在 2022/09/02 17:42, Guoqing Jiang 写道:
> Hi,
>
> On 8/29/22 9:15 PM, Yu Kuai wrote:
>> +static bool wait_barrier_nolock(struct r10conf *conf)
>> +{
>> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
>> +
>> + if (seq & 1)
>> + return false;
>> +
>> + if (READ_ONCE(conf->barrier))
>> + return false;
>> +
>> + atomic_inc(&conf->nr_pending);
>> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
>
> I think 'seq' is usually get from read_seqcount_begin.
read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this
will cause high cpu usage in come cases.
What I try to do here is just try once, and fall back to hold lock and
wait if failed.
What do you think?
Thanks,
Kuai
>
>> + return true;
>> +
>> + atomic_dec(&conf->nr_pending);
>> + return false;
>> +
>
> Thanks,
> Guoqing
> .
>
On 9/2/22 6:02 PM, Yu Kuai wrote:
> Hi,
>
> 在 2022/09/02 17:42, Guoqing Jiang 写道:
>> Hi,
>>
>> On 8/29/22 9:15 PM, Yu Kuai wrote:
>>> +static bool wait_barrier_nolock(struct r10conf *conf)
>>> +{
>>> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
>>> +
>>> + if (seq & 1)
>>> + return false;
>>> +
>>> + if (READ_ONCE(conf->barrier))
>>> + return false;
>>> +
>>> + atomic_inc(&conf->nr_pending);
>>> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
>>
>> I think 'seq' is usually get from read_seqcount_begin.
>
> read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this
> will cause high cpu usage in come cases.
>
> What I try to do here is just try once, and fall back to hold lock and
> wait if failed.
Thanks for the explanation.
I'd suggest to try with read_seqcount_begin/read_seqcount_retry pattern
because it is a common usage in kernel I think, then check whether the
performance drops or not. Maybe it is related to lockdep issue, but I am
not sure.
Thanks,
Guoqing
Hi,
在 2022/09/02 18:16, Guoqing Jiang 写道:
>
>
> On 9/2/22 6:02 PM, Yu Kuai wrote:
>> Hi,
>>
>> 在 2022/09/02 17:42, Guoqing Jiang 写道:
>>> Hi,
>>>
>>> On 8/29/22 9:15 PM, Yu Kuai wrote:
>>>> +static bool wait_barrier_nolock(struct r10conf *conf)
>>>> +{
>>>> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
>>>> +
>>>> + if (seq & 1)
>>>> + return false;
>>>> +
>>>> + if (READ_ONCE(conf->barrier))
>>>> + return false;
>>>> +
>>>> + atomic_inc(&conf->nr_pending);
>>>> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
>>>
>>> I think 'seq' is usually get from read_seqcount_begin.
>>
>> read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this
>> will cause high cpu usage in come cases.
>>
>> What I try to do here is just try once, and fall back to hold lock and
>> wait if failed.
>
> Thanks for the explanation.
>
> I'd suggest to try with read_seqcount_begin/read_seqcount_retry pattern
> because it is a common usage in kernel I think, then check whether the
> performance drops or not. Maybe it is related to lockdep issue, but I am
> not sure.
I can try read_seqcount_begin/read_seqcount_retry.
Please take a look at another thread, lockdep issue is due to
inconsistent usage of lock and seqcount inside seqlock:
wait_event() only release lock, seqcount is not released.
Thansk,
Kuai
>
> Thanks,
> Guoqing
> .
>
Hi,
On 2022-08-29 07:15, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
>
> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
> and io can't be dispatched until 'barrier' is dropped.
>
> Since holding the 'barrier' is not common, convert 'resync_lock' to use
> seqlock so that holding lock can be avoided in fast path.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
I've found some lockdep issues starting with this patch in md-next while
running mdadm tests (specifically 00raid10 when run about 10 times in a
row).
I've seen a couple different lock dep errors. The first seems to be
reproducible on this patch, then it possibly changes to the second on
subsequent patches. Not sure exactly.
I haven't dug into it too deeply, but hopefully it can be fixed easily.
Logan
--
================================
WARNING: inconsistent lock state
6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted
--------------------------------
inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes:
ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
raid10_read_request+0x21f/0x760
(raid10.c:1134)
{IN-SOFTIRQ-W} state was registered at:
lock_acquire+0x183/0x440
lower_barrier+0x5e/0xd0
end_sync_request+0x178/0x180
end_sync_write+0x193/0x380
bio_endio+0x346/0x3a0
blk_update_request+0x1eb/0x7c0
blk_mq_end_request+0x30/0x50
lo_complete_rq+0xb7/0x100
blk_complete_reqs+0x77/0x90
blk_done_softirq+0x38/0x40
__do_softirq+0x10c/0x650
run_ksoftirqd+0x48/0x80
smpboot_thread_fn+0x302/0x400
kthread+0x18c/0x1c0
ret_from_fork+0x1f/0x30
irq event stamp: 8930
hardirqs last enabled at (8929): [<ffffffff96df8351>]
_raw_spin_unlock_irqrestore+0x31/0x60
hardirqs last disabled at (8930): [<ffffffff96df7fc5>]
_raw_spin_lock_irq+0x75/0x90
softirqs last enabled at (6768): [<ffffffff9554970e>]
__irq_exit_rcu+0xfe/0x150
softirqs last disabled at (6757): [<ffffffff9554970e>]
__irq_exit_rcu+0xfe/0x150
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&____s->seqcount#10);
<Interrupt>
lock(&____s->seqcount#10);
*** DEADLOCK ***
2 locks held by fsck.ext3/1695:
#0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at:
page_cache_ra_unbounded+0xaf/0x250
#1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
raid10_read_request+0x21f/0x760
stack backtrace:
CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted
6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x5a/0x74
dump_stack+0x10/0x12
print_usage_bug.part.0+0x233/0x246
mark_lock.part.0.cold+0x73/0x14f
mark_held_locks+0x71/0xa0
lockdep_hardirqs_on_prepare+0x158/0x230
trace_hardirqs_on+0x34/0x100
_raw_spin_unlock_irq+0x28/0x60
wait_barrier+0x4a6/0x720
raid10.c:1004
raid10_read_request+0x21f/0x760
raid10_make_request+0x2d6/0x2160
md_handle_request+0x3f3/0x5b0
md_submit_bio+0xd9/0x120
__submit_bio+0x9d/0x100
submit_bio_noacct_nocheck+0x1fd/0x470
submit_bio_noacct+0x4c2/0xbb0
submit_bio+0x3f/0xf0
mpage_readahead+0x323/0x3b0
blkdev_readahead+0x15/0x20
read_pages+0x136/0x7a0
page_cache_ra_unbounded+0x18d/0x250
page_cache_ra_order+0x2c9/0x400
ondemand_readahead+0x320/0x730
page_cache_sync_ra+0xa6/0xb0
filemap_get_pages+0x1eb/0xc00
filemap_read+0x1f1/0x770
blkdev_read_iter+0x164/0x310
vfs_read+0x467/0x5a0
__x64_sys_pread64+0x122/0x160
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
--
======================================================
WARNING: possible circular locking dependency detected
6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted
------------------------------------------------------
systemd-udevd/292 is trying to acquire lock:
ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at:
wait_barrier+0x4fe/0x770
but task is already holding lock:
ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
raid10_read_request+0x21f/0x760
raid10.c:1140 wait_barrier()
raid10.c:1204 regular_request_wait()
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&____s->seqcount#11){+.+.}-{0:0}:
raise_barrier+0xe0/0x300
raid10.c:940 write_seqlock_irq()
raid10_sync_request+0x629/0x4750
raid10.c:3689 raise_barrire()
md_do_sync.cold+0x8ec/0x1491
md_thread+0x19d/0x2d0
kthread+0x18c/0x1c0
ret_from_fork+0x1f/0x30
-> #0 (&(&conf->resync_lock)->lock){....}-{2:2}:
__lock_acquire+0x1cb4/0x3170
lock_acquire+0x183/0x440
_raw_spin_lock_irq+0x4d/0x90
wait_barrier+0x4fe/0x770
raid10_read_request+0x21f/0x760
raid10.c:1140 wait_barrier()
raid10.c:1204 regular_request_wait()
raid10_make_request+0x2d6/0x2190
md_handle_request+0x3f3/0x5b0
md_submit_bio+0xd9/0x120
__submit_bio+0x9d/0x100
submit_bio_noacct_nocheck+0x1fd/0x470
submit_bio_noacct+0x4c2/0xbb0
submit_bio+0x3f/0xf0
submit_bh_wbc+0x270/0x2a0
block_read_full_folio+0x37c/0x580
blkdev_read_folio+0x18/0x20
filemap_read_folio+0x3f/0x110
do_read_cache_folio+0x13b/0x2c0
read_cache_folio+0x42/0x50
read_part_sector+0x74/0x1c0
read_lba+0x176/0x2a0
efi_partition+0x1ce/0xdd0
bdev_disk_changed+0x2e7/0x6a0
blkdev_get_whole+0xd2/0x140
blkdev_get_by_dev.part.0+0x37f/0x570
blkdev_get_by_dev+0x51/0x60
disk_scan_partitions+0xad/0xf0
blkdev_common_ioctl+0x3f3/0xdf0
blkdev_ioctl+0x1e1/0x450
__x64_sys_ioctl+0xc0/0x100
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&____s->seqcount#11);
lock(&(&conf->resync_lock)->lock);
lock(&____s->seqcount#11);
lock(&(&conf->resync_lock)->lock);
*** DEADLOCK ***
2 locks held by systemd-udevd/292:
#0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at:
blkdev_get_by_dev.part.0+0x180/0x570
#1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
raid10_read_request+0x21f/0x760
stack backtrace:
CPU: 3 PID: 292 Comm: systemd-udevd Not tainted
6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x5a/0x74
dump_stack+0x10/0x12
print_circular_bug.cold+0x146/0x14b
check_noncircular+0x1ff/0x250
__lock_acquire+0x1cb4/0x3170
lock_acquire+0x183/0x440
_raw_spin_lock_irq+0x4d/0x90
wait_barrier+0x4fe/0x770
raid10_read_request+0x21f/0x760
raid10_make_request+0x2d6/0x2190
md_handle_request+0x3f3/0x5b0
md_submit_bio+0xd9/0x120
__submit_bio+0x9d/0x100
submit_bio_noacct_nocheck+0x1fd/0x470
submit_bio_noacct+0x4c2/0xbb0
submit_bio+0x3f/0xf0
submit_bh_wbc+0x270/0x2a0
block_read_full_folio+0x37c/0x580
blkdev_read_folio+0x18/0x20
filemap_read_folio+0x3f/0x110
do_read_cache_folio+0x13b/0x2c0
read_cache_folio+0x42/0x50
read_part_sector+0x74/0x1c0
read_lba+0x176/0x2a0
efi_partition+0x1ce/0xdd0
bdev_disk_changed+0x2e7/0x6a0
blkdev_get_whole+0xd2/0x140
blkdev_get_by_dev.part.0+0x37f/0x570
blkdev_get_by_dev+0x51/0x60
disk_scan_partitions+0xad/0xf0
blkdev_common_ioctl+0x3f3/0xdf0
blkdev_ioctl+0x1e1/0x450
__x64_sys_ioctl+0xc0/0x100
do_syscall_64+0x35/0x80
Hi,
在 2022/09/02 2:41, Logan Gunthorpe 写道:
> Hi,
>
> On 2022-08-29 07:15, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
>> and io can't be dispatched until 'barrier' is dropped.
>>
>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>> seqlock so that holding lock can be avoided in fast path.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>
> I've found some lockdep issues starting with this patch in md-next while
> running mdadm tests (specifically 00raid10 when run about 10 times in a
> row).
>
> I've seen a couple different lock dep errors. The first seems to be
> reproducible on this patch, then it possibly changes to the second on
> subsequent patches. Not sure exactly.
>
Thanks for the test,
I think this is false positive because of the special usage here,
for example, in raise_barrier():
write_seqlock_irq
spin_lock_irq();
lock_acquire
do_write_seqcount_begin
seqcount_acquire
wait_event_lock_irq_cmd
spin_unlock_irq -> lock is released while seqcount is still hold
if other context hold the lock again, lockdep
will trigger warning.
...
spin_lock_irq
write_sequnlock_irq
Functionality should be ok, I'll try to find a way to prevent such
warning.
Thanks,
Kuai
> I haven't dug into it too deeply, but hopefully it can be fixed easily.
>
> Logan
>
> --
>
>
> ================================
> WARNING: inconsistent lock state
> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted
> --------------------------------
> inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
> fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes:
> ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
> (raid10.c:1134)
>
> {IN-SOFTIRQ-W} state was registered at:
> lock_acquire+0x183/0x440
> lower_barrier+0x5e/0xd0
> end_sync_request+0x178/0x180
> end_sync_write+0x193/0x380
> bio_endio+0x346/0x3a0
> blk_update_request+0x1eb/0x7c0
> blk_mq_end_request+0x30/0x50
> lo_complete_rq+0xb7/0x100
> blk_complete_reqs+0x77/0x90
> blk_done_softirq+0x38/0x40
> __do_softirq+0x10c/0x650
> run_ksoftirqd+0x48/0x80
> smpboot_thread_fn+0x302/0x400
> kthread+0x18c/0x1c0
> ret_from_fork+0x1f/0x30
>
> irq event stamp: 8930
> hardirqs last enabled at (8929): [<ffffffff96df8351>]
> _raw_spin_unlock_irqrestore+0x31/0x60
> hardirqs last disabled at (8930): [<ffffffff96df7fc5>]
> _raw_spin_lock_irq+0x75/0x90
> softirqs last enabled at (6768): [<ffffffff9554970e>]
> __irq_exit_rcu+0xfe/0x150
> softirqs last disabled at (6757): [<ffffffff9554970e>]
> __irq_exit_rcu+0xfe/0x150
>
> other info that might help us debug this:
> Possible unsafe locking scenario:
>
> CPU0
> ----
> lock(&____s->seqcount#10);
> <Interrupt>
> lock(&____s->seqcount#10);
>
> *** DEADLOCK ***
>
> 2 locks held by fsck.ext3/1695:
> #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at:
> page_cache_ra_unbounded+0xaf/0x250
> #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
>
> stack backtrace:
> CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted
> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
> 04/01/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x5a/0x74
> dump_stack+0x10/0x12
> print_usage_bug.part.0+0x233/0x246
> mark_lock.part.0.cold+0x73/0x14f
> mark_held_locks+0x71/0xa0
> lockdep_hardirqs_on_prepare+0x158/0x230
> trace_hardirqs_on+0x34/0x100
> _raw_spin_unlock_irq+0x28/0x60
> wait_barrier+0x4a6/0x720
> raid10.c:1004
> raid10_read_request+0x21f/0x760
> raid10_make_request+0x2d6/0x2160
> md_handle_request+0x3f3/0x5b0
> md_submit_bio+0xd9/0x120
> __submit_bio+0x9d/0x100
> submit_bio_noacct_nocheck+0x1fd/0x470
> submit_bio_noacct+0x4c2/0xbb0
> submit_bio+0x3f/0xf0
> mpage_readahead+0x323/0x3b0
> blkdev_readahead+0x15/0x20
> read_pages+0x136/0x7a0
> page_cache_ra_unbounded+0x18d/0x250
> page_cache_ra_order+0x2c9/0x400
> ondemand_readahead+0x320/0x730
> page_cache_sync_ra+0xa6/0xb0
> filemap_get_pages+0x1eb/0xc00
> filemap_read+0x1f1/0x770
> blkdev_read_iter+0x164/0x310
> vfs_read+0x467/0x5a0
> __x64_sys_pread64+0x122/0x160
> do_syscall_64+0x35/0x80
> entry_SYSCALL_64_after_hwframe+0x46/0xb0
>
> --
>
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted
> ------------------------------------------------------
> systemd-udevd/292 is trying to acquire lock:
> ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at:
> wait_barrier+0x4fe/0x770
>
> but task is already holding lock:
> ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
> raid10.c:1140 wait_barrier()
> raid10.c:1204 regular_request_wait()
>
>
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (&____s->seqcount#11){+.+.}-{0:0}:
> raise_barrier+0xe0/0x300
> raid10.c:940 write_seqlock_irq()
> raid10_sync_request+0x629/0x4750
> raid10.c:3689 raise_barrire()
> md_do_sync.cold+0x8ec/0x1491
> md_thread+0x19d/0x2d0
> kthread+0x18c/0x1c0
> ret_from_fork+0x1f/0x30
>
> -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}:
> __lock_acquire+0x1cb4/0x3170
> lock_acquire+0x183/0x440
> _raw_spin_lock_irq+0x4d/0x90
> wait_barrier+0x4fe/0x770
> raid10_read_request+0x21f/0x760
> raid10.c:1140 wait_barrier()
> raid10.c:1204 regular_request_wait()
> raid10_make_request+0x2d6/0x2190
> md_handle_request+0x3f3/0x5b0
> md_submit_bio+0xd9/0x120
> __submit_bio+0x9d/0x100
> submit_bio_noacct_nocheck+0x1fd/0x470
> submit_bio_noacct+0x4c2/0xbb0
> submit_bio+0x3f/0xf0
> submit_bh_wbc+0x270/0x2a0
> block_read_full_folio+0x37c/0x580
> blkdev_read_folio+0x18/0x20
> filemap_read_folio+0x3f/0x110
> do_read_cache_folio+0x13b/0x2c0
> read_cache_folio+0x42/0x50
> read_part_sector+0x74/0x1c0
> read_lba+0x176/0x2a0
> efi_partition+0x1ce/0xdd0
> bdev_disk_changed+0x2e7/0x6a0
> blkdev_get_whole+0xd2/0x140
> blkdev_get_by_dev.part.0+0x37f/0x570
> blkdev_get_by_dev+0x51/0x60
> disk_scan_partitions+0xad/0xf0
> blkdev_common_ioctl+0x3f3/0xdf0
> blkdev_ioctl+0x1e1/0x450
> __x64_sys_ioctl+0xc0/0x100
> do_syscall_64+0x35/0x80
> entry_SYSCALL_64_after_hwframe+0x46/0xb0
>
> other info that might help us debug this:
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&____s->seqcount#11);
> lock(&(&conf->resync_lock)->lock);
> lock(&____s->seqcount#11);
> lock(&(&conf->resync_lock)->lock);
>
> *** DEADLOCK ***
>
> 2 locks held by systemd-udevd/292:
> #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at:
> blkdev_get_by_dev.part.0+0x180/0x570
> #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
>
> stack backtrace:
> CPU: 3 PID: 292 Comm: systemd-udevd Not tainted
> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
> 04/01/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x5a/0x74
> dump_stack+0x10/0x12
> print_circular_bug.cold+0x146/0x14b
> check_noncircular+0x1ff/0x250
> __lock_acquire+0x1cb4/0x3170
> lock_acquire+0x183/0x440
> _raw_spin_lock_irq+0x4d/0x90
> wait_barrier+0x4fe/0x770
> raid10_read_request+0x21f/0x760
> raid10_make_request+0x2d6/0x2190
> md_handle_request+0x3f3/0x5b0
> md_submit_bio+0xd9/0x120
> __submit_bio+0x9d/0x100
> submit_bio_noacct_nocheck+0x1fd/0x470
> submit_bio_noacct+0x4c2/0xbb0
> submit_bio+0x3f/0xf0
> submit_bh_wbc+0x270/0x2a0
> block_read_full_folio+0x37c/0x580
> blkdev_read_folio+0x18/0x20
> filemap_read_folio+0x3f/0x110
> do_read_cache_folio+0x13b/0x2c0
> read_cache_folio+0x42/0x50
> read_part_sector+0x74/0x1c0
> read_lba+0x176/0x2a0
> efi_partition+0x1ce/0xdd0
> bdev_disk_changed+0x2e7/0x6a0
> blkdev_get_whole+0xd2/0x140
> blkdev_get_by_dev.part.0+0x37f/0x570
> blkdev_get_by_dev+0x51/0x60
> disk_scan_partitions+0xad/0xf0
> blkdev_common_ioctl+0x3f3/0xdf0
> blkdev_ioctl+0x1e1/0x450
> __x64_sys_ioctl+0xc0/0x100
> do_syscall_64+0x35/0x80
> .
>
Hi, Logan
在 2022/09/02 9:21, Yu Kuai 写道:
> Hi,
>
> 在 2022/09/02 2:41, Logan Gunthorpe 写道:
>> Hi,
>>
>> On 2022-08-29 07:15, Yu Kuai wrote:
>>> From: Yu Kuai <yukuai3@huawei.com>
>>>
>>> Currently, wait_barrier() will hold 'resync_lock' to read
>>> 'conf->barrier',
>>> and io can't be dispatched until 'barrier' is dropped.
>>>
>>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>>> seqlock so that holding lock can be avoided in fast path.
>>>
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>
>> I've found some lockdep issues starting with this patch in md-next while
>> running mdadm tests (specifically 00raid10 when run about 10 times in a
>> row).
>>
>> I've seen a couple different lock dep errors. The first seems to be
>> reproducible on this patch, then it possibly changes to the second on
>> subsequent patches. Not sure exactly.
>>
>
> Thanks for the test,
>
> I think this is false positive because of the special usage here,
>
> for example, in raise_barrier():
>
> write_seqlock_irq
> spin_lock_irq();
> lock_acquire
> do_write_seqcount_begin
> seqcount_acquire
>
> wait_event_lock_irq_cmd
> spin_unlock_irq -> lock is released while seqcount is still hold
> if other context hold the lock again, lockdep
> will trigger warning.
> ...
> spin_lock_irq
>
> write_sequnlock_irq
>
> Functionality should be ok, I'll try to find a way to prevent such
> warning.
Can you try the following patch? I'm running mdadm tests myself and I
didn't see any problems yet.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2f7c8bef6dc2..317bd862f40a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -940,16 +940,16 @@ static void raise_barrier(struct r10conf *conf,
int force)
BUG_ON(force && !conf->barrier);
/* Wait until no block IO is waiting (unless 'force') */
- wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock.lock);
+ wait_event_seqlock_irq(conf->wait_barrier, force ||
!conf->nr_waiting,
+ conf->resync_lock);
/* block any new IO from starting */
WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */
- wait_event_lock_irq(conf->wait_barrier,
- !atomic_read(&conf->nr_pending) &&
conf->barrier < RESYNC_DEPTH,
- conf->resync_lock.lock);
+ wait_event_seqlock_irq(conf->wait_barrier,
+ !atomic_read(&conf->nr_pending) &&
+ conf->barrier < RESYNC_DEPTH,
conf->resync_lock);
write_sequnlock_irq(&conf->resync_lock);
}
@@ -1007,7 +1007,7 @@ static bool wait_barrier(struct r10conf *conf,
bool nowait)
ret = false;
} else {
raid10_log(conf->mddev, "wait barrier");
- wait_event_lock_irq(conf->wait_barrier,
+ wait_event_seqlock_irq(conf->wait_barrier,
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
bio_list &&
@@ -1020,7 +1020,7 @@ static bool wait_barrier(struct r10conf *conf,
bool nowait)
test_bit(MD_RECOVERY_RUNNING,
&conf->mddev->recovery) &&
conf->nr_queued > 0),
- conf->resync_lock.lock);
+ conf->resync_lock);
}
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -1058,10 +1058,9 @@ static void freeze_array(struct r10conf *conf,
int extra)
conf->array_freeze_pending++;
WRITE_ONCE(conf->barrier, conf->barrier + 1);
conf->nr_waiting++;
- wait_event_lock_irq_cmd(conf->wait_barrier,
+ wait_event_seqlock_irq_cmd(conf->wait_barrier,
atomic_read(&conf->nr_pending) ==
conf->nr_queued+extra,
- conf->resync_lock.lock,
- flush_pending_writes(conf));
+ conf->resync_lock,
flush_pending_writes(conf));
conf->array_freeze_pending--;
write_sequnlock_irq(&conf->resync_lock);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 58cfbf81447c..97d6b378e40c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -977,6 +977,13 @@ extern int do_wait_intr_irq(wait_queue_head_t *,
wait_queue_entry_t *);
schedule();
\
spin_lock_irq(&lock))
+#define __wait_event_seqlock_irq(wq_head, condition, lock, cmd)
\
+ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0,
0, \
+ write_sequnlock_irq(&lock);
\
+ cmd;
\
+ schedule();
\
+ write_seqlock_irq(&lock))
+
/**
* wait_event_lock_irq_cmd - sleep until a condition gets true. The
* condition is checked under the lock. This
@@ -1007,6 +1014,13 @@ do {
\
__wait_event_lock_irq(wq_head, condition, lock, cmd);
\
} while (0)
+#define wait_event_seqlock_irq_cmd(wq_head, condition, lock, cmd)
\
+do {
\
+ if (condition)
\
+ break;
\
+ __wait_event_seqlock_irq(wq_head, condition, lock, cmd);
\
+} while (0)
+
/**
* wait_event_lock_irq - sleep until a condition gets true. The
* condition is checked under the lock. This
@@ -1034,6 +1048,12 @@ do {
\
__wait_event_lock_irq(wq_head, condition, lock, );
\
} while (0)
+#define wait_event_seqlock_irq(wq_head, condition, lock)
\
+do {
\
+ if (condition)
\
+ break;
\
+ __wait_event_seqlock_irq(wq_head, condition, lock, );
\
+} while (0)
#define __wait_event_interruptible_lock_irq(wq_head, condition, lock,
cmd) \
___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,
\
>
> Thanks,
> Kuai
>> I haven't dug into it too deeply, but hopefully it can be fixed easily.
>>
>> Logan
>>
>> --
>>
>>
>> ================================
>> WARNING: inconsistent lock state
>> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted
>> --------------------------------
>> inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
>> fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes:
>> ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>> (raid10.c:1134)
>>
>> {IN-SOFTIRQ-W} state was registered at:
>> lock_acquire+0x183/0x440
>> lower_barrier+0x5e/0xd0
>> end_sync_request+0x178/0x180
>> end_sync_write+0x193/0x380
>> bio_endio+0x346/0x3a0
>> blk_update_request+0x1eb/0x7c0
>> blk_mq_end_request+0x30/0x50
>> lo_complete_rq+0xb7/0x100
>> blk_complete_reqs+0x77/0x90
>> blk_done_softirq+0x38/0x40
>> __do_softirq+0x10c/0x650
>> run_ksoftirqd+0x48/0x80
>> smpboot_thread_fn+0x302/0x400
>> kthread+0x18c/0x1c0
>> ret_from_fork+0x1f/0x30
>>
>> irq event stamp: 8930
>> hardirqs last enabled at (8929): [<ffffffff96df8351>]
>> _raw_spin_unlock_irqrestore+0x31/0x60
>> hardirqs last disabled at (8930): [<ffffffff96df7fc5>]
>> _raw_spin_lock_irq+0x75/0x90
>> softirqs last enabled at (6768): [<ffffffff9554970e>]
>> __irq_exit_rcu+0xfe/0x150
>> softirqs last disabled at (6757): [<ffffffff9554970e>]
>> __irq_exit_rcu+0xfe/0x150
>>
>> other info that might help us debug this:
>> Possible unsafe locking scenario:
>>
>> CPU0
>> ----
>> lock(&____s->seqcount#10);
>> <Interrupt>
>> lock(&____s->seqcount#10);
>>
>> *** DEADLOCK ***
>>
>> 2 locks held by fsck.ext3/1695:
>> #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at:
>> page_cache_ra_unbounded+0xaf/0x250
>> #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>>
>> stack backtrace:
>> CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted
>> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604
>> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
>> 04/01/2014
>> Call Trace:
>> <TASK>
>> dump_stack_lvl+0x5a/0x74
>> dump_stack+0x10/0x12
>> print_usage_bug.part.0+0x233/0x246
>> mark_lock.part.0.cold+0x73/0x14f
>> mark_held_locks+0x71/0xa0
>> lockdep_hardirqs_on_prepare+0x158/0x230
>> trace_hardirqs_on+0x34/0x100
>> _raw_spin_unlock_irq+0x28/0x60
>> wait_barrier+0x4a6/0x720
>> raid10.c:1004
>> raid10_read_request+0x21f/0x760
>> raid10_make_request+0x2d6/0x2160
>> md_handle_request+0x3f3/0x5b0
>> md_submit_bio+0xd9/0x120
>> __submit_bio+0x9d/0x100
>> submit_bio_noacct_nocheck+0x1fd/0x470
>> submit_bio_noacct+0x4c2/0xbb0
>> submit_bio+0x3f/0xf0
>> mpage_readahead+0x323/0x3b0
>> blkdev_readahead+0x15/0x20
>> read_pages+0x136/0x7a0
>> page_cache_ra_unbounded+0x18d/0x250
>> page_cache_ra_order+0x2c9/0x400
>> ondemand_readahead+0x320/0x730
>> page_cache_sync_ra+0xa6/0xb0
>> filemap_get_pages+0x1eb/0xc00
>> filemap_read+0x1f1/0x770
>> blkdev_read_iter+0x164/0x310
>> vfs_read+0x467/0x5a0
>> __x64_sys_pread64+0x122/0x160
>> do_syscall_64+0x35/0x80
>> entry_SYSCALL_64_after_hwframe+0x46/0xb0
>>
>> --
>>
>> ======================================================
>> WARNING: possible circular locking dependency detected
>> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted
>> ------------------------------------------------------
>> systemd-udevd/292 is trying to acquire lock:
>> ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at:
>> wait_barrier+0x4fe/0x770
>>
>> but task is already holding lock:
>> ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>> raid10.c:1140 wait_barrier()
>> raid10.c:1204 regular_request_wait()
>>
>>
>>
>> which lock already depends on the new lock.
>>
>>
>> the existing dependency chain (in reverse order) is:
>>
>> -> #1 (&____s->seqcount#11){+.+.}-{0:0}:
>> raise_barrier+0xe0/0x300
>> raid10.c:940 write_seqlock_irq()
>> raid10_sync_request+0x629/0x4750
>> raid10.c:3689 raise_barrire()
>> md_do_sync.cold+0x8ec/0x1491
>> md_thread+0x19d/0x2d0
>> kthread+0x18c/0x1c0
>> ret_from_fork+0x1f/0x30
>>
>> -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}:
>> __lock_acquire+0x1cb4/0x3170
>> lock_acquire+0x183/0x440
>> _raw_spin_lock_irq+0x4d/0x90
>> wait_barrier+0x4fe/0x770
>> raid10_read_request+0x21f/0x760
>> raid10.c:1140 wait_barrier()
>> raid10.c:1204 regular_request_wait()
>> raid10_make_request+0x2d6/0x2190
>> md_handle_request+0x3f3/0x5b0
>> md_submit_bio+0xd9/0x120
>> __submit_bio+0x9d/0x100
>> submit_bio_noacct_nocheck+0x1fd/0x470
>> submit_bio_noacct+0x4c2/0xbb0
>> submit_bio+0x3f/0xf0
>> submit_bh_wbc+0x270/0x2a0
>> block_read_full_folio+0x37c/0x580
>> blkdev_read_folio+0x18/0x20
>> filemap_read_folio+0x3f/0x110
>> do_read_cache_folio+0x13b/0x2c0
>> read_cache_folio+0x42/0x50
>> read_part_sector+0x74/0x1c0
>> read_lba+0x176/0x2a0
>> efi_partition+0x1ce/0xdd0
>> bdev_disk_changed+0x2e7/0x6a0
>> blkdev_get_whole+0xd2/0x140
>> blkdev_get_by_dev.part.0+0x37f/0x570
>> blkdev_get_by_dev+0x51/0x60
>> disk_scan_partitions+0xad/0xf0
>> blkdev_common_ioctl+0x3f3/0xdf0
>> blkdev_ioctl+0x1e1/0x450
>> __x64_sys_ioctl+0xc0/0x100
>> do_syscall_64+0x35/0x80
>> entry_SYSCALL_64_after_hwframe+0x46/0xb0
>>
>> other info that might help us debug this:
>>
>> Possible unsafe locking scenario:
>>
>> CPU0 CPU1
>> ---- ----
>> lock(&____s->seqcount#11);
>> lock(&(&conf->resync_lock)->lock);
>> lock(&____s->seqcount#11);
>> lock(&(&conf->resync_lock)->lock);
>>
>> *** DEADLOCK ***
>>
>> 2 locks held by systemd-udevd/292:
>> #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at:
>> blkdev_get_by_dev.part.0+0x180/0x570
>> #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>>
>> stack backtrace:
>> CPU: 3 PID: 292 Comm: systemd-udevd Not tainted
>> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600
>> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
>> 04/01/2014
>> Call Trace:
>> <TASK>
>> dump_stack_lvl+0x5a/0x74
>> dump_stack+0x10/0x12
>> print_circular_bug.cold+0x146/0x14b
>> check_noncircular+0x1ff/0x250
>> __lock_acquire+0x1cb4/0x3170
>> lock_acquire+0x183/0x440
>> _raw_spin_lock_irq+0x4d/0x90
>> wait_barrier+0x4fe/0x770
>> raid10_read_request+0x21f/0x760
>> raid10_make_request+0x2d6/0x2190
>> md_handle_request+0x3f3/0x5b0
>> md_submit_bio+0xd9/0x120
>> __submit_bio+0x9d/0x100
>> submit_bio_noacct_nocheck+0x1fd/0x470
>> submit_bio_noacct+0x4c2/0xbb0
>> submit_bio+0x3f/0xf0
>> submit_bh_wbc+0x270/0x2a0
>> block_read_full_folio+0x37c/0x580
>> blkdev_read_folio+0x18/0x20
>> filemap_read_folio+0x3f/0x110
>> do_read_cache_folio+0x13b/0x2c0
>> read_cache_folio+0x42/0x50
>> read_part_sector+0x74/0x1c0
>> read_lba+0x176/0x2a0
>> efi_partition+0x1ce/0xdd0
>> bdev_disk_changed+0x2e7/0x6a0
>> blkdev_get_whole+0xd2/0x140
>> blkdev_get_by_dev.part.0+0x37f/0x570
>> blkdev_get_by_dev+0x51/0x60
>> disk_scan_partitions+0xad/0xf0
>> blkdev_common_ioctl+0x3f3/0xdf0
>> blkdev_ioctl+0x1e1/0x450
>> __x64_sys_ioctl+0xc0/0x100
>> do_syscall_64+0x35/0x80
>> .
>>
>
> .
>
On 2022-09-02 02:14, Yu Kuai wrote:
> Can you try the following patch? I'm running mdadm tests myself and I
> didn't see any problems yet.
Yes, that patch seems to fix the issue.
However, may I suggest we do this without trying to introduce new
helpers into wait.h? I suspect that could result in a fair amount of
bike shedding and delay. wait_event_cmd() is often used in situations
where a specific lock type doesn't have a helper.
My stab at it is in a diff below which also fixes the bug.
I'd also recommend somebody clean up that nasty condition in
wait_barrier(). Put it into an appropriately named function
with some comments. As is, it is pretty much unreadable.
Logan
--
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0e3229ee1ebc..ae297bc870bd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -934,22 +934,26 @@ static void flush_pending_writes(struct r10conf *conf)
* lower_barrier when the particular background IO completes.
*/
+#define wait_event_barrier_cmd(conf, cond, cmd) \
+ wait_event_cmd((conf)->wait_barrier, cond, \
+ write_sequnlock_irq(&(conf)->resync_lock); cmd, \
+ write_seqlock_irq(&(conf)->resync_lock))
+#define wait_event_barrier(conf, cond) wait_event_barrier_cmd(conf, cond, )
+
static void raise_barrier(struct r10conf *conf, int force)
{
write_seqlock_irq(&conf->resync_lock);
BUG_ON(force && !conf->barrier);
/* Wait until no block IO is waiting (unless 'force') */
- wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock.lock);
+ wait_event_barrier(conf, force || !conf->nr_waiting);
/* block any new IO from starting */
WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */
- wait_event_lock_irq(conf->wait_barrier,
- !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock.lock);
+ wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
+ conf->barrier < RESYNC_DEPTH);
write_sequnlock_irq(&conf->resync_lock);
}
@@ -1007,20 +1011,19 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
ret = false;
} else {
raid10_log(conf->mddev, "wait barrier");
- wait_event_lock_irq(conf->wait_barrier,
- !conf->barrier ||
- (atomic_read(&conf->nr_pending) &&
- bio_list &&
- (!bio_list_empty(&bio_list[0]) ||
- !bio_list_empty(&bio_list[1]))) ||
+ wait_event_barrier(conf,
+ !conf->barrier ||
+ (atomic_read(&conf->nr_pending) &&
+ bio_list &&
+ (!bio_list_empty(&bio_list[0]) ||
+ !bio_list_empty(&bio_list[1]))) ||
/* move on if recovery thread is
* blocked by us
*/
- (conf->mddev->thread->tsk == current &&
- test_bit(MD_RECOVERY_RUNNING,
- &conf->mddev->recovery) &&
- conf->nr_queued > 0),
- conf->resync_lock.lock);
+ (conf->mddev->thread->tsk == current &&
+ test_bit(MD_RECOVERY_RUNNING,
+ &conf->mddev->recovery) &&
+ conf->nr_queued > 0));
}
conf->nr_waiting--;
if (!conf->nr_waiting)
@@ -1058,10 +1061,9 @@ static void freeze_array(struct r10conf *conf, int extra)
conf->array_freeze_pending++;
WRITE_ONCE(conf->barrier, conf->barrier + 1);
conf->nr_waiting++;
- wait_event_lock_irq_cmd(conf->wait_barrier,
- atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
- conf->resync_lock.lock,
- flush_pending_writes(conf));
+ wait_event_barrier_cmd(conf,
+ atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
+ flush_pending_writes(conf));
conf->array_freeze_pending--;
write_sequnlock_irq(&conf->resync_lock);
Hi,
在 2022/09/03 1:03, Logan Gunthorpe 写道:
>
>
>
> On 2022-09-02 02:14, Yu Kuai wrote:
>> Can you try the following patch? I'm running mdadm tests myself and I
>> didn't see any problems yet.
>
> Yes, that patch seems to fix the issue.
>
> However, may I suggest we do this without trying to introduce new
> helpers into wait.h? I suspect that could result in a fair amount of
> bike shedding and delay. wait_event_cmd() is often used in situations
> where a specific lock type doesn't have a helper.
Yes, that sounds good.
>
> My stab at it is in a diff below which also fixes the bug.
>
> I'd also recommend somebody clean up that nasty condition in
> wait_barrier(). Put it into an appropriately named function
> with some comments. As is, it is pretty much unreadable.
Now we're at it, I can take a look.
Thanks,
Kuai
>
> Logan
>
> --
>
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 0e3229ee1ebc..ae297bc870bd 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -934,22 +934,26 @@ static void flush_pending_writes(struct r10conf *conf)
> * lower_barrier when the particular background IO completes.
> */
>
> +#define wait_event_barrier_cmd(conf, cond, cmd) \
> + wait_event_cmd((conf)->wait_barrier, cond, \
> + write_sequnlock_irq(&(conf)->resync_lock); cmd, \
> + write_seqlock_irq(&(conf)->resync_lock))
> +#define wait_event_barrier(conf, cond) wait_event_barrier_cmd(conf, cond, )
> +
> static void raise_barrier(struct r10conf *conf, int force)
> {
> write_seqlock_irq(&conf->resync_lock);
> BUG_ON(force && !conf->barrier);
>
> /* Wait until no block IO is waiting (unless 'force') */
> - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
> - conf->resync_lock.lock);
> + wait_event_barrier(conf, force || !conf->nr_waiting);
>
> /* block any new IO from starting */
> WRITE_ONCE(conf->barrier, conf->barrier + 1);
>
> /* Now wait for all pending IO to complete */
> - wait_event_lock_irq(conf->wait_barrier,
> - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
> - conf->resync_lock.lock);
> + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
> + conf->barrier < RESYNC_DEPTH);
>
> write_sequnlock_irq(&conf->resync_lock);
> }
> @@ -1007,20 +1011,19 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
> ret = false;
> } else {
> raid10_log(conf->mddev, "wait barrier");
> - wait_event_lock_irq(conf->wait_barrier,
> - !conf->barrier ||
> - (atomic_read(&conf->nr_pending) &&
> - bio_list &&
> - (!bio_list_empty(&bio_list[0]) ||
> - !bio_list_empty(&bio_list[1]))) ||
> + wait_event_barrier(conf,
> + !conf->barrier ||
> + (atomic_read(&conf->nr_pending) &&
> + bio_list &&
> + (!bio_list_empty(&bio_list[0]) ||
> + !bio_list_empty(&bio_list[1]))) ||
> /* move on if recovery thread is
> * blocked by us
> */
> - (conf->mddev->thread->tsk == current &&
> - test_bit(MD_RECOVERY_RUNNING,
> - &conf->mddev->recovery) &&
> - conf->nr_queued > 0),
> - conf->resync_lock.lock);
> + (conf->mddev->thread->tsk == current &&
> + test_bit(MD_RECOVERY_RUNNING,
> + &conf->mddev->recovery) &&
> + conf->nr_queued > 0));
> }
> conf->nr_waiting--;
> if (!conf->nr_waiting)
> @@ -1058,10 +1061,9 @@ static void freeze_array(struct r10conf *conf, int extra)
> conf->array_freeze_pending++;
> WRITE_ONCE(conf->barrier, conf->barrier + 1);
> conf->nr_waiting++;
> - wait_event_lock_irq_cmd(conf->wait_barrier,
> - atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
> - conf->resync_lock.lock,
> - flush_pending_writes(conf));
> + wait_event_barrier_cmd(conf,
> + atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
> + flush_pending_writes(conf));
>
> conf->array_freeze_pending--;
> write_sequnlock_irq(&conf->resync_lock);
> .
>
On 9/2/22 2:41 AM, Logan Gunthorpe wrote: > Hi, > > On 2022-08-29 07:15, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', >> and io can't be dispatched until 'barrier' is dropped. >> >> Since holding the 'barrier' is not common, convert 'resync_lock' to use >> seqlock so that holding lock can be avoided in fast path. >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> > I've found some lockdep issues starting with this patch in md-next while > running mdadm tests (specifically 00raid10 when run about 10 times in a > row). > > I've seen a couple different lock dep errors. The first seems to be > reproducible on this patch, then it possibly changes to the second on > subsequent patches. Not sure exactly. That's why I said "try mdadm test suites too to avoid regression." ... Guoqing
On 2022-09-01 18:49, Guoqing Jiang wrote: > > > On 9/2/22 2:41 AM, Logan Gunthorpe wrote: >> Hi, >> >> On 2022-08-29 07:15, Yu Kuai wrote: >>> From: Yu Kuai <yukuai3@huawei.com> >>> >>> Currently, wait_barrier() will hold 'resync_lock' to read >>> 'conf->barrier', >>> and io can't be dispatched until 'barrier' is dropped. >>> >>> Since holding the 'barrier' is not common, convert 'resync_lock' to use >>> seqlock so that holding lock can be avoided in fast path. >>> >>> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> I've found some lockdep issues starting with this patch in md-next while >> running mdadm tests (specifically 00raid10 when run about 10 times in a >> row). >> >> I've seen a couple different lock dep errors. The first seems to be >> reproducible on this patch, then it possibly changes to the second on >> subsequent patches. Not sure exactly. > > That's why I said "try mdadm test suites too to avoid regression." ... You may have to run it multiple times, a single run tends not to catch all errors. I had to loop the noted test 10 times to be sure I hit this every time when I did the simple bisect. And ensure that all the debug options are on when you run it (take a look at the Kernel Hacking section in menuconfig). You won't hit this bug without at least CONFIG_PROVE_LOCKING=y. Logan
On 9/2/22 8:56 AM, Logan Gunthorpe wrote: > > On 2022-09-01 18:49, Guoqing Jiang wrote: >> >> On 9/2/22 2:41 AM, Logan Gunthorpe wrote: >>> Hi, >>> >>> On 2022-08-29 07:15, Yu Kuai wrote: >>>> From: Yu Kuai <yukuai3@huawei.com> >>>> >>>> Currently, wait_barrier() will hold 'resync_lock' to read >>>> 'conf->barrier', >>>> and io can't be dispatched until 'barrier' is dropped. >>>> >>>> Since holding the 'barrier' is not common, convert 'resync_lock' to use >>>> seqlock so that holding lock can be avoided in fast path. >>>> >>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >>> I've found some lockdep issues starting with this patch in md-next while >>> running mdadm tests (specifically 00raid10 when run about 10 times in a >>> row). >>> >>> I've seen a couple different lock dep errors. The first seems to be >>> reproducible on this patch, then it possibly changes to the second on >>> subsequent patches. Not sure exactly. >> That's why I said "try mdadm test suites too to avoid regression." ... > You may have to run it multiple times, a single run tends not to catch > all errors. I had to loop the noted test 10 times to be sure I hit this > every time when I did the simple bisect. > > And ensure that all the debug options are on when you run it (take a > look at the Kernel Hacking section in menuconfig). You won't hit this > bug without at least CONFIG_PROVE_LOCKING=y. Yes, we definitely need to enable the option to test change for locking stuffs. Thanks, Guoqing
© 2016 - 2026 Red Hat, Inc.