block/io.c | 10 +++++++--- include/block/block.h | 21 +++++++++++++-------- 2 files changed, 20 insertions(+), 11 deletions(-)
During block job completion, nothing is preventing
block_job_defer_to_main_loop_bh from being called in a nested
aio_poll(), which is a trouble, such as in this code path:
qmp_block_commit
commit_active_start
bdrv_reopen
bdrv_reopen_multiple
bdrv_reopen_prepare
bdrv_flush
aio_poll
aio_bh_poll
aio_bh_call
block_job_defer_to_main_loop_bh
stream_complete
bdrv_reopen
block_job_defer_to_main_loop_bh is the last step of the stream job,
which should have been "paused" by the bdrv_drained_begin/end in
bdrv_reopen_multiple, but it is not done because it's in the form of a
main loop BH.
Similar to why block jobs should be paused between drained_begin and
drained_end, BHs they schedule must be excluded as well. To achieve
this, this patch forces draining the BH in BDRV_POLL_WHILE.
Also because the BH in question can do bdrv_unref and child replacing,
protect @bs carefully to avoid use-after-free.
As a side effect this fixes a hang in block_job_detach_aio_context
during system_reset when a block job is ready:
#0 0x0000555555aa79f3 in bdrv_drain_recurse
#1 0x0000555555aa825d in bdrv_drained_begin
#2 0x0000555555aa8449 in bdrv_drain
#3 0x0000555555a9c356 in blk_drain
#4 0x0000555555aa3cfd in mirror_drain
#5 0x0000555555a66e11 in block_job_detach_aio_context
#6 0x0000555555a62f4d in bdrv_detach_aio_context
#7 0x0000555555a63116 in bdrv_set_aio_context
#8 0x0000555555a9d326 in blk_set_aio_context
#9 0x00005555557e38da in virtio_blk_data_plane_stop
#10 0x00005555559f9d5f in virtio_bus_stop_ioeventfd
#11 0x00005555559fa49b in virtio_bus_stop_ioeventfd
#12 0x00005555559f6a18 in virtio_pci_stop_ioeventfd
#13 0x00005555559f6a18 in virtio_pci_reset
#14 0x00005555559139a9 in qdev_reset_one
#15 0x0000555555916738 in qbus_walk_children
#16 0x0000555555913318 in qdev_walk_children
#17 0x0000555555916738 in qbus_walk_children
#18 0x00005555559168ca in qemu_devices_reset
#19 0x000055555581fcbb in pc_machine_reset
#20 0x00005555558a4d96 in qemu_system_reset
#21 0x000055555577157a in main_loop_should_exit
#22 0x000055555577157a in main_loop
#23 0x000055555577157a in main
The rationale is that the loop in block_job_detach_aio_context cannot
make any progress in pausing/completing the job, because bs->in_flight
is 0, so bdrv_drain doesn't process the block_job_defer_to_main_loop
BH. With this patch, it does.
Reported-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
---
v2: Do the BH poll in BDRV_POLL_WHILE to cover bdrv_drain_all_begin as
well. [Kevin]
---
block/io.c | 10 +++++++---
include/block/block.h | 21 +++++++++++++--------
2 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/block/io.c b/block/io.c
index 8706bfa..a472157 100644
--- a/block/io.c
+++ b/block/io.c
@@ -158,7 +158,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)
static bool bdrv_drain_recurse(BlockDriverState *bs)
{
- BdrvChild *child;
+ BdrvChild *child, *tmp;
bool waited;
waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
@@ -167,8 +167,12 @@ static bool bdrv_drain_recurse(BlockDriverState *bs)
bs->drv->bdrv_drain(bs);
}
- QLIST_FOREACH(child, &bs->children, next) {
- waited |= bdrv_drain_recurse(child->bs);
+ QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+ BlockDriverState *bs = child->bs;
+ assert(bs->refcnt > 0);
+ bdrv_ref(bs);
+ waited |= bdrv_drain_recurse(bs);
+ bdrv_unref(bs);
}
return waited;
diff --git a/include/block/block.h b/include/block/block.h
index 97d4330..3f05e71 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -381,12 +381,13 @@ void bdrv_drain_all(void);
#define BDRV_POLL_WHILE(bs, cond) ({ \
bool waited_ = false; \
+ bool busy_ = true; \
BlockDriverState *bs_ = (bs); \
AioContext *ctx_ = bdrv_get_aio_context(bs_); \
if (aio_context_in_iothread(ctx_)) { \
- while ((cond)) { \
- aio_poll(ctx_, true); \
- waited_ = true; \
+ while ((cond) || busy_) { \
+ busy_ = aio_poll(ctx_, (cond)); \
+ waited_ |= !!(cond); \
} \
} else { \
assert(qemu_get_current_aio_context() == \
@@ -398,11 +399,15 @@ void bdrv_drain_all(void);
*/ \
assert(!bs_->wakeup); \
bs_->wakeup = true; \
- while ((cond)) { \
- aio_context_release(ctx_); \
- aio_poll(qemu_get_aio_context(), true); \
- aio_context_acquire(ctx_); \
- waited_ = true; \
+ while (busy_) { \
+ if ((cond)) { \
+ waited_ = busy_ = true; \
+ aio_context_release(ctx_); \
+ aio_poll(qemu_get_aio_context(), true); \
+ aio_context_acquire(ctx_); \
+ } else { \
+ busy_ = aio_poll(ctx_, false); \
+ } \
} \
bs_->wakeup = false; \
} \
--
2.9.3
On Fri, 04/14 16:02, Fam Zheng wrote: > @@ -167,8 +167,12 @@ static bool bdrv_drain_recurse(BlockDriverState *bs) > bs->drv->bdrv_drain(bs); > } > > - QLIST_FOREACH(child, &bs->children, next) { > - waited |= bdrv_drain_recurse(child->bs); > + QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { > + BlockDriverState *bs = child->bs; Poor name, @bs is the function parameter already, let's use something like @bs1. Fam
On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: > During block job completion, nothing is preventing > block_job_defer_to_main_loop_bh from being called in a nested > aio_poll(), which is a trouble, such as in this code path: > > qmp_block_commit > commit_active_start > bdrv_reopen > bdrv_reopen_multiple > bdrv_reopen_prepare > bdrv_flush > aio_poll > aio_bh_poll > aio_bh_call > block_job_defer_to_main_loop_bh > stream_complete > bdrv_reopen > > block_job_defer_to_main_loop_bh is the last step of the stream job, > which should have been "paused" by the bdrv_drained_begin/end in > bdrv_reopen_multiple, but it is not done because it's in the form of a > main loop BH. > > Similar to why block jobs should be paused between drained_begin and > drained_end, BHs they schedule must be excluded as well. To achieve > this, this patch forces draining the BH in BDRV_POLL_WHILE. > > Also because the BH in question can do bdrv_unref and child replacing, > protect @bs carefully to avoid use-after-free. > > As a side effect this fixes a hang in block_job_detach_aio_context > during system_reset when a block job is ready: > > #0 0x0000555555aa79f3 in bdrv_drain_recurse > #1 0x0000555555aa825d in bdrv_drained_begin > #2 0x0000555555aa8449 in bdrv_drain > #3 0x0000555555a9c356 in blk_drain > #4 0x0000555555aa3cfd in mirror_drain > #5 0x0000555555a66e11 in block_job_detach_aio_context > #6 0x0000555555a62f4d in bdrv_detach_aio_context > #7 0x0000555555a63116 in bdrv_set_aio_context > #8 0x0000555555a9d326 in blk_set_aio_context > #9 0x00005555557e38da in virtio_blk_data_plane_stop > #10 0x00005555559f9d5f in virtio_bus_stop_ioeventfd > #11 0x00005555559fa49b in virtio_bus_stop_ioeventfd > #12 0x00005555559f6a18 in virtio_pci_stop_ioeventfd > #13 0x00005555559f6a18 in virtio_pci_reset > #14 0x00005555559139a9 in qdev_reset_one > #15 0x0000555555916738 in qbus_walk_children > #16 0x0000555555913318 in qdev_walk_children > #17 0x0000555555916738 in qbus_walk_children > #18 0x00005555559168ca in qemu_devices_reset > #19 0x000055555581fcbb in pc_machine_reset > #20 0x00005555558a4d96 in qemu_system_reset > #21 0x000055555577157a in main_loop_should_exit > #22 0x000055555577157a in main_loop > #23 0x000055555577157a in main > > The rationale is that the loop in block_job_detach_aio_context cannot > make any progress in pausing/completing the job, because bs->in_flight > is 0, so bdrv_drain doesn't process the block_job_defer_to_main_loop > BH. With this patch, it does. > > Reported-by: Jeff Cody <jcody@redhat.com> > Signed-off-by: Fam Zheng <famz@redhat.com> > > --- > > v2: Do the BH poll in BDRV_POLL_WHILE to cover bdrv_drain_all_begin as > well. [Kevin] > --- > block/io.c | 10 +++++++--- > include/block/block.h | 21 +++++++++++++-------- > 2 files changed, 20 insertions(+), 11 deletions(-) Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > diff --git a/block/io.c b/block/io.c > index 8706bfa..a472157 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -158,7 +158,7 @@ bool bdrv_requests_pending(BlockDriverState *bs) > > static bool bdrv_drain_recurse(BlockDriverState *bs) > { > - BdrvChild *child; > + BdrvChild *child, *tmp; > bool waited; > > waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); > @@ -167,8 +167,12 @@ static bool bdrv_drain_recurse(BlockDriverState *bs) > bs->drv->bdrv_drain(bs); > } > > - QLIST_FOREACH(child, &bs->children, next) { > - waited |= bdrv_drain_recurse(child->bs); > + QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) { > + BlockDriverState *bs = child->bs; > + assert(bs->refcnt > 0); > + bdrv_ref(bs); > + waited |= bdrv_drain_recurse(bs); > + bdrv_unref(bs); > } There are other loops over bs->children in the block layer code. If a they indirectly call aio_poll() then the child bs could disappear. They should probably also take areference. That said, I wonder if this really solves the problem of graph changes during bs->children traversal. We now have a reference but are iterating over stale data. This is a separate issue and doesn't need to hold up this patch. > > return waited; > diff --git a/include/block/block.h b/include/block/block.h > index 97d4330..3f05e71 100644 > --- a/include/block/block.h > +++ b/include/block/block.h > @@ -381,12 +381,13 @@ void bdrv_drain_all(void); > > #define BDRV_POLL_WHILE(bs, cond) ({ \ > bool waited_ = false; \ > + bool busy_ = true; \ > BlockDriverState *bs_ = (bs); \ > AioContext *ctx_ = bdrv_get_aio_context(bs_); \ > if (aio_context_in_iothread(ctx_)) { \ > - while ((cond)) { \ > - aio_poll(ctx_, true); \ > - waited_ = true; \ > + while ((cond) || busy_) { \ > + busy_ = aio_poll(ctx_, (cond)); \ > + waited_ |= !!(cond); \ > } \ > } else { \ > assert(qemu_get_current_aio_context() == \ > @@ -398,11 +399,15 @@ void bdrv_drain_all(void); > */ \ > assert(!bs_->wakeup); \ > bs_->wakeup = true; \ > - while ((cond)) { \ > - aio_context_release(ctx_); \ > - aio_poll(qemu_get_aio_context(), true); \ > - aio_context_acquire(ctx_); \ > - waited_ = true; \ > + while (busy_) { \ > + if ((cond)) { \ > + waited_ = busy_ = true; \ > + aio_context_release(ctx_); \ > + aio_poll(qemu_get_aio_context(), true); \ > + aio_context_acquire(ctx_); \ > + } else { \ > + busy_ = aio_poll(ctx_, false); \ > + } \ > } \ > bs_->wakeup = false; \ > } \ > -- > 2.9.3 > >
On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: > @@ -398,11 +399,15 @@ void bdrv_drain_all(void); > */ \ > assert(!bs_->wakeup); \ > bs_->wakeup = true; \ > - while ((cond)) { \ > - aio_context_release(ctx_); \ > - aio_poll(qemu_get_aio_context(), true); \ > - aio_context_acquire(ctx_); \ > - waited_ = true; \ > + while (busy_) { \ > + if ((cond)) { \ > + waited_ = busy_ = true; \ > + aio_context_release(ctx_); \ > + aio_poll(qemu_get_aio_context(), true); \ > + aio_context_acquire(ctx_); \ > + } else { \ > + busy_ = aio_poll(ctx_, false); \ > + } \ Wait, I'm confused. The current thread is not in the BDS AioContext. We're not allowed to call aio_poll(ctx_, false). Did you mean aio_poll(qemu_get_aio_context(), false) in order to process defer to main loop BHs? Stefan
On 14/04/2017 16:51, Stefan Hajnoczi wrote: > On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: >> @@ -398,11 +399,15 @@ void bdrv_drain_all(void); >> */ \ >> assert(!bs_->wakeup); \ >> bs_->wakeup = true; \ >> - while ((cond)) { \ >> - aio_context_release(ctx_); \ >> - aio_poll(qemu_get_aio_context(), true); \ >> - aio_context_acquire(ctx_); \ >> - waited_ = true; \ >> + while (busy_) { \ >> + if ((cond)) { \ >> + waited_ = busy_ = true; \ >> + aio_context_release(ctx_); \ >> + aio_poll(qemu_get_aio_context(), true); \ >> + aio_context_acquire(ctx_); \ >> + } else { \ >> + busy_ = aio_poll(ctx_, false); \ >> + } \ > > Wait, I'm confused. The current thread is not in the BDS AioContext. > We're not allowed to call aio_poll(ctx_, false). It's pretty ugly indeed. Strictly from a thread-safety point of view, everything that aio_poll calls will acquire the AioContext, so that is safe and in fact the release/acquire pair can beeven hoisted outside the "if". If we did that for blocking=true in both I/O and main thread, then that would be racy. This is the scenario mentioned in the commit message for c9d1a56, "block: only call aio_poll on the current thread's AioContext", 2016-10-28). If only one thread has blocking=true, it's subject to races too. In this case, the I/O thread may fail to be woken by iothread_stop's aio_notify. However, by the time iothread_stop is called there should be no BlockDriverStates (and thus no BDRV_POLL_WHILE running the above code) for the I/O thread's AioContext. The root cause of the bug is simple: due to the main thread having the I/O thread's AioContext, the main thread is not letting the I/O thread run. This is what RFifoLock and the contention callbacks were designed to fix, though they had other issues related to recursive locking (if you acquired an AioContext twice, the contention callback failed to release it). The right way to fix it is just not to hold _any_ lock across BDRV_POLL_WHILE. This means getting rid of aio_context_acquire/release, and being somewhat careful about bdrv_drained_begin, but overall it's not hard. But Fam's patch seems ugly but safe, making it the right thing to do for the 2.9 branch---possibly with a FIXME comment explaining the above. bdrv_coroutine_enter can be removed too once BDS can do fine-grained locking. So, the ramifications of the current partial conversion are pretty complex (though many older QEMU releases had other similar dataplane+blockjob bugs due to the above recursive locking issue), but it looks like it is more or less manageable to fix them either now or in a quick 2.9.1 release a few weeks after 2.9.0. (Fam and I also tried another way to fix it today, but it led to deadlocks not related at all to the partial conversion, so it seemed like a dead end). Regarding the other code path mentioned in the cover letter: >> >> qmp_block_commit >> commit_active_start >> bdrv_reopen >> bdrv_reopen_multiple >> bdrv_reopen_prepare >> bdrv_flush >> aio_poll >> aio_bh_poll >> aio_bh_call >> block_job_defer_to_main_loop_bh >> stream_complete >> bdrv_reopen it's possible that it's just a missing bdrv_ref/unref pair, respectively in bdrv_reopen_queue_child and bdrv_reopen_multiple, so not related to this patch. We didn't test adding the ref/unref though. Thanks, Paolo > Did you mean aio_poll(qemu_get_aio_context(), false) in order to > process defer to main loop BHs?
On Sat, Apr 15, 2017 at 01:10:02AM +0800, Paolo Bonzini wrote: > > > On 14/04/2017 16:51, Stefan Hajnoczi wrote: > > On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: > >> @@ -398,11 +399,15 @@ void bdrv_drain_all(void); > >> */ \ > >> assert(!bs_->wakeup); \ > >> bs_->wakeup = true; \ > >> - while ((cond)) { \ > >> - aio_context_release(ctx_); \ > >> - aio_poll(qemu_get_aio_context(), true); \ > >> - aio_context_acquire(ctx_); \ > >> - waited_ = true; \ > >> + while (busy_) { \ > >> + if ((cond)) { \ > >> + waited_ = busy_ = true; \ > >> + aio_context_release(ctx_); \ > >> + aio_poll(qemu_get_aio_context(), true); \ > >> + aio_context_acquire(ctx_); \ > >> + } else { \ > >> + busy_ = aio_poll(ctx_, false); \ > >> + } \ > > > > Wait, I'm confused. The current thread is not in the BDS AioContext. > > We're not allowed to call aio_poll(ctx_, false). > > It's pretty ugly indeed. Strictly from a thread-safety point of view, > everything that aio_poll calls will acquire the AioContext, so that is > safe and in fact the release/acquire pair can beeven hoisted outside > the "if". > > If we did that for blocking=true in both I/O and main thread, then that > would be racy. This is the scenario mentioned in the commit message for > c9d1a56, "block: only call aio_poll on the current thread's AioContext", > 2016-10-28). > > If only one thread has blocking=true, it's subject to races too. In > this case, the I/O thread may fail to be woken by iothread_stop's > aio_notify. However, by the time iothread_stop is called there should > be no BlockDriverStates (and thus no BDRV_POLL_WHILE running the above > code) for the I/O thread's AioContext. The scenario I have in mind is: BDRV_POLL_WHILE() is called from the IOThread and the main loop also invokes BDRV_POLL_WHILE(). The IOThread is blocked in aio_poll(ctx, true). The main loop calls aio_poll(ctx, false) and can therefore steal the event/completion condition. I *think* ppoll() will return in both threads and they will race to invoke handlers, but I'm not 100% sure that two BDRV_POLL_WHILE() calls in parallel are safe. What do you think? Stefan
On Sun, 04/16 10:37, Stefan Hajnoczi wrote: > On Sat, Apr 15, 2017 at 01:10:02AM +0800, Paolo Bonzini wrote: > > > > > > On 14/04/2017 16:51, Stefan Hajnoczi wrote: > > > On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: > > >> @@ -398,11 +399,15 @@ void bdrv_drain_all(void); > > >> */ \ > > >> assert(!bs_->wakeup); \ > > >> bs_->wakeup = true; \ > > >> - while ((cond)) { \ > > >> - aio_context_release(ctx_); \ > > >> - aio_poll(qemu_get_aio_context(), true); \ > > >> - aio_context_acquire(ctx_); \ > > >> - waited_ = true; \ > > >> + while (busy_) { \ > > >> + if ((cond)) { \ > > >> + waited_ = busy_ = true; \ > > >> + aio_context_release(ctx_); \ > > >> + aio_poll(qemu_get_aio_context(), true); \ > > >> + aio_context_acquire(ctx_); \ > > >> + } else { \ > > >> + busy_ = aio_poll(ctx_, false); \ > > >> + } \ > > > > > > Wait, I'm confused. The current thread is not in the BDS AioContext. > > > We're not allowed to call aio_poll(ctx_, false). > > > > It's pretty ugly indeed. Strictly from a thread-safety point of view, > > everything that aio_poll calls will acquire the AioContext, so that is > > safe and in fact the release/acquire pair can beeven hoisted outside > > the "if". > > > > If we did that for blocking=true in both I/O and main thread, then that > > would be racy. This is the scenario mentioned in the commit message for > > c9d1a56, "block: only call aio_poll on the current thread's AioContext", > > 2016-10-28). > > > > If only one thread has blocking=true, it's subject to races too. In > > this case, the I/O thread may fail to be woken by iothread_stop's > > aio_notify. However, by the time iothread_stop is called there should > > be no BlockDriverStates (and thus no BDRV_POLL_WHILE running the above > > code) for the I/O thread's AioContext. > > The scenario I have in mind is: > > BDRV_POLL_WHILE() is called from the IOThread and the main loop also > invokes BDRV_POLL_WHILE(). > > The IOThread is blocked in aio_poll(ctx, true). > > The main loop calls aio_poll(ctx, false) and can therefore steal the > event/completion condition. > > I *think* ppoll() will return in both threads and they will race to > invoke handlers, but I'm not 100% sure that two BDRV_POLL_WHILE() calls > in parallel are safe. > > What do you think? BDRV_POLL_WHILE in both IOThread and main loop has aio_context_acquire(ctx) around it; in the branch where main loop calls aio_poll(ctx, false), there is also no aio_context_release(ctx). So I thinki it is protected by the AioContext lock, and is safe. I tested your suggested "aio_poll(qemu_get_aio_context(), false)", but it doesn't work. The new hunk is: diff --git a/include/block/block.h b/include/block/block.h index 97d4330..0e77522 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -381,12 +381,13 @@ void bdrv_drain_all(void); #define BDRV_POLL_WHILE(bs, cond) ({ \ bool waited_ = false; \ + bool busy_ = true; \ BlockDriverState *bs_ = (bs); \ AioContext *ctx_ = bdrv_get_aio_context(bs_); \ if (aio_context_in_iothread(ctx_)) { \ - while ((cond)) { \ - aio_poll(ctx_, true); \ - waited_ = true; \ + while ((cond) || busy_) { \ + busy_ = aio_poll(ctx_, (cond)); \ + waited_ |= !!(cond); \ } \ } else { \ assert(qemu_get_current_aio_context() == \ @@ -398,11 +399,12 @@ void bdrv_drain_all(void); */ \ assert(!bs_->wakeup); \ bs_->wakeup = true; \ - while ((cond)) { \ - aio_context_release(ctx_); \ - aio_poll(qemu_get_aio_context(), true); \ - aio_context_acquire(ctx_); \ - waited_ = true; \ + while ((cond) || busy_) { \ + aio_context_release(ctx_); \ + busy_ = aio_poll(qemu_get_aio_context(), \ + (cond)); \ + aio_context_acquire(ctx_); \ + waited_ |= !!(cond); \ } \ bs_->wakeup = false; \ } \ I get a coroutine re-enter crash (with the same drive-mirror -> ready -> reset reproducer), which I don't fully understand yet: #0 0x00007f09fdec091f in raise () at /lib64/libc.so.6 #1 0x00007f09fdec251a in abort () at /lib64/libc.so.6 #2 0x0000563e2017db9b in qemu_aio_coroutine_enter (ctx=0x563e21f52020, co=0x563e22cf1840) at /stor/work/qemu/util/qemu-coroutine.c:114 #3 0x0000563e2015ee82 in aio_co_enter (ctx=0x563e21f52020, co=0x563e22cf1840) at /stor/work/qemu/util/async.c:472 #4 0x0000563e2007241a in bdrv_coroutine_enter (bs=0x563e21fde470, co=0x563e22cf1840) at /stor/work/qemu/block.c:4333 #5 0x0000563e200745e6 in block_job_enter (job=0x563e22e76000) at /stor/work/qemu/blockjob.c:535 #6 0x0000563e20074554 in block_job_resume (job=0x563e22e76000) at /stor/work/qemu/blockjob.c:521 #7 0x0000563e20073372 in block_job_drained_end (opaque=0x563e22e76000) at /stor/work/qemu/blockjob.c:80 #8 0x0000563e200c0578 in blk_root_drained_end (child=0x563e21f9c8f0) at /stor/work/qemu/block/block-backend.c:1945 #9 0x0000563e200ce7af in bdrv_parent_drained_end (bs=0x563e21fde470) at /stor/work/qemu/block/io.c:64 #10 0x0000563e200cefe3 in bdrv_drained_end (bs=0x563e21fde470) at /stor/work/qemu/block/io.c:245 #11 0x0000563e200cf06f in bdrv_drain (bs=0x563e21fde470) at /stor/work/qemu/block/io.c:270 #12 0x0000563e200bf485 in blk_drain (blk=0x563e21f8eec0) at /stor/work/qemu/block/block-backend.c:1383 #13 0x0000563e1fd069aa in virtio_blk_reset (vdev=0x563e23afd950) at /stor/work/qemu/hw/block/virtio-blk.c:708 #14 0x0000563e1fd4c2fd in virtio_reset (opaque=0x563e23afd950) at /stor/work/qemu/hw/virtio/virtio.c:1200 #15 0x0000563e1ffdea74 in virtio_bus_reset (bus=0x563e23afd8d8) at /stor/work/qemu/hw/virtio/virtio-bus.c:96 #16 0x0000563e1ffdc609 in virtio_pci_reset (qdev=0x563e23af5440) at /stor/work/qemu/hw/virtio/virtio-pci.c:1873 #17 0x0000563e1fe994e9 in device_reset (dev=0x563e23af5440) at /stor/work/qemu/hw/core/qdev.c:1169 #18 0x0000563e1fe9735b in qdev_reset_one (dev=0x563e23af5440, opaque=0x0) at /stor/work/qemu/hw/core/qdev.c:310 #19 0x0000563e1fe97f5f in qdev_walk_children (dev=0x563e23af5440, pre_devfn=0x0, pre_busfn=0x0, post_devfn=0x563e1fe9733f <qdev_reset_one>, post_busfn=0x563e1fe97362 <qbus_reset_one>, opaque=0x0) at /stor/work/qemu/hw/core/qdev.c:625 #20 0x0000563e1fe9c125 in qbus_walk_children (bus=0x563e2216e1f0, pre_devfn=0x0, pre_busfn=0x0, post_devfn=0x563e1fe9733f <qdev_reset_one>, post_busfn=0x563e1fe97362 <qbus_reset_one>, opaque=0x0) at /stor/work/qemu/hw/core/bus.c:59 #21 0x0000563e1fe97f23 in qdev_walk_children (dev=0x563e2216c790, pre_devfn=0x0, pre_busfn=0x0, post_devfn=0x563e1fe9733f <qdev_reset_one>, post_busfn=0x563e1fe97362 <qbus_reset_one>, opaque=0x0) at /stor/work/qemu/hw/core/qdev.c:617 #22 0x0000563e1fe9c125 in qbus_walk_children (bus=0x563e21fe9ae0, pre_devfn=0x0, pre_busfn=0x0, post_devfn=0x563e1fe9733f <qdev_reset_one>, post_busfn=0x563e1fe97362 <qbus_reset_one>, opaque=0x0) at /stor/work/qemu/hw/core/bus.c:59 #23 0x0000563e1fe97475 in qbus_reset_all (bus=0x563e21fe9ae0) at /stor/work/qemu/hw/core/qdev.c:336 #24 0x0000563e1fe97498 in qbus_reset_all_fn (opaque=0x563e21fe9ae0) at /stor/work/qemu/hw/core/qdev.c:342 #25 0x0000563e1fe9ca6b in qemu_devices_reset () at /stor/work/qemu/hw/core/reset.c:69 #26 0x0000563e1fd6377d in pc_machine_reset () at /stor/work/qemu/hw/i386/pc.c:2234 #27 0x0000563e1fe0f4d4 in qemu_system_reset (report=true) at /stor/work/qemu/vl.c:1697 #28 0x0000563e1fe0f8fe in main_loop_should_exit () at /stor/work/qemu/vl.c:1865 #29 0x0000563e1fe0f9cb in main_loop () at /stor/work/qemu/vl.c:1902 #30 0x0000563e1fe17716 in main (argc=20, argv=0x7ffde0a17168, envp=0x7ffde0a17210) at /stor/work/qemu/vl.c:4709 (gdb) info thread Id Target Id Frame * 1 Thread 0x7f0a04396c00 (LWP 6192) "qemu-system-x86" 0x0000563e2017db9b in qemu_aio_coroutine_enter (ctx=0x563e21f52020, co=0x563e22cf1840) at /stor/work/qemu/util/qemu-coroutine.c:114 2 Thread 0x7f09f535e700 (LWP 6193) "qemu-system-x86" 0x00007f0a022d2c7d in nanosleep () from /lib64/libpthread.so.0 3 Thread 0x7f09f4b5d700 (LWP 6194) "qemu-system-x86" 0x0000563e2015e98c in aio_notify (ctx=0x563e21f52020) at /stor/work/qemu/util/async.c:341 4 Thread 0x7f09ef117700 (LWP 6196) "qemu-system-x86" 0x00007f0a022cf460 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 (gdb) t 3 [Switching to thread 3 (Thread 0x7f09f4b5d700 (LWP 6194))] #0 0x0000563e2015e98c in aio_notify (ctx=0x563e21f52020) at /stor/work/qemu/util/async.c:341 341 if (ctx->notify_me) { (gdb) bt #0 0x0000563e2015e98c in aio_notify (ctx=0x563e21f52020) at /stor/work/qemu/util/async.c:341 #1 0x0000563e2015ea10 in aio_timerlist_notify (opaque=0x563e21f52020, type=QEMU_CLOCK_REALTIME) at /stor/work/qemu/util/async.c:356 #2 0x0000563e2016032f in timerlist_notify (timer_list=0x563e21f521e0) at /stor/work/qemu/util/qemu-timer.c:281 #3 0x0000563e20160656 in timerlist_rearm (timer_list=0x563e21f521e0) at /stor/work/qemu/util/qemu-timer.c:404 #4 0x0000563e20160729 in timer_mod_ns (ts=0x7f09f00014c0, expire_time=176119278967) at /stor/work/qemu/util/qemu-timer.c:432 #5 0x0000563e20160807 in timer_mod (ts=0x7f09f00014c0, expire_time=176119278967) at /stor/work/qemu/util/qemu-timer.c:462 #6 0x0000563e2017f00a in co_aio_sleep_ns (ctx=0x563e21f52020, type=QEMU_CLOCK_REALTIME, ns=100000000) at /stor/work/qemu/util/qemu-coroutine-sleep.c:38 #7 0x0000563e20074941 in block_job_sleep_ns (job=0x563e22e76000, type=QEMU_CLOCK_REALTIME, ns=100000000) at /stor/work/qemu/blockjob.c:636 #8 0x0000563e200cbba1 in mirror_run (opaque=0x563e22e76000) at /stor/work/qemu/block/mirror.c:881 #9 0x0000563e20073b1e in block_job_co_entry (opaque=0x563e22e76000) at /stor/work/qemu/blockjob.c:282 #10 0x0000563e2017f0bb in coroutine_trampoline (i0=583997504, i1=22078) at /stor/work/qemu/util/coroutine-ucontext.c:79 #11 0x00007f09fded5bf0 in __start_context () at /lib64/libc.so.6 #12 0x00007ffde0a14cd0 in () #13 0x0000000000000000 in ()
On 17/04/2017 05:33, Fam Zheng wrote: > BDRV_POLL_WHILE in both IOThread and main loop has aio_context_acquire(ctx) > around it; in the branch where main loop calls aio_poll(ctx, false), > there is also no aio_context_release(ctx). So I thinki it is protected by the > AioContext lock, and is safe. > > I tested your suggested "aio_poll(qemu_get_aio_context(), false)", but it > doesn't work. Right, that's because "aio_poll(qemu_get_aio_context(), false)" relies on bdrv_wakeup on the I/O thread side (which in turn is triggered by bdrv_inc_in_flight/bdrv_dec_in_flight), but the bottom half does not have bdrv_inc_in_flight/bdrv_dec_in_flight around it. Paolo
On Fri, 04/14 09:51, Stefan Hajnoczi wrote: > On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: > > @@ -398,11 +399,15 @@ void bdrv_drain_all(void); > > */ \ > > assert(!bs_->wakeup); \ > > bs_->wakeup = true; \ > > - while ((cond)) { \ > > - aio_context_release(ctx_); \ > > - aio_poll(qemu_get_aio_context(), true); \ > > - aio_context_acquire(ctx_); \ > > - waited_ = true; \ > > + while (busy_) { \ > > + if ((cond)) { \ > > + waited_ = busy_ = true; \ > > + aio_context_release(ctx_); \ > > + aio_poll(qemu_get_aio_context(), true); \ > > + aio_context_acquire(ctx_); \ > > + } else { \ > > + busy_ = aio_poll(ctx_, false); \ > > + } \ > > Wait, I'm confused. The current thread is not in the BDS AioContext. > We're not allowed to call aio_poll(ctx_, false). > > Did you mean aio_poll(qemu_get_aio_context(), false) in order to > process defer to main loop BHs? At this point it's even unclear to me what should be the plan for 2.9. v1 IMO was the least intrusive, but didn't cover bdrv_drain_all_begin. v2 has this controversial "aio_poll(ctx_, false)", however its alternative, "aio_poll(qemu_get_aio_context(), false)", "introduces" another crash that is not seen otherwise. What should we do now? Fam
On Mon, Apr 17, 2017 at 04:27:19PM +0800, Fam Zheng wrote: > On Fri, 04/14 09:51, Stefan Hajnoczi wrote: > > On Fri, Apr 14, 2017 at 9:02 AM, Fam Zheng <famz@redhat.com> wrote: > > > @@ -398,11 +399,15 @@ void bdrv_drain_all(void); > > > */ \ > > > assert(!bs_->wakeup); \ > > > bs_->wakeup = true; \ > > > - while ((cond)) { \ > > > - aio_context_release(ctx_); \ > > > - aio_poll(qemu_get_aio_context(), true); \ > > > - aio_context_acquire(ctx_); \ > > > - waited_ = true; \ > > > + while (busy_) { \ > > > + if ((cond)) { \ > > > + waited_ = busy_ = true; \ > > > + aio_context_release(ctx_); \ > > > + aio_poll(qemu_get_aio_context(), true); \ > > > + aio_context_acquire(ctx_); \ > > > + } else { \ > > > + busy_ = aio_poll(ctx_, false); \ > > > + } \ > > > > Wait, I'm confused. The current thread is not in the BDS AioContext. > > We're not allowed to call aio_poll(ctx_, false). > > > > Did you mean aio_poll(qemu_get_aio_context(), false) in order to > > process defer to main loop BHs? > > At this point it's even unclear to me what should be the plan for 2.9. v1 IMO > was the least intrusive, but didn't cover bdrv_drain_all_begin. v2 has this > controversial "aio_poll(ctx_, false)", however its alternative, > "aio_poll(qemu_get_aio_context(), false)", "introduces" another crash that is > not seen otherwise. > > What should we do now? > > Fam > I think the priority is fixing the regression, which v1 does. Is there a regression lurking in the bdrv_drain_all() path? I've not been able to find one. If there is no regressive path through bdrv_drain_all(), my vote would be the simplest, least intrusive patch that fixes the current regression, and I think that is v1. Then we can fix everything correctly, and more comprehensively, for 2.10. -Jeff
On 17/04/2017 10:27, Fam Zheng wrote: > At this point it's even unclear to me what should be the plan for 2.9. v1 IMO > was the least intrusive, but didn't cover bdrv_drain_all_begin. v2 has this > controversial "aio_poll(ctx_, false)", v1 has it too: - bdrv_drain_recurse(bs); + while (true) { + if (!bdrv_drain_recurse(bs) && + !aio_poll(bdrv_get_aio_context(bs), false)) { + break; + } + } I don't have any particular preference. Both patches are self contained and easy to revert when the underlying root cause is fixed. Thanks, Paolo > however its alternative, > "aio_poll(qemu_get_aio_context(), false)", "introduces" another crash that is > not seen otherwise. > > What should we do now?
On Tue, 04/18 10:18, Paolo Bonzini wrote: > > > On 17/04/2017 10:27, Fam Zheng wrote: > > At this point it's even unclear to me what should be the plan for 2.9. v1 IMO > > was the least intrusive, but didn't cover bdrv_drain_all_begin. v2 has this > > controversial "aio_poll(ctx_, false)", > > v1 has it too: > > - bdrv_drain_recurse(bs); > + while (true) { > + if (!bdrv_drain_recurse(bs) && > + !aio_poll(bdrv_get_aio_context(bs), false)) { > + break; > + } > + } Yes you are right. On the other hand, the fact that in v2 I had to add bdrv_ref/bdrv_unref around the recursive bdrv_drain_recurse() call makes me worry a little - I assume the same problem exists in v1 and is just latent. So maybe merging v2 is better. > > I don't have any particular preference. Both patches are self contained > and easy to revert when the underlying root cause is fixed. > Fam
© 2016 - 2024 Red Hat, Inc.