[v1] Revert "nvme: Fix coroutine waking"

[PATCH for-10.2] Revert "nvme: Fix coroutine waking"

Posted by Hanna Czenczek 1 month, 4 weeks ago

This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.

Lukáš Doktor reported a simple single-threaded nvme test case hanging
and bisected it to this commit.  While we are still investigating, it is
best to revert the commit for now.

(This breaks multiqueue for nvme, but better to have single-queue
working than neither.)

Cc: qemu-stable@nongnu.org
Reported-by: Lukáš Doktor <ldoktor@redhat.com>
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
---
 block/nvme.c | 56 +++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 919e14cef9..c3d3b99d1f 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -1200,36 +1200,26 @@ fail:
 
 typedef struct {
     Coroutine *co;
-    bool skip_yield;
     int ret;
+    AioContext *ctx;
 } NVMeCoData;
 
+static void nvme_rw_cb_bh(void *opaque)
+{
+    NVMeCoData *data = opaque;
+    qemu_coroutine_enter(data->co);
+}
+
 /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NVMeCoData *data = opaque;
-
     data->ret = ret;
-
-    if (data->co == qemu_coroutine_self()) {
-        /*
-         * Fast path: We are inside of the request coroutine (through
-         * nvme_submit_command, nvme_deferred_fn, nvme_process_completion).
-         * We can set data->skip_yield here to keep the coroutine from
-         * yielding, and then we don't need to schedule a BH to wake it.
-         */
-        data->skip_yield = true;
-    } else {
-        /*
-         * Safe to call: The case where we run in the request coroutine is
-         * handled above, so we must be independent of it; and without
-         * skip_yield set, the coroutine will yield.
-         * No need to release NVMeQueuePair.lock (we are called without it
-         * held).  (Note: If we enter the coroutine here, @data will
-         * probably be dangling once aio_co_wake() returns.)
-         */
-        aio_co_wake(data->co);
+    if (!data->co) {
+        /* The rw coroutine hasn't yielded, don't try to enter. */
+        return;
     }
+    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
 }
 
 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
@@ -1253,7 +1243,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
         .cdw12 = cpu_to_le32(cdw12),
     };
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1270,7 +1260,9 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
         return r;
     }
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
@@ -1366,7 +1358,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
         .nsid = cpu_to_le32(s->nsid),
     };
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1374,7 +1366,9 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
     req = nvme_get_free_req(ioq);
     assert(req);
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    if (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
@@ -1415,7 +1409,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     };
 
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1435,7 +1429,9 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
     assert(req);
 
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
@@ -1463,7 +1459,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     };
 
     NVMeCoData data = {
-        .co = qemu_coroutine_self(),
+        .ctx = bdrv_get_aio_context(bs),
         .ret = -EINPROGRESS,
     };
 
@@ -1508,7 +1504,9 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     trace_nvme_dsm(s, offset, bytes);
 
     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
-    if (!data.skip_yield) {
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
         qemu_coroutine_yield();
     }
 
-- 
2.52.0

Re: [PATCH for-10.2] Revert "nvme: Fix coroutine waking"

Posted by Lukáš Doktor 1 month, 3 weeks ago

Thanks, tested this on the CI system and with this patch it seems to work well.

Regards,
Lukáš

Tested-by: Lukáš Doktor <ldoktor@redhat.com>

Dne 12. 12. 25 v 11:25 Hanna Czenczek napsal(a):
> This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.
> 
> Lukáš Doktor reported a simple single-threaded nvme test case hanging
> and bisected it to this commit.  While we are still investigating, it is
> best to revert the commit for now.
> 
> (This breaks multiqueue for nvme, but better to have single-queue
> working than neither.)
> 
> Cc: qemu-stable@nongnu.org
> Reported-by: Lukáš Doktor <ldoktor@redhat.com>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  block/nvme.c | 56 +++++++++++++++++++++++++---------------------------
>  1 file changed, 27 insertions(+), 29 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 919e14cef9..c3d3b99d1f 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -1200,36 +1200,26 @@ fail:
>  
>  typedef struct {
>      Coroutine *co;
> -    bool skip_yield;
>      int ret;
> +    AioContext *ctx;
>  } NVMeCoData;
>  
> +static void nvme_rw_cb_bh(void *opaque)
> +{
> +    NVMeCoData *data = opaque;
> +    qemu_coroutine_enter(data->co);
> +}
> +
>  /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>      NVMeCoData *data = opaque;
> -
>      data->ret = ret;
> -
> -    if (data->co == qemu_coroutine_self()) {
> -        /*
> -         * Fast path: We are inside of the request coroutine (through
> -         * nvme_submit_command, nvme_deferred_fn, nvme_process_completion).
> -         * We can set data->skip_yield here to keep the coroutine from
> -         * yielding, and then we don't need to schedule a BH to wake it.
> -         */
> -        data->skip_yield = true;
> -    } else {
> -        /*
> -         * Safe to call: The case where we run in the request coroutine is
> -         * handled above, so we must be independent of it; and without
> -         * skip_yield set, the coroutine will yield.
> -         * No need to release NVMeQueuePair.lock (we are called without it
> -         * held).  (Note: If we enter the coroutine here, @data will
> -         * probably be dangling once aio_co_wake() returns.)
> -         */
> -        aio_co_wake(data->co);
> +    if (!data->co) {
> +        /* The rw coroutine hasn't yielded, don't try to enter. */
> +        return;
>      }
> +    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
>  }
>  
>  static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
> @@ -1253,7 +1243,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
>          .cdw12 = cpu_to_le32(cdw12),
>      };
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1270,7 +1260,9 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
>          return r;
>      }
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  
> @@ -1366,7 +1358,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
>          .nsid = cpu_to_le32(s->nsid),
>      };
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1374,7 +1366,9 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
>      req = nvme_get_free_req(ioq);
>      assert(req);
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    if (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  
> @@ -1415,7 +1409,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>      };
>  
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1435,7 +1429,9 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>      assert(req);
>  
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>  
> @@ -1463,7 +1459,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
>      };
>  
>      NVMeCoData data = {
> -        .co = qemu_coroutine_self(),
> +        .ctx = bdrv_get_aio_context(bs),
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1508,7 +1504,9 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
>      trace_nvme_dsm(s, offset, bytes);
>  
>      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
> -    if (!data.skip_yield) {
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
>          qemu_coroutine_yield();
>      }
>

Re: [PATCH for-10.2] Revert "nvme: Fix coroutine waking"

Posted by Hanna Czenczek 1 month, 3 weeks ago

On 12.12.25 11:25, Hanna Czenczek wrote:
> This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.
>
> Lukáš Doktor reported a simple single-threaded nvme test case hanging
> and bisected it to this commit.  While we are still investigating, it is
> best to revert the commit for now.
>
> (This breaks multiqueue for nvme, but better to have single-queue
> working than neither.)
>
> Cc: qemu-stable@nongnu.org
> Reported-by: Lukáš Doktor <ldoktor@redhat.com>
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>   block/nvme.c | 56 +++++++++++++++++++++++++---------------------------
>   1 file changed, 27 insertions(+), 29 deletions(-)
>
> diff --git a/block/nvme.c b/block/nvme.c
> index 919e14cef9..c3d3b99d1f 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c

[...]

>   /* Put into NVMeRequest.cb, so runs in the BDS's main AioContext */
>   static void nvme_rw_cb(void *opaque, int ret)
>   {

[...]

> -        aio_co_wake(data->co);
[...]
> +    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
>   }

 From testing, this bit seems to be the important one: The hang seems to 
be caused by entering directly the coroutine directly instead of always 
going through a BH.  Why that is, I haven’t yet found out, only that 
s/aio_co_wake()/aio_co_schedule()/ seems to make it work.

I’ll spend more time trying to find out why.

(The only thing I know so far is that iscsi similarly should not use 
aio_co_wake(), and for that we do have a documented reason: 
https://gitlab.com/qemu-project/qemu/-/commit/8b9dfe9098 – in light of 
that, it probably makes sense not to use aio_co_wake() for NFS either, 
which was the third case in the original series where I replaced a 
oneshot schedule by aio_co_wake().)

Hanna

Re: [PATCH for-10.2] Revert "nvme: Fix coroutine waking"

Posted by Hanna Czenczek 1 month, 4 weeks ago

On 12.12.25 11:25, Hanna Czenczek wrote:
> This reverts commit 0f142cbd919fcb6cea7aa176f7e4939925806dd9.
>
> Lukáš Doktor reported a simple single-threaded nvme test case hanging
> and bisected it to this commit.  While we are still investigating, it is
> best to revert the commit for now.
>
> (This breaks multiqueue for nvme, but better to have single-queue
> working than neither.)

I’ve tested this on Lukáš’s system, and there is no hang with this 
revert patch applied.

(For what it’s worth, the hang only seems to appear when writing, which 
is why I didn’t see it when testing myself.  I can’t really overwrite 
the only “testing” NVMe SSD I have, so I only tested reads.)

Hanna