[PATCH] hw/block/nvme: drain namespaces on sq deletion

Klaus Jensen posted 1 patch 3 years, 3 months ago
Failed in applying to current master (apply log)
hw/block/nvme.c | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
[PATCH] hw/block/nvme: drain namespaces on sq deletion
Posted by Klaus Jensen 3 years, 3 months ago
From: Klaus Jensen <k.jensen@samsung.com>

For most commands, when issuing an AIO, the BlockAIOCB is stored in the
NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
this is to allow the AIO to be cancelled when deleting submission
queues (it is currently not used for Abort).

Since the addition of the Dataset Management command and Zoned
Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
issued without saving a reference to the BlockAIOCB. This is a problem
since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
with an invalid BlockAIOCB.

Fix this by instead of explicitly cancelling the requests, just allow
the AIOs to complete by draining the namespace blockdevs.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 316858fd8adf..91f6fb6da1e2 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
 {
     req->ns = NULL;
     req->opaque = NULL;
+    req->aiocb = NULL;
     memset(&req->cqe, 0x0, sizeof(req->cqe));
     req->status = NVME_SUCCESS;
 }
@@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
     NvmeSQueue *sq;
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
+    int i;
 
     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
         trace_pci_nvme_err_invalid_del_sq(qid);
@@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
 
     trace_pci_nvme_del_sq(qid);
 
-    sq = n->sq[qid];
-    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
-        r = QTAILQ_FIRST(&sq->out_req_list);
-        assert(r->aiocb);
-        blk_aio_cancel(r->aiocb);
+    for (i = 1; i <= n->num_namespaces; i++) {
+        NvmeNamespace *ns = nvme_ns(n, i);
+        if (!ns) {
+            continue;
+        }
+
+        nvme_ns_drain(ns);
     }
+
+    sq = n->sq[qid];
+    assert(QTAILQ_EMPTY(&sq->out_req_list));
+
     if (!nvme_check_cqid(n, sq->cqid)) {
         cq = n->cq[sq->cqid];
         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
-- 
2.30.0


Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion
Posted by Minwoo Im 3 years, 2 months ago
On 21-01-27 14:15:05, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> this is to allow the AIO to be cancelled when deleting submission
> queues (it is currently not used for Abort).
> 
> Since the addition of the Dataset Management command and Zoned
> Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> issued without saving a reference to the BlockAIOCB. This is a problem
> since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> with an invalid BlockAIOCB.
> 
> Fix this by instead of explicitly cancelling the requests, just allow
> the AIOs to complete by draining the namespace blockdevs.
> 
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  hw/block/nvme.c | 18 +++++++++++++-----
>  1 file changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 316858fd8adf..91f6fb6da1e2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
>  {
>      req->ns = NULL;
>      req->opaque = NULL;
> +    req->aiocb = NULL;
>      memset(&req->cqe, 0x0, sizeof(req->cqe));
>      req->status = NVME_SUCCESS;
>  }
> @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>      NvmeSQueue *sq;
>      NvmeCQueue *cq;
>      uint16_t qid = le16_to_cpu(c->qid);
> +    int i;
>  
>      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
>          trace_pci_nvme_err_invalid_del_sq(qid);
> @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>  
>      trace_pci_nvme_del_sq(qid);
>  
> -    sq = n->sq[qid];
> -    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
> -        r = QTAILQ_FIRST(&sq->out_req_list);
> -        assert(r->aiocb);
> -        blk_aio_cancel(r->aiocb);
> +    for (i = 1; i <= n->num_namespaces; i++) {
> +        NvmeNamespace *ns = nvme_ns(n, i);
> +        if (!ns) {
> +            continue;
> +        }
> +
> +        nvme_ns_drain(ns);

If we just drain the entire namespaces here, commands which has nothing
to do with the target sq to be deleted will be drained.  And this might
be a burden for a single SQ deletion.

By the way, agree with the multiple AIOs references problem for newly added
commands.  But, shouldn't we manage the inflight AIO request references for
the newlly added commands with some other way and kill them all
explicitly as it was?  Maybe some of list for AIOCBs?

Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion
Posted by Klaus Jensen 3 years, 2 months ago
On Feb 11 11:49, Minwoo Im wrote:
> On 21-01-27 14:15:05, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jensen@samsung.com>
> > 
> > For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> > NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> > this is to allow the AIO to be cancelled when deleting submission
> > queues (it is currently not used for Abort).
> > 
> > Since the addition of the Dataset Management command and Zoned
> > Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> > issued without saving a reference to the BlockAIOCB. This is a problem
> > since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> > with an invalid BlockAIOCB.
> > 
> > Fix this by instead of explicitly cancelling the requests, just allow
> > the AIOs to complete by draining the namespace blockdevs.
> > 
> > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > ---
> >  hw/block/nvme.c | 18 +++++++++++++-----
> >  1 file changed, 13 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 316858fd8adf..91f6fb6da1e2 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
> >  {
> >      req->ns = NULL;
> >      req->opaque = NULL;
> > +    req->aiocb = NULL;
> >      memset(&req->cqe, 0x0, sizeof(req->cqe));
> >      req->status = NVME_SUCCESS;
> >  }
> > @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
> >      NvmeSQueue *sq;
> >      NvmeCQueue *cq;
> >      uint16_t qid = le16_to_cpu(c->qid);
> > +    int i;
> >  
> >      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
> >          trace_pci_nvme_err_invalid_del_sq(qid);
> > @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
> >  
> >      trace_pci_nvme_del_sq(qid);
> >  
> > -    sq = n->sq[qid];
> > -    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
> > -        r = QTAILQ_FIRST(&sq->out_req_list);
> > -        assert(r->aiocb);
> > -        blk_aio_cancel(r->aiocb);
> > +    for (i = 1; i <= n->num_namespaces; i++) {
> > +        NvmeNamespace *ns = nvme_ns(n, i);
> > +        if (!ns) {
> > +            continue;
> > +        }
> > +
> > +        nvme_ns_drain(ns);
> 
> If we just drain the entire namespaces here, commands which has nothing
> to do with the target sq to be deleted will be drained.  And this might
> be a burden for a single SQ deletion.
> 

That is true. But how often would you dynamically delete and create I/O
submission queues in the fast path?

> By the way, agree with the multiple AIOs references problem for newly added
> commands.  But, shouldn't we manage the inflight AIO request references for
> the newlly added commands with some other way and kill them all
> explicitly as it was?  Maybe some of list for AIOCBs?

I was hesitant to add more stuff to NvmeRequest (like a QTAILQ to track
this). Getting a steady-state with draining was an easy fix.
Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion
Posted by Minwoo Im 3 years, 2 months ago
On 21-02-11 13:07:08, Klaus Jensen wrote:
> On Feb 11 11:49, Minwoo Im wrote:
> > On 21-01-27 14:15:05, Klaus Jensen wrote:
> > > From: Klaus Jensen <k.jensen@samsung.com>
> > > 
> > > For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> > > NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> > > this is to allow the AIO to be cancelled when deleting submission
> > > queues (it is currently not used for Abort).
> > > 
> > > Since the addition of the Dataset Management command and Zoned
> > > Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> > > issued without saving a reference to the BlockAIOCB. This is a problem
> > > since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> > > with an invalid BlockAIOCB.
> > > 
> > > Fix this by instead of explicitly cancelling the requests, just allow
> > > the AIOs to complete by draining the namespace blockdevs.
> > > 
> > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > > ---
> > >  hw/block/nvme.c | 18 +++++++++++++-----
> > >  1 file changed, 13 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 316858fd8adf..91f6fb6da1e2 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
> > >  {
> > >      req->ns = NULL;
> > >      req->opaque = NULL;
> > > +    req->aiocb = NULL;
> > >      memset(&req->cqe, 0x0, sizeof(req->cqe));
> > >      req->status = NVME_SUCCESS;
> > >  }
> > > @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
> > >      NvmeSQueue *sq;
> > >      NvmeCQueue *cq;
> > >      uint16_t qid = le16_to_cpu(c->qid);
> > > +    int i;
> > >  
> > >      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
> > >          trace_pci_nvme_err_invalid_del_sq(qid);
> > > @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
> > >  
> > >      trace_pci_nvme_del_sq(qid);
> > >  
> > > -    sq = n->sq[qid];
> > > -    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
> > > -        r = QTAILQ_FIRST(&sq->out_req_list);
> > > -        assert(r->aiocb);
> > > -        blk_aio_cancel(r->aiocb);
> > > +    for (i = 1; i <= n->num_namespaces; i++) {
> > > +        NvmeNamespace *ns = nvme_ns(n, i);
> > > +        if (!ns) {
> > > +            continue;
> > > +        }
> > > +
> > > +        nvme_ns_drain(ns);
> > 
> > If we just drain the entire namespaces here, commands which has nothing
> > to do with the target sq to be deleted will be drained.  And this might
> > be a burden for a single SQ deletion.
> > 
> 
> That is true. But how often would you dynamically delete and create I/O
> submission queues in the fast path?

Delete I/O queues are not that often in the working NVMe controller, but
it might be a good case for the exception test from the host side like:
I/O queue deletion during I/O workloads.  If delete I/O queues are
returning by aborting their own requests only and quickly respond to the
host, then I think it might be a good one to test with.  Handling
requests gracefully sometimes don't cause corner cases from the host
point-of-view.  But, QEMU is not only for the host testing, so I am not
sure that QEMU NVMe device should handle things gracefully or try to do
something exactly as the real hardware(but, we don't know all the
hardware behavior ;)).

(But, Right. If I'm only talking about the kernel, then kernel does not
delete queues during the fast-path hot workloads.  But it's sometimes
great to test something on their own driver or application)

> > By the way, agree with the multiple AIOs references problem for newly added
> > commands.  But, shouldn't we manage the inflight AIO request references for
> > the newlly added commands with some other way and kill them all
> > explicitly as it was?  Maybe some of list for AIOCBs?
> 
> I was hesitant to add more stuff to NvmeRequest (like a QTAILQ to track
> this). Getting a steady-state with draining was an easy fix.

Graceful handling is easy to go with.  I am not expert for the overall
purpose of the QEMU NVMe device model, but I'm curious that which one we
need to take first between `Easy to go vs. What device should do`.

Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion
Posted by Klaus Jensen 3 years, 2 months ago
On Feb 11 22:49, Minwoo Im wrote:
> On 21-02-11 13:07:08, Klaus Jensen wrote:
> > On Feb 11 11:49, Minwoo Im wrote:
> > > On 21-01-27 14:15:05, Klaus Jensen wrote:
> > > > From: Klaus Jensen <k.jensen@samsung.com>
> > > > 
> > > > For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> > > > NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> > > > this is to allow the AIO to be cancelled when deleting submission
> > > > queues (it is currently not used for Abort).
> > > > 
> > > > Since the addition of the Dataset Management command and Zoned
> > > > Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> > > > issued without saving a reference to the BlockAIOCB. This is a problem
> > > > since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> > > > with an invalid BlockAIOCB.
> > > > 
> > > > Fix this by instead of explicitly cancelling the requests, just allow
> > > > the AIOs to complete by draining the namespace blockdevs.
> > > > 
> > > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > > > ---
> > > >  hw/block/nvme.c | 18 +++++++++++++-----
> > > >  1 file changed, 13 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > > index 316858fd8adf..91f6fb6da1e2 100644
> > > > --- a/hw/block/nvme.c
> > > > +++ b/hw/block/nvme.c
> > > > @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
> > > >  {
> > > >      req->ns = NULL;
> > > >      req->opaque = NULL;
> > > > +    req->aiocb = NULL;
> > > >      memset(&req->cqe, 0x0, sizeof(req->cqe));
> > > >      req->status = NVME_SUCCESS;
> > > >  }
> > > > @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
> > > >      NvmeSQueue *sq;
> > > >      NvmeCQueue *cq;
> > > >      uint16_t qid = le16_to_cpu(c->qid);
> > > > +    int i;
> > > >  
> > > >      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
> > > >          trace_pci_nvme_err_invalid_del_sq(qid);
> > > > @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
> > > >  
> > > >      trace_pci_nvme_del_sq(qid);
> > > >  
> > > > -    sq = n->sq[qid];
> > > > -    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
> > > > -        r = QTAILQ_FIRST(&sq->out_req_list);
> > > > -        assert(r->aiocb);
> > > > -        blk_aio_cancel(r->aiocb);
> > > > +    for (i = 1; i <= n->num_namespaces; i++) {
> > > > +        NvmeNamespace *ns = nvme_ns(n, i);
> > > > +        if (!ns) {
> > > > +            continue;
> > > > +        }
> > > > +
> > > > +        nvme_ns_drain(ns);
> > > 
> > > If we just drain the entire namespaces here, commands which has nothing
> > > to do with the target sq to be deleted will be drained.  And this might
> > > be a burden for a single SQ deletion.
> > > 
> > 
> > That is true. But how often would you dynamically delete and create I/O
> > submission queues in the fast path?
> 
> Delete I/O queues are not that often in the working NVMe controller, but
> it might be a good case for the exception test from the host side like:
> I/O queue deletion during I/O workloads.  If delete I/O queues are
> returning by aborting their own requests only and quickly respond to the
> host, then I think it might be a good one to test with.  Handling
> requests gracefully sometimes don't cause corner cases from the host
> point-of-view.  But, QEMU is not only for the host testing, so I am not
> sure that QEMU NVMe device should handle things gracefully or try to do
> something exactly as the real hardware(but, we don't know all the
> hardware behavior ;)).
> 
> (But, Right. If I'm only talking about the kernel, then kernel does not
> delete queues during the fast-path hot workloads.  But it's sometimes
> great to test something on their own driver or application)
> 
> > > By the way, agree with the multiple AIOs references problem for newly added
> > > commands.  But, shouldn't we manage the inflight AIO request references for
> > > the newlly added commands with some other way and kill them all
> > > explicitly as it was?  Maybe some of list for AIOCBs?
> > 
> > I was hesitant to add more stuff to NvmeRequest (like a QTAILQ to track
> > this). Getting a steady-state with draining was an easy fix.
> 
> Graceful handling is easy to go with.  I am not expert for the overall
> purpose of the QEMU NVMe device model, but I'm curious that which one we
> need to take first between `Easy to go vs. What device should do`.
> 

Alright, point taken :)

I'll post an RFC patch that tracks this properly instead of halfass'ing
it.
Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion
Posted by Klaus Jensen 3 years, 2 months ago
On Jan 27 14:15, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> this is to allow the AIO to be cancelled when deleting submission
> queues (it is currently not used for Abort).
> 
> Since the addition of the Dataset Management command and Zoned
> Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> issued without saving a reference to the BlockAIOCB. This is a problem
> since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> with an invalid BlockAIOCB.
> 
> Fix this by instead of explicitly cancelling the requests, just allow
> the AIOs to complete by draining the namespace blockdevs.
> 
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  hw/block/nvme.c | 18 +++++++++++++-----
>  1 file changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 316858fd8adf..91f6fb6da1e2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
>  {
>      req->ns = NULL;
>      req->opaque = NULL;
> +    req->aiocb = NULL;
>      memset(&req->cqe, 0x0, sizeof(req->cqe));
>      req->status = NVME_SUCCESS;
>  }
> @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>      NvmeSQueue *sq;
>      NvmeCQueue *cq;
>      uint16_t qid = le16_to_cpu(c->qid);
> +    int i;
>  
>      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
>          trace_pci_nvme_err_invalid_del_sq(qid);
> @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>  
>      trace_pci_nvme_del_sq(qid);
>  
> -    sq = n->sq[qid];
> -    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
> -        r = QTAILQ_FIRST(&sq->out_req_list);
> -        assert(r->aiocb);
> -        blk_aio_cancel(r->aiocb);
> +    for (i = 1; i <= n->num_namespaces; i++) {
> +        NvmeNamespace *ns = nvme_ns(n, i);
> +        if (!ns) {
> +            continue;
> +        }
> +
> +        nvme_ns_drain(ns);
>      }
> +
> +    sq = n->sq[qid];
> +    assert(QTAILQ_EMPTY(&sq->out_req_list));
> +
>      if (!nvme_check_cqid(n, sq->cqid)) {
>          cq = n->cq[sq->cqid];
>          QTAILQ_REMOVE(&cq->sq_list, sq, entry);
> -- 
> 2.30.0
> 

Ping on this.