irqfd, iothread and polling support

[PATCH v3 4/4] hw/nvme: add polling support

Posted by Jinhao Fan 3 years, 5 months ago

Add AioContext polling handlers for NVMe SQ and CQ. By employing polling,
the latency of NVMe IO emulation is greatly reduced. The SQ polling
handler checks for updates on the SQ tail shadow doorbell buffer. The CQ
polling handler is an empty function because we procatively polls the CQ
head shadow doorbell buffer when we want to post a cqe. Updates on the
SQ eventidx buffer is stopped during polling to avoid the host doing
unnecessary doorbell buffer writes.

Comparison (KIOPS):

QD        1   4  16  64
QEMU     53 155 245 309
polling 123 165 189 191

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++----
 hw/nvme/nvme.h |  1 +
 2 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 869565d77b..a7f8a4220e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -298,6 +298,8 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
 
 static void nvme_process_sq(void *opaque);
 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq);
+static void nvme_update_sq_tail(NvmeSQueue *sq);
 
 static uint16_t nvme_sqid(NvmeRequest *req)
 {
@@ -4447,6 +4449,21 @@ static void nvme_cq_notifier(EventNotifier *e)
     nvme_post_cqes(cq);
 }
 
+static bool nvme_cq_notifier_aio_poll(void *opaque)
+{
+    /*
+     * We already "poll" the CQ tail shadow doorbell value in nvme_post_cqes(),
+     * so we do not need to check the value here. However, QEMU's AioContext
+     * polling requires us to provide io_poll and io_poll_ready handlers, so
+     * use dummy functions for CQ.
+     */
+    return false;
+}
+
+static void nvme_cq_notifier_aio_poll_ready(EventNotifier *n)
+{
+}
+
 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
 {
     NvmeCtrl *n = cq->ctrl;
@@ -4459,8 +4476,10 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
     }
 
     if (cq->cqid) {
-        aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
-                               NULL, NULL);
+        aio_set_event_notifier(n->ctx, &cq->notifier, true,
+                               nvme_cq_notifier,
+                               nvme_cq_notifier_aio_poll,
+                               nvme_cq_notifier_aio_poll_ready);
     } else {
         event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
     }
@@ -4482,6 +4501,44 @@ static void nvme_sq_notifier(EventNotifier *e)
     nvme_process_sq(sq);
 }
 
+static void nvme_sq_notifier_aio_poll_begin(EventNotifier *n)
+{
+    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+
+    nvme_update_sq_eventidx(sq);
+
+    /* Stop host doorbell writes by stop updating eventidx */
+    sq->suppress_db = true;
+}
+
+static bool nvme_sq_notifier_aio_poll(void *opaque)
+{
+    EventNotifier *n = opaque;
+    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+    uint32_t old_tail = sq->tail;
+
+    nvme_update_sq_tail(sq);
+
+    return sq->tail != old_tail;
+}
+
+static void nvme_sq_notifier_aio_poll_ready(EventNotifier *n)
+{
+    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+
+    nvme_process_sq(sq);
+}
+
+static void nvme_sq_notifier_aio_poll_end(EventNotifier *n)
+{
+    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+
+    nvme_update_sq_eventidx(sq);
+
+    /* Resume host doorbell writes */
+    sq->suppress_db = false;
+}
+
 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
 {
     NvmeCtrl *n = sq->ctrl;
@@ -4494,8 +4551,13 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
     }
 
     if (sq->sqid) {
-        aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
-                               NULL, NULL);
+        aio_set_event_notifier(n->ctx, &sq->notifier, true,
+                               nvme_sq_notifier,
+                               nvme_sq_notifier_aio_poll,
+                               nvme_sq_notifier_aio_poll_ready);
+        aio_set_event_notifier_poll(n->ctx, &sq->notifier,
+                                    nvme_sq_notifier_aio_poll_begin,
+                                    nvme_sq_notifier_aio_poll_end);
     } else {
         event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
     }
@@ -6530,7 +6592,9 @@ static void nvme_process_sq(void *opaque)
         }
 
         if (n->dbbuf_enabled) {
-            nvme_update_sq_eventidx(sq);
+            if (!sq->suppress_db) {
+                nvme_update_sq_eventidx(sq);
+            }
             nvme_update_sq_tail(sq);
         }
     }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 224b73e6c4..bd486a8e15 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -380,6 +380,7 @@ typedef struct NvmeSQueue {
     QEMUTimer   *timer;
     EventNotifier notifier;
     bool        ioeventfd_enabled;
+    bool        suppress_db;
     NvmeRequest *io_req;
     QTAILQ_HEAD(, NvmeRequest) req_list;
     QTAILQ_HEAD(, NvmeRequest) out_req_list;
-- 
2.25.1

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Klaus Jensen 3 years, 3 months ago

On Aug 27 17:12, Jinhao Fan wrote:
> Add AioContext polling handlers for NVMe SQ and CQ. By employing polling,
> the latency of NVMe IO emulation is greatly reduced. The SQ polling
> handler checks for updates on the SQ tail shadow doorbell buffer. The CQ
> polling handler is an empty function because we procatively polls the CQ
> head shadow doorbell buffer when we want to post a cqe. Updates on the
> SQ eventidx buffer is stopped during polling to avoid the host doing
> unnecessary doorbell buffer writes.
> 
> Comparison (KIOPS):
> 
> QD        1   4  16  64
> QEMU     53 155 245 309
> polling 123 165 189 191
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++----
>  hw/nvme/nvme.h |  1 +
>  2 files changed, 70 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 869565d77b..a7f8a4220e 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -298,6 +298,8 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
>  
>  static void nvme_process_sq(void *opaque);
>  static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq);
> +static void nvme_update_sq_tail(NvmeSQueue *sq);
>  
>  static uint16_t nvme_sqid(NvmeRequest *req)
>  {
> @@ -4447,6 +4449,21 @@ static void nvme_cq_notifier(EventNotifier *e)
>      nvme_post_cqes(cq);
>  }
>  
> +static bool nvme_cq_notifier_aio_poll(void *opaque)
> +{
> +    /*
> +     * We already "poll" the CQ tail shadow doorbell value in nvme_post_cqes(),
> +     * so we do not need to check the value here. However, QEMU's AioContext
> +     * polling requires us to provide io_poll and io_poll_ready handlers, so
> +     * use dummy functions for CQ.
> +     */
> +    return false;
> +}
> +
> +static void nvme_cq_notifier_aio_poll_ready(EventNotifier *n)
> +{
> +}
> +
>  static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
>  {
>      NvmeCtrl *n = cq->ctrl;
> @@ -4459,8 +4476,10 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
>      }
>  
>      if (cq->cqid) {
> -        aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
> -                               NULL, NULL);
> +        aio_set_event_notifier(n->ctx, &cq->notifier, true,
> +                               nvme_cq_notifier,
> +                               nvme_cq_notifier_aio_poll,
> +                               nvme_cq_notifier_aio_poll_ready);

There is no reason to set up these polling handlers (since they don't do
anything).

>      } else {
>          event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
>      }
> @@ -4482,6 +4501,44 @@ static void nvme_sq_notifier(EventNotifier *e)
>      nvme_process_sq(sq);
>  }
>  
> +static void nvme_sq_notifier_aio_poll_begin(EventNotifier *n)
> +{
> +    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +
> +    nvme_update_sq_eventidx(sq);
> +
> +    /* Stop host doorbell writes by stop updating eventidx */
> +    sq->suppress_db = true;

This doesn't do what you expect it to. By not updaring the eventidx it
will fall behind the actual head, causing the host to think that the
device is not processing events (but it is!), resulting in doorbell
ringing.

> +}
> +
> +static bool nvme_sq_notifier_aio_poll(void *opaque)
> +{
> +    EventNotifier *n = opaque;
> +    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +    uint32_t old_tail = sq->tail;
> +
> +    nvme_update_sq_tail(sq);
> +
> +    return sq->tail != old_tail;
> +}
> +
> +static void nvme_sq_notifier_aio_poll_ready(EventNotifier *n)
> +{
> +    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +
> +    nvme_process_sq(sq);
> +}
> +
> +static void nvme_sq_notifier_aio_poll_end(EventNotifier *n)
> +{
> +    NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +
> +    nvme_update_sq_eventidx(sq);
> +
> +    /* Resume host doorbell writes */
> +    sq->suppress_db = false;
> +}
> +
>  static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
>  {
>      NvmeCtrl *n = sq->ctrl;
> @@ -4494,8 +4551,13 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
>      }
>  
>      if (sq->sqid) {
> -        aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
> -                               NULL, NULL);
> +        aio_set_event_notifier(n->ctx, &sq->notifier, true,
> +                               nvme_sq_notifier,
> +                               nvme_sq_notifier_aio_poll,
> +                               nvme_sq_notifier_aio_poll_ready);
> +        aio_set_event_notifier_poll(n->ctx, &sq->notifier,
> +                                    nvme_sq_notifier_aio_poll_begin,
> +                                    nvme_sq_notifier_aio_poll_end);

You can remove the call to aio_set_event_notifier_poll() since the
supress_db "trick" shouldnt be used anyway.

>      } else {
>          event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
>      }
> @@ -6530,7 +6592,9 @@ static void nvme_process_sq(void *opaque)
>          }
>  
>          if (n->dbbuf_enabled) {
> -            nvme_update_sq_eventidx(sq);
> +            if (!sq->suppress_db) {
> +                nvme_update_sq_eventidx(sq);
> +            }

Remove this change.

>              nvme_update_sq_tail(sq);
>          }
>      }
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 224b73e6c4..bd486a8e15 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -380,6 +380,7 @@ typedef struct NvmeSQueue {
>      QEMUTimer   *timer;
>      EventNotifier notifier;
>      bool        ioeventfd_enabled;
> +    bool        suppress_db;

Remove this.

>      NvmeRequest *io_req;
>      QTAILQ_HEAD(, NvmeRequest) req_list;
>      QTAILQ_HEAD(, NvmeRequest) out_req_list;
> -- 
> 2.25.1
> 

I tested with the patch modified for the above and I am not seeing any
difference in performance. But without the supress_db, this seems more
"correct".

-- 
One of us - No more doubt, silence or taboo about mental illness.

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Jinhao Fan 3 years, 3 months ago

at 7:10 PM, Klaus Jensen <its@irrelevant.dk> wrote:

> This doesn't do what you expect it to. By not updaring the eventidx it
> will fall behind the actual head, causing the host to think that the
> device is not processing events (but it is!), resulting in doorbell
> ringing.

I’m not sure I understand this correctly. 

In 7.13.1 in NVMe Spec 1.4c it says "If updating an entry in the Shadow
Doorbell buffer **changes** the value from being less than or equal to the
value of the corresponding EventIdx buffer entry to being greater than that
value, then the host shall also update the controller's corresponding
doorbell register to match the value of that entry in the Shadow Doorbell
buffer.”

So my understanding is that once the eventidx falls behind the actual head,
the host will only ring the doorbell once but *not* for future submissions.

Is this not what real hosts are doing?

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Klaus Jensen 3 years, 3 months ago

On Nov  3 10:18, Jinhao Fan wrote:
> at 7:10 PM, Klaus Jensen <its@irrelevant.dk> wrote:
> 
> > This doesn't do what you expect it to. By not updaring the eventidx it
> > will fall behind the actual head, causing the host to think that the
> > device is not processing events (but it is!), resulting in doorbell
> > ringing.
> 
> I’m not sure I understand this correctly. 
> 
> In 7.13.1 in NVMe Spec 1.4c it says "If updating an entry in the Shadow
> Doorbell buffer **changes** the value from being less than or equal to the
> value of the corresponding EventIdx buffer entry to being greater than that
> value, then the host shall also update the controller's corresponding
> doorbell register to match the value of that entry in the Shadow Doorbell
> buffer.”
> 
> So my understanding is that once the eventidx falls behind the actual head,
> the host will only ring the doorbell once but *not* for future submissions.
> 
> Is this not what real hosts are doing?

I agree that the spec is a little unclear on this point. In any case, in
Linux, when the driver has decided that the sq tail must be updated,
it will use this check:

  (new_idx - event_idx - 1) < (new_idx - old)

So it doesn't account for if or not eventidx was already behind.

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Jinhao Fan 3 years, 3 months ago

On 11/3/2022 8:10 PM, Klaus Jensen wrote:
> I agree that the spec is a little unclear on this point. In any case, in
> Linux, when the driver has decided that the sq tail must be updated,
> it will use this check:
> 
>    (new_idx - event_idx - 1) < (new_idx - old)

When eventidx is already behind, it's like:

  0
  1 <- event_idx
  2 <- old
  3 <- new_idx
  4
  .
  .
  .

In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) = 
3-2=1, so the host won't update sq tail. Where am I wrong in this example?

> 
> So it doesn't account for if or not eventidx was already behind.

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Klaus Jensen 3 years, 3 months ago

On Nov  3 21:19, Jinhao Fan wrote:
> On 11/3/2022 8:10 PM, Klaus Jensen wrote:
> > I agree that the spec is a little unclear on this point. In any case, in
> > Linux, when the driver has decided that the sq tail must be updated,
> > it will use this check:
> > 
> >    (new_idx - event_idx - 1) < (new_idx - old)
> 
> When eventidx is already behind, it's like:
> 
>  0
>  1 <- event_idx
>  2 <- old
>  3 <- new_idx
>  4
>  .
>  .
>  .
> 
> In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
> 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
> 

That becomes 1 >= 1, i.e. "true". So this will result in the driver
doing an mmio doorbell write.

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by John Levon 3 years, 3 months ago

On Fri, Nov 04, 2022 at 07:32:12AM +0100, Klaus Jensen wrote:

> On Nov  3 21:19, Jinhao Fan wrote:
> > On 11/3/2022 8:10 PM, Klaus Jensen wrote:
> > > I agree that the spec is a little unclear on this point. In any case, in
> > > Linux, when the driver has decided that the sq tail must be updated,
> > > it will use this check:
> > > 
> > >    (new_idx - event_idx - 1) < (new_idx - old)
> > 
> > When eventidx is already behind, it's like:
> > 
> >  0
> >  1 <- event_idx
> >  2 <- old
> >  3 <- new_idx
> >  4
> >  .
> >  .
> >  .
> > 
> > In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
> > 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
> 
> That becomes 1 >= 1, i.e. "true". So this will result in the driver
> doing an mmio doorbell write.

The code is:

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)         
{                                                                                
        return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);            
}                                                                                

which per the above is "return 1 < 1;", or false. So the above case does *not*
do an mmio write. No?

regards
john

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Klaus Jensen 3 years, 3 months ago

On Nov  8 12:39, John Levon wrote:
> On Fri, Nov 04, 2022 at 07:32:12AM +0100, Klaus Jensen wrote:
> 
> > On Nov  3 21:19, Jinhao Fan wrote:
> > > On 11/3/2022 8:10 PM, Klaus Jensen wrote:
> > > > I agree that the spec is a little unclear on this point. In any case, in
> > > > Linux, when the driver has decided that the sq tail must be updated,
> > > > it will use this check:
> > > > 
> > > >    (new_idx - event_idx - 1) < (new_idx - old)
> > > 
> > > When eventidx is already behind, it's like:
> > > 
> > >  0
> > >  1 <- event_idx
> > >  2 <- old
> > >  3 <- new_idx
> > >  4
> > >  .
> > >  .
> > >  .
> > > 
> > > In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
> > > 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
> > 
> > That becomes 1 >= 1, i.e. "true". So this will result in the driver
> > doing an mmio doorbell write.
> 
> The code is:
> 
> static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)         
> {                                                                                
>         return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);            
> }                                                                                
> 
> which per the above is "return 1 < 1;", or false. So the above case does *not*
> do an mmio write. No?
> 

Whelp.

Looks like I'm in the wrong here, apologies!

Re: [PATCH v3 4/4] hw/nvme: add polling support

Posted by Jinhao Fan 3 years, 3 months ago

at 10:11 PM, Klaus Jensen <its@irrelevant.dk> wrote:

> On Nov  8 12:39, John Levon wrote:
>> On Fri, Nov 04, 2022 at 07:32:12AM +0100, Klaus Jensen wrote:
>> 
>>> On Nov  3 21:19, Jinhao Fan wrote:
>>>> On 11/3/2022 8:10 PM, Klaus Jensen wrote:
>>>>> I agree that the spec is a little unclear on this point. In any case, in
>>>>> Linux, when the driver has decided that the sq tail must be updated,
>>>>> it will use this check:
>>>>> 
>>>>>   (new_idx - event_idx - 1) < (new_idx - old)
>>>> 
>>>> When eventidx is already behind, it's like:
>>>> 
>>>> 0
>>>> 1 <- event_idx
>>>> 2 <- old
>>>> 3 <- new_idx
>>>> 4
>>>> .
>>>> .
>>>> .
>>>> 
>>>> In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
>>>> 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
>>> 
>>> That becomes 1 >= 1, i.e. "true". So this will result in the driver
>>> doing an mmio doorbell write.
>> 
>> The code is:
>> 
>> static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)         
>> {                                                                                
>>        return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);            
>> }                                                                                
>> 
>> which per the above is "return 1 < 1;", or false. So the above case does *not*
>> do an mmio write. No?
> 
> Whelp.
> 
> Looks like I'm in the wrong here, apologies!

So disabling eventidx updates during polling has the potential of reducing
doorbell writes. But as Klaus observed removing this function does not cause
performance difference. So I guess only one command is processed during each
polling iteration.