Add AioContext polling handlers for NVMe SQ and CQ. By employing polling,
the latency of NVMe IO emulation is greatly reduced. The SQ polling
handler checks for updates on the SQ tail shadow doorbell buffer. The CQ
polling handler is an empty function because we procatively polls the CQ
head shadow doorbell buffer when we want to post a cqe. Updates on the
SQ eventidx buffer is stopped during polling to avoid the host doing
unnecessary doorbell buffer writes.
Comparison (KIOPS):
QD 1 4 16 64
QEMU 53 155 245 309
polling 123 165 189 191
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
hw/nvme/ctrl.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++----
hw/nvme/nvme.h | 1 +
2 files changed, 70 insertions(+), 5 deletions(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 869565d77b..a7f8a4220e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -298,6 +298,8 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
static void nvme_process_sq(void *opaque);
static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq);
+static void nvme_update_sq_tail(NvmeSQueue *sq);
static uint16_t nvme_sqid(NvmeRequest *req)
{
@@ -4447,6 +4449,21 @@ static void nvme_cq_notifier(EventNotifier *e)
nvme_post_cqes(cq);
}
+static bool nvme_cq_notifier_aio_poll(void *opaque)
+{
+ /*
+ * We already "poll" the CQ tail shadow doorbell value in nvme_post_cqes(),
+ * so we do not need to check the value here. However, QEMU's AioContext
+ * polling requires us to provide io_poll and io_poll_ready handlers, so
+ * use dummy functions for CQ.
+ */
+ return false;
+}
+
+static void nvme_cq_notifier_aio_poll_ready(EventNotifier *n)
+{
+}
+
static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
{
NvmeCtrl *n = cq->ctrl;
@@ -4459,8 +4476,10 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
}
if (cq->cqid) {
- aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
- NULL, NULL);
+ aio_set_event_notifier(n->ctx, &cq->notifier, true,
+ nvme_cq_notifier,
+ nvme_cq_notifier_aio_poll,
+ nvme_cq_notifier_aio_poll_ready);
} else {
event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
}
@@ -4482,6 +4501,44 @@ static void nvme_sq_notifier(EventNotifier *e)
nvme_process_sq(sq);
}
+static void nvme_sq_notifier_aio_poll_begin(EventNotifier *n)
+{
+ NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+
+ nvme_update_sq_eventidx(sq);
+
+ /* Stop host doorbell writes by stop updating eventidx */
+ sq->suppress_db = true;
+}
+
+static bool nvme_sq_notifier_aio_poll(void *opaque)
+{
+ EventNotifier *n = opaque;
+ NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+ uint32_t old_tail = sq->tail;
+
+ nvme_update_sq_tail(sq);
+
+ return sq->tail != old_tail;
+}
+
+static void nvme_sq_notifier_aio_poll_ready(EventNotifier *n)
+{
+ NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+
+ nvme_process_sq(sq);
+}
+
+static void nvme_sq_notifier_aio_poll_end(EventNotifier *n)
+{
+ NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
+
+ nvme_update_sq_eventidx(sq);
+
+ /* Resume host doorbell writes */
+ sq->suppress_db = false;
+}
+
static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
{
NvmeCtrl *n = sq->ctrl;
@@ -4494,8 +4551,13 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
}
if (sq->sqid) {
- aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
- NULL, NULL);
+ aio_set_event_notifier(n->ctx, &sq->notifier, true,
+ nvme_sq_notifier,
+ nvme_sq_notifier_aio_poll,
+ nvme_sq_notifier_aio_poll_ready);
+ aio_set_event_notifier_poll(n->ctx, &sq->notifier,
+ nvme_sq_notifier_aio_poll_begin,
+ nvme_sq_notifier_aio_poll_end);
} else {
event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
}
@@ -6530,7 +6592,9 @@ static void nvme_process_sq(void *opaque)
}
if (n->dbbuf_enabled) {
- nvme_update_sq_eventidx(sq);
+ if (!sq->suppress_db) {
+ nvme_update_sq_eventidx(sq);
+ }
nvme_update_sq_tail(sq);
}
}
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 224b73e6c4..bd486a8e15 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -380,6 +380,7 @@ typedef struct NvmeSQueue {
QEMUTimer *timer;
EventNotifier notifier;
bool ioeventfd_enabled;
+ bool suppress_db;
NvmeRequest *io_req;
QTAILQ_HEAD(, NvmeRequest) req_list;
QTAILQ_HEAD(, NvmeRequest) out_req_list;
--
2.25.1
On Aug 27 17:12, Jinhao Fan wrote:
> Add AioContext polling handlers for NVMe SQ and CQ. By employing polling,
> the latency of NVMe IO emulation is greatly reduced. The SQ polling
> handler checks for updates on the SQ tail shadow doorbell buffer. The CQ
> polling handler is an empty function because we procatively polls the CQ
> head shadow doorbell buffer when we want to post a cqe. Updates on the
> SQ eventidx buffer is stopped during polling to avoid the host doing
> unnecessary doorbell buffer writes.
>
> Comparison (KIOPS):
>
> QD 1 4 16 64
> QEMU 53 155 245 309
> polling 123 165 189 191
>
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
> hw/nvme/ctrl.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++----
> hw/nvme/nvme.h | 1 +
> 2 files changed, 70 insertions(+), 5 deletions(-)
>
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 869565d77b..a7f8a4220e 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -298,6 +298,8 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
>
> static void nvme_process_sq(void *opaque);
> static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq);
> +static void nvme_update_sq_tail(NvmeSQueue *sq);
>
> static uint16_t nvme_sqid(NvmeRequest *req)
> {
> @@ -4447,6 +4449,21 @@ static void nvme_cq_notifier(EventNotifier *e)
> nvme_post_cqes(cq);
> }
>
> +static bool nvme_cq_notifier_aio_poll(void *opaque)
> +{
> + /*
> + * We already "poll" the CQ tail shadow doorbell value in nvme_post_cqes(),
> + * so we do not need to check the value here. However, QEMU's AioContext
> + * polling requires us to provide io_poll and io_poll_ready handlers, so
> + * use dummy functions for CQ.
> + */
> + return false;
> +}
> +
> +static void nvme_cq_notifier_aio_poll_ready(EventNotifier *n)
> +{
> +}
> +
> static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> {
> NvmeCtrl *n = cq->ctrl;
> @@ -4459,8 +4476,10 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> }
>
> if (cq->cqid) {
> - aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
> - NULL, NULL);
> + aio_set_event_notifier(n->ctx, &cq->notifier, true,
> + nvme_cq_notifier,
> + nvme_cq_notifier_aio_poll,
> + nvme_cq_notifier_aio_poll_ready);
There is no reason to set up these polling handlers (since they don't do
anything).
> } else {
> event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> }
> @@ -4482,6 +4501,44 @@ static void nvme_sq_notifier(EventNotifier *e)
> nvme_process_sq(sq);
> }
>
> +static void nvme_sq_notifier_aio_poll_begin(EventNotifier *n)
> +{
> + NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +
> + nvme_update_sq_eventidx(sq);
> +
> + /* Stop host doorbell writes by stop updating eventidx */
> + sq->suppress_db = true;
This doesn't do what you expect it to. By not updaring the eventidx it
will fall behind the actual head, causing the host to think that the
device is not processing events (but it is!), resulting in doorbell
ringing.
> +}
> +
> +static bool nvme_sq_notifier_aio_poll(void *opaque)
> +{
> + EventNotifier *n = opaque;
> + NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> + uint32_t old_tail = sq->tail;
> +
> + nvme_update_sq_tail(sq);
> +
> + return sq->tail != old_tail;
> +}
> +
> +static void nvme_sq_notifier_aio_poll_ready(EventNotifier *n)
> +{
> + NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +
> + nvme_process_sq(sq);
> +}
> +
> +static void nvme_sq_notifier_aio_poll_end(EventNotifier *n)
> +{
> + NvmeSQueue *sq = container_of(n, NvmeSQueue, notifier);
> +
> + nvme_update_sq_eventidx(sq);
> +
> + /* Resume host doorbell writes */
> + sq->suppress_db = false;
> +}
> +
> static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> {
> NvmeCtrl *n = sq->ctrl;
> @@ -4494,8 +4551,13 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> }
>
> if (sq->sqid) {
> - aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
> - NULL, NULL);
> + aio_set_event_notifier(n->ctx, &sq->notifier, true,
> + nvme_sq_notifier,
> + nvme_sq_notifier_aio_poll,
> + nvme_sq_notifier_aio_poll_ready);
> + aio_set_event_notifier_poll(n->ctx, &sq->notifier,
> + nvme_sq_notifier_aio_poll_begin,
> + nvme_sq_notifier_aio_poll_end);
You can remove the call to aio_set_event_notifier_poll() since the
supress_db "trick" shouldnt be used anyway.
> } else {
> event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> }
> @@ -6530,7 +6592,9 @@ static void nvme_process_sq(void *opaque)
> }
>
> if (n->dbbuf_enabled) {
> - nvme_update_sq_eventidx(sq);
> + if (!sq->suppress_db) {
> + nvme_update_sq_eventidx(sq);
> + }
Remove this change.
> nvme_update_sq_tail(sq);
> }
> }
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 224b73e6c4..bd486a8e15 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -380,6 +380,7 @@ typedef struct NvmeSQueue {
> QEMUTimer *timer;
> EventNotifier notifier;
> bool ioeventfd_enabled;
> + bool suppress_db;
Remove this.
> NvmeRequest *io_req;
> QTAILQ_HEAD(, NvmeRequest) req_list;
> QTAILQ_HEAD(, NvmeRequest) out_req_list;
> --
> 2.25.1
>
I tested with the patch modified for the above and I am not seeing any
difference in performance. But without the supress_db, this seems more
"correct".
--
One of us - No more doubt, silence or taboo about mental illness.
at 7:10 PM, Klaus Jensen <its@irrelevant.dk> wrote: > This doesn't do what you expect it to. By not updaring the eventidx it > will fall behind the actual head, causing the host to think that the > device is not processing events (but it is!), resulting in doorbell > ringing. I’m not sure I understand this correctly. In 7.13.1 in NVMe Spec 1.4c it says "If updating an entry in the Shadow Doorbell buffer **changes** the value from being less than or equal to the value of the corresponding EventIdx buffer entry to being greater than that value, then the host shall also update the controller's corresponding doorbell register to match the value of that entry in the Shadow Doorbell buffer.” So my understanding is that once the eventidx falls behind the actual head, the host will only ring the doorbell once but *not* for future submissions. Is this not what real hosts are doing?
On Nov 3 10:18, Jinhao Fan wrote: > at 7:10 PM, Klaus Jensen <its@irrelevant.dk> wrote: > > > This doesn't do what you expect it to. By not updaring the eventidx it > > will fall behind the actual head, causing the host to think that the > > device is not processing events (but it is!), resulting in doorbell > > ringing. > > I’m not sure I understand this correctly. > > In 7.13.1 in NVMe Spec 1.4c it says "If updating an entry in the Shadow > Doorbell buffer **changes** the value from being less than or equal to the > value of the corresponding EventIdx buffer entry to being greater than that > value, then the host shall also update the controller's corresponding > doorbell register to match the value of that entry in the Shadow Doorbell > buffer.” > > So my understanding is that once the eventidx falls behind the actual head, > the host will only ring the doorbell once but *not* for future submissions. > > Is this not what real hosts are doing? I agree that the spec is a little unclear on this point. In any case, in Linux, when the driver has decided that the sq tail must be updated, it will use this check: (new_idx - event_idx - 1) < (new_idx - old) So it doesn't account for if or not eventidx was already behind.
On 11/3/2022 8:10 PM, Klaus Jensen wrote: > I agree that the spec is a little unclear on this point. In any case, in > Linux, when the driver has decided that the sq tail must be updated, > it will use this check: > > (new_idx - event_idx - 1) < (new_idx - old) When eventidx is already behind, it's like: 0 1 <- event_idx 2 <- old 3 <- new_idx 4 . . . In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) = 3-2=1, so the host won't update sq tail. Where am I wrong in this example? > > So it doesn't account for if or not eventidx was already behind.
On Nov 3 21:19, Jinhao Fan wrote: > On 11/3/2022 8:10 PM, Klaus Jensen wrote: > > I agree that the spec is a little unclear on this point. In any case, in > > Linux, when the driver has decided that the sq tail must be updated, > > it will use this check: > > > > (new_idx - event_idx - 1) < (new_idx - old) > > When eventidx is already behind, it's like: > > 0 > 1 <- event_idx > 2 <- old > 3 <- new_idx > 4 > . > . > . > > In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) = > 3-2=1, so the host won't update sq tail. Where am I wrong in this example? > That becomes 1 >= 1, i.e. "true". So this will result in the driver doing an mmio doorbell write.
On Fri, Nov 04, 2022 at 07:32:12AM +0100, Klaus Jensen wrote:
> On Nov 3 21:19, Jinhao Fan wrote:
> > On 11/3/2022 8:10 PM, Klaus Jensen wrote:
> > > I agree that the spec is a little unclear on this point. In any case, in
> > > Linux, when the driver has decided that the sq tail must be updated,
> > > it will use this check:
> > >
> > > (new_idx - event_idx - 1) < (new_idx - old)
> >
> > When eventidx is already behind, it's like:
> >
> > 0
> > 1 <- event_idx
> > 2 <- old
> > 3 <- new_idx
> > 4
> > .
> > .
> > .
> >
> > In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
> > 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
>
> That becomes 1 >= 1, i.e. "true". So this will result in the driver
> doing an mmio doorbell write.
The code is:
static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}
which per the above is "return 1 < 1;", or false. So the above case does *not*
do an mmio write. No?
regards
john
On Nov 8 12:39, John Levon wrote:
> On Fri, Nov 04, 2022 at 07:32:12AM +0100, Klaus Jensen wrote:
>
> > On Nov 3 21:19, Jinhao Fan wrote:
> > > On 11/3/2022 8:10 PM, Klaus Jensen wrote:
> > > > I agree that the spec is a little unclear on this point. In any case, in
> > > > Linux, when the driver has decided that the sq tail must be updated,
> > > > it will use this check:
> > > >
> > > > (new_idx - event_idx - 1) < (new_idx - old)
> > >
> > > When eventidx is already behind, it's like:
> > >
> > > 0
> > > 1 <- event_idx
> > > 2 <- old
> > > 3 <- new_idx
> > > 4
> > > .
> > > .
> > > .
> > >
> > > In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
> > > 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
> >
> > That becomes 1 >= 1, i.e. "true". So this will result in the driver
> > doing an mmio doorbell write.
>
> The code is:
>
> static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
> {
> return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
> }
>
> which per the above is "return 1 < 1;", or false. So the above case does *not*
> do an mmio write. No?
>
Whelp.
Looks like I'm in the wrong here, apologies!
at 10:11 PM, Klaus Jensen <its@irrelevant.dk> wrote:
> On Nov 8 12:39, John Levon wrote:
>> On Fri, Nov 04, 2022 at 07:32:12AM +0100, Klaus Jensen wrote:
>>
>>> On Nov 3 21:19, Jinhao Fan wrote:
>>>> On 11/3/2022 8:10 PM, Klaus Jensen wrote:
>>>>> I agree that the spec is a little unclear on this point. In any case, in
>>>>> Linux, when the driver has decided that the sq tail must be updated,
>>>>> it will use this check:
>>>>>
>>>>> (new_idx - event_idx - 1) < (new_idx - old)
>>>>
>>>> When eventidx is already behind, it's like:
>>>>
>>>> 0
>>>> 1 <- event_idx
>>>> 2 <- old
>>>> 3 <- new_idx
>>>> 4
>>>> .
>>>> .
>>>> .
>>>>
>>>> In this case, (new_idx - event_idx - 1) = 3-1-1 = 1 >= (new_idx - old) =
>>>> 3-2=1, so the host won't update sq tail. Where am I wrong in this example?
>>>
>>> That becomes 1 >= 1, i.e. "true". So this will result in the driver
>>> doing an mmio doorbell write.
>>
>> The code is:
>>
>> static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
>> {
>> return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
>> }
>>
>> which per the above is "return 1 < 1;", or false. So the above case does *not*
>> do an mmio write. No?
>
> Whelp.
>
> Looks like I'm in the wrong here, apologies!
So disabling eventidx updates during polling has the potential of reducing
doorbell writes. But as Klaus observed removing this function does not cause
performance difference. So I guess only one command is processed during each
polling iteration.
© 2016 - 2026 Red Hat, Inc.