[PATCH] hw/nvme: Add iothread support

Jinhao Fan posted 1 patch 1 year, 9 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20220720090053.309229-1-fanjinhao21s@ict.ac.cn
Maintainers: Keith Busch <kbusch@kernel.org>, Klaus Jensen <its@irrelevant.dk>
hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
hw/nvme/ns.c   | 19 +++++++++---
hw/nvme/nvme.h |  6 +++-
3 files changed, 95 insertions(+), 10 deletions(-)
[PATCH] hw/nvme: Add iothread support
Posted by Jinhao Fan 1 year, 9 months ago
Add an option "iothread=x" to do emulation in a seperate iothread.
This improves the performance because QEMU's main loop is responsible
for a lot of other work while iothread is dedicated to NVMe emulation.
Moreover, emulating in iothread brings the potential of polling on
SQ/CQ doorbells, which I will bring up in a following patch.

Iothread can be enabled by:
 -object iothread,id=nvme0 \
 -device nvme,iothread=nvme0 \

Performance comparisons (KIOPS):

QD         1   4  16  64
QEMU      41 136 242 338
iothread  53 155 245 309

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
 hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
 hw/nvme/ns.c   | 19 +++++++++---
 hw/nvme/nvme.h |  6 +++-
 3 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 533ad14e7a..a4efc97b33 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4257,6 +4257,7 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
 {
     NvmeCtrl *n = cq->ctrl;
     uint16_t offset = (cq->cqid << 3) + (1 << 2);
+    bool in_iothread = !qemu_in_main_thread();
     int ret;
 
     ret = event_notifier_init(&cq->notifier, 0);
@@ -4264,10 +4265,19 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
         return ret;
     }
 
-    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+    if (in_iothread) {
+        qemu_mutex_lock_iothread();
+    }
+
+    aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
+                           NULL, NULL);
     memory_region_add_eventfd(&n->iomem,
                               0x1000 + offset, 4, false, 0, &cq->notifier);
 
+    if (in_iothread) {
+        qemu_mutex_unlock_iothread();
+    }
+    
     return 0;
 }
 
@@ -4284,6 +4294,7 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
 {
     NvmeCtrl *n = sq->ctrl;
     uint16_t offset = sq->sqid << 3;
+    bool in_iothread = !qemu_in_main_thread();
     int ret;
 
     ret = event_notifier_init(&sq->notifier, 0);
@@ -4291,9 +4302,16 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
         return ret;
     }
 
-    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+    if (in_iothread) {
+        qemu_mutex_lock_iothread();
+    }
+    aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
+                           NULL, NULL);
     memory_region_add_eventfd(&n->iomem,
                               0x1000 + offset, 4, false, 0, &sq->notifier);
+    if (in_iothread) {
+        qemu_mutex_unlock_iothread();
+    }
 
     return 0;
 }
@@ -4301,13 +4319,22 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
 {
     uint16_t offset = sq->sqid << 3;
+    bool in_iothread = !qemu_in_main_thread();
 
     n->sq[sq->sqid] = NULL;
     timer_free(sq->timer);
     if (sq->ioeventfd_enabled) {
+        if (in_iothread) {
+            qemu_mutex_lock_iothread();
+        }
+
         memory_region_del_eventfd(&n->iomem,
                                   0x1000 + offset, 4, false, 0, &sq->notifier);
         event_notifier_cleanup(&sq->notifier);
+
+        if (in_iothread) {
+            qemu_mutex_unlock_iothread();
+        }
     }
     g_free(sq->io_req);
     if (sq->sqid) {
@@ -4376,8 +4403,9 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
         sq->io_req[i].sq = sq;
         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
     }
-    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
 
+    sq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS,
+                              nvme_process_sq, sq);
     if (n->dbbuf_enabled) {
         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
@@ -4691,13 +4719,22 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
     uint16_t offset = (cq->cqid << 3) + (1 << 2);
+    bool in_iothread = !qemu_in_main_thread();
 
     n->cq[cq->cqid] = NULL;
     timer_free(cq->timer);
     if (cq->ioeventfd_enabled) {
+        if (in_iothread) {
+            qemu_mutex_lock_iothread();
+        }
+        
         memory_region_del_eventfd(&n->iomem,
                                   0x1000 + offset, 4, false, 0, &cq->notifier);
         event_notifier_cleanup(&cq->notifier);
+        
+        if (in_iothread) {
+            qemu_mutex_unlock_iothread();
+        }
     }
     if (msix_enabled(&n->parent_obj)) {
         msix_vector_unuse(&n->parent_obj, cq->vector);
@@ -4765,7 +4802,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
         }
     }
     n->cq[cqid] = cq;
-    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
+    cq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS,
+                              nvme_post_cqes, cq);
 }
 
 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
@@ -6557,6 +6595,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
     uint32_t intms = ldl_le_p(&n->bar.intms);
     uint32_t csts = ldl_le_p(&n->bar.csts);
     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
+    bool in_iothread = !qemu_in_main_thread();
 
     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
@@ -6726,10 +6765,22 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 
         stl_le_p(&n->bar.pmrctl, data);
         if (NVME_PMRCTL_EN(data)) {
+            if (in_iothread) {
+                qemu_mutex_lock_iothread();
+            }
             memory_region_set_enabled(&n->pmr.dev->mr, true);
+            if (in_iothread) {
+                qemu_mutex_unlock_iothread();
+            }
             pmrsts = 0;
         } else {
+            if (in_iothread) {
+                qemu_mutex_lock_iothread();
+            }
             memory_region_set_enabled(&n->pmr.dev->mr, false);
+            if (in_iothread) {
+                qemu_mutex_unlock_iothread();
+            }
             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
             n->pmr.cmse = false;
         }
@@ -7528,6 +7579,14 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
         stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
     }
+
+    if (n->params.iothread) {
+        n->iothread = n->params.iothread;
+        object_ref(OBJECT(n->iothread));
+        n->ctx = iothread_get_aio_context(n->iothread);
+    } else {
+        n->ctx = qemu_get_aio_context();
+    }
 }
 
 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
@@ -7600,7 +7659,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
         ns = &n->namespace;
         ns->params.nsid = 1;
 
-        if (nvme_ns_setup(ns, errp)) {
+        if (nvme_ns_setup(n, ns, errp)) {
             return;
         }
 
@@ -7631,6 +7690,15 @@ static void nvme_exit(PCIDevice *pci_dev)
     g_free(n->sq);
     g_free(n->aer_reqs);
 
+    aio_context_acquire(n->ctx);
+    blk_set_aio_context(n->namespace.blkconf.blk, qemu_get_aio_context(), NULL);
+    aio_context_release(n->ctx);
+
+    if (n->iothread) {
+        object_unref(OBJECT(n->iothread));
+        n->iothread = NULL;
+    }
+
     if (n->params.cmb_size_mb) {
         g_free(n->cmb.buf);
     }
@@ -7665,6 +7733,8 @@ static Property nvme_props[] = {
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
     DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
+    DEFINE_PROP_LINK("iothread", NvmeCtrl, params.iothread, TYPE_IOTHREAD,
+                     IOThread *),
     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
                      params.auto_transition_zones, true),
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 62a1f97be0..683d0d7a3b 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -146,8 +146,10 @@ lbaf_found:
     return 0;
 }
 
-static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
+static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
+    AioContext *old_context;
+    int ret;
     bool read_only;
 
     if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -170,6 +172,15 @@ static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    old_context = blk_get_aio_context(ns->blkconf.blk);
+    aio_context_acquire(old_context);
+    ret = blk_set_aio_context(ns->blkconf.blk, n->ctx, errp);
+    aio_context_release(old_context);
+
+    if (ret) {
+        error_setg(errp, "Set AioContext on BlockBackend failed");
+    }
+
     return 0;
 }
 
@@ -482,13 +493,13 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
     return 0;
 }
 
-int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
+int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
     if (nvme_ns_check_constraints(ns, errp)) {
         return -1;
     }
 
-    if (nvme_ns_init_blk(ns, errp)) {
+    if (nvme_ns_init_blk(n, ns, errp)) {
         return -1;
     }
 
@@ -563,7 +574,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
         }
     }
 
-    if (nvme_ns_setup(ns, errp)) {
+    if (nvme_ns_setup(n, ns, errp)) {
         return;
     }
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..85e2aa7b29 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -21,6 +21,7 @@
 #include "qemu/uuid.h"
 #include "hw/pci/pci.h"
 #include "hw/block/block.h"
+#include "sysemu/iothread.h"
 
 #include "block/nvme.h"
 
@@ -275,7 +276,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
 }
 
 void nvme_ns_init_format(NvmeNamespace *ns);
-int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
+int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
 void nvme_ns_cleanup(NvmeNamespace *ns);
@@ -427,6 +428,7 @@ typedef struct NvmeParams {
     uint16_t sriov_vi_flexible;
     uint8_t  sriov_max_vq_per_vf;
     uint8_t  sriov_max_vi_per_vf;
+    IOThread *iothread;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
@@ -458,6 +460,8 @@ typedef struct NvmeCtrl {
     uint64_t    dbbuf_dbs;
     uint64_t    dbbuf_eis;
     bool        dbbuf_enabled;
+    IOThread    *iothread;
+    AioContext  *ctx;
 
     struct {
         MemoryRegion mem;
-- 
2.25.1
Re: [PATCH] hw/nvme: Add iothread support
Posted by Klaus Jensen 1 year, 7 months ago
On Jul 20 17:00, Jinhao Fan wrote:
> Add an option "iothread=x" to do emulation in a seperate iothread.
> This improves the performance because QEMU's main loop is responsible
> for a lot of other work while iothread is dedicated to NVMe emulation.
> Moreover, emulating in iothread brings the potential of polling on
> SQ/CQ doorbells, which I will bring up in a following patch.
> 
> Iothread can be enabled by:
>  -object iothread,id=nvme0 \
>  -device nvme,iothread=nvme0 \
> 
> Performance comparisons (KIOPS):
> 
> QD         1   4  16  64
> QEMU      41 136 242 338
> iothread  53 155 245 309
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
>  hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
>  hw/nvme/ns.c   | 19 +++++++++---
>  hw/nvme/nvme.h |  6 +++-
>  3 files changed, 95 insertions(+), 10 deletions(-)
> 

Jinhao,

Are you gonna respin this based on the irqfd patches? I suggest you just
add this work on top and post a series that is irqfd+iothread. Then, if
we find the irqfd ready for merge, we can pick that up for the next
release cycle early and continue on iothread work.
Re: [PATCH] hw/nvme: Add iothread support
Posted by Jinhao Fan 1 year, 7 months ago
Sure. I’ve already reworked this iothread patch upon the new irqfd patch. I think I can post a v2 patch today. Do you mean I include irqfd v3 in the new iothread patch series?

发自我的iPhone

> 在 2022年8月26日,15:12,Klaus Jensen <its@irrelevant.dk> 写道:
> 
> On Jul 20 17:00, Jinhao Fan wrote:
>> Add an option "iothread=x" to do emulation in a seperate iothread.
>> This improves the performance because QEMU's main loop is responsible
>> for a lot of other work while iothread is dedicated to NVMe emulation.
>> Moreover, emulating in iothread brings the potential of polling on
>> SQ/CQ doorbells, which I will bring up in a following patch.
>> 
>> Iothread can be enabled by:
>> -object iothread,id=nvme0 \
>> -device nvme,iothread=nvme0 \
>> 
>> Performance comparisons (KIOPS):
>> 
>> QD         1   4  16  64
>> QEMU      41 136 242 338
>> iothread  53 155 245 309
>> 
>> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
>> ---
>> hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
>> hw/nvme/ns.c   | 19 +++++++++---
>> hw/nvme/nvme.h |  6 +++-
>> 3 files changed, 95 insertions(+), 10 deletions(-)
>> 
> 
> Jinhao,
> 
> Are you gonna respin this based on the irqfd patches? I suggest you just
> add this work on top and post a series that is irqfd+iothread. Then, if
> we find the irqfd ready for merge, we can pick that up for the next
> release cycle early and continue on iothread work.
-----BEGIN PGP SIGNATURE-----

iQEzBAEBCAAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmMIchkACgkQTeGvMW1P
DemFzQgAq7T7IoD01Sd+UZ5eD3TkAusWJ9R5q06UGat2fUMtlsClowml+QEbzGOI
R6L/9RP5ejxyhHCP83gCl0tAvUh2grbe+7W6QxeIZ1EKS8rOrdRyM5lysVyFkotH
PRswnOCBUMGMcqy9dvStGFjglCMAZdyyxXzWZHEf7sVUadrVoP63H7jUqvfOlMKN
PVaOX7wHvmDzB4KwFEBZXTHjQGlvvTfpAmtTAqnvOfNVxC61QPesWmMC//MuOLKt
k6oA+oDP5AMe3GpGaFZSSdsDKobBq2ZFGCJlQUBhlSWQAoCOkr8bTJpOMEx7R3Sf
42feTthla0bRaHGkfB4nwcmgXjrnSg==
=g/n7
-----END PGP SIGNATURE-----
Re: [PATCH] hw/nvme: Add iothread support
Posted by Klaus Jensen 1 year, 7 months ago
On Aug 26 16:15, Jinhao Fan wrote:
> Sure. I’ve already reworked this iothread patch upon the new irqfd
> patch. I think I can post a v2 patch today. Do you mean I include
> irqfd v3 in the new iothread patch series?
> 

Yes, please include irqfd-v3 in the series.
Re: [PATCH] hw/nvme: Add iothread support
Posted by Jinhao Fan 1 year, 8 months ago
at 5:00 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> Add an option "iothread=x" to do emulation in a seperate iothread.
> This improves the performance because QEMU's main loop is responsible
> for a lot of other work while iothread is dedicated to NVMe emulation.
> Moreover, emulating in iothread brings the potential of polling on
> SQ/CQ doorbells, which I will bring up in a following patch.
> 
> Iothread can be enabled by:
> -object iothread,id=nvme0 \
> -device nvme,iothread=nvme0 \
> 
> Performance comparisons (KIOPS):
> 
> QD         1   4  16  64
> QEMU      41 136 242 338
> iothread  53 155 245 309
> 
> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
> ---
> hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
> hw/nvme/ns.c   | 19 +++++++++---
> hw/nvme/nvme.h |  6 +++-
> 3 files changed, 95 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 533ad14e7a..a4efc97b33 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -4257,6 +4257,7 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> {
>     NvmeCtrl *n = cq->ctrl;
>     uint16_t offset = (cq->cqid << 3) + (1 << 2);
> +    bool in_iothread = !qemu_in_main_thread();
>     int ret;
> 
>     ret = event_notifier_init(&cq->notifier, 0);
> @@ -4264,10 +4265,19 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
>         return ret;
>     }
> 
> -    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> +    if (in_iothread) {
> +        qemu_mutex_lock_iothread();
> +    }
> +
> +    aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
> +                           NULL, NULL);
>     memory_region_add_eventfd(&n->iomem,
>                               0x1000 + offset, 4, false, 0, &cq->notifier);
> 
> +    if (in_iothread) {
> +        qemu_mutex_unlock_iothread();
> +    }
> +    
>     return 0;
> }
> 
> @@ -4284,6 +4294,7 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> {
>     NvmeCtrl *n = sq->ctrl;
>     uint16_t offset = sq->sqid << 3;
> +    bool in_iothread = !qemu_in_main_thread();
>     int ret;
> 
>     ret = event_notifier_init(&sq->notifier, 0);
> @@ -4291,9 +4302,16 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
>         return ret;
>     }
> 
> -    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> +    if (in_iothread) {
> +        qemu_mutex_lock_iothread();
> +    }
> +    aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
> +                           NULL, NULL);
>     memory_region_add_eventfd(&n->iomem,
>                               0x1000 + offset, 4, false, 0, &sq->notifier);
> +    if (in_iothread) {
> +        qemu_mutex_unlock_iothread();
> +    }
> 
>     return 0;
> }
> @@ -4301,13 +4319,22 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
> {
>     uint16_t offset = sq->sqid << 3;
> +    bool in_iothread = !qemu_in_main_thread();
> 
>     n->sq[sq->sqid] = NULL;
>     timer_free(sq->timer);
>     if (sq->ioeventfd_enabled) {
> +        if (in_iothread) {
> +            qemu_mutex_lock_iothread();
> +        }
> +
>         memory_region_del_eventfd(&n->iomem,
>                                   0x1000 + offset, 4, false, 0, &sq->notifier);
>         event_notifier_cleanup(&sq->notifier);
> +
> +        if (in_iothread) {
> +            qemu_mutex_unlock_iothread();
> +        }
>     }
>     g_free(sq->io_req);
>     if (sq->sqid) {
> @@ -4376,8 +4403,9 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
>         sq->io_req[i].sq = sq;
>         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
>     }
> -    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
> 
> +    sq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS,
> +                              nvme_process_sq, sq);
>     if (n->dbbuf_enabled) {
>         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> @@ -4691,13 +4719,22 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
> static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
> {
>     uint16_t offset = (cq->cqid << 3) + (1 << 2);
> +    bool in_iothread = !qemu_in_main_thread();
> 
>     n->cq[cq->cqid] = NULL;
>     timer_free(cq->timer);
>     if (cq->ioeventfd_enabled) {
> +        if (in_iothread) {
> +            qemu_mutex_lock_iothread();
> +        }
> +        
>         memory_region_del_eventfd(&n->iomem,
>                                   0x1000 + offset, 4, false, 0, &cq->notifier);
>         event_notifier_cleanup(&cq->notifier);
> +        
> +        if (in_iothread) {
> +            qemu_mutex_unlock_iothread();
> +        }
>     }
>     if (msix_enabled(&n->parent_obj)) {
>         msix_vector_unuse(&n->parent_obj, cq->vector);
> @@ -4765,7 +4802,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
>         }
>     }
>     n->cq[cqid] = cq;
> -    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
> +    cq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS,
> +                              nvme_post_cqes, cq);
> }
> 
> static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
> @@ -6557,6 +6595,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
>     uint32_t intms = ldl_le_p(&n->bar.intms);
>     uint32_t csts = ldl_le_p(&n->bar.csts);
>     uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
> +    bool in_iothread = !qemu_in_main_thread();
> 
>     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
>         NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
> @@ -6726,10 +6765,22 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
> 
>         stl_le_p(&n->bar.pmrctl, data);
>         if (NVME_PMRCTL_EN(data)) {
> +            if (in_iothread) {
> +                qemu_mutex_lock_iothread();
> +            }
>             memory_region_set_enabled(&n->pmr.dev->mr, true);
> +            if (in_iothread) {
> +                qemu_mutex_unlock_iothread();
> +            }
>             pmrsts = 0;
>         } else {
> +            if (in_iothread) {
> +                qemu_mutex_lock_iothread();
> +            }
>             memory_region_set_enabled(&n->pmr.dev->mr, false);
> +            if (in_iothread) {
> +                qemu_mutex_unlock_iothread();
> +            }
>             NVME_PMRSTS_SET_NRDY(pmrsts, 1);
>             n->pmr.cmse = false;
>         }
> @@ -7528,6 +7579,14 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
>     if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
>         stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
>     }
> +
> +    if (n->params.iothread) {
> +        n->iothread = n->params.iothread;
> +        object_ref(OBJECT(n->iothread));
> +        n->ctx = iothread_get_aio_context(n->iothread);
> +    } else {
> +        n->ctx = qemu_get_aio_context();
> +    }
> }
> 
> static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
> @@ -7600,7 +7659,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>         ns = &n->namespace;
>         ns->params.nsid = 1;
> 
> -        if (nvme_ns_setup(ns, errp)) {
> +        if (nvme_ns_setup(n, ns, errp)) {
>             return;
>         }
> 
> @@ -7631,6 +7690,15 @@ static void nvme_exit(PCIDevice *pci_dev)
>     g_free(n->sq);
>     g_free(n->aer_reqs);
> 
> +    aio_context_acquire(n->ctx);
> +    blk_set_aio_context(n->namespace.blkconf.blk, qemu_get_aio_context(), NULL);
> +    aio_context_release(n->ctx);
> +
> +    if (n->iothread) {
> +        object_unref(OBJECT(n->iothread));
> +        n->iothread = NULL;
> +    }
> +
>     if (n->params.cmb_size_mb) {
>         g_free(n->cmb.buf);
>     }
> @@ -7665,6 +7733,8 @@ static Property nvme_props[] = {
>     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
>     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
>     DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
> +    DEFINE_PROP_LINK("iothread", NvmeCtrl, params.iothread, TYPE_IOTHREAD,
> +                     IOThread *),
>     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
>     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
>                      params.auto_transition_zones, true),
> diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
> index 62a1f97be0..683d0d7a3b 100644
> --- a/hw/nvme/ns.c
> +++ b/hw/nvme/ns.c
> @@ -146,8 +146,10 @@ lbaf_found:
>     return 0;
> }
> 
> -static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
> +static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> {
> +    AioContext *old_context;
> +    int ret;
>     bool read_only;
> 
>     if (!blkconf_blocksizes(&ns->blkconf, errp)) {
> @@ -170,6 +172,15 @@ static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
>         return -1;
>     }
> 
> +    old_context = blk_get_aio_context(ns->blkconf.blk);
> +    aio_context_acquire(old_context);
> +    ret = blk_set_aio_context(ns->blkconf.blk, n->ctx, errp);
> +    aio_context_release(old_context);
> +
> +    if (ret) {
> +        error_setg(errp, "Set AioContext on BlockBackend failed");
> +    }
> +
>     return 0;
> }
> 
> @@ -482,13 +493,13 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
>     return 0;
> }
> 
> -int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
> +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> {
>     if (nvme_ns_check_constraints(ns, errp)) {
>         return -1;
>     }
> 
> -    if (nvme_ns_init_blk(ns, errp)) {
> +    if (nvme_ns_init_blk(n, ns, errp)) {
>         return -1;
>     }
> 
> @@ -563,7 +574,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
>         }
>     }
> 
> -    if (nvme_ns_setup(ns, errp)) {
> +    if (nvme_ns_setup(n, ns, errp)) {
>         return;
>     }
> 
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 79f5c281c2..85e2aa7b29 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -21,6 +21,7 @@
> #include "qemu/uuid.h"
> #include "hw/pci/pci.h"
> #include "hw/block/block.h"
> +#include "sysemu/iothread.h"
> 
> #include "block/nvme.h"
> 
> @@ -275,7 +276,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
> }
> 
> void nvme_ns_init_format(NvmeNamespace *ns);
> -int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
> +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
> void nvme_ns_drain(NvmeNamespace *ns);
> void nvme_ns_shutdown(NvmeNamespace *ns);
> void nvme_ns_cleanup(NvmeNamespace *ns);
> @@ -427,6 +428,7 @@ typedef struct NvmeParams {
>     uint16_t sriov_vi_flexible;
>     uint8_t  sriov_max_vq_per_vf;
>     uint8_t  sriov_max_vi_per_vf;
> +    IOThread *iothread;
> } NvmeParams;
> 
> typedef struct NvmeCtrl {
> @@ -458,6 +460,8 @@ typedef struct NvmeCtrl {
>     uint64_t    dbbuf_dbs;
>     uint64_t    dbbuf_eis;
>     bool        dbbuf_enabled;
> +    IOThread    *iothread;
> +    AioContext  *ctx;
> 
>     struct {
>         MemoryRegion mem;
> -- 
> 2.25.1

Hi Klaus and Keith,

I just added support for interrupt masking. How can I test interrupt
masking?

Thanks,
Jinhao Fan
Re: [PATCH] hw/nvme: Add iothread support
Posted by Klaus Jensen 1 year, 8 months ago
On Jul 26 16:55, Jinhao Fan wrote:
> 
> Hi Klaus and Keith,
> 
> I just added support for interrupt masking. How can I test interrupt
> masking?
> 

I wondered if this might be possible to test this with a user-space
VFIO-based driver, but VFIO does not export masking/unmasking on
MSI/MSI-X through the SET_IRQ interface and disallows reading and
writing in the MSI-X table area :(
Re: [PATCH] hw/nvme: Add iothread support
Posted by Keith Busch 1 year, 8 months ago
On Tue, Jul 26, 2022 at 04:55:54PM +0800, Jinhao Fan wrote:
> 
> Hi Klaus and Keith,
> 
> I just added support for interrupt masking. How can I test interrupt
> masking?

Are you asking about MSI masking? I don't think any drivers are using the
feature, so you'd need to modify one to test it. I can give you some pointers
if you are asking about MSI.
Re: [PATCH] hw/nvme: Add iothread support
Posted by Jinhao Fan 1 year, 8 months ago
at 10:45 PM, Keith Busch <kbusch@kernel.org> wrote:

> On Tue, Jul 26, 2022 at 04:55:54PM +0800, Jinhao Fan wrote:
>> Hi Klaus and Keith,
>> 
>> I just added support for interrupt masking. How can I test interrupt
>> masking?
> 
> Are you asking about MSI masking? I don't think any drivers are using the
> feature, so you'd need to modify one to test it. I can give you some pointers
> if you are asking about MSI.

Thanks Keith,

Do I understand correctly: qemu-nvme only supports MSI-X and pin-based
interrupts. So MSI masking here is equivalent with MSI-X masking.

If the above is correct, I think I am asking about MSI masking.

BTW, a double check on ctrl.c seems to show that we only support disabling
interrupt at CQ creation time, which is recorded in the cq->irq_enabled.
This is different from my prior understanding that interrupts can be
disabled at runtime by a call like Linux irq_save(). Therefore I doubt
whether qemu-nvme supported "interrupt masking" before. How do you
understand qemu-nvme’s interrupt masking support?

Jinhao
Re: [PATCH] hw/nvme: Add iothread support
Posted by Keith Busch 1 year, 8 months ago
On Tue, Jul 26, 2022 at 11:32:57PM +0800, Jinhao Fan wrote:
> at 10:45 PM, Keith Busch <kbusch@kernel.org> wrote:
> 
> > On Tue, Jul 26, 2022 at 04:55:54PM +0800, Jinhao Fan wrote:
> >> Hi Klaus and Keith,
> >> 
> >> I just added support for interrupt masking. How can I test interrupt
> >> masking?
> > 
> > Are you asking about MSI masking? I don't think any drivers are using the
> > feature, so you'd need to modify one to test it. I can give you some pointers
> > if you are asking about MSI.
> 
> Thanks Keith,
> 
> Do I understand correctly: qemu-nvme only supports MSI-X and pin-based
> interrupts. So MSI masking here is equivalent with MSI-X masking.

It looks like qemu's nvme only supports MSI-x. I could have sworn it used to
support MSI, but I must be thinking of an unofficial fork.

I was mainly asking about MSI masking as it relates to nvme controller's
IVMS/IVMC registers. I don't think any driver is making use of these, and those
don't apply to MSI-x; just MSI and legacy.

At the PCIe level, masking MSI vectors is in Config space. Writing to Config
space is too slow to do per-interrupt, so NVMe created the IVMS/IVMC registers
to get around that.

> If the above is correct, I think I am asking about MSI masking.
> 
> BTW, a double check on ctrl.c seems to show that we only support disabling
> interrupt at CQ creation time, which is recorded in the cq->irq_enabled.
> This is different from my prior understanding that interrupts can be
> disabled at runtime by a call like Linux irq_save(). Therefore I doubt
> whether qemu-nvme supported "interrupt masking" before. How do you
> understand qemu-nvme’s interrupt masking support?

MSI-x uses MMIO for masking, so there's no need for an NVMe specific way to
mask these interrupts. You can just use the generic PCIe methods to clear the
vector's enable bit. But no NVMe driver that I know of is making use of these
either, though it should be possible to make linux start doing that.
 
The CQ irq_enabled field is there so the user can create a pure polling queue.
That's a fixed property of the queue that can't be re-enabled without
destroying and recreating.

The linux irq_save only disables local CPU interrupts from most sources. All
pci devices are unaware of this and can still send their interrupt messages to
the CPU, but the CPU won't context switch to the irq handler until after
irqrestore is called.

Re: [PATCH] hw/nvme: Add iothread support
Posted by Jinhao Fan 1 year, 8 months ago
at 2:07 AM, Keith Busch <kbusch@kernel.org> wrote:

> MSI-x uses MMIO for masking, so there's no need for an NVMe specific way to
> mask these interrupts. You can just use the generic PCIe methods to clear the
> vector's enable bit. But no NVMe driver that I know of is making use of these
> either, though it should be possible to make linux start doing that.

I believe we need to handle MSI-x masking in the NVMe controller after we
switch to irqfd. Before that QEMU’s MSI-x emulation logic helps us handle
masked interrupts. But with irqfd, we bypass QEMU’s MSI-x and let the kernel
send the interrupt directly. Therefore qemu-nvme needs to do some
bookkeeping about which interrupt vectors are masked.
msix_set_vector_notifiers helps us do that.
Re: [PATCH] hw/nvme: Add iothread support
Posted by Jinhao Fan 1 year, 8 months ago
at 10:56 AM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote:

> at 2:07 AM, Keith Busch <kbusch@kernel.org> wrote:
> 
>> MSI-x uses MMIO for masking, so there's no need for an NVMe specific way to
>> mask these interrupts. You can just use the generic PCIe methods to clear the
>> vector's enable bit. But no NVMe driver that I know of is making use of these
>> either, though it should be possible to make linux start doing that.
> 
> I believe we need to handle MSI-x masking in the NVMe controller after we
> switch to irqfd. Before that QEMU’s MSI-x emulation logic helps us handle
> masked interrupts. But with irqfd, we bypass QEMU’s MSI-x and let the kernel
> send the interrupt directly. Therefore qemu-nvme needs to do some
> bookkeeping about which interrupt vectors are masked.
> msix_set_vector_notifiers helps us do that.

But as no NVMe driver makes use of MSI-x interrupt masking, is it OK if we
just leave MSI-x masking support broken when irqfd is enabled?