hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---- hw/nvme/ns.c | 19 +++++++++--- hw/nvme/nvme.h | 6 +++- 3 files changed, 95 insertions(+), 10 deletions(-)
Add an option "iothread=x" to do emulation in a seperate iothread.
This improves the performance because QEMU's main loop is responsible
for a lot of other work while iothread is dedicated to NVMe emulation.
Moreover, emulating in iothread brings the potential of polling on
SQ/CQ doorbells, which I will bring up in a following patch.
Iothread can be enabled by:
-object iothread,id=nvme0 \
-device nvme,iothread=nvme0 \
Performance comparisons (KIOPS):
QD 1 4 16 64
QEMU 41 136 242 338
iothread 53 155 245 309
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
hw/nvme/ns.c | 19 +++++++++---
hw/nvme/nvme.h | 6 +++-
3 files changed, 95 insertions(+), 10 deletions(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 533ad14e7a..a4efc97b33 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4257,6 +4257,7 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
{
NvmeCtrl *n = cq->ctrl;
uint16_t offset = (cq->cqid << 3) + (1 << 2);
+ bool in_iothread = !qemu_in_main_thread();
int ret;
ret = event_notifier_init(&cq->notifier, 0);
@@ -4264,10 +4265,19 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
return ret;
}
- event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+ if (in_iothread) {
+ qemu_mutex_lock_iothread();
+ }
+
+ aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier,
+ NULL, NULL);
memory_region_add_eventfd(&n->iomem,
0x1000 + offset, 4, false, 0, &cq->notifier);
+ if (in_iothread) {
+ qemu_mutex_unlock_iothread();
+ }
+
return 0;
}
@@ -4284,6 +4294,7 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
{
NvmeCtrl *n = sq->ctrl;
uint16_t offset = sq->sqid << 3;
+ bool in_iothread = !qemu_in_main_thread();
int ret;
ret = event_notifier_init(&sq->notifier, 0);
@@ -4291,9 +4302,16 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
return ret;
}
- event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+ if (in_iothread) {
+ qemu_mutex_lock_iothread();
+ }
+ aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier,
+ NULL, NULL);
memory_region_add_eventfd(&n->iomem,
0x1000 + offset, 4, false, 0, &sq->notifier);
+ if (in_iothread) {
+ qemu_mutex_unlock_iothread();
+ }
return 0;
}
@@ -4301,13 +4319,22 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
{
uint16_t offset = sq->sqid << 3;
+ bool in_iothread = !qemu_in_main_thread();
n->sq[sq->sqid] = NULL;
timer_free(sq->timer);
if (sq->ioeventfd_enabled) {
+ if (in_iothread) {
+ qemu_mutex_lock_iothread();
+ }
+
memory_region_del_eventfd(&n->iomem,
0x1000 + offset, 4, false, 0, &sq->notifier);
event_notifier_cleanup(&sq->notifier);
+
+ if (in_iothread) {
+ qemu_mutex_unlock_iothread();
+ }
}
g_free(sq->io_req);
if (sq->sqid) {
@@ -4376,8 +4403,9 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
sq->io_req[i].sq = sq;
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
- sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+ sq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS,
+ nvme_process_sq, sq);
if (n->dbbuf_enabled) {
sq->db_addr = n->dbbuf_dbs + (sqid << 3);
sq->ei_addr = n->dbbuf_eis + (sqid << 3);
@@ -4691,13 +4719,22 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
{
uint16_t offset = (cq->cqid << 3) + (1 << 2);
+ bool in_iothread = !qemu_in_main_thread();
n->cq[cq->cqid] = NULL;
timer_free(cq->timer);
if (cq->ioeventfd_enabled) {
+ if (in_iothread) {
+ qemu_mutex_lock_iothread();
+ }
+
memory_region_del_eventfd(&n->iomem,
0x1000 + offset, 4, false, 0, &cq->notifier);
event_notifier_cleanup(&cq->notifier);
+
+ if (in_iothread) {
+ qemu_mutex_unlock_iothread();
+ }
}
if (msix_enabled(&n->parent_obj)) {
msix_vector_unuse(&n->parent_obj, cq->vector);
@@ -4765,7 +4802,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
}
}
n->cq[cqid] = cq;
- cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
+ cq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS,
+ nvme_post_cqes, cq);
}
static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
@@ -6557,6 +6595,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
uint32_t intms = ldl_le_p(&n->bar.intms);
uint32_t csts = ldl_le_p(&n->bar.csts);
uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
+ bool in_iothread = !qemu_in_main_thread();
if (unlikely(offset & (sizeof(uint32_t) - 1))) {
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
@@ -6726,10 +6765,22 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
stl_le_p(&n->bar.pmrctl, data);
if (NVME_PMRCTL_EN(data)) {
+ if (in_iothread) {
+ qemu_mutex_lock_iothread();
+ }
memory_region_set_enabled(&n->pmr.dev->mr, true);
+ if (in_iothread) {
+ qemu_mutex_unlock_iothread();
+ }
pmrsts = 0;
} else {
+ if (in_iothread) {
+ qemu_mutex_lock_iothread();
+ }
memory_region_set_enabled(&n->pmr.dev->mr, false);
+ if (in_iothread) {
+ qemu_mutex_unlock_iothread();
+ }
NVME_PMRSTS_SET_NRDY(pmrsts, 1);
n->pmr.cmse = false;
}
@@ -7528,6 +7579,14 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
}
+
+ if (n->params.iothread) {
+ n->iothread = n->params.iothread;
+ object_ref(OBJECT(n->iothread));
+ n->ctx = iothread_get_aio_context(n->iothread);
+ } else {
+ n->ctx = qemu_get_aio_context();
+ }
}
static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
@@ -7600,7 +7659,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
ns = &n->namespace;
ns->params.nsid = 1;
- if (nvme_ns_setup(ns, errp)) {
+ if (nvme_ns_setup(n, ns, errp)) {
return;
}
@@ -7631,6 +7690,15 @@ static void nvme_exit(PCIDevice *pci_dev)
g_free(n->sq);
g_free(n->aer_reqs);
+ aio_context_acquire(n->ctx);
+ blk_set_aio_context(n->namespace.blkconf.blk, qemu_get_aio_context(), NULL);
+ aio_context_release(n->ctx);
+
+ if (n->iothread) {
+ object_unref(OBJECT(n->iothread));
+ n->iothread = NULL;
+ }
+
if (n->params.cmb_size_mb) {
g_free(n->cmb.buf);
}
@@ -7665,6 +7733,8 @@ static Property nvme_props[] = {
DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
+ DEFINE_PROP_LINK("iothread", NvmeCtrl, params.iothread, TYPE_IOTHREAD,
+ IOThread *),
DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
params.auto_transition_zones, true),
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 62a1f97be0..683d0d7a3b 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -146,8 +146,10 @@ lbaf_found:
return 0;
}
-static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
+static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
{
+ AioContext *old_context;
+ int ret;
bool read_only;
if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -170,6 +172,15 @@ static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
return -1;
}
+ old_context = blk_get_aio_context(ns->blkconf.blk);
+ aio_context_acquire(old_context);
+ ret = blk_set_aio_context(ns->blkconf.blk, n->ctx, errp);
+ aio_context_release(old_context);
+
+ if (ret) {
+ error_setg(errp, "Set AioContext on BlockBackend failed");
+ }
+
return 0;
}
@@ -482,13 +493,13 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
return 0;
}
-int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
+int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
{
if (nvme_ns_check_constraints(ns, errp)) {
return -1;
}
- if (nvme_ns_init_blk(ns, errp)) {
+ if (nvme_ns_init_blk(n, ns, errp)) {
return -1;
}
@@ -563,7 +574,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
}
}
- if (nvme_ns_setup(ns, errp)) {
+ if (nvme_ns_setup(n, ns, errp)) {
return;
}
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..85e2aa7b29 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -21,6 +21,7 @@
#include "qemu/uuid.h"
#include "hw/pci/pci.h"
#include "hw/block/block.h"
+#include "sysemu/iothread.h"
#include "block/nvme.h"
@@ -275,7 +276,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
}
void nvme_ns_init_format(NvmeNamespace *ns);
-int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
+int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
void nvme_ns_drain(NvmeNamespace *ns);
void nvme_ns_shutdown(NvmeNamespace *ns);
void nvme_ns_cleanup(NvmeNamespace *ns);
@@ -427,6 +428,7 @@ typedef struct NvmeParams {
uint16_t sriov_vi_flexible;
uint8_t sriov_max_vq_per_vf;
uint8_t sriov_max_vi_per_vf;
+ IOThread *iothread;
} NvmeParams;
typedef struct NvmeCtrl {
@@ -458,6 +460,8 @@ typedef struct NvmeCtrl {
uint64_t dbbuf_dbs;
uint64_t dbbuf_eis;
bool dbbuf_enabled;
+ IOThread *iothread;
+ AioContext *ctx;
struct {
MemoryRegion mem;
--
2.25.1
On Jul 20 17:00, Jinhao Fan wrote: > Add an option "iothread=x" to do emulation in a seperate iothread. > This improves the performance because QEMU's main loop is responsible > for a lot of other work while iothread is dedicated to NVMe emulation. > Moreover, emulating in iothread brings the potential of polling on > SQ/CQ doorbells, which I will bring up in a following patch. > > Iothread can be enabled by: > -object iothread,id=nvme0 \ > -device nvme,iothread=nvme0 \ > > Performance comparisons (KIOPS): > > QD 1 4 16 64 > QEMU 41 136 242 338 > iothread 53 155 245 309 > > Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn> > --- > hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---- > hw/nvme/ns.c | 19 +++++++++--- > hw/nvme/nvme.h | 6 +++- > 3 files changed, 95 insertions(+), 10 deletions(-) > Jinhao, Are you gonna respin this based on the irqfd patches? I suggest you just add this work on top and post a series that is irqfd+iothread. Then, if we find the irqfd ready for merge, we can pick that up for the next release cycle early and continue on iothread work.
Sure. I’ve already reworked this iothread patch upon the new irqfd patch. I think I can post a v2 patch today. Do you mean I include irqfd v3 in the new iothread patch series? 发自我的iPhone > 在 2022年8月26日,15:12,Klaus Jensen <its@irrelevant.dk> 写道: > > On Jul 20 17:00, Jinhao Fan wrote: >> Add an option "iothread=x" to do emulation in a seperate iothread. >> This improves the performance because QEMU's main loop is responsible >> for a lot of other work while iothread is dedicated to NVMe emulation. >> Moreover, emulating in iothread brings the potential of polling on >> SQ/CQ doorbells, which I will bring up in a following patch. >> >> Iothread can be enabled by: >> -object iothread,id=nvme0 \ >> -device nvme,iothread=nvme0 \ >> >> Performance comparisons (KIOPS): >> >> QD 1 4 16 64 >> QEMU 41 136 242 338 >> iothread 53 155 245 309 >> >> Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn> >> --- >> hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---- >> hw/nvme/ns.c | 19 +++++++++--- >> hw/nvme/nvme.h | 6 +++- >> 3 files changed, 95 insertions(+), 10 deletions(-) >> > > Jinhao, > > Are you gonna respin this based on the irqfd patches? I suggest you just > add this work on top and post a series that is irqfd+iothread. Then, if > we find the irqfd ready for merge, we can pick that up for the next > release cycle early and continue on iothread work. -----BEGIN PGP SIGNATURE----- iQEzBAEBCAAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmMIchkACgkQTeGvMW1P DemFzQgAq7T7IoD01Sd+UZ5eD3TkAusWJ9R5q06UGat2fUMtlsClowml+QEbzGOI R6L/9RP5ejxyhHCP83gCl0tAvUh2grbe+7W6QxeIZ1EKS8rOrdRyM5lysVyFkotH PRswnOCBUMGMcqy9dvStGFjglCMAZdyyxXzWZHEf7sVUadrVoP63H7jUqvfOlMKN PVaOX7wHvmDzB4KwFEBZXTHjQGlvvTfpAmtTAqnvOfNVxC61QPesWmMC//MuOLKt k6oA+oDP5AMe3GpGaFZSSdsDKobBq2ZFGCJlQUBhlSWQAoCOkr8bTJpOMEx7R3Sf 42feTthla0bRaHGkfB4nwcmgXjrnSg== =g/n7 -----END PGP SIGNATURE-----
On Aug 26 16:15, Jinhao Fan wrote: > Sure. I’ve already reworked this iothread patch upon the new irqfd > patch. I think I can post a v2 patch today. Do you mean I include > irqfd v3 in the new iothread patch series? > Yes, please include irqfd-v3 in the series.
at 5:00 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > Add an option "iothread=x" to do emulation in a seperate iothread. > This improves the performance because QEMU's main loop is responsible > for a lot of other work while iothread is dedicated to NVMe emulation. > Moreover, emulating in iothread brings the potential of polling on > SQ/CQ doorbells, which I will bring up in a following patch. > > Iothread can be enabled by: > -object iothread,id=nvme0 \ > -device nvme,iothread=nvme0 \ > > Performance comparisons (KIOPS): > > QD 1 4 16 64 > QEMU 41 136 242 338 > iothread 53 155 245 309 > > Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn> > --- > hw/nvme/ctrl.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---- > hw/nvme/ns.c | 19 +++++++++--- > hw/nvme/nvme.h | 6 +++- > 3 files changed, 95 insertions(+), 10 deletions(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 533ad14e7a..a4efc97b33 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -4257,6 +4257,7 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) > { > NvmeCtrl *n = cq->ctrl; > uint16_t offset = (cq->cqid << 3) + (1 << 2); > + bool in_iothread = !qemu_in_main_thread(); > int ret; > > ret = event_notifier_init(&cq->notifier, 0); > @@ -4264,10 +4265,19 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) > return ret; > } > > - event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); > + if (in_iothread) { > + qemu_mutex_lock_iothread(); > + } > + > + aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier, > + NULL, NULL); > memory_region_add_eventfd(&n->iomem, > 0x1000 + offset, 4, false, 0, &cq->notifier); > > + if (in_iothread) { > + qemu_mutex_unlock_iothread(); > + } > + > return 0; > } > > @@ -4284,6 +4294,7 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) > { > NvmeCtrl *n = sq->ctrl; > uint16_t offset = sq->sqid << 3; > + bool in_iothread = !qemu_in_main_thread(); > int ret; > > ret = event_notifier_init(&sq->notifier, 0); > @@ -4291,9 +4302,16 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) > return ret; > } > > - event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); > + if (in_iothread) { > + qemu_mutex_lock_iothread(); > + } > + aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier, > + NULL, NULL); > memory_region_add_eventfd(&n->iomem, > 0x1000 + offset, 4, false, 0, &sq->notifier); > + if (in_iothread) { > + qemu_mutex_unlock_iothread(); > + } > > return 0; > } > @@ -4301,13 +4319,22 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) > static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) > { > uint16_t offset = sq->sqid << 3; > + bool in_iothread = !qemu_in_main_thread(); > > n->sq[sq->sqid] = NULL; > timer_free(sq->timer); > if (sq->ioeventfd_enabled) { > + if (in_iothread) { > + qemu_mutex_lock_iothread(); > + } > + > memory_region_del_eventfd(&n->iomem, > 0x1000 + offset, 4, false, 0, &sq->notifier); > event_notifier_cleanup(&sq->notifier); > + > + if (in_iothread) { > + qemu_mutex_unlock_iothread(); > + } > } > g_free(sq->io_req); > if (sq->sqid) { > @@ -4376,8 +4403,9 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, > sq->io_req[i].sq = sq; > QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); > } > - sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); > > + sq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS, > + nvme_process_sq, sq); > if (n->dbbuf_enabled) { > sq->db_addr = n->dbbuf_dbs + (sqid << 3); > sq->ei_addr = n->dbbuf_eis + (sqid << 3); > @@ -4691,13 +4719,22 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) > static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) > { > uint16_t offset = (cq->cqid << 3) + (1 << 2); > + bool in_iothread = !qemu_in_main_thread(); > > n->cq[cq->cqid] = NULL; > timer_free(cq->timer); > if (cq->ioeventfd_enabled) { > + if (in_iothread) { > + qemu_mutex_lock_iothread(); > + } > + > memory_region_del_eventfd(&n->iomem, > 0x1000 + offset, 4, false, 0, &cq->notifier); > event_notifier_cleanup(&cq->notifier); > + > + if (in_iothread) { > + qemu_mutex_unlock_iothread(); > + } > } > if (msix_enabled(&n->parent_obj)) { > msix_vector_unuse(&n->parent_obj, cq->vector); > @@ -4765,7 +4802,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, > } > } > n->cq[cqid] = cq; > - cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); > + cq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS, > + nvme_post_cqes, cq); > } > > static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) > @@ -6557,6 +6595,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, > uint32_t intms = ldl_le_p(&n->bar.intms); > uint32_t csts = ldl_le_p(&n->bar.csts); > uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts); > + bool in_iothread = !qemu_in_main_thread(); > > if (unlikely(offset & (sizeof(uint32_t) - 1))) { > NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32, > @@ -6726,10 +6765,22 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, > > stl_le_p(&n->bar.pmrctl, data); > if (NVME_PMRCTL_EN(data)) { > + if (in_iothread) { > + qemu_mutex_lock_iothread(); > + } > memory_region_set_enabled(&n->pmr.dev->mr, true); > + if (in_iothread) { > + qemu_mutex_unlock_iothread(); > + } > pmrsts = 0; > } else { > + if (in_iothread) { > + qemu_mutex_lock_iothread(); > + } > memory_region_set_enabled(&n->pmr.dev->mr, false); > + if (in_iothread) { > + qemu_mutex_unlock_iothread(); > + } > NVME_PMRSTS_SET_NRDY(pmrsts, 1); > n->pmr.cmse = false; > } > @@ -7528,6 +7579,14 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) > if (pci_is_vf(&n->parent_obj) && !sctrl->scs) { > stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); > } > + > + if (n->params.iothread) { > + n->iothread = n->params.iothread; > + object_ref(OBJECT(n->iothread)); > + n->ctx = iothread_get_aio_context(n->iothread); > + } else { > + n->ctx = qemu_get_aio_context(); > + } > } > > static int nvme_init_subsys(NvmeCtrl *n, Error **errp) > @@ -7600,7 +7659,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) > ns = &n->namespace; > ns->params.nsid = 1; > > - if (nvme_ns_setup(ns, errp)) { > + if (nvme_ns_setup(n, ns, errp)) { > return; > } > > @@ -7631,6 +7690,15 @@ static void nvme_exit(PCIDevice *pci_dev) > g_free(n->sq); > g_free(n->aer_reqs); > > + aio_context_acquire(n->ctx); > + blk_set_aio_context(n->namespace.blkconf.blk, qemu_get_aio_context(), NULL); > + aio_context_release(n->ctx); > + > + if (n->iothread) { > + object_unref(OBJECT(n->iothread)); > + n->iothread = NULL; > + } > + > if (n->params.cmb_size_mb) { > g_free(n->cmb.buf); > } > @@ -7665,6 +7733,8 @@ static Property nvme_props[] = { > DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), > DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), > DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true), > + DEFINE_PROP_LINK("iothread", NvmeCtrl, params.iothread, TYPE_IOTHREAD, > + IOThread *), > DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), > DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, > params.auto_transition_zones, true), > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c > index 62a1f97be0..683d0d7a3b 100644 > --- a/hw/nvme/ns.c > +++ b/hw/nvme/ns.c > @@ -146,8 +146,10 @@ lbaf_found: > return 0; > } > > -static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) > +static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > { > + AioContext *old_context; > + int ret; > bool read_only; > > if (!blkconf_blocksizes(&ns->blkconf, errp)) { > @@ -170,6 +172,15 @@ static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) > return -1; > } > > + old_context = blk_get_aio_context(ns->blkconf.blk); > + aio_context_acquire(old_context); > + ret = blk_set_aio_context(ns->blkconf.blk, n->ctx, errp); > + aio_context_release(old_context); > + > + if (ret) { > + error_setg(errp, "Set AioContext on BlockBackend failed"); > + } > + > return 0; > } > > @@ -482,13 +493,13 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) > return 0; > } > > -int nvme_ns_setup(NvmeNamespace *ns, Error **errp) > +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > { > if (nvme_ns_check_constraints(ns, errp)) { > return -1; > } > > - if (nvme_ns_init_blk(ns, errp)) { > + if (nvme_ns_init_blk(n, ns, errp)) { > return -1; > } > > @@ -563,7 +574,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) > } > } > > - if (nvme_ns_setup(ns, errp)) { > + if (nvme_ns_setup(n, ns, errp)) { > return; > } > > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > index 79f5c281c2..85e2aa7b29 100644 > --- a/hw/nvme/nvme.h > +++ b/hw/nvme/nvme.h > @@ -21,6 +21,7 @@ > #include "qemu/uuid.h" > #include "hw/pci/pci.h" > #include "hw/block/block.h" > +#include "sysemu/iothread.h" > > #include "block/nvme.h" > > @@ -275,7 +276,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns) > } > > void nvme_ns_init_format(NvmeNamespace *ns); > -int nvme_ns_setup(NvmeNamespace *ns, Error **errp); > +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); > void nvme_ns_drain(NvmeNamespace *ns); > void nvme_ns_shutdown(NvmeNamespace *ns); > void nvme_ns_cleanup(NvmeNamespace *ns); > @@ -427,6 +428,7 @@ typedef struct NvmeParams { > uint16_t sriov_vi_flexible; > uint8_t sriov_max_vq_per_vf; > uint8_t sriov_max_vi_per_vf; > + IOThread *iothread; > } NvmeParams; > > typedef struct NvmeCtrl { > @@ -458,6 +460,8 @@ typedef struct NvmeCtrl { > uint64_t dbbuf_dbs; > uint64_t dbbuf_eis; > bool dbbuf_enabled; > + IOThread *iothread; > + AioContext *ctx; > > struct { > MemoryRegion mem; > -- > 2.25.1 Hi Klaus and Keith, I just added support for interrupt masking. How can I test interrupt masking? Thanks, Jinhao Fan
On Jul 26 16:55, Jinhao Fan wrote: > > Hi Klaus and Keith, > > I just added support for interrupt masking. How can I test interrupt > masking? > I wondered if this might be possible to test this with a user-space VFIO-based driver, but VFIO does not export masking/unmasking on MSI/MSI-X through the SET_IRQ interface and disallows reading and writing in the MSI-X table area :(
On Tue, Jul 26, 2022 at 04:55:54PM +0800, Jinhao Fan wrote: > > Hi Klaus and Keith, > > I just added support for interrupt masking. How can I test interrupt > masking? Are you asking about MSI masking? I don't think any drivers are using the feature, so you'd need to modify one to test it. I can give you some pointers if you are asking about MSI.
at 10:45 PM, Keith Busch <kbusch@kernel.org> wrote: > On Tue, Jul 26, 2022 at 04:55:54PM +0800, Jinhao Fan wrote: >> Hi Klaus and Keith, >> >> I just added support for interrupt masking. How can I test interrupt >> masking? > > Are you asking about MSI masking? I don't think any drivers are using the > feature, so you'd need to modify one to test it. I can give you some pointers > if you are asking about MSI. Thanks Keith, Do I understand correctly: qemu-nvme only supports MSI-X and pin-based interrupts. So MSI masking here is equivalent with MSI-X masking. If the above is correct, I think I am asking about MSI masking. BTW, a double check on ctrl.c seems to show that we only support disabling interrupt at CQ creation time, which is recorded in the cq->irq_enabled. This is different from my prior understanding that interrupts can be disabled at runtime by a call like Linux irq_save(). Therefore I doubt whether qemu-nvme supported "interrupt masking" before. How do you understand qemu-nvme’s interrupt masking support? Jinhao
On Tue, Jul 26, 2022 at 11:32:57PM +0800, Jinhao Fan wrote: > at 10:45 PM, Keith Busch <kbusch@kernel.org> wrote: > > > On Tue, Jul 26, 2022 at 04:55:54PM +0800, Jinhao Fan wrote: > >> Hi Klaus and Keith, > >> > >> I just added support for interrupt masking. How can I test interrupt > >> masking? > > > > Are you asking about MSI masking? I don't think any drivers are using the > > feature, so you'd need to modify one to test it. I can give you some pointers > > if you are asking about MSI. > > Thanks Keith, > > Do I understand correctly: qemu-nvme only supports MSI-X and pin-based > interrupts. So MSI masking here is equivalent with MSI-X masking. It looks like qemu's nvme only supports MSI-x. I could have sworn it used to support MSI, but I must be thinking of an unofficial fork. I was mainly asking about MSI masking as it relates to nvme controller's IVMS/IVMC registers. I don't think any driver is making use of these, and those don't apply to MSI-x; just MSI and legacy. At the PCIe level, masking MSI vectors is in Config space. Writing to Config space is too slow to do per-interrupt, so NVMe created the IVMS/IVMC registers to get around that. > If the above is correct, I think I am asking about MSI masking. > > BTW, a double check on ctrl.c seems to show that we only support disabling > interrupt at CQ creation time, which is recorded in the cq->irq_enabled. > This is different from my prior understanding that interrupts can be > disabled at runtime by a call like Linux irq_save(). Therefore I doubt > whether qemu-nvme supported "interrupt masking" before. How do you > understand qemu-nvme’s interrupt masking support? MSI-x uses MMIO for masking, so there's no need for an NVMe specific way to mask these interrupts. You can just use the generic PCIe methods to clear the vector's enable bit. But no NVMe driver that I know of is making use of these either, though it should be possible to make linux start doing that. The CQ irq_enabled field is there so the user can create a pure polling queue. That's a fixed property of the queue that can't be re-enabled without destroying and recreating. The linux irq_save only disables local CPU interrupts from most sources. All pci devices are unaware of this and can still send their interrupt messages to the CPU, but the CPU won't context switch to the irq handler until after irqrestore is called.
at 2:07 AM, Keith Busch <kbusch@kernel.org> wrote: > MSI-x uses MMIO for masking, so there's no need for an NVMe specific way to > mask these interrupts. You can just use the generic PCIe methods to clear the > vector's enable bit. But no NVMe driver that I know of is making use of these > either, though it should be possible to make linux start doing that. I believe we need to handle MSI-x masking in the NVMe controller after we switch to irqfd. Before that QEMU’s MSI-x emulation logic helps us handle masked interrupts. But with irqfd, we bypass QEMU’s MSI-x and let the kernel send the interrupt directly. Therefore qemu-nvme needs to do some bookkeeping about which interrupt vectors are masked. msix_set_vector_notifiers helps us do that.
at 10:56 AM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > at 2:07 AM, Keith Busch <kbusch@kernel.org> wrote: > >> MSI-x uses MMIO for masking, so there's no need for an NVMe specific way to >> mask these interrupts. You can just use the generic PCIe methods to clear the >> vector's enable bit. But no NVMe driver that I know of is making use of these >> either, though it should be possible to make linux start doing that. > > I believe we need to handle MSI-x masking in the NVMe controller after we > switch to irqfd. Before that QEMU’s MSI-x emulation logic helps us handle > masked interrupts. But with irqfd, we bypass QEMU’s MSI-x and let the kernel > send the interrupt directly. Therefore qemu-nvme needs to do some > bookkeeping about which interrupt vectors are masked. > msix_set_vector_notifiers helps us do that. But as no NVMe driver makes use of MSI-x interrupt masking, is it OK if we just leave MSI-x masking support broken when irqfd is enabled?
© 2016 - 2024 Red Hat, Inc.