hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++- hw/nvme/nvme.h | 3 +++ 2 files changed, 68 insertions(+), 1 deletions(-)
Use irqfd to directly notify KVM to inject interrupts. This is done by
registering a virtual IRQ(virq) in KVM and associate the virq with an
irqfd, so that KVM can directly inject the interrupt when it receives
notification from the irqfd. This approach is supposed to improve
performance because it bypasses QEMU's MSI interrupt emulation logic.
However, I did not see an obvious improvement of the emulation KIOPS:
QD 1 4 16 64
QEMU 38 123 210 329
irqfd 40 129 219 328
I found this problem quite hard to diagnose since irqfd's workflow
involves both QEMU and the in-kernel KVM.
Could you help me figure out the following questions:
1. How much performance improvement can I expect from using irqfd?
2. How can I debug this kind of cross QEMU-KVM problems?
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
---
hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++-
hw/nvme/nvme.h | 3 +++
2 files changed, 68 insertions(+), 1 deletions(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 4b75c5f549..59768c4586 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -159,6 +159,7 @@
#include "sysemu/sysemu.h"
#include "sysemu/block-backend.h"
#include "sysemu/hostmem.h"
+#include "sysemu/kvm.h"
#include "hw/pci/msix.h"
#include "migration/vmstate.h"
@@ -484,12 +485,70 @@ static void nvme_irq_check(NvmeCtrl *n)
}
}
+static int nvme_kvm_msix_vector_use(NvmeCtrl *n,
+ NvmeCQueue *cq,
+ int vector)
+{
+ int ret;
+
+ KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+ ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
+ if (ret < 0) {
+ return ret;
+ }
+ kvm_irqchip_commit_route_changes(&c);
+ cq->virq = ret;
+ return 0;
+}
+
+static int nvme_init_cq_irqfd(NvmeCQueue *cq)
+{
+ int ret;
+
+ ret = nvme_kvm_msix_vector_use(cq->ctrl, cq, (int)cq->vector);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = event_notifier_init(&cq->irq_notifier, 0);
+ if (ret < 0) {
+ goto fail_notifier;
+ }
+
+ ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
+ NULL, cq->virq);
+ if (ret < 0) {
+ goto fail_kvm;
+ }
+
+ return 0;
+
+fail_kvm:
+ event_notifier_cleanup(&cq->irq_notifier);
+fail_notifier:
+ kvm_irqchip_release_virq(kvm_state, cq->virq);
+fail:
+ return ret;
+}
+
static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
{
if (cq->irq_enabled) {
if (msix_enabled(&(n->parent_obj))) {
+ /* Initialize CQ irqfd */
+ if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) {
+ int ret = nvme_init_cq_irqfd(cq);
+ if (ret == 0) {
+ cq->irqfd_enabled = true;
+ }
+ }
+
trace_pci_nvme_irq_msix(cq->vector);
- msix_notify(&(n->parent_obj), cq->vector);
+ if (cq->irqfd_enabled) {
+ event_notifier_set(&cq->irq_notifier);
+ } else {
+ msix_notify(&(n->parent_obj), cq->vector);
+ }
} else {
trace_pci_nvme_irq_pin();
assert(cq->vector < 32);
@@ -4670,6 +4729,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
event_notifier_cleanup(&cq->notifier);
}
if (msix_enabled(&n->parent_obj)) {
+ if (cq->irqfd_enabled) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier,
+ cq->virq);
+ kvm_irqchip_release_virq(kvm_state, cq->virq);
+ event_notifier_cleanup(&cq->irq_notifier);
+ }
msix_vector_unuse(&n->parent_obj, cq->vector);
}
if (cq->cqid) {
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 2a9beea0c8..84e5b00fe3 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -391,7 +391,10 @@ typedef struct NvmeCQueue {
uint64_t ei_addr;
QEMUTimer *timer;
EventNotifier notifier;
+ EventNotifier irq_notifier;
+ int virq;
bool ioeventfd_enabled;
+ bool irqfd_enabled;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
} NvmeCQueue;
--
2.25.1
at 12:35 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) > { > if (cq->irq_enabled) { > if (msix_enabled(&(n->parent_obj))) { > + /* Initialize CQ irqfd */ > + if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) { > + int ret = nvme_init_cq_irqfd(cq); > + if (ret == 0) { > + cq->irqfd_enabled = true; > + } > + } > + Another question: In this version I left irqfd initialization to the first assertion of an irq. But I think it is better to initialize irqfd at cq creation time so we won’t bother checking it at each irq assertion. However if I put these code in nvme_init_cq(), irqfd does not work properly. After adding some tracepoints I found the MSI messages in MSI-X table changed after nvme_init_cq(). Specifically, the `data` field does not seem correct at the time when nvme_init_cq() is called. Keith, you must be familiar with how the nvme driver initializes CQs. Could you give some information on when I can safely use the contents in the MSI-X table?
On Mon, Aug 08, 2022 at 10:23:03AM +0800, Jinhao Fan wrote: > at 12:35 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > > > static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) > > { > > if (cq->irq_enabled) { > > if (msix_enabled(&(n->parent_obj))) { > > + /* Initialize CQ irqfd */ > > + if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) { > > + int ret = nvme_init_cq_irqfd(cq); > > + if (ret == 0) { > > + cq->irqfd_enabled = true; > > + } > > + } > > + > > Another question: > > In this version I left irqfd initialization to the first assertion of an > irq. But I think it is better to initialize irqfd at cq creation time so we > won’t bother checking it at each irq assertion. However if I put these code > in nvme_init_cq(), irqfd does not work properly. After adding some > tracepoints I found the MSI messages in MSI-X table changed after > nvme_init_cq(). Specifically, the `data` field does not seem correct at the > time when nvme_init_cq() is called. > > Keith, you must be familiar with how the nvme driver initializes CQs. Could > you give some information on when I can safely use the contents in the MSI-X > table? The driver will create the cq with an allocated vector, but it's not activated until after the driver wires it up to a handler. I think that's what you're observing with the incomplete MSIx table entry on creation.
> The driver will create the cq with an allocated vector, but it's not activated > until after the driver wires it up to a handler. I think that's what you're > observing with the incomplete MSIx table entry on creation. Also, I'm wondering if this is inconsistent with the NVMe spec. In Section 7.6.1 of the 1.4 spec, it says "After determining the number of I/O Queues, the MSI and/or MSI-X registers should be configured;" in Step 8, and CQ creation happens in Step 9. Now the driver changes MSI-X registers after CQ creation, is it a violation of the spec?
On Wed, Aug 10, 2022 at 12:48:53AM +0800, 樊金昊 wrote: >> The driver will create the cq with an allocated vector, but it's not activated >> until after the driver wires it up to a handler. I think that's what you're >> observing with the incomplete MSIx table entry on creation. > > Also, I'm wondering if this is inconsistent with the NVMe spec. In Section 7.6.1 > of the 1.4 spec, it says "After determining the number of I/O Queues, the MSI > and/or MSI-X registers should be configured;" in Step 8, and CQ creation happens > in Step 9. Now the driver changes MSI-X registers after CQ creation, is it a > violation of the spec? I don't think it's a problem. This is really a more "informative" section of the spec and doesn't specify any hard requirements. You should be able to rely on the entry's data being stable after the first queue doorbell ring, though.
> The driver will create the cq with an allocated vector, but it's not activated > until after the driver wires it up to a handler. I think that's what you're > observing with the incomplete MSIx table entry on creation. Agreed. I digged through pci_request_irq()'s call chain and found pci_write_msi_msg() was called in the end. Now to implement irqfd support, we need to register the (complete) MSI message in KVM so that KVM can directly send the interrupt when we signal the irqfd. My prior implementation delayed each CQ's MSI message registration to its first nvme_post_cqes(). I'm not sure whether this is a good choice. What do you think about this approach? BTW, since we skip QEMU's MSI-x emulation with irqfd, we need to record the mask status of each interrupt vector. QEMU provides msix_set_vector_notifiers() to help us call handlers on each mask and unmask event. But this function works on a per-device basis. I guess it is best to call msix_set_vector_notifiers() after all CQs are created. But I think qemu-nvme can't tell when the host has finished CQ creation. Where do you think is the best place we register the mask/unmask callbacks? Is it OK to put it at, say, the first nvme_post_cqes() of the whole device? Thanks, Jinhao Fan
> In this version I left irqfd initialization to the first assertion of an > irq. But I think it is better to initialize irqfd at cq creation time so we > won’t bother checking it at each irq assertion. However if I put these code > in nvme_init_cq(), irqfd does not work properly. After adding some > tracepoints I found the MSI messages in MSI-X table changed after > nvme_init_cq(). Specifically, the `data` field does not seem correct at the > time when nvme_init_cq() is called. I found that in Linux NVMe driver, in nvme_create_cq() (drivers/nvme/host/pci.c), queue_request_irq() is called after nvme_init_queue(). Does this possibly cause the incorrect MSI messages at CQ creation time?
On Sat, Jul 9, 2022, 00:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > Use irqfd to directly notify KVM to inject interrupts. This is done by > registering a virtual IRQ(virq) in KVM and associate the virq with an > irqfd, so that KVM can directly inject the interrupt when it receives > notification from the irqfd. This approach is supposed to improve > performance because it bypasses QEMU's MSI interrupt emulation logic. > > However, I did not see an obvious improvement of the emulation KIOPS: > > QD 1 4 16 64 > QEMU 38 123 210 329 > irqfd 40 129 219 328 > > I found this problem quite hard to diagnose since irqfd's workflow > involves both QEMU and the in-kernel KVM. > > Could you help me figure out the following questions: > > 1. How much performance improvement can I expect from using irqfd? > Hi Jinhao, Thanks for working on this! irqfd is not necessarily faster than KVM ioctl interrupt injection. There are at least two non performance reasons for irqfd: 1. It avoids QEMU emulation code, which historically was not thread safe and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore cannot safely call the regular interrupt emulation code in QEMU. I think this is still true today although parts of the code may now be less reliant on the BQL. 2. The eventfd interface decouples interrupt injection from the KVM ioctl interface. Vhost kernel and vhost-user device emulation code has no dependency on KVM thanks to irqfd. They work with any eventfd, including irqfd. 2. How can I debug this kind of cross QEMU-KVM problems? > perf(1) is good at observing both kernel and userspace activity together. What is it that you want to debug. > Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn> > --- > hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++- > hw/nvme/nvme.h | 3 +++ > 2 files changed, 68 insertions(+), 1 deletions(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 4b75c5f549..59768c4586 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -159,6 +159,7 @@ > #include "sysemu/sysemu.h" > #include "sysemu/block-backend.h" > #include "sysemu/hostmem.h" > +#include "sysemu/kvm.h" > #include "hw/pci/msix.h" > #include "migration/vmstate.h" > > @@ -484,12 +485,70 @@ static void nvme_irq_check(NvmeCtrl *n) > } > } > > +static int nvme_kvm_msix_vector_use(NvmeCtrl *n, > + NvmeCQueue *cq, > + int vector) > +{ > + int ret; > + > + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); > + ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj); > + if (ret < 0) { > + return ret; > + } > + kvm_irqchip_commit_route_changes(&c); > + cq->virq = ret; > + return 0; > +} > + > +static int nvme_init_cq_irqfd(NvmeCQueue *cq) > +{ > + int ret; > + > + ret = nvme_kvm_msix_vector_use(cq->ctrl, cq, (int)cq->vector); > + if (ret < 0) { > + goto fail; > + } > + > + ret = event_notifier_init(&cq->irq_notifier, 0); > + if (ret < 0) { > + goto fail_notifier; > + } > + > + ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier, > + NULL, cq->virq); > + if (ret < 0) { > + goto fail_kvm; > + } > + > + return 0; > + > +fail_kvm: > + event_notifier_cleanup(&cq->irq_notifier); > +fail_notifier: > + kvm_irqchip_release_virq(kvm_state, cq->virq); > +fail: > + return ret; > +} > + > static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) > { > if (cq->irq_enabled) { > if (msix_enabled(&(n->parent_obj))) { > + /* Initialize CQ irqfd */ > + if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != > 0) { > + int ret = nvme_init_cq_irqfd(cq); > + if (ret == 0) { > + cq->irqfd_enabled = true; > + } > + } > + > trace_pci_nvme_irq_msix(cq->vector); > - msix_notify(&(n->parent_obj), cq->vector); > + if (cq->irqfd_enabled) { > + event_notifier_set(&cq->irq_notifier); > What happens when the MSI-X vector is masked? I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked. This makes me wonder if the VIRTIO and NVMe IOThread irqfd code can be unified. Maybe IOThread support can be built into the core device emulation code (e.g. irq APIs) so that it's not necessary to duplicate it. + } else { > + msix_notify(&(n->parent_obj), cq->vector); > + } > } else { > trace_pci_nvme_irq_pin(); > assert(cq->vector < 32); > @@ -4670,6 +4729,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl > *n) > event_notifier_cleanup(&cq->notifier); > } > if (msix_enabled(&n->parent_obj)) { > + if (cq->irqfd_enabled) { > + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, > &cq->irq_notifier, > + cq->virq); > + kvm_irqchip_release_virq(kvm_state, cq->virq); > + event_notifier_cleanup(&cq->irq_notifier); > + } > msix_vector_unuse(&n->parent_obj, cq->vector); > } > if (cq->cqid) { > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > index 2a9beea0c8..84e5b00fe3 100644 > --- a/hw/nvme/nvme.h > +++ b/hw/nvme/nvme.h > @@ -391,7 +391,10 @@ typedef struct NvmeCQueue { > uint64_t ei_addr; > QEMUTimer *timer; > EventNotifier notifier; > + EventNotifier irq_notifier; > + int virq; > bool ioeventfd_enabled; > + bool irqfd_enabled; > QTAILQ_HEAD(, NvmeSQueue) sq_list; > QTAILQ_HEAD(, NvmeRequest) req_list; > } NvmeCQueue; > -- > 2.25.1 > > >
at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > What happens when the MSI-X vector is masked? > > I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked. Hi Stefan, While implementing interrupt masking support, I found it hard to test this feature on the host. Keith told me that no NVMe drivers are currently using this feature. Do you remember how you tested interrupt masking? Regards, Jinhao Fan
On Aug 2 12:03, Jinhao Fan wrote: > at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > What happens when the MSI-X vector is masked? > > > > I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked. > > Hi Stefan, > > While implementing interrupt masking support, I found it hard to test this > feature on the host. Keith told me that no NVMe drivers are currently using > this feature. Do you remember how you tested interrupt masking? > You can probably do this with qtest. I don't see a helper for masking the vectors, but qpci_msix_masked() should be usable as a base for just changing that readl to a writel and mask it out. This should allow you to do a relatively simple test case where you 1. bootstrap the device as simple as possible (forget about I/O queues) - I *think* you just need to use guest_alloc for the admin queue memory, use qpci_msix_enable() etc. 2. setup a simple admin command in the queue 3. mask the interrupt 4. ring the doorbell (a writel) 5. check that the vector remains in pending state (qpci_msix_pending()). This *could* be a potential way to do this.
Hi Stefan, Thanks for the detailed explanation! at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > Hi Jinhao, > Thanks for working on this! > > irqfd is not necessarily faster than KVM ioctl interrupt injection. > > There are at least two non performance reasons for irqfd: > 1. It avoids QEMU emulation code, which historically was not thread safe and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore cannot safely call the regular interrupt emulation code in QEMU. I think this is still true today although parts of the code may now be less reliant on the BQL. This probably means we need to move to irqfd when iothread support is added in qemu-nvme. > 2. The eventfd interface decouples interrupt injection from the KVM ioctl interface. Vhost kernel and vhost-user device emulation code has no dependency on KVM thanks to irqfd. They work with any eventfd, including irqfd. This is contrary to our original belief. Klaus once pointed out that irqfd is KVM specific. I agreed with him since I found irqfd implementation is in virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you elaborate on what “no dependency on KVM” means? > 2. How can I debug this kind of cross QEMU-KVM problems? > > perf(1) is good at observing both kernel and userspace activity together. What is it that you want to debug. > I’ll look into perf(1). I think what I was trying to do is like a breakdown analysis on which part caused the latency. For example, what is the root cause of the performance improvements or regressions when irqfd is turned on. > What happens when the MSI-X vector is masked? > > I remember the VIRTIO code having masking support. I'm on my phone and can't check now, but I think it registers a temporary eventfd and buffers irqs while the vector is masked. Yes, this RFC ignored interrupt masking support. > > This makes me wonder if the VIRTIO and NVMe IOThread irqfd code can be unified. Maybe IOThread support can be built into the core device emulation code (e.g. irq APIs) so that it's not necessary to duplicate it. > Agreed. Recently when working on ioeventfd, iothread and polling support, my typical workflow is to look at how virtio does that and adjust that code into nvme. I think unifying their IOThread code can be beneficial since VIRTIO has incorporated many optimizations over the years that can not be directly enjoyed by nvme. But I fear that subtle differences in the two protocols may cause challenges for the unification. Again, thanks for your help :)
On Wed, Jul 20, 2022, 22:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > Hi Stefan, > > Thanks for the detailed explanation! > > at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > Hi Jinhao, > > Thanks for working on this! > > > > irqfd is not necessarily faster than KVM ioctl interrupt injection. > > > > There are at least two non performance reasons for irqfd: > > 1. It avoids QEMU emulation code, which historically was not thread safe > and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore > cannot safely call the regular interrupt emulation code in QEMU. I think > this is still true today although parts of the code may now be less reliant > on the BQL. > > This probably means we need to move to irqfd when iothread support is added > in qemu-nvme. > Yes. You can audit the interrupt code but I'm pretty sure there is shared state that needs to be protected by the BQL. So the NVMe emulation code probably needs to use irqfd to avoid the interrupt emulation code. > > 2. The eventfd interface decouples interrupt injection from the KVM > ioctl interface. Vhost kernel and vhost-user device emulation code has no > dependency on KVM thanks to irqfd. They work with any eventfd, including > irqfd. > > This is contrary to our original belief. Klaus once pointed out that irqfd > is KVM specific. I agreed with him since I found irqfd implementation is in > virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you > elaborate on what “no dependency on KVM” means? > "They work with any eventfd, including irqfd" If you look at the vhost kernel or vhost-user code, you'll see they just signal the eventfd. It doesn't have to be an irqfd. An irqfd is a specific type of eventfd that the KVM kernel module implements to inject interrupts when the eventfd is signaled. By the way, this not only decouples vhost from the KVM kernel module, but also allows QEMU to emulate MSI-X masking via buffering the interrupt in userspace. > > 2. How can I debug this kind of cross QEMU-KVM problems? > > > > perf(1) is good at observing both kernel and userspace activity > together. What is it that you want to debug. > > > > I’ll look into perf(1). I think what I was trying to do is like a breakdown > analysis on which part caused the latency. For example, what is the root > cause of the performance improvements or regressions when irqfd is turned > on. > Nice, perf(1) is good for that. You can enable trace events and add kprobes/uprobes to record timestamps when specific functions are entered. > > > What happens when the MSI-X vector is masked? > > > > I remember the VIRTIO code having masking support. I'm on my phone and > can't check now, but I think it registers a temporary eventfd and buffers > irqs while the vector is masked. > > Yes, this RFC ignored interrupt masking support. > > > > > This makes me wonder if the VIRTIO and NVMe IOThread irqfd code can be > unified. Maybe IOThread support can be built into the core device emulation > code (e.g. irq APIs) so that it's not necessary to duplicate it. > > > > Agreed. Recently when working on ioeventfd, iothread and polling support, > my > typical workflow is to look at how virtio does that and adjust that code > into nvme. I think unifying their IOThread code can be beneficial since > VIRTIO has incorporated many optimizations over the years that can not be > directly enjoyed by nvme. But I fear that subtle differences in the two > protocols may cause challenges for the unification. > > Again, thanks for your help :) >
On Jul 21 09:29, Stefan Hajnoczi wrote: > On Wed, Jul 20, 2022, 22:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > > > Hi Stefan, > > > > Thanks for the detailed explanation! > > > > at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > > > Hi Jinhao, > > > Thanks for working on this! > > > > > > irqfd is not necessarily faster than KVM ioctl interrupt injection. > > > > > > There are at least two non performance reasons for irqfd: > > > 1. It avoids QEMU emulation code, which historically was not thread safe > > and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore > > cannot safely call the regular interrupt emulation code in QEMU. I think > > this is still true today although parts of the code may now be less reliant > > on the BQL. > > > > This probably means we need to move to irqfd when iothread support is added > > in qemu-nvme. > > > > Yes. You can audit the interrupt code but I'm pretty sure there is shared > state that needs to be protected by the BQL. So the NVMe emulation code > probably needs to use irqfd to avoid the interrupt emulation code. > > > > > 2. The eventfd interface decouples interrupt injection from the KVM > > ioctl interface. Vhost kernel and vhost-user device emulation code has no > > dependency on KVM thanks to irqfd. They work with any eventfd, including > > irqfd. > > > > This is contrary to our original belief. Klaus once pointed out that irqfd > > is KVM specific. I agreed with him since I found irqfd implementation is in > > virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you > > elaborate on what “no dependency on KVM” means? > > > > "They work with any eventfd, including irqfd" > > If you look at the vhost kernel or vhost-user code, you'll see they just > signal the eventfd. It doesn't have to be an irqfd. > > An irqfd is a specific type of eventfd that the KVM kernel module > implements to inject interrupts when the eventfd is signaled. > > By the way, this not only decouples vhost from the KVM kernel module, but > also allows QEMU to emulate MSI-X masking via buffering the interrupt in > userspace. > > The virtio dataplane (iothread support) only works with kvm if I am not mistaken, so I guess this is similar to what we want to do here. If we dont have KVM, we wont use iothread and we wont use the kvm irqchip/irqfd. Am I understanding this correctly?
On Wed, 27 Jul 2022 at 03:18, Klaus Jensen <its@irrelevant.dk> wrote: > > On Jul 21 09:29, Stefan Hajnoczi wrote: > > On Wed, Jul 20, 2022, 22:36 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > > > > > Hi Stefan, > > > > > > Thanks for the detailed explanation! > > > > > > at 6:21 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > > > > > Hi Jinhao, > > > > Thanks for working on this! > > > > > > > > irqfd is not necessarily faster than KVM ioctl interrupt injection. > > > > > > > > There are at least two non performance reasons for irqfd: > > > > 1. It avoids QEMU emulation code, which historically was not thread safe > > > and needed the Big QEMU Lock. IOThreads don't hold the BQL and therefore > > > cannot safely call the regular interrupt emulation code in QEMU. I think > > > this is still true today although parts of the code may now be less reliant > > > on the BQL. > > > > > > This probably means we need to move to irqfd when iothread support is added > > > in qemu-nvme. > > > > > > > Yes. You can audit the interrupt code but I'm pretty sure there is shared > > state that needs to be protected by the BQL. So the NVMe emulation code > > probably needs to use irqfd to avoid the interrupt emulation code. > > > > > > > > 2. The eventfd interface decouples interrupt injection from the KVM > > > ioctl interface. Vhost kernel and vhost-user device emulation code has no > > > dependency on KVM thanks to irqfd. They work with any eventfd, including > > > irqfd. > > > > > > This is contrary to our original belief. Klaus once pointed out that irqfd > > > is KVM specific. I agreed with him since I found irqfd implementation is in > > > virt/kvm/eventfd.c. But irqfd indeed avoids the KVM ioctl call. Could you > > > elaborate on what “no dependency on KVM” means? > > > > > > > "They work with any eventfd, including irqfd" > > > > If you look at the vhost kernel or vhost-user code, you'll see they just > > signal the eventfd. It doesn't have to be an irqfd. > > > > An irqfd is a specific type of eventfd that the KVM kernel module > > implements to inject interrupts when the eventfd is signaled. > > > > By the way, this not only decouples vhost from the KVM kernel module, but > > also allows QEMU to emulate MSI-X masking via buffering the interrupt in > > userspace. > > > > > > The virtio dataplane (iothread support) only works with kvm if I am not > mistaken, so I guess this is similar to what we want to do here. If we > dont have KVM, we wont use iothread and we wont use the kvm > irqchip/irqfd. > > Am I understanding this correctly? I think that is incorrect. QEMU has guest notifier emulation for the non-KVM (and non-MSI-X PCI) cases. When there is no irqfd support available, QEMU sets up a regular eventfd and calls virtio_queue_guest_notifier_read() when it becomes readable. Stefan
at 11:18 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > I think that is incorrect. QEMU has guest notifier emulation for the > non-KVM (and non-MSI-X PCI) cases. When there is no irqfd support > available, QEMU sets up a regular eventfd and calls > virtio_queue_guest_notifier_read() when it becomes readable. Thanks Stefan. I finally understand why there is a `with_irqfd` parameter for virtio_queue_set_guest_notifier_fd_handler. But if `with_irqfd` is false, it seems OK to directly call virtio_irq(). Why still bother using an eventfd? Is it for interrupt batching?
On Thu, Jul 28, 2022, 11:34 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > at 11:18 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > I think that is incorrect. QEMU has guest notifier emulation for the > > non-KVM (and non-MSI-X PCI) cases. When there is no irqfd support > > available, QEMU sets up a regular eventfd and calls > > virtio_queue_guest_notifier_read() when it becomes readable. > > Thanks Stefan. I finally understand why there is a `with_irqfd` parameter > for virtio_queue_set_guest_notifier_fd_handler. > > But if `with_irqfd` is false, it seems OK to directly call virtio_irq(). > Why > still bother using an eventfd? Is it for interrupt batching? > virtio_irq() is not thread safe so it cannot be called directly from the IOThread. Bouncing through the eventfd ensures that the virtio_irq() call happens in the QEMU main loop thread with the BQL held. It may be cheaper to use a BH instead of an eventfd when irqfd is not available, but this is a slow path anyway. We might as well reuse the eventfd code that's already there. Stefan >
at 9:29 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > Nice, perf(1) is good for that. You can enable trace events and add > kprobes/uprobes to record timestamps when specific functions are entered. > Thanks Stefan, One last question: Currently we can achieve hundreds of KIOPS. That means perf can easily capture millions of trace events per second. I found perf has quite high overhead at such a rate of trace events. Do you have any advices on tracing high IOPS tasks?
On Sun, Jul 24, 2022, 11:21 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > at 9:29 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > > > Nice, perf(1) is good for that. You can enable trace events and add > > kprobes/uprobes to record timestamps when specific functions are entered. > > > > Thanks Stefan, > > One last question: Currently we can achieve hundreds of KIOPS. That means > perf can easily capture millions of trace events per second. I found perf > has quite high overhead at such a rate of trace events. Do you have any > advices on tracing high IOPS tasks? I don't. BTW uprobes are expensive but kprobes are cheaper. Stefan >
at 3:36 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > On Sun, Jul 24, 2022, 11:21 Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > at 9:29 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > > > > Nice, perf(1) is good for that. You can enable trace events and add > > kprobes/uprobes to record timestamps when specific functions are entered. > > > > Thanks Stefan, > > One last question: Currently we can achieve hundreds of KIOPS. That means > perf can easily capture millions of trace events per second. I found perf > has quite high overhead at such a rate of trace events. Do you have any > advices on tracing high IOPS tasks? > > I don't. BTW uprobes are expensive but kprobes are cheaper. > > Stefan Gotcha. Thanks! Jinhao Fan
at 12:35 PM, Jinhao Fan <fanjinhao21s@ict.ac.cn> wrote: > Use irqfd to directly notify KVM to inject interrupts. This is done by > registering a virtual IRQ(virq) in KVM and associate the virq with an > irqfd, so that KVM can directly inject the interrupt when it receives > notification from the irqfd. This approach is supposed to improve > performance because it bypasses QEMU's MSI interrupt emulation logic. > > However, I did not see an obvious improvement of the emulation KIOPS: > > QD 1 4 16 64 > QEMU 38 123 210 329 > irqfd 40 129 219 328 > > I found this problem quite hard to diagnose since irqfd's workflow > involves both QEMU and the in-kernel KVM. > > Could you help me figure out the following questions: > > 1. How much performance improvement can I expect from using irqfd? > 2. How can I debug this kind of cross QEMU-KVM problems? > > Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn> > --- > hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++- > hw/nvme/nvme.h | 3 +++ > 2 files changed, 68 insertions(+), 1 deletions(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 4b75c5f549..59768c4586 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -159,6 +159,7 @@ > #include "sysemu/sysemu.h" > #include "sysemu/block-backend.h" > #include "sysemu/hostmem.h" > +#include "sysemu/kvm.h" > #include "hw/pci/msix.h" > #include "migration/vmstate.h" > > @@ -484,12 +485,70 @@ static void nvme_irq_check(NvmeCtrl *n) > } > } > > +static int nvme_kvm_msix_vector_use(NvmeCtrl *n, > + NvmeCQueue *cq, > + int vector) > +{ > + int ret; > + > + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); > + ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj); > + if (ret < 0) { > + return ret; > + } > + kvm_irqchip_commit_route_changes(&c); > + cq->virq = ret; > + return 0; > +} > + > +static int nvme_init_cq_irqfd(NvmeCQueue *cq) > +{ > + int ret; > + > + ret = nvme_kvm_msix_vector_use(cq->ctrl, cq, (int)cq->vector); > + if (ret < 0) { > + goto fail; > + } > + > + ret = event_notifier_init(&cq->irq_notifier, 0); > + if (ret < 0) { > + goto fail_notifier; > + } > + > + ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier, > + NULL, cq->virq); > + if (ret < 0) { > + goto fail_kvm; > + } > + > + return 0; > + > +fail_kvm: > + event_notifier_cleanup(&cq->irq_notifier); > +fail_notifier: > + kvm_irqchip_release_virq(kvm_state, cq->virq); > +fail: > + return ret; > +} > + > static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) > { > if (cq->irq_enabled) { > if (msix_enabled(&(n->parent_obj))) { > + /* Initialize CQ irqfd */ > + if (!cq->irqfd_enabled && n->params.ioeventfd && cq->cqid != 0) { > + int ret = nvme_init_cq_irqfd(cq); > + if (ret == 0) { > + cq->irqfd_enabled = true; > + } > + } > + > trace_pci_nvme_irq_msix(cq->vector); > - msix_notify(&(n->parent_obj), cq->vector); > + if (cq->irqfd_enabled) { > + event_notifier_set(&cq->irq_notifier); > + } else { > + msix_notify(&(n->parent_obj), cq->vector); > + } > } else { > trace_pci_nvme_irq_pin(); > assert(cq->vector < 32); > @@ -4670,6 +4729,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) > event_notifier_cleanup(&cq->notifier); > } > if (msix_enabled(&n->parent_obj)) { > + if (cq->irqfd_enabled) { > + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &cq->irq_notifier, > + cq->virq); > + kvm_irqchip_release_virq(kvm_state, cq->virq); > + event_notifier_cleanup(&cq->irq_notifier); > + } > msix_vector_unuse(&n->parent_obj, cq->vector); > } > if (cq->cqid) { > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > index 2a9beea0c8..84e5b00fe3 100644 > --- a/hw/nvme/nvme.h > +++ b/hw/nvme/nvme.h > @@ -391,7 +391,10 @@ typedef struct NvmeCQueue { > uint64_t ei_addr; > QEMUTimer *timer; > EventNotifier notifier; > + EventNotifier irq_notifier; > + int virq; > bool ioeventfd_enabled; > + bool irqfd_enabled; > QTAILQ_HEAD(, NvmeSQueue) sq_list; > QTAILQ_HEAD(, NvmeRequest) req_list; > } NvmeCQueue; > -- > 2.25.1 Hi Stefan, It seems you originally introduced irqfd to virtio-blk to solve thread safety problems [1]. Could you help explain the benefits of irqfd? Thanks! Jinhao Fan
On Jul 9 12:35, Jinhao Fan wrote: > Use irqfd to directly notify KVM to inject interrupts. This is done by > registering a virtual IRQ(virq) in KVM and associate the virq with an > irqfd, so that KVM can directly inject the interrupt when it receives > notification from the irqfd. This approach is supposed to improve > performance because it bypasses QEMU's MSI interrupt emulation logic. > > However, I did not see an obvious improvement of the emulation KIOPS: > > QD 1 4 16 64 > QEMU 38 123 210 329 > irqfd 40 129 219 328 > > I found this problem quite hard to diagnose since irqfd's workflow > involves both QEMU and the in-kernel KVM. > > Could you help me figure out the following questions: > > 1. How much performance improvement can I expect from using irqfd? This is a level of QEMU/KVM that I am by no means an expert on and I would have to let the broader QEMU community comment on this. > 2. How can I debug this kind of cross QEMU-KVM problems? Not sure how to directly "debug" it, but there is `perf kvm` to get information about what is happing in the kvm subsystem.
On Jul 12 14:26, Klaus Jensen wrote: > On Jul 9 12:35, Jinhao Fan wrote: > > Use irqfd to directly notify KVM to inject interrupts. This is done by > > registering a virtual IRQ(virq) in KVM and associate the virq with an > > irqfd, so that KVM can directly inject the interrupt when it receives > > notification from the irqfd. This approach is supposed to improve > > performance because it bypasses QEMU's MSI interrupt emulation logic. > > > > However, I did not see an obvious improvement of the emulation KIOPS: > > > > QD 1 4 16 64 > > QEMU 38 123 210 329 > > irqfd 40 129 219 328 > > > > I found this problem quite hard to diagnose since irqfd's workflow > > involves both QEMU and the in-kernel KVM. > > > > Could you help me figure out the following questions: > > > > 1. How much performance improvement can I expect from using irqfd? > > This is a level of QEMU/KVM that I am by no means an expert on and I > would have to let the broader QEMU community comment on this. > In any case, I'm wary about adding this level of kvm-dependence in the device. This wont work on non-kvm platforms any more. I think you should put irqfd on hold and focus on iothreads :)
at 12:18 PM, Klaus Jensen <its@irrelevant.dk> wrote: > On Jul 12 14:26, Klaus Jensen wrote: >> On Jul 9 12:35, Jinhao Fan wrote: >>> Use irqfd to directly notify KVM to inject interrupts. This is done by >>> registering a virtual IRQ(virq) in KVM and associate the virq with an >>> irqfd, so that KVM can directly inject the interrupt when it receives >>> notification from the irqfd. This approach is supposed to improve >>> performance because it bypasses QEMU's MSI interrupt emulation logic. >>> >>> However, I did not see an obvious improvement of the emulation KIOPS: >>> >>> QD 1 4 16 64 >>> QEMU 38 123 210 329 >>> irqfd 40 129 219 328 >>> >>> I found this problem quite hard to diagnose since irqfd's workflow >>> involves both QEMU and the in-kernel KVM. >>> >>> Could you help me figure out the following questions: >>> >>> 1. How much performance improvement can I expect from using irqfd? >> >> This is a level of QEMU/KVM that I am by no means an expert on and I >> would have to let the broader QEMU community comment on this. > > In any case, I'm wary about adding this level of kvm-dependence in the > device. This wont work on non-kvm platforms any more. Yes, irqfd seems only useful on KVM-based systems. Maybe it is more suitable for vhost or VFIO based solutions which need irqfd to deliver interrupts. > I think you should put irqfd on hold and focus on iothreads :) I’m working on iothread currently. But I also observed a performance regression with iothread enabled. I found ftrace, which is supported by both QEMU and KVM, seems good for analyzing performance issues. I’m currently exploring with ftrace.
© 2016 - 2024 Red Hat, Inc.