This is majorly only for X86 because that's the only one that supports
split irqchip for now.
When the irqchip is split, we face a dilemma that KVM irqfd will be
enabled, however the slow irqchip is still running in the userspace.
It means that the resamplefd in the kernel irqfds won't take any
effect and it will miss to ack INTx interrupts on EOIs.
One example is split irqchip with VFIO INTx, which will break if we
use the VFIO INTx fast path.
This patch can potentially supports the VFIO fast path again for INTx,
that the IRQ delivery will still use the fast path, while we don't
need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
callers of vfio_eoi() hook). However the EOI of the INTx will still
need to be done from the userspace by caching all the resamplefds in
QEMU and kick properly for IOAPIC EOI broadcast.
This is tricky because in this case the userspace ioapic irr &
remote-irr will be bypassed. However such a change will greatly boost
performance for assigned devices using INTx irqs (TCP_RR boosts 46%
after this patch applied).
When the userspace is responsible for the resamplefd kickup, don't
register it on the kvm_irqfd anymore, because on newer kernels (after
commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
irqchip and resamplefd. This will make sure that the fast path will
work for all supported kernels.
https://patchwork.kernel.org/patch/10738541/#22609933
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
accel/kvm/kvm-all.c | 85 +++++++++++++++++++++++++++++++++++++++++-
accel/kvm/trace-events | 1 +
hw/intc/ioapic.c | 23 +++++++++++-
include/sysemu/kvm.h | 7 ++++
4 files changed, 112 insertions(+), 4 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index d49b74512a..89771ea114 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -159,9 +159,65 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
static NotifierList kvm_irqchip_change_notifiers =
NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
+struct KVMResampleFd {
+ int gsi;
+ EventNotifier *resample_event;
+ QLIST_ENTRY(KVMResampleFd) node;
+};
+typedef struct KVMResampleFd KVMResampleFd;
+
+/*
+ * Only used with split irqchip where we need to do the resample fd
+ * kick for the kernel from userspace.
+ */
+static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
+ QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
+
#define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
#define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
+static inline void kvm_resample_fd_remove(int gsi)
+{
+ KVMResampleFd *rfd;
+
+ QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
+ if (rfd->gsi == gsi) {
+ QLIST_REMOVE(rfd, node);
+ g_free(rfd);
+ break;
+ }
+ }
+}
+
+static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
+{
+ KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
+
+ rfd->gsi = gsi;
+ rfd->resample_event = event;
+
+ QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
+}
+
+bool kvm_resample_fd_notify(int gsi)
+{
+ KVMResampleFd *rfd;
+
+ if (!kvm_irqchip_is_split()) {
+ return false;
+ }
+
+ QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
+ if (rfd->gsi == gsi) {
+ event_notifier_set(rfd->resample_event);
+ trace_kvm_resample_fd_notify(gsi);
+ return true;
+ }
+ }
+
+ return false;
+}
+
int kvm_get_max_memslots(void)
{
KVMState *s = KVM_STATE(current_accel());
@@ -1642,8 +1698,33 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
};
if (rfd != -1) {
- irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
- irqfd.resamplefd = rfd;
+ assert(assign);
+ if (kvm_irqchip_is_split()) {
+ /*
+ * When the slow irqchip (e.g. IOAPIC) is in the
+ * userspace, KVM kernel resamplefd will not work because
+ * the EOI of the interrupt will be delivered to userspace
+ * instead, so the KVM kernel resamplefd kick will be
+ * skipped. The userspace here mimics what the kernel
+ * provides with resamplefd, remember the resamplefd and
+ * kick it when we receive EOI of this IRQ.
+ *
+ * This is hackery because IOAPIC is mostly bypassed
+ * (except EOI broadcasts) when irqfd is used. However
+ * this can bring much performance back for split irqchip
+ * with INTx IRQs (for VFIO, this gives 93% perf of the
+ * full fast path, which is 46% perf boost comparing to
+ * the INTx slow path).
+ */
+ kvm_resample_fd_insert(virq, resample);
+ } else {
+ irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
+ irqfd.resamplefd = rfd;
+ }
+ } else if (!assign) {
+ if (kvm_irqchip_is_split()) {
+ kvm_resample_fd_remove(virq);
+ }
}
if (!kvm_irqfds_enabled()) {
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index 4fb6e59d19..a68eb66534 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -16,4 +16,5 @@ kvm_set_ioeventfd_mmio(int fd, uint64_t addr, uint32_t val, bool assign, uint32_
kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%x val=0x%x assign: %d size: %d match: %d"
kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
+kvm_resample_fd_notify(int gsi) "gsi %d"
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 15747fe2c2..13921b333d 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
for (n = 0; n < IOAPIC_NUM_PINS; n++) {
entry = s->ioredtbl[n];
- if ((entry & IOAPIC_VECTOR_MASK) != vector ||
- ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
+ if ((entry & IOAPIC_VECTOR_MASK) != vector) {
+ continue;
+ }
+
+ /*
+ * When IOAPIC is in the userspace while APIC is still in
+ * the kernel (i.e., split irqchip), we have a trick to
+ * kick the resamplefd logic for registered irqfds from
+ * userspace to deactivate the IRQ. When that happens, it
+ * means the irq bypassed userspace IOAPIC (so the irr and
+ * remote-irr of the table entry should be bypassed too
+ * even if interrupt come), then we don't need to clear
+ * the remote-IRR and check irr again because they'll
+ * always be zeros.
+ */
+ if (kvm_resample_fd_notify(n)) {
+ continue;
+ }
+
+ if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) !=
+ IOAPIC_TRIGGER_LEVEL) {
continue;
}
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 141342de98..3f0830cc4f 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -555,4 +555,11 @@ int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source);
int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target);
struct ppc_radix_page_info *kvm_get_radix_page_info(void);
int kvm_get_max_memslots(void);
+
+/*
+ * Notify resamplefd for EOI of specific interrupts. Returns true
+ * when one resamplefd is notified, false if no such IRQ found.
+ */
+bool kvm_resample_fd_notify(int gsi);
+
#endif
--
2.24.1
On Fri, 28 Feb 2020 11:15:02 -0500
Peter Xu <peterx@redhat.com> wrote:
> This is majorly only for X86 because that's the only one that supports
> split irqchip for now.
>
> When the irqchip is split, we face a dilemma that KVM irqfd will be
> enabled, however the slow irqchip is still running in the userspace.
> It means that the resamplefd in the kernel irqfds won't take any
> effect and it will miss to ack INTx interrupts on EOIs.
>
> One example is split irqchip with VFIO INTx, which will break if we
> use the VFIO INTx fast path.
>
> This patch can potentially supports the VFIO fast path again for INTx,
> that the IRQ delivery will still use the fast path, while we don't
> need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
> callers of vfio_eoi() hook). However the EOI of the INTx will still
> need to be done from the userspace by caching all the resamplefds in
> QEMU and kick properly for IOAPIC EOI broadcast.
>
> This is tricky because in this case the userspace ioapic irr &
> remote-irr will be bypassed. However such a change will greatly boost
> performance for assigned devices using INTx irqs (TCP_RR boosts 46%
> after this patch applied).
>
> When the userspace is responsible for the resamplefd kickup, don't
> register it on the kvm_irqfd anymore, because on newer kernels (after
> commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
> irqchip and resamplefd. This will make sure that the fast path will
> work for all supported kernels.
>
> https://patchwork.kernel.org/patch/10738541/#22609933
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> accel/kvm/kvm-all.c | 85 +++++++++++++++++++++++++++++++++++++++++-
> accel/kvm/trace-events | 1 +
> hw/intc/ioapic.c | 23 +++++++++++-
> include/sysemu/kvm.h | 7 ++++
> 4 files changed, 112 insertions(+), 4 deletions(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index d49b74512a..89771ea114 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -159,9 +159,65 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
> static NotifierList kvm_irqchip_change_notifiers =
> NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
>
> +struct KVMResampleFd {
> + int gsi;
> + EventNotifier *resample_event;
> + QLIST_ENTRY(KVMResampleFd) node;
> +};
> +typedef struct KVMResampleFd KVMResampleFd;
> +
> +/*
> + * Only used with split irqchip where we need to do the resample fd
> + * kick for the kernel from userspace.
> + */
> +static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
> + QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
> +
> #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
> #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
>
> +static inline void kvm_resample_fd_remove(int gsi)
> +{
> + KVMResampleFd *rfd;
> +
> + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> + if (rfd->gsi == gsi) {
> + QLIST_REMOVE(rfd, node);
> + g_free(rfd);
> + break;
> + }
> + }
> +}
> +
> +static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
> +{
> + KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
> +
> + rfd->gsi = gsi;
> + rfd->resample_event = event;
> +
> + QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
> +}
> +
> +bool kvm_resample_fd_notify(int gsi)
> +{
> + KVMResampleFd *rfd;
> +
> + if (!kvm_irqchip_is_split()) {
> + return false;
> + }
> +
> + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> + if (rfd->gsi == gsi) {
> + event_notifier_set(rfd->resample_event);
> + trace_kvm_resample_fd_notify(gsi);
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> int kvm_get_max_memslots(void)
> {
> KVMState *s = KVM_STATE(current_accel());
[snip]
> diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> index 15747fe2c2..13921b333d 100644
> --- a/hw/intc/ioapic.c
> +++ b/hw/intc/ioapic.c
> @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> entry = s->ioredtbl[n];
>
> - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> + continue;
> + }
> +
> + /*
> + * When IOAPIC is in the userspace while APIC is still in
> + * the kernel (i.e., split irqchip), we have a trick to
> + * kick the resamplefd logic for registered irqfds from
> + * userspace to deactivate the IRQ. When that happens, it
> + * means the irq bypassed userspace IOAPIC (so the irr and
> + * remote-irr of the table entry should be bypassed too
> + * even if interrupt come), then we don't need to clear
> + * the remote-IRR and check irr again because they'll
> + * always be zeros.
> + */
> + if (kvm_resample_fd_notify(n)) {
> + continue;
> + }
It seems the problem I reported is here. In my configuration virtio-blk
and an assigned e1000e share an interrupt. virtio-blk is initializing
and apparently triggers an interrupt. The vfio-pci device is
configured for INTx though not active yet, but kvm_resample_fd_notify()
kicks the fd here, so we continue. If I remove the continue here both
devices seem to work, but I don't claim to understand the condition
we're trying to continue for here yet. This series needs more testing
with shared interrupts. Thanks,
Alex
On Mon, 9 Mar 2020 16:10:59 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:
> On Fri, 28 Feb 2020 11:15:02 -0500
> Peter Xu <peterx@redhat.com> wrote:
>
> > This is majorly only for X86 because that's the only one that supports
> > split irqchip for now.
> >
> > When the irqchip is split, we face a dilemma that KVM irqfd will be
> > enabled, however the slow irqchip is still running in the userspace.
> > It means that the resamplefd in the kernel irqfds won't take any
> > effect and it will miss to ack INTx interrupts on EOIs.
> >
> > One example is split irqchip with VFIO INTx, which will break if we
> > use the VFIO INTx fast path.
> >
> > This patch can potentially supports the VFIO fast path again for INTx,
> > that the IRQ delivery will still use the fast path, while we don't
> > need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
> > callers of vfio_eoi() hook). However the EOI of the INTx will still
> > need to be done from the userspace by caching all the resamplefds in
> > QEMU and kick properly for IOAPIC EOI broadcast.
> >
> > This is tricky because in this case the userspace ioapic irr &
> > remote-irr will be bypassed. However such a change will greatly boost
> > performance for assigned devices using INTx irqs (TCP_RR boosts 46%
> > after this patch applied).
> >
> > When the userspace is responsible for the resamplefd kickup, don't
> > register it on the kvm_irqfd anymore, because on newer kernels (after
> > commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
> > irqchip and resamplefd. This will make sure that the fast path will
> > work for all supported kernels.
> >
> > https://patchwork.kernel.org/patch/10738541/#22609933
> >
> > Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> > accel/kvm/kvm-all.c | 85 +++++++++++++++++++++++++++++++++++++++++-
> > accel/kvm/trace-events | 1 +
> > hw/intc/ioapic.c | 23 +++++++++++-
> > include/sysemu/kvm.h | 7 ++++
> > 4 files changed, 112 insertions(+), 4 deletions(-)
> >
> > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> > index d49b74512a..89771ea114 100644
> > --- a/accel/kvm/kvm-all.c
> > +++ b/accel/kvm/kvm-all.c
> > @@ -159,9 +159,65 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
> > static NotifierList kvm_irqchip_change_notifiers =
> > NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
> >
> > +struct KVMResampleFd {
> > + int gsi;
> > + EventNotifier *resample_event;
> > + QLIST_ENTRY(KVMResampleFd) node;
> > +};
> > +typedef struct KVMResampleFd KVMResampleFd;
> > +
> > +/*
> > + * Only used with split irqchip where we need to do the resample fd
> > + * kick for the kernel from userspace.
> > + */
> > +static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
> > + QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
> > +
> > #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
> > #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
> >
> > +static inline void kvm_resample_fd_remove(int gsi)
> > +{
> > + KVMResampleFd *rfd;
> > +
> > + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> > + if (rfd->gsi == gsi) {
> > + QLIST_REMOVE(rfd, node);
> > + g_free(rfd);
> > + break;
> > + }
> > + }
> > +}
> > +
> > +static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
> > +{
> > + KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
> > +
> > + rfd->gsi = gsi;
> > + rfd->resample_event = event;
> > +
> > + QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
> > +}
> > +
> > +bool kvm_resample_fd_notify(int gsi)
> > +{
> > + KVMResampleFd *rfd;
> > +
> > + if (!kvm_irqchip_is_split()) {
> > + return false;
> > + }
> > +
> > + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> > + if (rfd->gsi == gsi) {
> > + event_notifier_set(rfd->resample_event);
> > + trace_kvm_resample_fd_notify(gsi);
> > + return true;
> > + }
> > + }
> > +
> > + return false;
> > +}
> > +
> > int kvm_get_max_memslots(void)
> > {
> > KVMState *s = KVM_STATE(current_accel());
> [snip]
> > diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> > index 15747fe2c2..13921b333d 100644
> > --- a/hw/intc/ioapic.c
> > +++ b/hw/intc/ioapic.c
> > @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> > for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> > entry = s->ioredtbl[n];
> >
> > - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> > - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> > + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> > + continue;
> > + }
> > +
> > + /*
> > + * When IOAPIC is in the userspace while APIC is still in
> > + * the kernel (i.e., split irqchip), we have a trick to
> > + * kick the resamplefd logic for registered irqfds from
> > + * userspace to deactivate the IRQ. When that happens, it
> > + * means the irq bypassed userspace IOAPIC (so the irr and
> > + * remote-irr of the table entry should be bypassed too
> > + * even if interrupt come), then we don't need to clear
> > + * the remote-IRR and check irr again because they'll
> > + * always be zeros.
> > + */
> > + if (kvm_resample_fd_notify(n)) {
> > + continue;
> > + }
>
> It seems the problem I reported is here. In my configuration virtio-blk
> and an assigned e1000e share an interrupt. virtio-blk is initializing
> and apparently triggers an interrupt. The vfio-pci device is
> configured for INTx though not active yet, but kvm_resample_fd_notify()
> kicks the fd here, so we continue. If I remove the continue here both
> devices seem to work, but I don't claim to understand the condition
> we're trying to continue for here yet. This series needs more testing
> with shared interrupts. Thanks,
I'm also curious how this ended up between testing whether the vector
is masked and testing that it's level triggered. We shouldn't have any
edge triggered resamplers. I find however that if I move the resampler
notify to after the remote IRR test, my NIC gets starved of interrupts.
So empirically, it seems kvm_resample_fd_notify() should be a void
function called unconditionally between the original mask+level check
removed above and the IRR check below. Thanks,
Alex
On Mon, Mar 09, 2020 at 04:33:59PM -0600, Alex Williamson wrote:
[...]
> > > diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> > > index 15747fe2c2..13921b333d 100644
> > > --- a/hw/intc/ioapic.c
> > > +++ b/hw/intc/ioapic.c
> > > @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> > > for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> > > entry = s->ioredtbl[n];
> > >
> > > - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> > > - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> > > + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> > > + continue;
> > > + }
> > > +
> > > + /*
> > > + * When IOAPIC is in the userspace while APIC is still in
> > > + * the kernel (i.e., split irqchip), we have a trick to
> > > + * kick the resamplefd logic for registered irqfds from
> > > + * userspace to deactivate the IRQ. When that happens, it
> > > + * means the irq bypassed userspace IOAPIC (so the irr and
> > > + * remote-irr of the table entry should be bypassed too
> > > + * even if interrupt come), then we don't need to clear
> > > + * the remote-IRR and check irr again because they'll
> > > + * always be zeros.
> > > + */
> > > + if (kvm_resample_fd_notify(n)) {
> > > + continue;
> > > + }
> >
> > It seems the problem I reported is here. In my configuration virtio-blk
> > and an assigned e1000e share an interrupt. virtio-blk is initializing
> > and apparently triggers an interrupt. The vfio-pci device is
> > configured for INTx though not active yet, but kvm_resample_fd_notify()
> > kicks the fd here, so we continue. If I remove the continue here both
> > devices seem to work, but I don't claim to understand the condition
> > we're trying to continue for here yet. This series needs more testing
> > with shared interrupts. Thanks,
>
> I'm also curious how this ended up between testing whether the vector
> is masked and testing that it's level triggered. We shouldn't have any
> edge triggered resamplers.
We had a similar discussion in V1 (with Paolo):
https://patchwork.kernel.org/patch/11407441/#23190891
So my understanding is that VFIO will unmask the intx IRQ when it
comes, and register the resamplefd too, no matter whether it's level
triggered (at least from what the code does). Am I right?
> I find however that if I move the resampler
> notify to after the remote IRR test, my NIC gets starved of interrupts.
> So empirically, it seems kvm_resample_fd_notify() should be a void
> function called unconditionally between the original mask+level check
> removed above and the IRR check below. Thanks,
Yes IMHO we can't move that notify() to be after the remote IRR check
because the IRR and remote IRR will be completely bypassed for the
assigned device. In other words, even if the interrupt has arrived
for the assigned device, both IRR and remote IRR should always be zero
(assuming the virtio-blk device doesn't generate any IRQ). So we
probably still need to do the notify even if remote-irr is not set.
Thanks,
--
Peter Xu
On Mon, 9 Mar 2020 20:38:08 -0400
Peter Xu <peterx@redhat.com> wrote:
> On Mon, Mar 09, 2020 at 04:33:59PM -0600, Alex Williamson wrote:
>
> [...]
>
> > > > diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> > > > index 15747fe2c2..13921b333d 100644
> > > > --- a/hw/intc/ioapic.c
> > > > +++ b/hw/intc/ioapic.c
> > > > @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> > > > for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> > > > entry = s->ioredtbl[n];
> > > >
> > > > - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> > > > - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> > > > + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> > > > + continue;
> > > > + }
> > > > +
> > > > + /*
> > > > + * When IOAPIC is in the userspace while APIC is still in
> > > > + * the kernel (i.e., split irqchip), we have a trick to
> > > > + * kick the resamplefd logic for registered irqfds from
> > > > + * userspace to deactivate the IRQ. When that happens, it
> > > > + * means the irq bypassed userspace IOAPIC (so the irr and
> > > > + * remote-irr of the table entry should be bypassed too
> > > > + * even if interrupt come), then we don't need to clear
> > > > + * the remote-IRR and check irr again because they'll
> > > > + * always be zeros.
> > > > + */
> > > > + if (kvm_resample_fd_notify(n)) {
> > > > + continue;
> > > > + }
> > >
> > > It seems the problem I reported is here. In my configuration virtio-blk
> > > and an assigned e1000e share an interrupt. virtio-blk is initializing
> > > and apparently triggers an interrupt. The vfio-pci device is
> > > configured for INTx though not active yet, but kvm_resample_fd_notify()
> > > kicks the fd here, so we continue. If I remove the continue here both
> > > devices seem to work, but I don't claim to understand the condition
> > > we're trying to continue for here yet. This series needs more testing
> > > with shared interrupts. Thanks,
> >
> > I'm also curious how this ended up between testing whether the vector
> > is masked and testing that it's level triggered. We shouldn't have any
> > edge triggered resamplers.
>
> We had a similar discussion in V1 (with Paolo):
>
> https://patchwork.kernel.org/patch/11407441/#23190891
>
> So my understanding is that VFIO will unmask the intx IRQ when it
> comes, and register the resamplefd too, no matter whether it's level
> triggered (at least from what the code does). Am I right?
As Paolo replied in your previous discussion, INTx is always level
triggered.
> > I find however that if I move the resampler
> > notify to after the remote IRR test, my NIC gets starved of interrupts.
> > So empirically, it seems kvm_resample_fd_notify() should be a void
> > function called unconditionally between the original mask+level check
> > removed above and the IRR check below. Thanks,
>
> Yes IMHO we can't move that notify() to be after the remote IRR check
> because the IRR and remote IRR will be completely bypassed for the
> assigned device. In other words, even if the interrupt has arrived
> for the assigned device, both IRR and remote IRR should always be zero
> (assuming the virtio-blk device doesn't generate any IRQ). So we
> probably still need to do the notify even if remote-irr is not set.
Yep. Thanks,
Alex
On Mon, Mar 09, 2020 at 04:10:59PM -0600, Alex Williamson wrote:
> On Fri, 28 Feb 2020 11:15:02 -0500
> Peter Xu <peterx@redhat.com> wrote:
>
> > This is majorly only for X86 because that's the only one that supports
> > split irqchip for now.
> >
> > When the irqchip is split, we face a dilemma that KVM irqfd will be
> > enabled, however the slow irqchip is still running in the userspace.
> > It means that the resamplefd in the kernel irqfds won't take any
> > effect and it will miss to ack INTx interrupts on EOIs.
> >
> > One example is split irqchip with VFIO INTx, which will break if we
> > use the VFIO INTx fast path.
> >
> > This patch can potentially supports the VFIO fast path again for INTx,
> > that the IRQ delivery will still use the fast path, while we don't
> > need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
> > callers of vfio_eoi() hook). However the EOI of the INTx will still
> > need to be done from the userspace by caching all the resamplefds in
> > QEMU and kick properly for IOAPIC EOI broadcast.
> >
> > This is tricky because in this case the userspace ioapic irr &
> > remote-irr will be bypassed. However such a change will greatly boost
> > performance for assigned devices using INTx irqs (TCP_RR boosts 46%
> > after this patch applied).
> >
> > When the userspace is responsible for the resamplefd kickup, don't
> > register it on the kvm_irqfd anymore, because on newer kernels (after
> > commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
> > irqchip and resamplefd. This will make sure that the fast path will
> > work for all supported kernels.
> >
> > https://patchwork.kernel.org/patch/10738541/#22609933
> >
> > Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> > accel/kvm/kvm-all.c | 85 +++++++++++++++++++++++++++++++++++++++++-
> > accel/kvm/trace-events | 1 +
> > hw/intc/ioapic.c | 23 +++++++++++-
> > include/sysemu/kvm.h | 7 ++++
> > 4 files changed, 112 insertions(+), 4 deletions(-)
> >
> > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> > index d49b74512a..89771ea114 100644
> > --- a/accel/kvm/kvm-all.c
> > +++ b/accel/kvm/kvm-all.c
> > @@ -159,9 +159,65 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
> > static NotifierList kvm_irqchip_change_notifiers =
> > NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
> >
> > +struct KVMResampleFd {
> > + int gsi;
> > + EventNotifier *resample_event;
> > + QLIST_ENTRY(KVMResampleFd) node;
> > +};
> > +typedef struct KVMResampleFd KVMResampleFd;
> > +
> > +/*
> > + * Only used with split irqchip where we need to do the resample fd
> > + * kick for the kernel from userspace.
> > + */
> > +static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
> > + QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
> > +
> > #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
> > #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
> >
> > +static inline void kvm_resample_fd_remove(int gsi)
> > +{
> > + KVMResampleFd *rfd;
> > +
> > + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> > + if (rfd->gsi == gsi) {
> > + QLIST_REMOVE(rfd, node);
> > + g_free(rfd);
> > + break;
> > + }
> > + }
> > +}
> > +
> > +static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
> > +{
> > + KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
> > +
> > + rfd->gsi = gsi;
> > + rfd->resample_event = event;
> > +
> > + QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
> > +}
> > +
> > +bool kvm_resample_fd_notify(int gsi)
> > +{
> > + KVMResampleFd *rfd;
> > +
> > + if (!kvm_irqchip_is_split()) {
> > + return false;
> > + }
> > +
> > + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> > + if (rfd->gsi == gsi) {
> > + event_notifier_set(rfd->resample_event);
> > + trace_kvm_resample_fd_notify(gsi);
> > + return true;
> > + }
> > + }
> > +
> > + return false;
> > +}
> > +
> > int kvm_get_max_memslots(void)
> > {
> > KVMState *s = KVM_STATE(current_accel());
> [snip]
> > diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> > index 15747fe2c2..13921b333d 100644
> > --- a/hw/intc/ioapic.c
> > +++ b/hw/intc/ioapic.c
> > @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> > for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> > entry = s->ioredtbl[n];
> >
> > - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> > - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> > + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> > + continue;
> > + }
> > +
> > + /*
> > + * When IOAPIC is in the userspace while APIC is still in
> > + * the kernel (i.e., split irqchip), we have a trick to
> > + * kick the resamplefd logic for registered irqfds from
> > + * userspace to deactivate the IRQ. When that happens, it
> > + * means the irq bypassed userspace IOAPIC (so the irr and
> > + * remote-irr of the table entry should be bypassed too
> > + * even if interrupt come), then we don't need to clear
> > + * the remote-IRR and check irr again because they'll
> > + * always be zeros.
> > + */
> > + if (kvm_resample_fd_notify(n)) {
> > + continue;
> > + }
>
> It seems the problem I reported is here. In my configuration virtio-blk
> and an assigned e1000e share an interrupt. virtio-blk is initializing
> and apparently triggers an interrupt. The vfio-pci device is
> configured for INTx though not active yet, but kvm_resample_fd_notify()
> kicks the fd here, so we continue. If I remove the continue here both
> devices seem to work, but I don't claim to understand the condition
> we're trying to continue for here yet. This series needs more testing
> with shared interrupts. Thanks,
Hi, Alex,
The "continue" was there only because I wanted to skip updating remote
IRR and so on, considering that it won't be useful after all if the
userspace irqchip is bypassed here. However I totally overlooked the
fact that this exact IRQ can be shared by another device that is using
the same IRQ while it was not bypassed by the kernel instead.
Here's my guess on what should have happened...
- The first IRQ of the virtio-blk device will work, which will set
IRR bit, and at last it'll also set remote-irr of the IRQ, showing
that it's in service
- When the virtio-blk wants to EOI the irq, it noticed that it's
registered with resamplefd (which is actually the nic's rather
than the virtio-blk device), then it ignored the update to
remote-irr assuming that it'll always be zero (but actually it's
just set).
- When the virtio-blk wants to send further IRQs (starting from the
2nd one), it will try to set irr again, but this time since the
remote-irr is still set, it'll ignore the interrupt because it
thought the IRQ is still in service (while it's not). This
corresponds to where "coalesce==true" in ioapic_service():
if (coalesce) {
/* We are level triggered interrupts, and the
* guest should be still working on previous one,
* so skip it. */
continue;
}
With that, I think your proposed solution should be the right fix,
that we should still keep the whole IOAPIC EOI path even if the IRQ is
registered with resamplefd, because that can be used by the other device.
I'll also test this senario tomorrow.
Thanks!
--
Peter Xu
On Fri, 28 Feb 2020 11:15:02 -0500
Peter Xu <peterx@redhat.com> wrote:
> This is majorly only for X86 because that's the only one that supports
> split irqchip for now.
>
> When the irqchip is split, we face a dilemma that KVM irqfd will be
> enabled, however the slow irqchip is still running in the userspace.
> It means that the resamplefd in the kernel irqfds won't take any
> effect and it will miss to ack INTx interrupts on EOIs.
>
> One example is split irqchip with VFIO INTx, which will break if we
> use the VFIO INTx fast path.
>
> This patch can potentially supports the VFIO fast path again for INTx,
> that the IRQ delivery will still use the fast path, while we don't
> need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
> callers of vfio_eoi() hook). However the EOI of the INTx will still
> need to be done from the userspace by caching all the resamplefds in
> QEMU and kick properly for IOAPIC EOI broadcast.
>
> This is tricky because in this case the userspace ioapic irr &
> remote-irr will be bypassed. However such a change will greatly boost
> performance for assigned devices using INTx irqs (TCP_RR boosts 46%
> after this patch applied).
>
> When the userspace is responsible for the resamplefd kickup, don't
> register it on the kvm_irqfd anymore, because on newer kernels (after
> commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
> irqchip and resamplefd. This will make sure that the fast path will
> work for all supported kernels.
>
> https://patchwork.kernel.org/patch/10738541/#22609933
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> accel/kvm/kvm-all.c | 85 +++++++++++++++++++++++++++++++++++++++++-
> accel/kvm/trace-events | 1 +
> hw/intc/ioapic.c | 23 +++++++++++-
> include/sysemu/kvm.h | 7 ++++
> 4 files changed, 112 insertions(+), 4 deletions(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index d49b74512a..89771ea114 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -159,9 +159,65 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
> static NotifierList kvm_irqchip_change_notifiers =
> NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
>
> +struct KVMResampleFd {
> + int gsi;
> + EventNotifier *resample_event;
> + QLIST_ENTRY(KVMResampleFd) node;
> +};
> +typedef struct KVMResampleFd KVMResampleFd;
> +
> +/*
> + * Only used with split irqchip where we need to do the resample fd
> + * kick for the kernel from userspace.
> + */
> +static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
> + QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
> +
> #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
> #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
>
> +static inline void kvm_resample_fd_remove(int gsi)
> +{
> + KVMResampleFd *rfd;
> +
> + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> + if (rfd->gsi == gsi) {
> + QLIST_REMOVE(rfd, node);
> + g_free(rfd);
> + break;
> + }
> + }
> +}
> +
> +static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
> +{
> + KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
> +
> + rfd->gsi = gsi;
> + rfd->resample_event = event;
> +
> + QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
> +}
> +
> +bool kvm_resample_fd_notify(int gsi)
> +{
> + KVMResampleFd *rfd;
> +
> + if (!kvm_irqchip_is_split()) {
> + return false;
> + }
Nit, checking split irqchip here seems unnecessary. We're only adding
and removing list entries based on split irqchip below, so the list
would be empty anyway, unless another user comes along that might have
a reason for this functionality that isn't as tied to split irqchip.
Overall the series looks like a big improvement versus falling back to
our crappy generic EOI hackery with split irqchip. Thanks,
Alex
> +
> + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> + if (rfd->gsi == gsi) {
> + event_notifier_set(rfd->resample_event);
> + trace_kvm_resample_fd_notify(gsi);
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> int kvm_get_max_memslots(void)
> {
> KVMState *s = KVM_STATE(current_accel());
> @@ -1642,8 +1698,33 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
> };
>
> if (rfd != -1) {
> - irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
> - irqfd.resamplefd = rfd;
> + assert(assign);
> + if (kvm_irqchip_is_split()) {
> + /*
> + * When the slow irqchip (e.g. IOAPIC) is in the
> + * userspace, KVM kernel resamplefd will not work because
> + * the EOI of the interrupt will be delivered to userspace
> + * instead, so the KVM kernel resamplefd kick will be
> + * skipped. The userspace here mimics what the kernel
> + * provides with resamplefd, remember the resamplefd and
> + * kick it when we receive EOI of this IRQ.
> + *
> + * This is hackery because IOAPIC is mostly bypassed
> + * (except EOI broadcasts) when irqfd is used. However
> + * this can bring much performance back for split irqchip
> + * with INTx IRQs (for VFIO, this gives 93% perf of the
> + * full fast path, which is 46% perf boost comparing to
> + * the INTx slow path).
> + */
> + kvm_resample_fd_insert(virq, resample);
> + } else {
> + irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
> + irqfd.resamplefd = rfd;
> + }
> + } else if (!assign) {
> + if (kvm_irqchip_is_split()) {
> + kvm_resample_fd_remove(virq);
> + }
> }
>
> if (!kvm_irqfds_enabled()) {
> diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
> index 4fb6e59d19..a68eb66534 100644
> --- a/accel/kvm/trace-events
> +++ b/accel/kvm/trace-events
> @@ -16,4 +16,5 @@ kvm_set_ioeventfd_mmio(int fd, uint64_t addr, uint32_t val, bool assign, uint32_
> kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%x val=0x%x assign: %d size: %d match: %d"
> kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
> kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
> +kvm_resample_fd_notify(int gsi) "gsi %d"
>
> diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> index 15747fe2c2..13921b333d 100644
> --- a/hw/intc/ioapic.c
> +++ b/hw/intc/ioapic.c
> @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> entry = s->ioredtbl[n];
>
> - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> + continue;
> + }
> +
> + /*
> + * When IOAPIC is in the userspace while APIC is still in
> + * the kernel (i.e., split irqchip), we have a trick to
> + * kick the resamplefd logic for registered irqfds from
> + * userspace to deactivate the IRQ. When that happens, it
> + * means the irq bypassed userspace IOAPIC (so the irr and
> + * remote-irr of the table entry should be bypassed too
> + * even if interrupt come), then we don't need to clear
> + * the remote-IRR and check irr again because they'll
> + * always be zeros.
> + */
> + if (kvm_resample_fd_notify(n)) {
> + continue;
> + }
> +
> + if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) !=
> + IOAPIC_TRIGGER_LEVEL) {
> continue;
> }
>
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 141342de98..3f0830cc4f 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -555,4 +555,11 @@ int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source);
> int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target);
> struct ppc_radix_page_info *kvm_get_radix_page_info(void);
> int kvm_get_max_memslots(void);
> +
> +/*
> + * Notify resamplefd for EOI of specific interrupts. Returns true
> + * when one resamplefd is notified, false if no such IRQ found.
> + */
> +bool kvm_resample_fd_notify(int gsi);
> +
> #endif
On Thu, Mar 05, 2020 at 04:58:57PM -0700, Alex Williamson wrote:
Hi, Alex,
[...]
> > +bool kvm_resample_fd_notify(int gsi)
> > +{
> > + KVMResampleFd *rfd;
> > +
> > + if (!kvm_irqchip_is_split()) {
> > + return false;
> > + }
>
> Nit, checking split irqchip here seems unnecessary. We're only adding
> and removing list entries based on split irqchip below, so the list
> would be empty anyway, unless another user comes along that might have
> a reason for this functionality that isn't as tied to split irqchip.
Right, now it's more or less a hint to readers, and we can remove it.
I'll see whether I'll repost a new version, and I'll drop it if so.
>
> Overall the series looks like a big improvement versus falling back to
> our crappy generic EOI hackery with split irqchip. Thanks,
Yes I was pretty happy to see the numbers too when I first tested the
series, after all I was still uncertain about how much overhead the
userspace EOI would take on the irq return path. It turns out that
the injection seems to be more important.
In all cases, major credits go to Paolo for the idea. :)
Thanks,
--
Peter Xu
On Thu, 5 Mar 2020 19:43:24 -0500
Peter Xu <peterx@redhat.com> wrote:
> On Thu, Mar 05, 2020 at 04:58:57PM -0700, Alex Williamson wrote:
>
> Hi, Alex,
>
> [...]
>
> > > +bool kvm_resample_fd_notify(int gsi)
> > > +{
> > > + KVMResampleFd *rfd;
> > > +
> > > + if (!kvm_irqchip_is_split()) {
> > > + return false;
> > > + }
> >
> > Nit, checking split irqchip here seems unnecessary. We're only adding
> > and removing list entries based on split irqchip below, so the list
> > would be empty anyway, unless another user comes along that might have
> > a reason for this functionality that isn't as tied to split irqchip.
>
> Right, now it's more or less a hint to readers, and we can remove it.
> I'll see whether I'll repost a new version, and I'll drop it if so.
>
> >
> > Overall the series looks like a big improvement versus falling back to
> > our crappy generic EOI hackery with split irqchip. Thanks,
>
> Yes I was pretty happy to see the numbers too when I first tested the
> series, after all I was still uncertain about how much overhead the
> userspace EOI would take on the irq return path. It turns out that
> the injection seems to be more important.
>
> In all cases, major credits go to Paolo for the idea. :)
Hey Peter, I'm trying to test this myself and my VM just hangs as soon
as I enable split irqchip. It boots up to discovering the virtio
disks, then nothing more. My host kernel is 5.3.7-301.fc31.x86_64,
QEMU is 373c7068dd61 + this patch series. VM script is:
/usr/local/bin/qemu-system-x86_64 \
-S \
-machine pc-q35-5.0,accel=kvm,usb=off,vmport=off,dump-guest-core=off,kernel-irqchip=split \
-cpu host \
-m 2048 \
-smp 2,sockets=2,cores=1,threads=1 \
-no-user-config \
-nodefaults \
-monitor stdio \
-serial none \
-parallel none \
-no-hpet \
-device pcie-root-port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2 \
-device pcie-root-port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 \
-device pcie-root-port,port=0x12,chassis=3,id=pci.3,bus=pcie.0,addr=0x2.0x2 \
-device pcie-root-port,port=0x13,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x3 \
-device pcie-root-port,port=0x14,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x4 \
-device pcie-root-port,port=0x15,chassis=6,id=pci.6,bus=pcie.0,addr=0x2.0x5 \
-drive file=/var/lib/libvirt/images/fedora31-1.qcow2,format=qcow2,if=none,id=drive-virtio-disk0 \
-device virtio-blk-pci,scsi=off,bus=pci.3,addr=0x0,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 \
-vnc :0 \
-device VGA,id=video0,vgamem_mb=16,bus=pcie.0,addr=0x1 \
-device vfio-pci,host=02:00.0,id=e1000e,bus=pci.2,addr=0x0
Guest has pci=nomsi on the kernel command line. It boots with
irqchip=on, also boots with x-no-kvm-intx=on as an arg to the vfio-pci
device. I'm afraid there's a regression here unless I'm failing to add
something necessary for split irqchip. Thanks,
Alex
Hi Peter,
On 2/28/20 5:15 PM, Peter Xu wrote:
> This is majorly only for X86 because that's the only one that supports
> split irqchip for now.
>
> When the irqchip is split, we face a dilemma that KVM irqfd will be
> enabled, however the slow irqchip is still running in the userspace.
> It means that the resamplefd in the kernel irqfds won't take any
> effect and it will miss to ack INTx interrupts on EOIs.
>
> One example is split irqchip with VFIO INTx, which will break if we
> use the VFIO INTx fast path.
>
> This patch can potentially supports the VFIO fast path again for INTx,
> that the IRQ delivery will still use the fast path, while we don't
> need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
> callers of vfio_eoi() hook). However the EOI of the INTx will still
> need to be done from the userspace by caching all the resamplefds in
> QEMU and kick properly for IOAPIC EOI broadcast.
>
> This is tricky because in this case the userspace ioapic irr &
> remote-irr will be bypassed. However such a change will greatly boost
> performance for assigned devices using INTx irqs (TCP_RR boosts 46%
> after this patch applied).
>
> When the userspace is responsible for the resamplefd kickup, don't
> register it on the kvm_irqfd anymore, because on newer kernels (after
> commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
> irqchip and resamplefd. This will make sure that the fast path will
> work for all supported kernels.
>
> https://patchwork.kernel.org/patch/10738541/#22609933
>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Thanks
Eric
> ---
> accel/kvm/kvm-all.c | 85 +++++++++++++++++++++++++++++++++++++++++-
> accel/kvm/trace-events | 1 +
> hw/intc/ioapic.c | 23 +++++++++++-
> include/sysemu/kvm.h | 7 ++++
> 4 files changed, 112 insertions(+), 4 deletions(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index d49b74512a..89771ea114 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -159,9 +159,65 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
> static NotifierList kvm_irqchip_change_notifiers =
> NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
>
> +struct KVMResampleFd {
> + int gsi;
> + EventNotifier *resample_event;
> + QLIST_ENTRY(KVMResampleFd) node;
> +};
> +typedef struct KVMResampleFd KVMResampleFd;
> +
> +/*
> + * Only used with split irqchip where we need to do the resample fd
> + * kick for the kernel from userspace.
> + */
> +static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
> + QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
> +
> #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
> #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
>
> +static inline void kvm_resample_fd_remove(int gsi)
> +{
> + KVMResampleFd *rfd;
> +
> + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> + if (rfd->gsi == gsi) {
> + QLIST_REMOVE(rfd, node);
> + g_free(rfd);
> + break;
> + }
> + }
> +}
> +
> +static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
> +{
> + KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
> +
> + rfd->gsi = gsi;
> + rfd->resample_event = event;
> +
> + QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
> +}
> +
> +bool kvm_resample_fd_notify(int gsi)
> +{
> + KVMResampleFd *rfd;
> +
> + if (!kvm_irqchip_is_split()) {
> + return false;
> + }
> +
> + QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
> + if (rfd->gsi == gsi) {
> + event_notifier_set(rfd->resample_event);
> + trace_kvm_resample_fd_notify(gsi);
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> int kvm_get_max_memslots(void)
> {
> KVMState *s = KVM_STATE(current_accel());
> @@ -1642,8 +1698,33 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
> };
>
> if (rfd != -1) {
> - irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
> - irqfd.resamplefd = rfd;
> + assert(assign);
> + if (kvm_irqchip_is_split()) {
> + /*
> + * When the slow irqchip (e.g. IOAPIC) is in the
> + * userspace, KVM kernel resamplefd will not work because
> + * the EOI of the interrupt will be delivered to userspace
> + * instead, so the KVM kernel resamplefd kick will be
> + * skipped. The userspace here mimics what the kernel
> + * provides with resamplefd, remember the resamplefd and
> + * kick it when we receive EOI of this IRQ.
> + *
> + * This is hackery because IOAPIC is mostly bypassed
> + * (except EOI broadcasts) when irqfd is used. However
> + * this can bring much performance back for split irqchip
> + * with INTx IRQs (for VFIO, this gives 93% perf of the
> + * full fast path, which is 46% perf boost comparing to
> + * the INTx slow path).
> + */
> + kvm_resample_fd_insert(virq, resample);
> + } else {
> + irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
> + irqfd.resamplefd = rfd;
> + }
> + } else if (!assign) {
> + if (kvm_irqchip_is_split()) {
> + kvm_resample_fd_remove(virq);
> + }
> }
>
> if (!kvm_irqfds_enabled()) {
> diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
> index 4fb6e59d19..a68eb66534 100644
> --- a/accel/kvm/trace-events
> +++ b/accel/kvm/trace-events
> @@ -16,4 +16,5 @@ kvm_set_ioeventfd_mmio(int fd, uint64_t addr, uint32_t val, bool assign, uint32_
> kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%x val=0x%x assign: %d size: %d match: %d"
> kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
> kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
> +kvm_resample_fd_notify(int gsi) "gsi %d"
>
> diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> index 15747fe2c2..13921b333d 100644
> --- a/hw/intc/ioapic.c
> +++ b/hw/intc/ioapic.c
> @@ -236,8 +236,27 @@ void ioapic_eoi_broadcast(int vector)
> for (n = 0; n < IOAPIC_NUM_PINS; n++) {
> entry = s->ioredtbl[n];
>
> - if ((entry & IOAPIC_VECTOR_MASK) != vector ||
> - ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) {
> + if ((entry & IOAPIC_VECTOR_MASK) != vector) {
> + continue;
> + }
> +
> + /*
> + * When IOAPIC is in the userspace while APIC is still in
> + * the kernel (i.e., split irqchip), we have a trick to
> + * kick the resamplefd logic for registered irqfds from
> + * userspace to deactivate the IRQ. When that happens, it
> + * means the irq bypassed userspace IOAPIC (so the irr and
> + * remote-irr of the table entry should be bypassed too
> + * even if interrupt come), then we don't need to clear
> + * the remote-IRR and check irr again because they'll
> + * always be zeros.
> + */
> + if (kvm_resample_fd_notify(n)) {
> + continue;
> + }
> +
> + if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) !=
> + IOAPIC_TRIGGER_LEVEL) {
> continue;
> }
>
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 141342de98..3f0830cc4f 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -555,4 +555,11 @@ int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source);
> int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target);
> struct ppc_radix_page_info *kvm_get_radix_page_info(void);
> int kvm_get_max_memslots(void);
> +
> +/*
> + * Notify resamplefd for EOI of specific interrupts. Returns true
> + * when one resamplefd is notified, false if no such IRQ found.
> + */
> +bool kvm_resample_fd_notify(int gsi);
> +
> #endif
>
© 2016 - 2025 Red Hat, Inc.