hw/intc/ioapic.c | 57 ++++++++++++++++++++++++++++--- hw/intc/trace-events | 1 + include/hw/i386/ioapic_internal.h | 3 ++ 3 files changed, 56 insertions(+), 5 deletions(-)
It was found that Hyper-V 2016 on KVM in some configurations (q35 machine +
piix4-usb-uhci) hangs on boot. Root-cause was that one of Hyper-V
level-triggered interrupt handler performs EOI before fixing the cause of
the interrupt. This results in IOAPIC keep re-raising the level-triggered
interrupt after EOI because irq-line remains asserted.
Gory details: https://www.spinics.net/lists/kvm/msg184484.html
(the whole thread).
Turns out we were dealing with similar issues before; in-kernel IOAPIC
implementation has commit 184564efae4d ("kvm: ioapic: conditionally delay
irq delivery duringeoi broadcast") which describes a very similar issue.
Steal the idea from the above mentioned commit for IOAPIC implementation in
QEMU. SUCCESSIVE_IRQ_MAX_COUNT, delay and the comment are borrowed as well.
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
Changes since v1:
- timer_mod() -> timer_mod_anticipate() [Paolo Bonzini]
- Massaged changelog [Liran Alon]
- Make implementation look like in-kernel one [Liran Alon]
---
hw/intc/ioapic.c | 57 ++++++++++++++++++++++++++++---
hw/intc/trace-events | 1 +
include/hw/i386/ioapic_internal.h | 3 ++
3 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 9d75f84d3b..9fb8dd3450 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -139,6 +139,15 @@ static void ioapic_service(IOAPICCommonState *s)
}
}
+#define SUCCESSIVE_IRQ_MAX_COUNT 10000
+
+static void ioapic_timer(void *opaque)
+{
+ IOAPICCommonState *s = opaque;
+
+ ioapic_service(s);
+}
+
static void ioapic_set_irq(void *opaque, int vector, int level)
{
IOAPICCommonState *s = opaque;
@@ -222,13 +231,40 @@ void ioapic_eoi_broadcast(int vector)
}
for (n = 0; n < IOAPIC_NUM_PINS; n++) {
entry = s->ioredtbl[n];
- if ((entry & IOAPIC_LVT_REMOTE_IRR)
- && (entry & IOAPIC_VECTOR_MASK) == vector) {
- trace_ioapic_clear_remote_irr(n, vector);
- s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
- if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
+
+ if (((entry & IOAPIC_VECTOR_MASK) != vector) ||
+ !(entry & IOAPIC_LVT_REMOTE_IRR)) {
+ continue;
+ }
+
+ trace_ioapic_clear_remote_irr(n, vector);
+ s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR;
+
+ if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) !=
+ IOAPIC_TRIGGER_LEVEL) {
+ continue;
+ }
+
+ if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) {
+ ++s->irq_eoi[vector];
+ if (s->irq_eoi[vector] >= SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt immediately
+ * during eoi broadcast, and this lets a buggy guest make
+ * slow progress even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this behavior if we
+ * detect an interrupt storm.
+ */
+ s->irq_eoi[vector] = 0;
+ timer_mod_anticipate(s->timer,
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+ NANOSECONDS_PER_SECOND / 100);
+ trace_ioapic_eoi_delayed_reassert(vector);
+ } else {
ioapic_service(s);
}
+ } else {
+ s->irq_eoi[vector] = 0;
}
}
}
@@ -401,6 +437,8 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s,
"ioapic", 0x1000);
+ s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, ioapic_timer, s);
+
qdev_init_gpio_in(dev, ioapic_set_irq, IOAPIC_NUM_PINS);
ioapics[ioapic_no] = s;
@@ -408,6 +446,14 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
qemu_add_machine_init_done_notifier(&s->machine_done);
}
+static void ioapic_unrealize(DeviceState *dev, Error **errp)
+{
+ IOAPICCommonState *s = IOAPIC_COMMON(dev);
+
+ timer_del(s->timer);
+ timer_free(s->timer);
+}
+
static Property ioapic_properties[] = {
DEFINE_PROP_UINT8("version", IOAPICCommonState, version, IOAPIC_VER_DEF),
DEFINE_PROP_END_OF_LIST(),
@@ -419,6 +465,7 @@ static void ioapic_class_init(ObjectClass *klass, void *data)
DeviceClass *dc = DEVICE_CLASS(klass);
k->realize = ioapic_realize;
+ k->unrealize = ioapic_unrealize;
/*
* If APIC is in kernel, we need to update the kernel cache after
* migration, otherwise first 24 gsi routes will be invalid.
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index a28bdce925..90c9d07c1a 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -25,6 +25,7 @@ apic_mem_writel(uint64_t addr, uint32_t val) "0x%"PRIx64" = 0x%08x"
ioapic_set_remote_irr(int n) "set remote irr for pin %d"
ioapic_clear_remote_irr(int n, int vector) "clear remote irr for pin %d vector %d"
ioapic_eoi_broadcast(int vector) "EOI broadcast for vector %d"
+ioapic_eoi_delayed_reassert(int vector) "delayed reassert on EOI broadcast for vector %d"
ioapic_mem_read(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem read addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" retval 0x%"PRIx32
ioapic_mem_write(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem write addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" val 0x%"PRIx32
ioapic_set_irq(int vector, int level) "vector: %d level: %d"
diff --git a/include/hw/i386/ioapic_internal.h b/include/hw/i386/ioapic_internal.h
index 9848f391bb..70f9fc750a 100644
--- a/include/hw/i386/ioapic_internal.h
+++ b/include/hw/i386/ioapic_internal.h
@@ -96,6 +96,7 @@ typedef struct IOAPICCommonClass {
SysBusDeviceClass parent_class;
DeviceRealize realize;
+ DeviceUnrealize unrealize;
void (*pre_save)(IOAPICCommonState *s);
void (*post_load)(IOAPICCommonState *s);
} IOAPICCommonClass;
@@ -111,6 +112,8 @@ struct IOAPICCommonState {
uint8_t version;
uint64_t irq_count[IOAPIC_NUM_PINS];
int irq_level[IOAPIC_NUM_PINS];
+ int irq_eoi[IOAPIC_NUM_PINS];
+ QEMUTimer *timer;
};
void ioapic_reset_common(DeviceState *dev);
--
2.20.1
> On 2 Apr 2019, at 11:02, Vitaly Kuznetsov <vkuznets@redhat.com> wrote: > > It was found that Hyper-V 2016 on KVM in some configurations (q35 machine + > piix4-usb-uhci) hangs on boot. Root-cause was that one of Hyper-V > level-triggered interrupt handler performs EOI before fixing the cause of > the interrupt. This results in IOAPIC keep re-raising the level-triggered > interrupt after EOI because irq-line remains asserted. > > Gory details: https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_kvm_msg184484.html&d=DwIDAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0&m=GChegdKds--tabGxr5GuYEYOixhXJSy1NdM1z1HllzY&s=1eaLCEy8KZxs9hcxJGI2MMfmsL0jIRhem8QAeeBgehM&e= > (the whole thread). > > Turns out we were dealing with similar issues before; in-kernel IOAPIC > implementation has commit 184564efae4d ("kvm: ioapic: conditionally delay > irq delivery duringeoi broadcast") which describes a very similar issue. > > Steal the idea from the above mentioned commit for IOAPIC implementation in > QEMU. SUCCESSIVE_IRQ_MAX_COUNT, delay and the comment are borrowed as well. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> I have only small minor refactoring comments inline. Reviewed-by: Liran Alon <liran.alon@oracle.com> -Liran > --- > Changes since v1: > - timer_mod() -> timer_mod_anticipate() [Paolo Bonzini] > - Massaged changelog [Liran Alon] > - Make implementation look like in-kernel one [Liran Alon] > --- > hw/intc/ioapic.c | 57 ++++++++++++++++++++++++++++--- > hw/intc/trace-events | 1 + > include/hw/i386/ioapic_internal.h | 3 ++ > 3 files changed, 56 insertions(+), 5 deletions(-) > > diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c > index 9d75f84d3b..9fb8dd3450 100644 > --- a/hw/intc/ioapic.c > +++ b/hw/intc/ioapic.c > @@ -139,6 +139,15 @@ static void ioapic_service(IOAPICCommonState *s) > } > } > > +#define SUCCESSIVE_IRQ_MAX_COUNT 10000 > + > +static void ioapic_timer(void *opaque) I suggest rename method to something such as “delayed_ioapic_service_timer_handler()”. > +{ > + IOAPICCommonState *s = opaque; > + > + ioapic_service(s); > +} > + > static void ioapic_set_irq(void *opaque, int vector, int level) > { > IOAPICCommonState *s = opaque; > @@ -222,13 +231,40 @@ void ioapic_eoi_broadcast(int vector) > } > for (n = 0; n < IOAPIC_NUM_PINS; n++) { > entry = s->ioredtbl[n]; > - if ((entry & IOAPIC_LVT_REMOTE_IRR) > - && (entry & IOAPIC_VECTOR_MASK) == vector) { > - trace_ioapic_clear_remote_irr(n, vector); > - s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; > - if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) { > + > + if (((entry & IOAPIC_VECTOR_MASK) != vector) || > + !(entry & IOAPIC_LVT_REMOTE_IRR)) { > + continue; > + } > + > + trace_ioapic_clear_remote_irr(n, vector); > + s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; remote-irr is only set for level-triggered interrupt. Thus, I think in the “if” above you should “continue;” in case trigger-mode != IOAPIC_TRIGGER_LEVEL. i.e. Replace condition "!(entry & IOAPIC_LVT_REMOTE_IRR)” with “trigger-mode != IOAPIC_TRIGGER_LEVEL”. Then you can also remove the “if” below. > + > + if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != > + IOAPIC_TRIGGER_LEVEL) { > + continue; > + } > + > + if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) { > + ++s->irq_eoi[vector]; > + if (s->irq_eoi[vector] >= SUCCESSIVE_IRQ_MAX_COUNT) { This can just be “==“ as you should never reach beyond SUCCESSIVE_IRQ_MAX_COUNT. But you can remain with “>=“ if you want to be on the safe side? > + /* > + * Real hardware does not deliver the interrupt immediately > + * during eoi broadcast, and this lets a buggy guest make > + * slow progress even if it does not correctly handle a > + * level-triggered interrupt. Emulate this behavior if we > + * detect an interrupt storm. > + */ > + s->irq_eoi[vector] = 0; > + timer_mod_anticipate(s->timer, > + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > + NANOSECONDS_PER_SECOND / 100); > + trace_ioapic_eoi_delayed_reassert(vector); > + } else { > ioapic_service(s); > } > + } else { > + s->irq_eoi[vector] = 0; > } > } > } > @@ -401,6 +437,8 @@ static void ioapic_realize(DeviceState *dev, Error **errp) > memory_region_init_io(&s->io_memory, OBJECT(s), &ioapic_io_ops, s, > "ioapic", 0x1000); > > + s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, ioapic_timer, s); > + > qdev_init_gpio_in(dev, ioapic_set_irq, IOAPIC_NUM_PINS); > > ioapics[ioapic_no] = s; > @@ -408,6 +446,14 @@ static void ioapic_realize(DeviceState *dev, Error **errp) > qemu_add_machine_init_done_notifier(&s->machine_done); > } > > +static void ioapic_unrealize(DeviceState *dev, Error **errp) > +{ > + IOAPICCommonState *s = IOAPIC_COMMON(dev); > + > + timer_del(s->timer); > + timer_free(s->timer); > +} > + > static Property ioapic_properties[] = { > DEFINE_PROP_UINT8("version", IOAPICCommonState, version, IOAPIC_VER_DEF), > DEFINE_PROP_END_OF_LIST(), > @@ -419,6 +465,7 @@ static void ioapic_class_init(ObjectClass *klass, void *data) > DeviceClass *dc = DEVICE_CLASS(klass); > > k->realize = ioapic_realize; > + k->unrealize = ioapic_unrealize; > /* > * If APIC is in kernel, we need to update the kernel cache after > * migration, otherwise first 24 gsi routes will be invalid. > diff --git a/hw/intc/trace-events b/hw/intc/trace-events > index a28bdce925..90c9d07c1a 100644 > --- a/hw/intc/trace-events > +++ b/hw/intc/trace-events > @@ -25,6 +25,7 @@ apic_mem_writel(uint64_t addr, uint32_t val) "0x%"PRIx64" = 0x%08x" > ioapic_set_remote_irr(int n) "set remote irr for pin %d" > ioapic_clear_remote_irr(int n, int vector) "clear remote irr for pin %d vector %d" > ioapic_eoi_broadcast(int vector) "EOI broadcast for vector %d" > +ioapic_eoi_delayed_reassert(int vector) "delayed reassert on EOI broadcast for vector %d" > ioapic_mem_read(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem read addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" retval 0x%"PRIx32 > ioapic_mem_write(uint8_t addr, uint8_t regsel, uint8_t size, uint32_t val) "ioapic mem write addr 0x%"PRIx8" regsel: 0x%"PRIx8" size 0x%"PRIx8" val 0x%"PRIx32 > ioapic_set_irq(int vector, int level) "vector: %d level: %d" > diff --git a/include/hw/i386/ioapic_internal.h b/include/hw/i386/ioapic_internal.h > index 9848f391bb..70f9fc750a 100644 > --- a/include/hw/i386/ioapic_internal.h > +++ b/include/hw/i386/ioapic_internal.h > @@ -96,6 +96,7 @@ typedef struct IOAPICCommonClass { > SysBusDeviceClass parent_class; > > DeviceRealize realize; > + DeviceUnrealize unrealize; > void (*pre_save)(IOAPICCommonState *s); > void (*post_load)(IOAPICCommonState *s); > } IOAPICCommonClass; > @@ -111,6 +112,8 @@ struct IOAPICCommonState { > uint8_t version; > uint64_t irq_count[IOAPIC_NUM_PINS]; > int irq_level[IOAPIC_NUM_PINS]; > + int irq_eoi[IOAPIC_NUM_PINS]; > + QEMUTimer *timer; I suggest rename “timer” to something more indicative as one can confuse it with a generic IOAPIC timer. Something such as “delayed_ioapic_service_timer”. > }; > > void ioapic_reset_common(DeviceState *dev); > -- > 2.20.1 >
On 02/04/19 10:23, Liran Alon wrote: >> +#define SUCCESSIVE_IRQ_MAX_COUNT 10000 >> + >> +static void ioapic_timer(void *opaque) > I suggest rename method to something such as “delayed_ioapic_service_timer_handler()”. > I renamed them to delayed_ioapic_service_{timer,cb} and queued the patch. >> >> + if (((entry & IOAPIC_VECTOR_MASK) != vector) || >> + !(entry & IOAPIC_LVT_REMOTE_IRR)) { >> + continue; >> + } >> + >> + trace_ioapic_clear_remote_irr(n, vector); >> + s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; > > remote-irr is only set for level-triggered interrupt. > Thus, I think in the “if” above you should “continue;” in case trigger-mode != IOAPIC_TRIGGER_LEVEL. > i.e. Replace condition "!(entry & IOAPIC_LVT_REMOTE_IRR)” with “trigger-mode != IOAPIC_TRIGGER_LEVEL”. > Then you can also remove the “if” below. Like this? @@ -232,19 +232,18 @@ void ioapic_eoi_broadcast(int vector) for (n = 0; n < IOAPIC_NUM_PINS; n++) { entry = s->ioredtbl[n]; - if (((entry & IOAPIC_VECTOR_MASK) != vector) || - !(entry & IOAPIC_LVT_REMOTE_IRR)) { + if ((entry & IOAPIC_VECTOR_MASK) != vector || + ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) { continue; } - trace_ioapic_clear_remote_irr(n, vector); - s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; - - if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != - IOAPIC_TRIGGER_LEVEL) { + if (!(entry & IOAPIC_LVT_REMOTE_IRR)) { continue; } + trace_ioapic_clear_remote_irr(n, vector); + s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; + if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) { ++s->irq_eoi[vector]; if (s->irq_eoi[vector] >= SUCCESSIVE_IRQ_MAX_COUNT) { Paolo
> On 2 Apr 2019, at 12:06, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 02/04/19 10:23, Liran Alon wrote: >>> +#define SUCCESSIVE_IRQ_MAX_COUNT 10000 >>> + >>> +static void ioapic_timer(void *opaque) >> I suggest rename method to something such as “delayed_ioapic_service_timer_handler()”. >> > > I renamed them to delayed_ioapic_service_{timer,cb} and queued the patch. Thanks. > >>> >>> + if (((entry & IOAPIC_VECTOR_MASK) != vector) || >>> + !(entry & IOAPIC_LVT_REMOTE_IRR)) { >>> + continue; >>> + } >>> + >>> + trace_ioapic_clear_remote_irr(n, vector); >>> + s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; >> >> remote-irr is only set for level-triggered interrupt. >> Thus, I think in the “if” above you should “continue;” in case trigger-mode != IOAPIC_TRIGGER_LEVEL. >> i.e. Replace condition "!(entry & IOAPIC_LVT_REMOTE_IRR)” with “trigger-mode != IOAPIC_TRIGGER_LEVEL”. >> Then you can also remove the “if” below. > > Like this? > > @@ -232,19 +232,18 @@ void ioapic_eoi_broadcast(int vector) > for (n = 0; n < IOAPIC_NUM_PINS; n++) { > entry = s->ioredtbl[n]; > > - if (((entry & IOAPIC_VECTOR_MASK) != vector) || > - !(entry & IOAPIC_LVT_REMOTE_IRR)) { > + if ((entry & IOAPIC_VECTOR_MASK) != vector || > + ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != IOAPIC_TRIGGER_LEVEL) { > continue; > } > > - trace_ioapic_clear_remote_irr(n, vector); > - s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; > - > - if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != > - IOAPIC_TRIGGER_LEVEL) { > + if (!(entry & IOAPIC_LVT_REMOTE_IRR)) { > continue; > } I think above “if” of checking remote-irr should just be removed. But the rest seems good :) > > + trace_ioapic_clear_remote_irr(n, vector); > + s->ioredtbl[n] = entry & ~IOAPIC_LVT_REMOTE_IRR; > + > if (!(entry & IOAPIC_LVT_MASKED) && (s->irr & (1 << n))) { > ++s->irq_eoi[vector]; > if (s->irq_eoi[vector] >= SUCCESSIVE_IRQ_MAX_COUNT) { > > > Paolo
On 02/04/19 11:08, Liran Alon wrote: >> - >> - if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != >> - IOAPIC_TRIGGER_LEVEL) { >> + if (!(entry & IOAPIC_LVT_REMOTE_IRR)) { >> continue; >> } > I think above “if” of checking remote-irr should just be removed. > But the rest seems good :) > It seems more logical, as the condition is now the opposite of ioapic_set_irq: ioapic_set_irq services the interrupt if remote-irr = 0, EOI does it if remote-irr = 1. Paolo
> On 2 Apr 2019, at 13:20, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 02/04/19 11:08, Liran Alon wrote: >>> - >>> - if (((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1) != >>> - IOAPIC_TRIGGER_LEVEL) { >>> + if (!(entry & IOAPIC_LVT_REMOTE_IRR)) { >>> continue; >>> } >> I think above “if” of checking remote-irr should just be removed. >> But the rest seems good :) >> > > It seems more logical, as the condition is now the opposite of > ioapic_set_irq: ioapic_set_irq services the interrupt if remote-irr = 0, > EOI does it if remote-irr = 1. > > Paolo At this point at ioapic_eoi_broadcast(), you already know you got an EOI for a level-triggered interrupt. Therefore, the remote-irr must be already set to 1. Otherwise, this is a bug. You can assert on this if you wish. (Note that remote-irr is a read-only property that cannot be overwritten by guest writing to IOAPIC redirection-table) -Liran
© 2016 - 2024 Red Hat, Inc.