IRQ window inhibits can be requested by multiple vCPUs at the same time
for injecting interrupts meant for different vCPUs. However, AVIC
inhibition is VM-wide and hence it is possible for the inhibition to be
cleared prematurely by the first vCPU that obtains the IRQ window even
though a second vCPU is still waiting for its IRQ window. This is likely
not a functional issue since the other vCPU will again see that
interrupts are pending to be injected (due to KVM_REQ_EVENT), and will
again request for an IRQ window inhibition. However, this can result in
AVIC being rapidly toggled resulting in high contention on
apicv_update_lock and degrading performance of the guest.
Address this by maintaining a VM-wide count of the number of vCPUs that
have requested for an IRQ window. Set/clear the inhibit reason when the
count transitions between 0 and 1. This ensures that the inhibit reason
is not cleared as long as there are some vCPUs still waiting for an IRQ
window.
Co-developed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Co-developed-by: Naveen N Rao (AMD) <naveen@kernel.org>
Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>
Tested-by: Naveen N Rao (AMD) <naveen@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_host.h | 19 ++++++++++++++++-
arch/x86/kvm/svm/svm.c | 36 +++++++++++++++++++++++----------
arch/x86/kvm/svm/svm.h | 1 +
arch/x86/kvm/x86.c | 19 +++++++++++++++++
4 files changed, 63 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e441f270f354..b08baeff98b2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1427,6 +1427,7 @@ struct kvm_arch {
struct kvm_pit *vpit;
#endif
atomic_t vapics_in_nmi_mode;
+
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
@@ -1434,9 +1435,13 @@ struct kvm_arch {
bool apic_access_memslot_enabled;
bool apic_access_memslot_inhibited;
- /* Protects apicv_inhibit_reasons */
+ /*
+ * Protects apicv_inhibit_reasons and apicv_nr_irq_window_req (with an
+ * asterisk, see kvm_inc_or_dec_irq_window_inhibit() for details).
+ */
struct rw_semaphore apicv_update_lock;
unsigned long apicv_inhibit_reasons;
+ atomic_t apicv_nr_irq_window_req;
gpa_t wall_clock;
@@ -2309,6 +2314,18 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
}
+void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc);
+
+static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm)
+{
+ kvm_inc_or_dec_irq_window_inhibit(kvm, true);
+}
+
+static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm)
+{
+ kvm_inc_or_dec_irq_window_inhibit(kvm, false);
+}
+
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 24b9c2275821..559e8fa76b7e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3729,8 +3729,11 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
* the case in which the interrupt window was requested while L1 was
* active (the vCPU was not running nested).
*/
- if (!kvm_cpu_has_injectable_intr(vcpu) || is_guest_mode(vcpu))
- kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+ if (svm->avic_irq_window &&
+ (!kvm_cpu_has_injectable_intr(vcpu) || is_guest_mode(vcpu))) {
+ svm->avic_irq_window = false;
+ kvm_dec_apicv_irq_window_req(svm->vcpu.kvm);
+ }
trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
++vcpu->stat.irq_injections;
@@ -3932,17 +3935,28 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
*/
if (vgif || gif_set(svm)) {
/*
- * IRQ window is not needed when AVIC is enabled,
- * unless we have pending ExtINT since it cannot be injected
- * via AVIC. In such case, KVM needs to temporarily disable AVIC,
- * and fallback to injecting IRQ via V_IRQ.
+ * KVM only enables IRQ windows when AVIC is enabled if there's
+ * pending ExtINT since it cannot be injected via AVIC (ExtINT
+ * bypasses the local APIC). V_IRQ is ignored by hardware when
+ * AVIC is enabled, and so KVM needs to temporarily disable
+ * AVIC in order to detect when it's ok to inject the ExtINT.
*
- * If running nested, AVIC is already locally inhibited
- * on this vCPU, therefore there is no need to request
- * the VM wide AVIC inhibition.
+ * If running nested, AVIC is already locally inhibited on this
+ * vCPU (L2 vCPUs use a different MMU that never maps the AVIC
+ * backing page), therefore there is no need to increment the
+ * VM-wide AVIC inhibit. KVM will re-evaluate events when the
+ * vCPU exits to L1 and enable an IRQ window if the ExtINT is
+ * still pending.
+ *
+ * Note, the IRQ window inhibit needs to be updated even if
+ * AVIC is inhibited for a different reason, as KVM needs to
+ * keep AVIC inhibited if the other reason is cleared and there
+ * is still an injectable interrupt pending.
*/
- if (!is_guest_mode(vcpu))
- kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+ if (enable_apicv && !svm->avic_irq_window && !is_guest_mode(vcpu)) {
+ svm->avic_irq_window = true;
+ kvm_inc_apicv_irq_window_req(vcpu->kvm);
+ }
svm_set_vintr(svm);
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index ebd7b36b1ceb..68675b25ef8e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -333,6 +333,7 @@ struct vcpu_svm {
bool guest_state_loaded;
+ bool avic_irq_window;
bool x2avic_msrs_intercepted;
bool lbr_msrs_intercepted;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8acfdfc583a1..2528dfffb42b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10994,6 +10994,25 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit);
+void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc)
+{
+ int add = inc ? 1 : -1;
+
+ if (!enable_apicv)
+ return;
+
+ /*
+ * Strictly speaking, the lock is only needed if going 0->1 or 1->0,
+ * a la atomic_dec_and_mutex_lock. However, ExtINTs are rare and
+ * only target a single CPU, so that is the common case; do not
+ * bother eliding the down_write()/up_write() pair.
+ */
+ guard(rwsem_write)(&kvm->arch.apicv_update_lock);
+ if (atomic_add_return(add, &kvm->arch.apicv_nr_irq_window_req) == inc)
+ __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_IRQWIN, inc);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inc_or_dec_irq_window_inhibit);
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
if (!kvm_apic_present(vcpu))
--
2.52.0.457.g6b5491de43-goog
© 2016 - 2026 Red Hat, Inc.