Add a Kconfig to allowing building KVM without support for emulating an
I/O APIC, PIC, and PIT, which is desirable for deployments that effectively
don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to
create an in-kernel I/O APIC. E.g. compiling out support eliminates a few
thousand lines of guest-facing code and gives security folks warm fuzzies.
As a bonus, wrapping relevant paths with CONFIG_KVM_IOAPIC #ifdefs makes
it much easier for readers to understand which bits and pieces exist
specific for fully in-kernel IRQ chips.
Opportunistically convert all two in-kernel uses of __KVM_HAVE_IOAPIC to
CONFIG_KVM_IOAPIC, e.g. rather than add a second #ifdef to generate a stub
for kvm_arch_post_irq_routing_update().
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/include/asm/kvm_host.h | 2 ++
arch/x86/kvm/Kconfig | 10 ++++++++++
arch/x86/kvm/Makefile | 5 +++--
arch/x86/kvm/irq.c | 6 ++++++
arch/x86/kvm/irq_comm.c | 2 ++
arch/x86/kvm/lapic.c | 7 ++++++-
arch/x86/kvm/trace.h | 2 ++
arch/x86/kvm/x86.c | 24 ++++++++++++++++++++----
include/linux/kvm_host.h | 2 +-
include/trace/events/kvm.h | 4 ++--
10 files changed, 54 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ebda93979179..f5ff5174674c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1374,9 +1374,11 @@ struct kvm_arch {
atomic_t noncoherent_dma_count;
#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
atomic_t assigned_device_count;
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
+#endif
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec5382..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
config KVM_SMM
bool "System Management Mode emulation"
default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a5d362c7b504..92c737257789 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+kvm-y += x86.o emulate.o irq.o lapic.o \
+ irq_comm.o cpuid.o pmu.o mtrr.o \
debugfs.o mmu/mmu.o mmu/page_track.o \
mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b9b9df00ab77..a416ccddde5f 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -75,8 +75,10 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
+#ifdef CONFIG_KVM_IOAPIC
if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
+#endif
WARN_ON_ONCE(!irqchip_split(v->kvm));
return pending_userspace_extint(v);
@@ -135,8 +137,10 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
+#ifdef CONFIG_KVM_IOAPIC
if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
WARN_ON_ONCE(!irqchip_split(v->kvm));
return get_userspace_extint(v);
@@ -170,7 +174,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
kvm_x86_call(migrate_timers)(vcpu);
}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index adef53dc4fef..a4ef150fdd1c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -208,6 +208,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
* check kvm_arch_can_set_irq_routing() before calling this function.
*/
switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
case KVM_IRQ_ROUTING_IRQCHIP:
if (irqchip_split(kvm))
return -EINVAL;
@@ -231,6 +232,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
break;
+#endif
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73418dc0ebb2..4cf8c1f753d3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- int trigger_mode;
+ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
@@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
+#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
vcpu->arch.apic_arb_prio = 0;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4ef17990574d..ababdba2c186 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -270,6 +270,7 @@ TRACE_EVENT(kvm_cpuid,
{0x6, "SIPI"}, \
{0x7, "ExtINT"}
+#ifdef CONFIG_KVM_IOAPIC
TRACE_EVENT(kvm_ioapic_set_irq,
TP_PROTO(__u64 e, int pin, bool coalesced),
TP_ARGS(e, pin, coalesced),
@@ -314,6 +315,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
(__entry->e & (1<<15)) ? "level" : "edge",
(__entry->e & (1<<16)) ? "|masked" : "")
);
+#endif
TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e2c249d45ca..52eff4919d95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4630,17 +4630,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -6393,6 +6396,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
+#ifdef CONFIG_KVM_IOAPIC
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
@@ -6521,6 +6525,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
return 0;
}
+#endif /* CONFIG_KVM_IOAPIC */
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -7064,9 +7069,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7074,6 +7081,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7097,6 +7105,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7257,6 +7266,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -10716,8 +10726,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -12920,7 +12932,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
+#endif
kvm_mmu_pre_destroy_vm(kvm);
static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@@ -12944,8 +12958,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 44b439c5fcf4..0e151db44ecd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1019,7 +1019,7 @@ void kvm_destroy_vcpus(struct kvm *kvm);
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_KVM_IOAPIC
void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
#else
static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 96e581900c8e..1065a81ca57f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -84,14 +84,14 @@ TRACE_EVENT(kvm_set_irq,
);
#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
-#if defined(__KVM_HAVE_IOAPIC)
+#ifdef CONFIG_KVM_IOAPIC
#define kvm_irqchips \
{KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \
{KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \
{KVM_IRQCHIP_IOAPIC, "IOAPIC"}
-#endif /* defined(__KVM_HAVE_IOAPIC) */
+#endif /* CONFIG_KVM_IOAPIC */
#if defined(CONFIG_HAVE_KVM_IRQCHIP)
--
2.49.0.1101.gccaa498523-goog
On Mon, 2025-05-19 at 16:28 -0700, Sean Christopherson wrote: > Add a Kconfig to allowing building KVM without support for emulating an ^ allow > I/O APIC, PIC, and PIT, which is desirable for deployments that effectively > don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to > create an in-kernel I/O APIC. > Do you happen to know what developments don't support a full in-kernel IRQ chip? Do they only support userspace IRQ chip, or not support any IRQ chip at all?
On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: > On Mon, 2025-05-19 at 16:28 -0700, Sean Christopherson wrote: > > Add a Kconfig to allowing building KVM without support for emulating an > ^ > allow > > > I/O APIC, PIC, and PIT, which is desirable for deployments that effectively > > don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to > > create an in-kernel I/O APIC. > > > > Do you happen to know what developments don't support a full in-kernel IRQ chip? > > Do they only support userspace IRQ chip, or not support any IRQ chip at all? Forgot to ask: Since this new Kconfig option is not only for IOAPIC but also includes PIC and PIT, is CONFIG_KVM_IRQCHIP a better name?
On Thu, May 29, 2025, Kai Huang wrote: > On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: > > On Mon, 2025-05-19 at 16:28 -0700, Sean Christopherson wrote: > > > Add a Kconfig to allowing building KVM without support for emulating an > > ^ > > allow > > > > > I/O APIC, PIC, and PIT, which is desirable for deployments that effectively > > > don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to > > > create an in-kernel I/O APIC. > > > > Do you happen to know what developments don't support a full in-kernel IRQ chip? Google Cloud, for one. I suspect/assume many/most CSPs don't utilize an in-kernel I/O APIC. > > Do they only support userspace IRQ chip, or not support any IRQ chip at all? The former, only userspace I/O APIC (and associated devices), though some VM shapes, e.g. TDX, don't provide an I/O APIC or PIC. > Forgot to ask: > > Since this new Kconfig option is not only for IOAPIC but also includes PIC and > PIT, is CONFIG_KVM_IRQCHIP a better name? I much prefer IOAPIC, because IRQCHIP is far too ambiguous and confusing, e.g. just look at KVM's internal APIs, where these: irqchip_in_kernel() irqchip_kernel() are not equivalent. In practice, no modern guest kernel is going to utilize the PIC, and the PIT isn't an IRQ chip, i.e. isn't strictly covered by IRQCHIP either. So I think/hope the vast majority of users/readers will be able to intuit that CONFIG_KVM_IOAPIC also covers the PIC and PIT.
On Thu, 2025-05-29 at 07:31 -0700, Sean Christopherson wrote: > On Thu, May 29, 2025, Kai Huang wrote: > > On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: > > > On Mon, 2025-05-19 at 16:28 -0700, Sean Christopherson wrote: > > > > Add a Kconfig to allowing building KVM without support for emulating an > > > ^ > > > allow > > > > > > > I/O APIC, PIC, and PIT, which is desirable for deployments that effectively > > > > don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to > > > > create an in-kernel I/O APIC. > > > > > > Do you happen to know what developments don't support a full in-kernel IRQ chip? > > Google Cloud, for one. I suspect/assume many/most CSPs don't utilize an in-kernel > I/O APIC. > > > > Do they only support userspace IRQ chip, or not support any IRQ chip at all? > > The former, only userspace I/O APIC (and associated devices), though some VM > shapes, e.g. TDX, don't provide an I/O APIC or PIC. Thanks for the info. Just wondering what's the benefit of using userspace IRQCHIP instead of emulating in the kernel? I thought one should either use in-kernel IRQCHIP or doesn't use any. > > > Forgot to ask: > > > > Since this new Kconfig option is not only for IOAPIC but also includes PIC and > > PIT, is CONFIG_KVM_IRQCHIP a better name? > > I much prefer IOAPIC, because IRQCHIP is far too ambiguous and confusing, e.g. > just look at KVM's internal APIs, where these: > > irqchip_in_kernel() > irqchip_kernel() > > are not equivalent. In practice, no modern guest kernel is going to utilize the > PIC, and the PIT isn't an IRQ chip, i.e. isn't strictly covered by IRQCHIP either. Right. Maybe it is worth to further have dedicated Kconfig for PIC, PIT and IOAPIC? But hmm, I am not sure whether emulating IOAPIC has more value than PIC. For modern guests all emulated/assigned devices should just use MSI/MSI-X? > So I think/hope the vast majority of users/readers will be able to intuit that > CONFIG_KVM_IOAPIC also covers the PIC and PIT. Sure. Btw, I also find irqchip_in_kernel() and irqchip_kernel() confusing. I am not sure the value of having irqchip_in_kernel() in fact. The guest should always have an in-kernel APIC for modern guests. I am wondering whether we can get rid of it completely (the logic will be it is always be true), or we can have a Kconfig to only build it when user truly wants it.
On Thu, May 29, 2025, Kai Huang wrote: > On Thu, 2025-05-29 at 07:31 -0700, Sean Christopherson wrote: > > On Thu, May 29, 2025, Kai Huang wrote: > > > On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: > > > > On Mon, 2025-05-19 at 16:28 -0700, Sean Christopherson wrote: > > > > > Add a Kconfig to allowing building KVM without support for emulating an > > > > ^ > > > > allow > > > > > > > > > I/O APIC, PIC, and PIT, which is desirable for deployments that effectively > > > > > don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to > > > > > create an in-kernel I/O APIC. > > > > > > > > Do you happen to know what developments don't support a full in-kernel IRQ chip? > > > > Google Cloud, for one. I suspect/assume many/most CSPs don't utilize an in-kernel > > I/O APIC. > > > > > > Do they only support userspace IRQ chip, or not support any IRQ chip at all? > > > > The former, only userspace I/O APIC (and associated devices), though some VM > > shapes, e.g. TDX, don't provide an I/O APIC or PIC. > > Thanks for the info. > > Just wondering what's the benefit of using userspace IRQCHIP instead of > emulating in the kernel? Reduced kernel attack surface (this was especially true years ago, before KVM's I/O APIC emulation was well-tested) and more flexibility (e.g. shipping userspace changes is typically easier than shipping new kernels. I'm pretty sure there's one more big one that I'm blanking on at the moment. > I thought one should either use in-kernel IRQCHIP or doesn't use any. > > > > > > Forgot to ask: > > > > > > Since this new Kconfig option is not only for IOAPIC but also includes PIC and > > > PIT, is CONFIG_KVM_IRQCHIP a better name? > > > > I much prefer IOAPIC, because IRQCHIP is far too ambiguous and confusing, e.g. > > just look at KVM's internal APIs, where these: > > > > irqchip_in_kernel() > > irqchip_kernel() > > > > are not equivalent. In practice, no modern guest kernel is going to utilize the > > PIC, and the PIT isn't an IRQ chip, i.e. isn't strictly covered by IRQCHIP either. > > Right. > > Maybe it is worth to further have dedicated Kconfig for PIC, PIT and IOAPIC? Nah. PIC and I/O APIC can't be split (without new uAPI and non-trivial complexity), and I highly doubt there is any use case that would want an in-kernel I/O APIC with a userspace PIT. I.e. in practice, the threealmost always come as a group; either a setup wants all, or a setup wants none. > But hmm, I am not sure whether emulating IOAPIC has more value than PIC. AIUI, it's not really an either or, since most software expects both an I/O APIC and PIC. Any remotely modern kernel will definitely prefer the I/O APIC, but I don't think it's something that can be guaranteed. > For modern guests all emulated/assigned devices should just use MSI/MSI-X? Not all emulated devices, since some legacy hang off the I/O APIC, i.e. aren't capable of generating MISs. > > So I think/hope the vast majority of users/readers will be able to intuit that > > CONFIG_KVM_IOAPIC also covers the PIC and PIT. > > Sure. > > Btw, I also find irqchip_in_kernel() and irqchip_kernel() confusing. I am not > sure the value of having irqchip_in_kernel() in fact. The guest should always > have an in-kernel APIC for modern guests. I am wondering whether we can get rid > of it completely (the logic will be it is always be true), or we can have a > Kconfig to only build it when user truly wants it. For better or worse, an in-kernel local APIC is still optional. I do hope/want to make it mandatory, but that's not a small ABI change.
On 5/30/25 01:08, Sean Christopherson wrote: > On Thu, May 29, 2025, Kai Huang wrote: >> On Thu, 2025-05-29 at 07:31 -0700, Sean Christopherson wrote: >>> On Thu, May 29, 2025, Kai Huang wrote: >>>> On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: >>>>> Do they only support userspace IRQ chip, or not support any IRQ chip at all? >>> >>> The former, only userspace I/O APIC (and associated devices), though some VM >>> shapes, e.g. TDX, don't provide an I/O APIC or PIC. >> >> Thanks for the info. >> >> Just wondering what's the benefit of using userspace IRQCHIP instead of >> emulating in the kernel? > > Reduced kernel attack surface (this was especially true years ago, before KVM's > I/O APIC emulation was well-tested) and more flexibility (e.g. shipping userspace > changes is typically easier than shipping new kernels. I'm pretty sure there's > one more big one that I'm blanking on at the moment. Feature-wise, the big one is support for IRQ remapping which is not implemented in the in-kernel IOAPIC. >>>> Forgot to ask: >>>> >>>> Since this new Kconfig option is not only for IOAPIC but also includes PIC and >>>> PIT, is CONFIG_KVM_IRQCHIP a better name? >>> >>> I much prefer IOAPIC, because IRQCHIP is far too ambiguous and confusing, e.g. >>> just look at KVM's internal APIs, where these: >>> >>> irqchip_in_kernel() >>> irqchip_kernel() >>> >>> are not equivalent. In practice, no modern guest kernel is going to utilize the >>> PIC, and the PIT isn't an IRQ chip, i.e. isn't strictly covered by IRQCHIP either. >> >> Right. >> >> Maybe it is worth to further have dedicated Kconfig for PIC, PIT and IOAPIC? > > Nah. PIC and I/O APIC can't be split (without new uAPI and non-trivial complexity), > and I highly doubt there is any use case that would want an in-kernel I/O APIC > with a userspace PIT. I.e. in practice, the three almost always come as a group; > either a setup wants all, or a setup wants none. Without "almost", even. I think it's okay to make it CONFIG_KVM_IOAPIC, it's not super accurate but there's no single word that convey "IOAPIC, PIC and PIT". >> Btw, I also find irqchip_in_kernel() and irqchip_kernel() confusing. I am not >> sure the value of having irqchip_in_kernel() in fact. The guest should always >> have an in-kernel APIC for modern guests. I am wondering whether we can get rid >> of it completely (the logic will be it is always be true), or we can have a >> Kconfig to only build it when user truly wants it. irqchip_kernel() can be renamed to irqchip_full(). > For better or worse, an in-kernel local APIC is still optional. I do hope/want > to make it mandatory, but that's not a small ABI change. I am pretty sure that some users (was it DOSBox? or maybe even gVisor?) would break. Paolo
[Sorry for bumping an old thread] On Wed, Jun 04, 2025 at 06:54:44PM +0200, Paolo Bonzini wrote: > On 5/30/25 01:08, Sean Christopherson wrote: > > On Thu, May 29, 2025, Kai Huang wrote: > > > On Thu, 2025-05-29 at 07:31 -0700, Sean Christopherson wrote: > > > > On Thu, May 29, 2025, Kai Huang wrote: > > > > > On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: > > > > > > Do they only support userspace IRQ chip, or not support any IRQ chip at all? > > > > > > > > The former, only userspace I/O APIC (and associated devices), though some VM > > > > shapes, e.g. TDX, don't provide an I/O APIC or PIC. > > > > > > Thanks for the info. > > > > > > Just wondering what's the benefit of using userspace IRQCHIP instead of > > > emulating in the kernel? > > > > Reduced kernel attack surface (this was especially true years ago, before KVM's > > I/O APIC emulation was well-tested) and more flexibility (e.g. shipping userspace > > changes is typically easier than shipping new kernels. I'm pretty sure there's > > one more big one that I'm blanking on at the moment. > > Feature-wise, the big one is support for IRQ remapping which is not > implemented in the in-kernel IOAPIC. Is there a reason to prefer the in-kernel IOAPIC today, seeing as it is still the default with Qemu? Thanks, Naveen
On Thu, 2025-05-29 at 16:08 -0700, Sean Christopherson wrote: > On Thu, May 29, 2025, Kai Huang wrote: > > On Thu, 2025-05-29 at 07:31 -0700, Sean Christopherson wrote: > > > On Thu, May 29, 2025, Kai Huang wrote: > > > > On Thu, 2025-05-29 at 23:55 +1200, Kai Huang wrote: > > > > > On Mon, 2025-05-19 at 16:28 -0700, Sean Christopherson wrote: > > > > > > Add a Kconfig to allowing building KVM without support for emulating an > > > > > ^ > > > > > allow > > > > > > > > > > > I/O APIC, PIC, and PIT, which is desirable for deployments that effectively > > > > > > don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to > > > > > > create an in-kernel I/O APIC. > > > > > > > > > > Do you happen to know what developments don't support a full in-kernel IRQ chip? > > > > > > Google Cloud, for one. I suspect/assume many/most CSPs don't utilize an in-kernel > > > I/O APIC. > > > > > > > > Do they only support userspace IRQ chip, or not support any IRQ chip at all? > > > > > > The former, only userspace I/O APIC (and associated devices), though some VM > > > shapes, e.g. TDX, don't provide an I/O APIC or PIC. > > > > Thanks for the info. > > > > Just wondering what's the benefit of using userspace IRQCHIP instead of > > emulating in the kernel? > > Reduced kernel attack surface (this was especially true years ago, before KVM's > I/O APIC emulation was well-tested) and more flexibility (e.g. shipping userspace > changes is typically easier than shipping new kernels. I'm pretty sure there's > one more big one that I'm blanking on at the moment. Yeah those make sense. I thought it was from functionality/performance's perspective but I was at wrong direction. > > > I thought one should either use in-kernel IRQCHIP or doesn't use any. > > > > > > > > > Forgot to ask: > > > > > > > > Since this new Kconfig option is not only for IOAPIC but also includes PIC and > > > > PIT, is CONFIG_KVM_IRQCHIP a better name? > > > > > > I much prefer IOAPIC, because IRQCHIP is far too ambiguous and confusing, e.g. > > > just look at KVM's internal APIs, where these: > > > > > > irqchip_in_kernel() > > > irqchip_kernel() > > > > > > are not equivalent. In practice, no modern guest kernel is going to utilize the > > > PIC, and the PIT isn't an IRQ chip, i.e. isn't strictly covered by IRQCHIP either. > > > > Right. > > > > Maybe it is worth to further have dedicated Kconfig for PIC, PIT and IOAPIC? > > Nah. PIC and I/O APIC can't be split (without new uAPI and non-trivial complexity), Right. I forgot this. > and I highly doubt there is any use case that would want an in-kernel I/O APIC > with a userspace PIT. I.e. in practice, the threealmost always come as a group; > either a setup wants all, or a setup wants none. OK. > > > But hmm, I am not sure whether emulating IOAPIC has more value than PIC. > > AIUI, it's not really an either or, since most software expects both an I/O APIC > and PIC. Any remotely modern kernel will definitely prefer the I/O APIC, but I > don't think it's something that can be guaranteed. OK :-) > > > For modern guests all emulated/assigned devices should just use MSI/MSI-X? > > Not all emulated devices, since some legacy hang off the I/O APIC, i.e. aren't > capable of generating MISs. Yeah. I thought in those deployments the guests should not be configured to have those devices. > > > > So I think/hope the vast majority of users/readers will be able to intuit that > > > CONFIG_KVM_IOAPIC also covers the PIC and PIT. > > > > Sure. > > > > Btw, I also find irqchip_in_kernel() and irqchip_kernel() confusing. I am not > > sure the value of having irqchip_in_kernel() in fact. The guest should always > > have an in-kernel APIC for modern guests. I am wondering whether we can get rid > > of it completely (the logic will be it is always be true), or we can have a > > Kconfig to only build it when user truly wants it. > > For better or worse, an in-kernel local APIC is still optional. I do hope/want > to make it mandatory, but that's not a small ABI change. Right. The ABI change is concern. Thanks for all the explanation!
© 2016 - 2025 Red Hat, Inc.