With posted MSIs, end of interrupt is handled by the notification
handler. Each MSI handler does not go through local APIC IRR, ISR
processing. There's no need to do apic_eoi() in those handlers.
Add a new acpi_ack_irq_no_eoi() for the posted MSI IR chip. At runtime
the call trace looks like:
__sysvec_posted_msi_notification() {
irq_chip_ack_parent() {
apic_ack_irq_no_eoi();
}
handle_irq_event() {
handle_irq_event_percpu() {
driver_handler()
}
}
IO-APIC IR is excluded the from posted MSI, we need to make sure it
still performs EOI.
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
arch/x86/include/asm/apic.h | 1 +
arch/x86/kernel/apic/io_apic.c | 2 +-
arch/x86/kernel/apic/vector.c | 5 ++++
drivers/iommu/intel/irq_remapping.c | 38 ++++++++++++++++++++++++++++-
4 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5af4ec1a0f71..a88015d5638b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -485,6 +485,7 @@ static inline void apic_setup_apic_calls(void) { }
#endif /* CONFIG_X86_LOCAL_APIC */
extern void apic_ack_irq(struct irq_data *data);
+extern void apic_ack_irq_no_eoi(struct irq_data *data);
static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00da6cf6b07d..ca398ee9075b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1993,7 +1993,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_startup = startup_ioapic_irq,
.irq_mask = mask_ioapic_irq,
.irq_unmask = unmask_ioapic_irq,
- .irq_ack = irq_chip_ack_parent,
+ .irq_ack = apic_ack_irq,
.irq_eoi = ioapic_ir_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 14fc33cfdb37..01223ac4f57a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -911,6 +911,11 @@ void apic_ack_irq(struct irq_data *irqd)
apic_eoi();
}
+void apic_ack_irq_no_eoi(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+}
+
void apic_ack_edge(struct irq_data *irqd)
{
irq_complete_move(irqd_cfg(irqd));
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 29b9e55dcf26..f2870d3c8313 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1233,6 +1233,42 @@ static struct irq_chip intel_ir_chip = {
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
};
+/*
+ * With posted MSIs, all vectors are multiplexed into a single notification
+ * vector. Devices MSIs are then dispatched in a demux loop where
+ * EOIs can be coalesced as well.
+ *
+ * IR chip "INTEL-IR-POST" does not do EOI on ACK instead letting posted
+ * interrupt notification handler to perform EOI.
+ *
+ * For the example below, 3 MSIs are coalesced in one CPU notification. Only
+ * one apic_eoi() is needed.
+ *
+ * __sysvec_posted_msi_notification() {
+ * irq_enter()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * apic_ack_irq_no_eoi()
+ * handle_irq()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * apic_ack_irq_no_eoi()
+ * handle_irq()
+ * handle_edge_irq()
+ * irq_chip_ack_parent()
+ * apic_ack_irq_no_eoi()
+ * handle_irq()
+ * apic_eoi()
+ * irq_exit()
+ */
+static struct irq_chip intel_ir_chip_post_msi = {
+ .name = "INTEL-IR-POST",
+ .irq_ack = apic_ack_irq_no_eoi,
+ .irq_set_affinity = intel_ir_set_affinity,
+ .irq_compose_msi_msg = intel_ir_compose_msi_msg,
+ .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
+};
+
static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
{
memset(msg, 0, sizeof(*msg));
@@ -1361,7 +1397,7 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
irq_data->hwirq = (index << 16) + i;
irq_data->chip_data = ird;
- irq_data->chip = &intel_ir_chip;
+ irq_data->chip = posted_msi_supported() ? &intel_ir_chip_post_msi : &intel_ir_chip;
intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
}
--
2.25.1
On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
> {
> memset(msg, 0, sizeof(*msg));
> @@ -1361,7 +1397,7 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
>
> irq_data->hwirq = (index << 16) + i;
> irq_data->chip_data = ird;
> - irq_data->chip = &intel_ir_chip;
> + irq_data->chip = posted_msi_supported() ? &intel_ir_chip_post_msi : &intel_ir_chip;
This is just wrong because you change the chip to posted for _ALL_
domains unconditionally.
The only domains which want this chip are the PCI/MSI domains. And those
are distinct from the domains which serve IO/APIC, HPET, no?
So you can set that chip only for PCI/MSI and just let IO/APIC, HPET
domains keep the original chip, which spares any modification of the
IO/APIC domain.
Hi Thomas,
On Wed, 06 Dec 2023 21:44:02 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:
> On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> > static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
> > {
> > memset(msg, 0, sizeof(*msg));
> > @@ -1361,7 +1397,7 @@ static int intel_irq_remapping_alloc(struct
> > irq_domain *domain,
> > irq_data->hwirq = (index << 16) + i;
> > irq_data->chip_data = ird;
> > - irq_data->chip = &intel_ir_chip;
> > + irq_data->chip = posted_msi_supported() ?
> > &intel_ir_chip_post_msi : &intel_ir_chip;
>
> This is just wrong because you change the chip to posted for _ALL_
> domains unconditionally.
>
> The only domains which want this chip are the PCI/MSI domains. And those
> are distinct from the domains which serve IO/APIC, HPET, no?
>
> So you can set that chip only for PCI/MSI and just let IO/APIC, HPET
> domains keep the original chip, which spares any modification of the
> IO/APIC domain.
>
>
make sense.
- irq_data->chip = posted_msi_supported() ? &intel_ir_chip_post_msi : &intel_ir_chip;
+ if ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) && posted_msi_supported())
+ irq_data->chip = &intel_ir_chip_post_msi;
+ else
+ irq_data->chip = &intel_ir_chip;
Now in IRQ debugfs, I can see the correct IR chips for IOAPIC IRQs and
MSIs.
e.g
domain: IO-APIC-8
hwirq: 0x9
chip: IR-IO-APIC
flags: 0x410
IRQCHIP_SKIP_SET_WAKE
parent:
domain: INTEL-IR-9-13
hwirq: 0x80000
chip: INTEL-IR
flags: 0x0
parent:
domain: VECTOR
domain: IR-PCI-MSI-0000:3d:00.4-11
hwirq: 0x0
chip: IR-PCI-MSI-0000:3d:00.4
flags: 0x430
IRQCHIP_SKIP_SET_WAKE
IRQCHIP_ONESHOT_SAFE
parent:
domain: INTEL-IR-4-13
hwirq: 0x0
chip: INTEL-IR-POST
flags: 0x0
parent:
domain: VECTOR
Thanks,
Jacob
On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> With posted MSIs, end of interrupt is handled by the notification
> handler. Each MSI handler does not go through local APIC IRR, ISR
> processing. There's no need to do apic_eoi() in those handlers.
>
> Add a new acpi_ack_irq_no_eoi() for the posted MSI IR chip. At runtime
> the call trace looks like:
>
> __sysvec_posted_msi_notification() {
> irq_chip_ack_parent() {
> apic_ack_irq_no_eoi();
> }
Huch? There is something missing here to make sense.
> handle_irq_event() {
> handle_irq_event_percpu() {
> driver_handler()
> }
> }
>
> IO-APIC IR is excluded the from posted MSI, we need to make sure it
> still performs EOI.
We need to make the code correct and write changelogs which make
sense. This sentence makes no sense whatsoever.
What has the IO-APIC to do with posted MSIs?
It's a different interrupt chip hierarchy, no?
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 00da6cf6b07d..ca398ee9075b 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1993,7 +1993,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
> .irq_startup = startup_ioapic_irq,
> .irq_mask = mask_ioapic_irq,
> .irq_unmask = unmask_ioapic_irq,
> - .irq_ack = irq_chip_ack_parent,
> + .irq_ack = apic_ack_irq,
Why?
> .irq_eoi = ioapic_ir_ack_level,
> .irq_set_affinity = ioapic_set_affinity,
> .irq_retrigger = irq_chip_retrigger_hierarchy,
> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
> index 14fc33cfdb37..01223ac4f57a 100644
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -911,6 +911,11 @@ void apic_ack_irq(struct irq_data *irqd)
> apic_eoi();
> }
>
> +void apic_ack_irq_no_eoi(struct irq_data *irqd)
> +{
> + irq_move_irq(irqd);
> +}
> +
The exact purpose of that function is to invoke irq_move_irq() which is
a completely pointless exercise for interrupts which are remapped.
Hi Thomas,
On Wed, 06 Dec 2023 21:15:24 +0100, Thomas Gleixner <tglx@linutronix.de>
wrote:
> On Sat, Nov 11 2023 at 20:16, Jacob Pan wrote:
> > With posted MSIs, end of interrupt is handled by the notification
> > handler. Each MSI handler does not go through local APIC IRR, ISR
> > processing. There's no need to do apic_eoi() in those handlers.
> >
> > Add a new acpi_ack_irq_no_eoi() for the posted MSI IR chip. At runtime
> > the call trace looks like:
> >
> > __sysvec_posted_msi_notification() {
> > irq_chip_ack_parent() {
> > apic_ack_irq_no_eoi();
> > }
>
> Huch? There is something missing here to make sense.
Good point, I was too focused on eoi. The trace should be like
* __sysvec_posted_msi_notification()
* irq_enter();
* handle_edge_irq()
* irq_chip_ack_parent()
* dummy(); // No EOI
* handle_irq_event()
* driver_handler()
* irq_enter();
* handle_edge_irq()
* irq_chip_ack_parent()
* dummy(); // No EOI
* handle_irq_event()
* driver_handler()
* irq_enter();
* handle_edge_irq()
* irq_chip_ack_parent()
* dummy(); // No EOI
* handle_irq_event()
* driver_handler()
* apic_eoi()
* irq_exit()
> > handle_irq_event() {
> > handle_irq_event_percpu() {
> > driver_handler()
> > }
> > }
> >
> > IO-APIC IR is excluded the from posted MSI, we need to make sure it
> > still performs EOI.
>
> We need to make the code correct and write changelogs which make
> sense. This sentence makes no sense whatsoever.
>
> What has the IO-APIC to do with posted MSIs?
>
> It's a different interrupt chip hierarchy, no?
Right, I should not modify IOAPIC chip. Just assign posted IR chip to
device MSI/x.
> > diff --git a/arch/x86/kernel/apic/io_apic.c
> > b/arch/x86/kernel/apic/io_apic.c index 00da6cf6b07d..ca398ee9075b 100644
> > --- a/arch/x86/kernel/apic/io_apic.c
> > +++ b/arch/x86/kernel/apic/io_apic.c
> > @@ -1993,7 +1993,7 @@ static struct irq_chip ioapic_ir_chip
> > __read_mostly = { .irq_startup = startup_ioapic_irq,
> > .irq_mask = mask_ioapic_irq,
> > .irq_unmask = unmask_ioapic_irq,
> > - .irq_ack = irq_chip_ack_parent,
> > + .irq_ack = apic_ack_irq,
>
> Why?
ditto.
>
> > .irq_eoi = ioapic_ir_ack_level,
> > .irq_set_affinity = ioapic_set_affinity,
> > .irq_retrigger = irq_chip_retrigger_hierarchy,
> > diff --git a/arch/x86/kernel/apic/vector.c
> > b/arch/x86/kernel/apic/vector.c index 14fc33cfdb37..01223ac4f57a 100644
> > --- a/arch/x86/kernel/apic/vector.c
> > +++ b/arch/x86/kernel/apic/vector.c
> > @@ -911,6 +911,11 @@ void apic_ack_irq(struct irq_data *irqd)
> > apic_eoi();
> > }
> >
> > +void apic_ack_irq_no_eoi(struct irq_data *irqd)
> > +{
> > + irq_move_irq(irqd);
> > +}
> > +
>
> The exact purpose of that function is to invoke irq_move_irq() which is
> a completely pointless exercise for interrupts which are remapped.
OK, I will replace this with a dummy .irq_ack() function.
Device MSIs do not have IRQD_SETAFFINITY_PENDING set.
Thanks,
Jacob
© 2016 - 2026 Red Hat, Inc.