Do not allow to write to RTE registers using io_apic_write and instead
require changes to RTE to be performed using ioapic_write_entry.
This is in preparation for passing the full contents of the RTE to the
IOMMU interrupt remapping handlers, so remapping entries for IO-APIC
RTEs can be updated atomically when possible.
While immediately this commit might expand the number of MMIO accesses
in order to update an IO-APIC RTE, further changes will benefit from
getting the full RTE value passed to the IOMMU handlers, as the logic
is greatly simplified when the IOMMU handlers can get the complete RTE
value in one go.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v1:
- Reinstate io_apic_modify().
- Expand commit message.
---
xen/arch/x86/include/asm/io_apic.h | 8 ++---
xen/arch/x86/io_apic.c | 37 ++++++++++++------------
xen/drivers/passthrough/amd/iommu_intr.c | 6 ----
3 files changed, 22 insertions(+), 29 deletions(-)
diff --git a/xen/arch/x86/include/asm/io_apic.h b/xen/arch/x86/include/asm/io_apic.h
index 4c4777b68a51..9165da2281ae 100644
--- a/xen/arch/x86/include/asm/io_apic.h
+++ b/xen/arch/x86/include/asm/io_apic.h
@@ -162,8 +162,8 @@ static inline void __io_apic_write(unsigned int apic, unsigned int reg, unsigned
static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
- if ( ioapic_reg_remapped(reg) )
- return iommu_update_ire_from_apic(apic, reg, value);
+ /* RTE writes must use ioapic_write_entry. */
+ BUG_ON(reg >= 0x10);
__io_apic_write(apic, reg, value);
}
@@ -173,8 +173,8 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
*/
static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
- if ( ioapic_reg_remapped(reg) )
- return iommu_update_ire_from_apic(apic, reg, value);
+ /* RTE writes must use ioapic_write_entry. */
+ BUG_ON(reg >= 0x10);
*(IO_APIC_BASE(apic) + 4) = value;
}
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index aada2ef96c62..85b4b4c6bc98 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -269,15 +269,15 @@ void __ioapic_write_entry(
{
union entry_union eu = { .entry = e };
- if ( raw )
+ if ( raw || !iommu_intremap )
{
__io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
__io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
}
else
{
- io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
- io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
+ iommu_update_ire_from_apic(apic, 0x11 + 2 * pin, eu.w2);
+ iommu_update_ire_from_apic(apic, 0x10 + 2 * pin, eu.w1);
}
}
@@ -433,16 +433,17 @@ static void modify_IO_APIC_irq(unsigned int irq, unsigned int enable,
unsigned int disable)
{
struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int pin, reg;
for (;;) {
- pin = entry->pin;
+ unsigned int pin = entry->pin;
+ struct IO_APIC_route_entry rte;
+
if (pin == -1)
break;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- reg &= ~disable;
- reg |= enable;
- io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+ rte = __ioapic_read_entry(entry->apic, pin, false);
+ rte.raw &= ~(uint64_t)disable;
+ rte.raw |= enable;
+ __ioapic_write_entry(entry->apic, pin, false, rte);
if (!entry->next)
break;
entry = irq_2_pin + entry->next;
@@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
dest = SET_APIC_LOGICAL_ID(dest);
entry = irq_2_pin + irq;
for (;;) {
- unsigned int data;
+ struct IO_APIC_route_entry rte;
+
pin = entry->pin;
if (pin == -1)
break;
- io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest);
- data = io_apic_read(entry->apic, 0x10 + pin*2);
- data &= ~IO_APIC_REDIR_VECTOR_MASK;
- data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK);
- io_apic_modify(entry->apic, 0x10 + pin*2, data);
+ rte = __ioapic_read_entry(entry->apic, pin, false);
+ rte.dest.dest32 = dest;
+ rte.vector = desc->arch.vector;
+ __ioapic_write_entry(entry->apic, pin, false, rte);
if (!entry->next)
break;
@@ -2127,10 +2128,8 @@ void ioapic_resume(void)
reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
__io_apic_write(apic, 0, reg_00.raw);
}
- for (i = 0; i < nr_ioapic_entries[apic]; i++, entry++) {
- __io_apic_write(apic, 0x11+2*i, *(((int *)entry)+1));
- __io_apic_write(apic, 0x10+2*i, *(((int *)entry)+0));
- }
+ for (i = 0; i < nr_ioapic_entries[apic]; i++, entry++)
+ __ioapic_write_entry(apic, i, true, *entry);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
}
diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c
index 9e6be3be3515..f32c418a7e49 100644
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -361,12 +361,6 @@ void cf_check amd_iommu_ioapic_update_ire(
struct amd_iommu *iommu;
unsigned int idx;
- if ( !iommu_intremap )
- {
- __io_apic_write(apic, reg, value);
- return;
- }
-
idx = ioapic_id_to_index(IO_APIC_ID(apic));
if ( idx == MAX_IO_APICS )
return;
--
2.41.0
On 18.07.2023 14:43, Roger Pau Monne wrote:
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -269,15 +269,15 @@ void __ioapic_write_entry(
> {
> union entry_union eu = { .entry = e };
>
> - if ( raw )
> + if ( raw || !iommu_intremap )
> {
> __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
> __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> }
> else
> {
> - io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
> - io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> + iommu_update_ire_from_apic(apic, 0x11 + 2 * pin, eu.w2);
> + iommu_update_ire_from_apic(apic, 0x10 + 2 * pin, eu.w1);
> }
> }
I think __ioapic_read_entry() wants updating similarly, so that both
remain consistent.
> @@ -433,16 +433,17 @@ static void modify_IO_APIC_irq(unsigned int irq, unsigned int enable,
> unsigned int disable)
> {
> struct irq_pin_list *entry = irq_2_pin + irq;
> - unsigned int pin, reg;
>
> for (;;) {
> - pin = entry->pin;
> + unsigned int pin = entry->pin;
> + struct IO_APIC_route_entry rte;
> +
> if (pin == -1)
> break;
> - reg = io_apic_read(entry->apic, 0x10 + pin*2);
> - reg &= ~disable;
> - reg |= enable;
> - io_apic_modify(entry->apic, 0x10 + pin*2, reg);
> + rte = __ioapic_read_entry(entry->apic, pin, false);
> + rte.raw &= ~(uint64_t)disable;
> + rte.raw |= enable;
> + __ioapic_write_entry(entry->apic, pin, false, rte);
While this isn't going to happen overly often, ...
> @@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
> dest = SET_APIC_LOGICAL_ID(dest);
> entry = irq_2_pin + irq;
> for (;;) {
> - unsigned int data;
> + struct IO_APIC_route_entry rte;
> +
> pin = entry->pin;
> if (pin == -1)
> break;
>
> - io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest);
> - data = io_apic_read(entry->apic, 0x10 + pin*2);
> - data &= ~IO_APIC_REDIR_VECTOR_MASK;
> - data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK);
> - io_apic_modify(entry->apic, 0x10 + pin*2, data);
> + rte = __ioapic_read_entry(entry->apic, pin, false);
> + rte.dest.dest32 = dest;
> + rte.vector = desc->arch.vector;
> + __ioapic_write_entry(entry->apic, pin, false, rte);
... this makes me wonder whether there shouldn't be an
__ioapic_modify_entry() capable of suppressing one of the two
writes (but still being handed the full RTE).
Jan
On Tue, Jul 18, 2023 at 05:40:18PM +0200, Jan Beulich wrote:
> On 18.07.2023 14:43, Roger Pau Monne wrote:
> > --- a/xen/arch/x86/io_apic.c
> > +++ b/xen/arch/x86/io_apic.c
> > @@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
> > dest = SET_APIC_LOGICAL_ID(dest);
> > entry = irq_2_pin + irq;
> > for (;;) {
> > - unsigned int data;
> > + struct IO_APIC_route_entry rte;
> > +
> > pin = entry->pin;
> > if (pin == -1)
> > break;
> >
> > - io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest);
> > - data = io_apic_read(entry->apic, 0x10 + pin*2);
> > - data &= ~IO_APIC_REDIR_VECTOR_MASK;
> > - data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK);
> > - io_apic_modify(entry->apic, 0x10 + pin*2, data);
> > + rte = __ioapic_read_entry(entry->apic, pin, false);
> > + rte.dest.dest32 = dest;
> > + rte.vector = desc->arch.vector;
> > + __ioapic_write_entry(entry->apic, pin, false, rte);
>
> ... this makes me wonder whether there shouldn't be an
> __ioapic_modify_entry() capable of suppressing one of the two
> writes (but still being handed the full RTE).
I've wondered about this, and I'm not sure how often can one of the
two writes be suppressed here, as we modify both the low (for the
vector) and the high part of the RTE (for the destination). It's
unlikely that the same vector could be used on both destinations, and
IMO such case doesn't warrant the introduction of the extra logic
required in order to suppress one of the writes.
Am I overlooking something?
Thanks, Roger.
On 25.07.2023 15:30, Roger Pau Monné wrote:
> On Tue, Jul 18, 2023 at 05:40:18PM +0200, Jan Beulich wrote:
>> On 18.07.2023 14:43, Roger Pau Monne wrote:
>>> --- a/xen/arch/x86/io_apic.c
>>> +++ b/xen/arch/x86/io_apic.c
>>> @@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
>>> dest = SET_APIC_LOGICAL_ID(dest);
>>> entry = irq_2_pin + irq;
>>> for (;;) {
>>> - unsigned int data;
>>> + struct IO_APIC_route_entry rte;
>>> +
>>> pin = entry->pin;
>>> if (pin == -1)
>>> break;
>>>
>>> - io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest);
>>> - data = io_apic_read(entry->apic, 0x10 + pin*2);
>>> - data &= ~IO_APIC_REDIR_VECTOR_MASK;
>>> - data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK);
>>> - io_apic_modify(entry->apic, 0x10 + pin*2, data);
>>> + rte = __ioapic_read_entry(entry->apic, pin, false);
>>> + rte.dest.dest32 = dest;
>>> + rte.vector = desc->arch.vector;
>>> + __ioapic_write_entry(entry->apic, pin, false, rte);
>>
>> ... this makes me wonder whether there shouldn't be an
>> __ioapic_modify_entry() capable of suppressing one of the two
>> writes (but still being handed the full RTE).
>
> I've wondered about this, and I'm not sure how often can one of the
> two writes be suppressed here, as we modify both the low (for the
> vector) and the high part of the RTE (for the destination). It's
> unlikely that the same vector could be used on both destinations, and
> IMO such case doesn't warrant the introduction of the extra logic
> required in order to suppress one of the writes.
>
> Am I overlooking something?
Oh, no, that was me seeing the io_apic_modify() without paying attention
to the earlier io_apic_write() (both in the code you replace).
Jan
On Tue, Jul 18, 2023 at 05:40:18PM +0200, Jan Beulich wrote:
> On 18.07.2023 14:43, Roger Pau Monne wrote:
> > --- a/xen/arch/x86/io_apic.c
> > +++ b/xen/arch/x86/io_apic.c
> > @@ -269,15 +269,15 @@ void __ioapic_write_entry(
> > {
> > union entry_union eu = { .entry = e };
> >
> > - if ( raw )
> > + if ( raw || !iommu_intremap )
> > {
> > __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
> > __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> > }
> > else
> > {
> > - io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
> > - io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> > + iommu_update_ire_from_apic(apic, 0x11 + 2 * pin, eu.w2);
> > + iommu_update_ire_from_apic(apic, 0x10 + 2 * pin, eu.w1);
> > }
> > }
>
> I think __ioapic_read_entry() wants updating similarly, so that both
> remain consistent.
My plan was to deal with __ioapic_read_entry() separately, as I would
also like to convert iommu_read_apic_from_ire() to get passed a pin
instead of a register, but I could make your requested adjustment here
for consistency with __ioapic_write_entry().
> > @@ -433,16 +433,17 @@ static void modify_IO_APIC_irq(unsigned int irq, unsigned int enable,
> > unsigned int disable)
> > {
> > struct irq_pin_list *entry = irq_2_pin + irq;
> > - unsigned int pin, reg;
> >
> > for (;;) {
> > - pin = entry->pin;
> > + unsigned int pin = entry->pin;
> > + struct IO_APIC_route_entry rte;
> > +
> > if (pin == -1)
> > break;
> > - reg = io_apic_read(entry->apic, 0x10 + pin*2);
> > - reg &= ~disable;
> > - reg |= enable;
> > - io_apic_modify(entry->apic, 0x10 + pin*2, reg);
> > + rte = __ioapic_read_entry(entry->apic, pin, false);
> > + rte.raw &= ~(uint64_t)disable;
> > + rte.raw |= enable;
> > + __ioapic_write_entry(entry->apic, pin, false, rte);
>
> While this isn't going to happen overly often, ...
>
> > @@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
> > dest = SET_APIC_LOGICAL_ID(dest);
> > entry = irq_2_pin + irq;
> > for (;;) {
> > - unsigned int data;
> > + struct IO_APIC_route_entry rte;
> > +
> > pin = entry->pin;
> > if (pin == -1)
> > break;
> >
> > - io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest);
> > - data = io_apic_read(entry->apic, 0x10 + pin*2);
> > - data &= ~IO_APIC_REDIR_VECTOR_MASK;
> > - data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK);
> > - io_apic_modify(entry->apic, 0x10 + pin*2, data);
> > + rte = __ioapic_read_entry(entry->apic, pin, false);
> > + rte.dest.dest32 = dest;
> > + rte.vector = desc->arch.vector;
> > + __ioapic_write_entry(entry->apic, pin, false, rte);
>
> ... this makes me wonder whether there shouldn't be an
> __ioapic_modify_entry() capable of suppressing one of the two
> writes (but still being handed the full RTE).
We would then need the original RTE to be passed to
__ioapic_modify_entry() in order for it to decide whether one of the
accesses can be eliminated (as I don't think we want to read the RTE
to check for differences, as we would then perform even more
accesses).
This would only be relevant for systems without IOMMU IRTEs, as
otherwise the IO-APIC RTE gets modified by the IOMMU handlers.
Thanks, Roger.
On 19.07.2023 17:20, Roger Pau Monné wrote:
> On Tue, Jul 18, 2023 at 05:40:18PM +0200, Jan Beulich wrote:
>> On 18.07.2023 14:43, Roger Pau Monne wrote:
>>> --- a/xen/arch/x86/io_apic.c
>>> +++ b/xen/arch/x86/io_apic.c
>>> @@ -269,15 +269,15 @@ void __ioapic_write_entry(
>>> {
>>> union entry_union eu = { .entry = e };
>>>
>>> - if ( raw )
>>> + if ( raw || !iommu_intremap )
>>> {
>>> __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
>>> __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
>>> }
>>> else
>>> {
>>> - io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
>>> - io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
>>> + iommu_update_ire_from_apic(apic, 0x11 + 2 * pin, eu.w2);
>>> + iommu_update_ire_from_apic(apic, 0x10 + 2 * pin, eu.w1);
>>> }
>>> }
>>
>> I think __ioapic_read_entry() wants updating similarly, so that both
>> remain consistent.
>
> My plan was to deal with __ioapic_read_entry() separately, as I would
> also like to convert iommu_read_apic_from_ire() to get passed a pin
> instead of a register, but I could make your requested adjustment here
> for consistency with __ioapic_write_entry().
I would indeed prefer if you did, to avoid the functions going out of
sync.
>>> @@ -433,16 +433,17 @@ static void modify_IO_APIC_irq(unsigned int irq, unsigned int enable,
>>> unsigned int disable)
>>> {
>>> struct irq_pin_list *entry = irq_2_pin + irq;
>>> - unsigned int pin, reg;
>>>
>>> for (;;) {
>>> - pin = entry->pin;
>>> + unsigned int pin = entry->pin;
>>> + struct IO_APIC_route_entry rte;
>>> +
>>> if (pin == -1)
>>> break;
>>> - reg = io_apic_read(entry->apic, 0x10 + pin*2);
>>> - reg &= ~disable;
>>> - reg |= enable;
>>> - io_apic_modify(entry->apic, 0x10 + pin*2, reg);
>>> + rte = __ioapic_read_entry(entry->apic, pin, false);
>>> + rte.raw &= ~(uint64_t)disable;
>>> + rte.raw |= enable;
>>> + __ioapic_write_entry(entry->apic, pin, false, rte);
>>
>> While this isn't going to happen overly often, ...
>>
>>> @@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
>>> dest = SET_APIC_LOGICAL_ID(dest);
>>> entry = irq_2_pin + irq;
>>> for (;;) {
>>> - unsigned int data;
>>> + struct IO_APIC_route_entry rte;
>>> +
>>> pin = entry->pin;
>>> if (pin == -1)
>>> break;
>>>
>>> - io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest);
>>> - data = io_apic_read(entry->apic, 0x10 + pin*2);
>>> - data &= ~IO_APIC_REDIR_VECTOR_MASK;
>>> - data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK);
>>> - io_apic_modify(entry->apic, 0x10 + pin*2, data);
>>> + rte = __ioapic_read_entry(entry->apic, pin, false);
>>> + rte.dest.dest32 = dest;
>>> + rte.vector = desc->arch.vector;
>>> + __ioapic_write_entry(entry->apic, pin, false, rte);
>>
>> ... this makes me wonder whether there shouldn't be an
>> __ioapic_modify_entry() capable of suppressing one of the two
>> writes (but still being handed the full RTE).
>
> We would then need the original RTE to be passed to
> __ioapic_modify_entry() in order for it to decide whether one of the
> accesses can be eliminated (as I don't think we want to read the RTE
> to check for differences, as we would then perform even more
> accesses).
I was actually thinking of a 2-bit hint that the caller would pass:
Low half unmodified and high half unmodified.
> This would only be relevant for systems without IOMMU IRTEs, as
> otherwise the IO-APIC RTE gets modified by the IOMMU handlers.
Initially yes. I think there's room there to avoid one half of the
write as well.
Jan
© 2016 - 2026 Red Hat, Inc.