[PATCH 0/4] amd_iommu: Bugfix and Enhancement for IO Page Fault

Dongli Zhang posted 4 patches an hour ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20260609204403.323149-1-dongli.zhang@oracle.com
Maintainers: "Michael S. Tsirkin" <mst@redhat.com>, "Dr. David Alan Gilbert" <dave@treblig.org>
hw/i386/amd_iommu.c | 74 +++++++++++++++++++++++++++++++++++++-----------
hw/i386/amd_iommu.h |  4 +++
2 files changed, 62 insertions(+), 16 deletions(-)
[PATCH 0/4] amd_iommu: Bugfix and Enhancement for IO Page Fault
Posted by Dongli Zhang an hour ago
Recently, I tried using QEMU IOMMU emulation to study PCI device IOMMU page
faults.

It works well with intel-iommu.

[   28.819456] DMAR: DRHD: handling fault status reg 2
[   28.820403] DMAR: [DMA Write NO_PASID] Request device [01:00.0] fault addr 0xc0000000 [fault reason 0x05] PTE Write access is not set
[   28.822325] DMAR: Dump dmar0 table entries for IOVA 0xc0000000
[   28.823289] DMAR: root entry: 0x00000001014ef001
[   28.823291] DMAR: context entry: hi 0x0000000000000602, low 0x00000001014ee001
[   28.825266] DMAR: pte level: 4, pte value: 0x000000010208f403
[   28.826211] DMAR: pte level: 3, pte value: 0x00000001020c8403
[   28.827158] DMAR: pte level: 2, pte value: 0x0000000000000000
[   28.828109] DMAR: page table not present at level 1

However, there are some issues with amd-iommu. This patchset aims to address
these issues through bugfixes and enhancements.

- Fix duplicate MSI capabilities for the IOMMU PCI device.
- Fix an invalid PCI address reported in I/O page fault events.
- Fix excessive GA log messages causing endless guest dmesg output.
- Enable I/O page fault delivery when xtsup=on.


The QEMU test patch I used to inject IOMMU IO page faults has been appended to
the end of this email. To execute "info version" HMP command will be able to
trigger an IO page fault.

The following is the QEMU command line. I have tested these changes with both
7.0 and v6.4 guest kernels. I have also tested with both xtsup=off and
xtsup=on.

qemu-system-x86_64 \
-machine q35,accel=kvm,kernel-irqchip=split \
-smp 4 -m 8G \
-cpu host \
-hda ol94.qcow2 \
-device pcie-root-port,io-reserve=0,pref64-reserve=32M,bus=pcie.0,chassis=40,id=pcie-root-port.1,addr=05.00,multifunction=on \
-device virtio-net-pci,netdev=tapnet,bus=pcie-root-port.1,mq=true,vectors=9,mac=52:54:11:22:14:47,disable-legacy=on,iommu_platform=true,packed=true \
-netdev tap,id=tapnet,script=qemu-ifup,downscript=no,queues=4,vhost=off \
-monitor stdio -vnc :18 \
-device amd-iommu,dma-remap=true,xtsup=off \
-kernel mainline-linux/arch/x86_64/boot/bzImage \
-append "root=/dev/sda1 init=/sbin/init text loglevel=7 console=ttyS0 clocksource=tsc iommu=force iommu.passthrough=0 amd_iommu=force_isolation iommu.strict=1"


Dongli Zhang(4):
  amd_iommu: Do not create duplicate MSI capability
  amd_iommu: Use full BDF when reporting page faults
  amd_iommu: Do not latch unsupported GA log status bits
  amd_iommu: Use INTCAPXT for IOMMU event interrupts

 hw/i386/amd_iommu.c | 74 +++++++++++++++++++++++++++++++++++++-----------
 hw/i386/amd_iommu.h |  4 +++
 2 files changed, 62 insertions(+), 16 deletions(-)


Here is the QEMU test patch to inject IO page fault.

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 63e2faee99..9de0f302d7 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1877,6 +1877,8 @@ err_undo_map:
     goto done;
 }

+bool testsys_debug = false;
+
 static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
 {
     unsigned int i, max;
@@ -1951,6 +1953,13 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
     do {
         bool map_ok;

+        if (testsys_debug && (desc.addr & 0xff000000) == 0xff000000) {
+            testsys_debug = false;
+            printf("fault: virtqueue_packed_pop() desc.addr=0x%lx\n", desc.addr);
+            desc.addr = 0xc0000000;
+            desc.len = 1;
+        }
+
         if (desc.flags & VRING_DESC_F_WRITE) {
             map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
                                         iov + out_num,
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 443b8c785d..c42004d39b 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -114,6 +114,8 @@ void hmp_info_name(Monitor *mon, const QDict *qdict)
     qapi_free_NameInfo(info);
 }

+extern bool testsys_debug;
+
 void hmp_info_version(Monitor *mon, const QDict *qdict)
 {
     VersionInfo *info;
@@ -125,6 +127,9 @@ void hmp_info_version(Monitor *mon, const QDict *qdict)
                    info->package);

     qapi_free_VersionInfo(info);
+
+    printf("fault: enable one-time debug\n");
+    testsys_debug = true;
 }

 void hmp_quit(Monitor *mon, const QDict *qdict)


Thank you very much!

Dongli Zhang
[PATCH 1/4] amd_iommu: Do not create duplicate MSI capability
Posted by Dongli Zhang an hour ago
amdvi_pci_realize() manually adds an MSI capability before calling
msi_init(). msi_init() creates the MSI capability itself, so the device
can expose duplicate MSI capabilities to the guest.

Drop the manual MSI capability and let msi_init() create the single MSI
capability used by QEMU.

Otherwise, there are duplicate MSI capabilities in guest VM lspci.

[root@vm ~]# lspci -vvv -s 00:02.0
00:02.0 IOMMU: Advanced Micro Devices, Inc. [AMD] Family 15h (Models 10h-1fh) I/O Memory Management Unit
	Subsystem: Red Hat, Inc. Device 1100
	Control: I/O+ Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx+
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Interrupt: pin ? routed to IRQ 24
	Capabilities: [60] MSI: Enable+ Count=1/1 Maskable- 64bit+
		Address: 00000000fee02000  Data: 0021
	Capabilities: [5c] HyperTransport: Slave or Primary Interface
		Command: BaseUnitID=0 UnitCnt=0 MastHost- DefDir-
		Link Control 0: CFlE- CST+ CFE- <LkFail- Init- EOC- TXO- <CRCErr=c
		Link Config 0: MLWI=16bit MLWO=8bit LWI=8bit LWO=8bit
		Link Control 1: CFlE- CST- CFE- <LkFail- Init- EOC- TXO- <CRCErr=0
		Link Config 1: MLWI=8bit MLWO=[6] LWI=[6] LWO=N/C
		Revision ID: 0.00
	Capabilities: [58] MSI: Enable- Count=1/1 Maskable- 64bit-
		Address: 00005808  Data: 5c05
	Capabilities: [40] Secure device <?>

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
 hw/i386/amd_iommu.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 789e09d6f2..997d02eab5 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -2422,11 +2422,6 @@ static void amdvi_pci_realize(PCIDevice *pdev, Error **errp)
     }
     s->capab_offset = ret;
 
-    ret = pci_add_capability(pdev, PCI_CAP_ID_MSI, 0,
-                             AMDVI_CAPAB_REG_SIZE, errp);
-    if (ret < 0) {
-        return;
-    }
     ret = pci_add_capability(pdev, PCI_CAP_ID_HT, 0,
                              AMDVI_CAPAB_REG_SIZE, errp);
     if (ret < 0) {
-- 
2.43.5
[PATCH 2/4] amd_iommu: Use full BDF when reporting page faults
Posted by Dongli Zhang an hour ago
AMDVIAddressSpace->devfn only contains the PCI device/function number.
AMD-Vi event log entries require the full DeviceID, including the PCI
bus number.

When reporting IO page faults for devices behind a pcie-root-port, QEMU
logs the fault against the wrong requester. For example, a fault from
01:00.0 is reported as 00:00.0 because only devfn is passed to
amdvi_page_fault().

[root@vm ~]# lspci
... ...
01:00.0 Ethernet controller: Red Hat, Inc. Virtio 1.0 network device (rev 01)

[   58.154050] pci 0000:00:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0xc0000000 flags=0x000a]
[   58.156968] pci 0000:00:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0xc0000000 flags=0x000a]

Use the full BDF consistently for AMD-Vi address spaces and pass it to
page fault and page-table access error reporting.

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
 hw/i386/amd_iommu.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 997d02eab5..ca9d79bb51 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -87,6 +87,11 @@ typedef struct AMDVIIOTLBEntry {
     uint64_t page_mask;         /* physical page size  */
 } AMDVIIOTLBEntry;
 
+static uint16_t amdvi_as_devid(AMDVIAddressSpace *as)
+{
+    return PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn);
+}
+
 /*
  * These 'fault' reasons have an overloaded meaning since they are not only
  * intended for describing reasons that generate an IO_PAGE_FAULT as per the AMD
@@ -613,7 +618,7 @@ static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
 
 static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte)
 {
-    uint16_t devid = PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn);
+    uint16_t devid = amdvi_as_devid(as);
     AMDVIState *s = as->iommu_state;
 
     if (!amdvi_get_dte(s, devid, dte)) {
@@ -664,6 +669,7 @@ static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte,
 {
     IOMMUAccessFlags perms = amdvi_get_perms(dte);
 
+    uint16_t devid = amdvi_as_devid(as);
     uint8_t level, mode;
     uint64_t pte_addr;
 
@@ -719,7 +725,7 @@ static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte,
          * and walk down to the lower level.
          */
         pte_addr = NEXT_PTE_ADDR(*pte, level, address);
-        *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
+        *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, devid);
 
         if (*pte == (uint64_t)-1) {
             /*
@@ -1763,6 +1769,7 @@ static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
     uint8_t mode;
     uint64_t pte;
     int fetch_ret;
+    uint16_t devid = amdvi_as_devid(as);
 
     /* make sure the DTE has TV = 1 */
     if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) {
@@ -1796,7 +1803,7 @@ static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
     if (fetch_ret < 0 || !IOMMU_PTE_PRESENT(pte) ||
         perms != (perms & amdvi_get_perms(pte))) {
 
-        amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
+        amdvi_page_fault(as->iommu_state, devid, addr, perms);
         trace_amdvi_page_fault(addr);
         return;
     }
@@ -1823,7 +1830,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
                                bool is_write, IOMMUTLBEntry *ret)
 {
     AMDVIState *s = as->iommu_state;
-    uint16_t devid = PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn);
+    uint16_t devid = amdvi_as_devid(as);
     AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
     uint64_t entry[4];
     int dte_ret;
-- 
2.43.5
[PATCH 3/4] amd_iommu: Do not latch unsupported GA log status bits
Posted by Dongli Zhang an hour ago
When QEMU records an AMD-Vi event-log entry for an IO page fault, it
sets the event interrupt status bit and injects the event interrupt.

Linux kernel prior to commit 2379f3485239 ("iommu/amd: Refactor IOMMU
interrupt handling logic for Event, PPR, and GA logs") clears the status
register with a combined mask AMD_IOMMU_INT_MASK when handling any IOMMU
interrupt. That mask includes the
GA log interrupt (MMIO_STATUS_GALOG_INT_MASK) and
overflow (MMIO_STATUS_GALOG_OVERFLOW_MASK) bits, even when the interrupt
being handled is only an event-log interrupt.

QEMU does not produce GA log entries or set GA log status. However, the
emulated status register still accepts guest writes to the GA log status
bits as normal writes. As a result, a guest status clear write that
includes those bits can latch a spurious GA log overflow. The guest then
observes GA log status that QEMU did not generate and repeatedly
restarts the GA log.

The below guest dmesg logs are from v6.4 Linux kernel.

[   57.818224] virtio-pci 0000:01:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0xc0000000 flags=0x0009]
[   57.821130] virtio-pci 0000:01:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0xc0000000 flags=0x0009]
[   57.824003] AMD-Vi: IOMMU GA Log overflow
[   57.825099] AMD-Vi: IOMMU GA Log restarting
[   57.826338] AMD-Vi: IOMMU GA Log overflow
[   57.827425] AMD-Vi: IOMMU GA Log restarting
[   57.828657] AMD-Vi: IOMMU GA Log overflow
[   57.829758] AMD-Vi: IOMMU GA Log restarting
[   57.830986] AMD-Vi: IOMMU GA Log overflow
[   57.832063] AMD-Vi: IOMMU GA Log restarting

Mainline Linux commit 2379f3485239 ("iommu/amd: Refactor IOMMU interrupt
handling logic for Event, PPR, and GA logs") avoids triggering this by
chance by clearing only the status bits for the log being handled. However,
older guests still use the combined clear mask.

Add the GA log interrupt and overflow bits to the AMDVI_MMIO_STATUS
w1cmask so guest writes cannot create unsupported GA log status. This
does not implement GA log production. It only prevents guest status clear
writes from creating GA log status that QEMU did not generate to
emulated IO page fault for guest Linux kernel.

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
 hw/i386/amd_iommu.c | 4 +++-
 hw/i386/amd_iommu.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index ca9d79bb51..996c154a3f 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -2414,7 +2414,9 @@ static void amdvi_init(AMDVIState *s)
     amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES,
                    amdvi_extended_feature_register(s),
                    0xffffffffffffffef, 0);
-    amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
+    amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98,
+                   0x67 | AMDVI_MMIO_STATUS_GALOG_OVF |
+                   AMDVI_MMIO_STATUS_GALOG_INT);
 }
 
 static void amdvi_pci_realize(PCIDevice *pdev, Error **errp)
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 302ccca512..0b2e8d207b 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -113,6 +113,8 @@
 #define AMDVI_MMIO_STATUS_COMP_INT    (1 << 2)
 #define AMDVI_MMIO_STATUS_EVENT_INT   (1 << 1)
 #define AMDVI_MMIO_STATUS_EVT_OVF     (1 << 0)
+#define AMDVI_MMIO_STATUS_GALOG_OVF   (1 << 9)
+#define AMDVI_MMIO_STATUS_GALOG_INT   (1 << 10)
 
 #define AMDVI_CMDBUF_ID_BYTE              0x07
 #define AMDVI_CMDBUF_ID_RSHIFT            4
-- 
2.43.5
[PATCH 4/4] amd_iommu: Use INTCAPXT for IOMMU event interrupts
Posted by Dongli Zhang an hour ago
When xtsup=on is exposed, Linux switches the AMD IOMMU event interrupt
setup to x2APIC mode and programs the INTCAPXT event register instead
of relying on the PCI MSI capability.

QEMU currently neither records writes to the INTCAPXT event register nor
uses the programmed event vector when raising an AMD IOMMU event interrupt.
As a result, DMA faults can be logged into the event log without
delivering the AMD-Vi event interrupt to the guest when xtsup=on.

Add the INTCAPXT event MMIO offset and accept guest writes to it. When
IntCapXT is enabled, build the AMD-Vi event interrupt MSI message from
the programmed INTCAPXT event register; otherwise keep the existing PCI
MSI fallback.

This patch intentionally only supports the INTCAPXT event interrupt path.
It does not add PPR or GA-log interrupt support.

Keep command completion wait interrupts on the existing PCI MSI path by
using a separate event interrupt helper for INTCAPXT delivery.

Send INTCAPXT event interrupts through the APIC MSI delivery hook so
KVM can handle x2APIC destination IDs above 255.

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
 hw/i386/amd_iommu.c | 50 +++++++++++++++++++++++++++++++++++++++------
 hw/i386/amd_iommu.h |  2 ++
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 996c154a3f..17485aa27f 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -220,18 +220,53 @@ static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
    amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
 }
 
-static void amdvi_generate_msi_interrupt(AMDVIState *s)
+static void amdvi_send_msi_message(AMDVIState *s, MSIMessage msg)
 {
-    MSIMessage msg = {};
     MemTxAttrs attrs = {
         .requester_id = pci_requester_id(&s->pci->dev)
     };
 
+    address_space_stl_le(&address_space_memory, msg.address, msg.data,
+                         attrs, NULL);
+}
+
+static void amdvi_generate_msi_interrupt(AMDVIState *s)
+{
+    MSIMessage msg;
+
     if (msi_enabled(&s->pci->dev)) {
         msg = msi_get_message(&s->pci->dev, 0);
-        address_space_stl_le(&address_space_memory, msg.address, msg.data,
-                             attrs, NULL);
+        amdvi_send_msi_message(s, msg);
+    }
+}
+
+static void amdvi_generate_event_interrupt(AMDVIState *s)
+{
+    uint64_t intcapxt;
+    uint64_t control;
+    MSIMessage msg;
+
+    control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
+    if (!s->xtsup || !(control & AMDVI_MMIO_CONTROL_INTCAPXTEN)) {
+        amdvi_generate_msi_interrupt(s);
+        return;
+    }
+
+    intcapxt = amdvi_readq(s, AMDVI_MMIO_INTCAPXT_EVT);
+    if (!intcapxt) {
+        return;
     }
+
+    X86IOMMUIrq irq = {
+        .delivery_mode = AMDVI_IOAPIC_INT_TYPE_FIXED,
+        .vector = extract64(intcapxt, 32, 8),
+        .dest = extract64(intcapxt, 8, 24) |
+                (extract64(intcapxt, 56, 8) << 24),
+        .dest_mode = extract64(intcapxt, 2, 1),
+    };
+
+    x86_iommu_irq_to_msi_message(&irq, &msg);
+    apic_get_class(NULL)->send_msi(&msg);
 }
 
 static uint32_t get_next_eventlog_entry(AMDVIState *s)
@@ -257,7 +292,7 @@ static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
         /* generate overflow interrupt */
         if (s->evtlog_intr) {
             amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
-            amdvi_generate_msi_interrupt(s);
+            amdvi_generate_event_interrupt(s);
         }
         return;
     }
@@ -272,7 +307,7 @@ static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
 
     if (s->evtlog_intr) {
         amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVENT_INT);
-        amdvi_generate_msi_interrupt(s);
+        amdvi_generate_event_interrupt(s);
     }
 }
 
@@ -1740,6 +1775,9 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
         amdvi_mmio_reg_write(s, size, val, addr);
         amdvi_handle_excllim_write(s);
         break;
+    case AMDVI_MMIO_INTCAPXT_EVT:
+        amdvi_mmio_reg_write(s, size, val, addr);
+        break;
         /* PPR log base - unused for now */
     case AMDVI_MMIO_PPR_BASE:
         amdvi_mmio_reg_write(s, size, val, addr);
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 0b2e8d207b..7e829ccaa2 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -57,6 +57,7 @@
 #define AMDVI_MMIO_EXCL_BASE          0x0020
 #define AMDVI_MMIO_EXCL_LIMIT         0x0028
 #define AMDVI_MMIO_EXT_FEATURES       0x0030
+#define AMDVI_MMIO_INTCAPXT_EVT       0x0170
 #define AMDVI_MMIO_COMMAND_HEAD       0x2000
 #define AMDVI_MMIO_COMMAND_TAIL       0x2008
 #define AMDVI_MMIO_EVENT_HEAD         0x2010
@@ -106,6 +107,7 @@
 #define AMDVI_MMIO_CONTROL_COMWAITINTEN   (1ULL << 4)
 #define AMDVI_MMIO_CONTROL_CMDBUFLEN      (1ULL << 12)
 #define AMDVI_MMIO_CONTROL_GAEN           (1ULL << 17)
+#define AMDVI_MMIO_CONTROL_INTCAPXTEN     (1ULL << 51)
 
 /* MMIO status register bits */
 #define AMDVI_MMIO_STATUS_CMDBUF_RUN  (1 << 4)
-- 
2.43.5