On ARM, when a device is behind an IOMMU, its MSI doorbell address is
subject to translation by the IOMMU. This behavior affects vfio-pci
passthrough devices assigned to guests using an accelerated SMMUv3.
In this setup, we configure the host SMMUv3 in nested mode, where
VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
we currently return the system address space via the get_address_space()
callback for vfio-pci devices.
However, QEMU/KVM also uses this same callback path when resolving the
address space for MSI doorbells:
kvm_irqchip_add_msi_route()
kvm_arch_fixup_msi_route()
pci_device_iommu_address_space()
This leads to problems when MSI doorbells need to be translated.
To fix this, introduce an optional get_msi_address_space() callback.
In the SMMUv3 accelerated case, this callback returns the IOMMU address
space if the guest has set up S1 translations for the vfio-pci device.
Otherwise, it returns the system address space.
Suggested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
hw/arm/smmuv3-accel.c | 25 +++++++++++++++++++++++++
hw/pci/pci.c | 19 +++++++++++++++++++
include/hw/pci/pci.h | 16 ++++++++++++++++
target/arm/kvm.c | 2 +-
4 files changed, 61 insertions(+), 1 deletion(-)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index f1584dd775..04c665ccf5 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -346,6 +346,30 @@ static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
}
}
+static AddressSpace *smmuv3_accel_find_msi_as(PCIBus *bus, void *opaque,
+ int devfn)
+{
+ SMMUState *bs = opaque;
+ SMMUPciBus *sbus;
+ SMMUv3AccelDevice *accel_dev;
+ SMMUDevice *sdev;
+
+ sbus = smmu_get_sbus(bs, bus);
+ accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
+ sdev = &accel_dev->sdev;
+
+ /*
+ * If the assigned vfio-pci dev has S1 translation enabled by
+ * Guest, return IOMMU address space for MSI translation.
+ * Otherwise, return system address space.
+ */
+ if (accel_dev->s1_hwpt) {
+ return &sdev->as;
+ } else {
+ return &accel_dev->as_sysmem;
+ }
+}
+
static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci)
{
@@ -407,6 +431,7 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.get_viommu_cap = smmuv3_accel_get_viommu_cap,
.set_iommu_device = smmuv3_accel_set_iommu_device,
.unset_iommu_device = smmuv3_accel_unset_iommu_device,
+ .get_msi_address_space = smmuv3_accel_find_msi_as,
};
void smmuv3_accel_init(SMMUv3State *s)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 13de0e2809..404aeb643d 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2957,6 +2957,25 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
return &address_space_memory;
}
+AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev)
+{
+ PCIBus *bus;
+ PCIBus *iommu_bus;
+ int devfn;
+
+ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+ if (iommu_bus) {
+ if (iommu_bus->iommu_ops->get_msi_address_space) {
+ return iommu_bus->iommu_ops->get_msi_address_space(bus,
+ iommu_bus->iommu_opaque, devfn);
+ } else {
+ return iommu_bus->iommu_ops->get_address_space(bus,
+ iommu_bus->iommu_opaque, devfn);
+ }
+ }
+ return &address_space_memory;
+}
+
int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n,
IOMMUNotify fn, void *opaque)
{
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index d1d43e9fb9..55138c406e 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -639,12 +639,28 @@ typedef struct PCIIOMMUOps {
uint32_t pasid, bool priv_req, bool exec_req,
hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
bool is_write);
+ /**
+ * @get_msi_address_space: get the address space for MSI doorbell address
+ * for devices
+ *
+ * Optional callback which returns a pointer to an #AddressSpace. This
+ * is required if MSI doorbell also gets translated through IOMMU(eg: ARM)
+ *
+ * @bus: the #PCIBus being accessed.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number
+ */
+ AddressSpace * (*get_msi_address_space)(PCIBus *bus, void *opaque,
+ int devfn);
} PCIIOMMUOps;
AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
Error **errp);
void pci_device_unset_iommu_device(PCIDevice *dev);
+AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev);
/**
* pci_device_get_viommu_cap: get vIOMMU capabilities.
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 6672344855..c78d0d59bb 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -1535,7 +1535,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level)
int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
uint64_t address, uint32_t data, PCIDevice *dev)
{
- AddressSpace *as = pci_device_iommu_address_space(dev);
+ AddressSpace *as = pci_device_iommu_msi_address_space(dev);
hwaddr xlat, len, doorbell_gpa;
MemoryRegionSection mrs;
MemoryRegion *mr;
--
2.34.1
On 7/14/25 5:59 PM, Shameer Kolothum wrote:
> On ARM, when a device is behind an IOMMU, its MSI doorbell address is
> subject to translation by the IOMMU. This behavior affects vfio-pci
> passthrough devices assigned to guests using an accelerated SMMUv3.
>
> In this setup, we configure the host SMMUv3 in nested mode, where
> VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
> controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
> we currently return the system address space via the get_address_space() callback for vfio-pci devices.
>
> However, QEMU/KVM also uses this same callback path when resolving the
> address space for MSI doorbells:
>
> kvm_irqchip_add_msi_route()
> kvm_arch_fixup_msi_route()
> pci_device_iommu_address_space()
>
> This leads to problems when MSI doorbells need to be translated.
Worth to explain the exact "problem" ;-)
Eric
>
> To fix this, introduce an optional get_msi_address_space() callback.
> In the SMMUv3 accelerated case, this callback returns the IOMMU address
> space if the guest has set up S1 translations for the vfio-pci device.
> Otherwise, it returns the system address space.
>
> Suggested-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
> hw/arm/smmuv3-accel.c | 25 +++++++++++++++++++++++++
> hw/pci/pci.c | 19 +++++++++++++++++++
> include/hw/pci/pci.h | 16 ++++++++++++++++
> target/arm/kvm.c | 2 +-
> 4 files changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> i ndex f1584dd775..04c665ccf5 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -346,6 +346,30 @@ static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque,
> }
> }
>
> +static AddressSpace *smmuv3_accel_find_msi_as(PCIBus *bus, void *opaque,
> + int devfn)
> +{
> + SMMUState *bs = opaque;
> + SMMUPciBus *sbus;
> + SMMUv3AccelDevice *accel_dev;
> + SMMUDevice *sdev;
> +
> + sbus = smmu_get_sbus(bs, bus);
> + accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn);
> + sdev = &accel_dev->sdev;
> +
> + /*
> + * If the assigned vfio-pci dev has S1 translation enabled by
> + * Guest, return IOMMU address space for MSI translation.
> + * Otherwise, return system address space.
> + */
> + if (accel_dev->s1_hwpt) {
> + return &sdev->as;
> + } else {
> + return &accel_dev->as_sysmem;
> + }
> +}
> +
> static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci)
> {
>
> @@ -407,6 +431,7 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
> .get_viommu_cap = smmuv3_accel_get_viommu_cap,
> .set_iommu_device = smmuv3_accel_set_iommu_device,
> .unset_iommu_device = smmuv3_accel_unset_iommu_device,
> + .get_msi_address_space = smmuv3_accel_find_msi_as,
> };
>
> void smmuv3_accel_init(SMMUv3State *s)
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 13de0e2809..404aeb643d 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2957,6 +2957,25 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> return &address_space_memory;
> }
>
> +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev)
> +{
> + PCIBus *bus;
> + PCIBus *iommu_bus;
> + int devfn;
> +
> + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
> + if (iommu_bus) {
> + if (iommu_bus->iommu_ops->get_msi_address_space) {
> + return iommu_bus->iommu_ops->get_msi_address_space(bus,
> + iommu_bus->iommu_opaque, devfn);
> + } else {
> + return iommu_bus->iommu_ops->get_address_space(bus,
> + iommu_bus->iommu_opaque, devfn);
> + }
> + }
> + return &address_space_memory;
> +}
> +
> int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n,
> IOMMUNotify fn, void *opaque)
> {
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index d1d43e9fb9..55138c406e 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -639,12 +639,28 @@ typedef struct PCIIOMMUOps {
> uint32_t pasid, bool priv_req, bool exec_req,
> hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
> bool is_write);
> + /**
> + * @get_msi_address_space: get the address space for MSI doorbell address
> + * for devices
> + *
> + * Optional callback which returns a pointer to an #AddressSpace. This
> + * is required if MSI doorbell also gets translated through IOMMU(eg: ARM)
> + *
> + * @bus: the #PCIBus being accessed.
> + *
> + * @opaque: the data passed to pci_setup_iommu().
> + *
> + * @devfn: device and function number
> + */
> + AddressSpace * (*get_msi_address_space)(PCIBus *bus, void *opaque,
> + int devfn);
> } PCIIOMMUOps;
>
> AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
> bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
> Error **errp);
> void pci_device_unset_iommu_device(PCIDevice *dev);
> +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev);
>
> /**
> * pci_device_get_viommu_cap: get vIOMMU capabilities.
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index 6672344855..c78d0d59bb 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -1535,7 +1535,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level)
> int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
> uint64_t address, uint32_t data, PCIDevice *dev)
> {
> - AddressSpace *as = pci_device_iommu_address_space(dev);
> + AddressSpace *as = pci_device_iommu_msi_address_space(dev);
> hwaddr xlat, len, doorbell_gpa;
> MemoryRegionSection mrs;
> MemoryRegion *mr;
On Mon, Jul 14, 2025 at 04:59:37PM +0100, Shameer Kolothum wrote: > On ARM, when a device is behind an IOMMU, its MSI doorbell address is > subject to translation by the IOMMU. This behavior affects vfio-pci > passthrough devices assigned to guests using an accelerated SMMUv3. > > In this setup, we configure the host SMMUv3 in nested mode, where > VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest > controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings, > we currently return the system address space via the get_address_space() > callback for vfio-pci devices. > > However, QEMU/KVM also uses this same callback path when resolving the > address space for MSI doorbells: > > kvm_irqchip_add_msi_route() > kvm_arch_fixup_msi_route() > pci_device_iommu_address_space() > > This leads to problems when MSI doorbells need to be translated. > > To fix this, introduce an optional get_msi_address_space() callback. > In the SMMUv3 accelerated case, this callback returns the IOMMU address > space if the guest has set up S1 translations for the vfio-pci device. > Otherwise, it returns the system address space. > > Suggested-by: Nicolin Chen <nicolinc@nvidia.com> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > --- > hw/arm/smmuv3-accel.c | 25 +++++++++++++++++++++++++ > hw/pci/pci.c | 19 +++++++++++++++++++ > include/hw/pci/pci.h | 16 ++++++++++++++++ > target/arm/kvm.c | 2 +- I think we need to separate core changes and smmu changes, like how pci_device_set/unset_iommu_device were introduced. Thanks Nicolin
On 7/14/25 9:50 PM, Nicolin Chen wrote: > On Mon, Jul 14, 2025 at 04:59:37PM +0100, Shameer Kolothum wrote: >> On ARM, when a device is behind an IOMMU, its MSI doorbell address is >> subject to translation by the IOMMU. This behavior affects vfio-pci >> passthrough devices assigned to guests using an accelerated SMMUv3. >> >> In this setup, we configure the host SMMUv3 in nested mode, where >> VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest >> controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings, >> we currently return the system address space via the get_address_space() >> callback for vfio-pci devices. >> >> However, QEMU/KVM also uses this same callback path when resolving the >> address space for MSI doorbells: >> >> kvm_irqchip_add_msi_route() >> kvm_arch_fixup_msi_route() >> pci_device_iommu_address_space() >> >> This leads to problems when MSI doorbells need to be translated. >> >> To fix this, introduce an optional get_msi_address_space() callback. >> In the SMMUv3 accelerated case, this callback returns the IOMMU address >> space if the guest has set up S1 translations for the vfio-pci device. >> Otherwise, it returns the system address space. >> >> Suggested-by: Nicolin Chen <nicolinc@nvidia.com> >> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> >> --- >> hw/arm/smmuv3-accel.c | 25 +++++++++++++++++++++++++ >> hw/pci/pci.c | 19 +++++++++++++++++++ >> include/hw/pci/pci.h | 16 ++++++++++++++++ >> target/arm/kvm.c | 2 +- > I think we need to separate core changes and smmu changes, like how > pci_device_set/unset_iommu_device were introduced. I agree with Nicolin. Eric > > Thanks > Nicolin >
© 2016 - 2025 Red Hat, Inc.