On ARM, when a device is behind an IOMMU, its MSI doorbell address is
subject to translation by the IOMMU. This behavior affects vfio-pci
passthrough devices assigned to guests using an accelerated SMMUv3.
In this setup, we configure the host SMMUv3 in nested mode, where
VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
we currently return the system address space via the get_address_space()
callback for vfio-pci devices.
However, QEMU/KVM also uses this same callback path when resolving the
address space for MSI doorbells:
kvm_irqchip_add_msi_route()
kvm_arch_fixup_msi_route()
pci_device_iommu_address_space()
get_address_space()
This will cause the device to be configured with wrong MSI doorbell
address if it return the system address space.
Introduce an optional get_msi_address_space() callback and use that in
the above path if available. This will enable IOMMU implementations to
make use of this if required.
Suggested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/pci/pci.c | 19 +++++++++++++++++++
include/hw/pci/pci.h | 16 ++++++++++++++++
target/arm/kvm.c | 2 +-
3 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 1315ef13ea..6f9e1616dd 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2964,6 +2964,25 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
return &address_space_memory;
}
+AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev)
+{
+ PCIBus *bus;
+ PCIBus *iommu_bus;
+ int devfn;
+
+ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+ if (iommu_bus) {
+ if (iommu_bus->iommu_ops->get_msi_address_space) {
+ return iommu_bus->iommu_ops->get_msi_address_space(bus,
+ iommu_bus->iommu_opaque, devfn);
+ } else {
+ return iommu_bus->iommu_ops->get_address_space(bus,
+ iommu_bus->iommu_opaque, devfn);
+ }
+ }
+ return &address_space_memory;
+}
+
int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n,
IOMMUNotify fn, void *opaque)
{
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index c54f2b53ae..0d3b351903 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -652,6 +652,21 @@ typedef struct PCIIOMMUOps {
uint32_t pasid, bool priv_req, bool exec_req,
hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
bool is_write);
+ /**
+ * @get_msi_address_space: get the address space for MSI doorbell address
+ * for devices
+ *
+ * Optional callback which returns a pointer to an #AddressSpace. This
+ * is required if MSI doorbell also gets translated through IOMMU(eg: ARM)
+ *
+ * @bus: the #PCIBus being accessed.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number
+ */
+ AddressSpace * (*get_msi_address_space)(PCIBus *bus, void *opaque,
+ int devfn);
} PCIIOMMUOps;
bool pci_device_get_iommu_bus_devfn(PCIDevice *dev, PCIBus **piommu_bus,
@@ -660,6 +675,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
Error **errp);
void pci_device_unset_iommu_device(PCIDevice *dev);
+AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev);
/**
* pci_device_get_viommu_flags: get vIOMMU flags.
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index b8a1c071f5..10eb8655c6 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -1611,7 +1611,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level)
int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
uint64_t address, uint32_t data, PCIDevice *dev)
{
- AddressSpace *as = pci_device_iommu_address_space(dev);
+ AddressSpace *as = pci_device_iommu_msi_address_space(dev);
hwaddr xlat, len, doorbell_gpa;
MemoryRegionSection mrs;
MemoryRegion *mr;
--
2.43.0
Hi Shameer
On 9/29/25 3:36 PM, Shameer Kolothum wrote:
> On ARM, when a device is behind an IOMMU, its MSI doorbell address is
> subject to translation by the IOMMU. This behavior affects vfio-pci
> passthrough devices assigned to guests using an accelerated SMMUv3.
>
> In this setup, we configure the host SMMUv3 in nested mode, where
> VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
> controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
> we currently return the system address space via the get_address_space()
> callback for vfio-pci devices.
>
> However, QEMU/KVM also uses this same callback path when resolving the
> address space for MSI doorbells:
>
> kvm_irqchip_add_msi_route()
> kvm_arch_fixup_msi_route()
> pci_device_iommu_address_space()
> get_address_space()
>
> This will cause the device to be configured with wrong MSI doorbell
> address if it return the system address space.
returns
> Introduce an optional get_msi_address_space() callback and use that in
> the above path if available. This will enable IOMMU implementations to
> make use of this if required.
if required
> Suggested-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/pci/pci.c | 19 +++++++++++++++++++
> include/hw/pci/pci.h | 16 ++++++++++++++++
> target/arm/kvm.c | 2 +-
> 3 files changed, 36 insertions(+), 1 deletion(-)
>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 1315ef13ea..6f9e1616dd 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2964,6 +2964,25 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> return &address_space_memory;
> }
>
> +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev)
> +{
> + PCIBus *bus;
> + PCIBus *iommu_bus;
> + int devfn;
> +
> + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
> + if (iommu_bus) {
> + if (iommu_bus->iommu_ops->get_msi_address_space) {
> + return iommu_bus->iommu_ops->get_msi_address_space(bus,
> + iommu_bus->iommu_opaque, devfn);
See my reply to Nicolin's comment. From a high level point of view the
semantic of
get_msi_address_space versus get_address_space
does not look very clear. I have the impression for HW nested implementation you were forced to return the &system_address through the get_address_space
although there is a protecting IOMMU and you need another callback for return a proper IOMMU as for MSIs. This is still unclear and looks hacky to me at this point. I think we need to get the semantic of get_msi_address_space vs get_address_space more solid and you need to explain why get_address_space
is mandated to return &system_address in our case.
Maybe you explained that earlier in some thread but I fail to find those info again in the commit messages/comments and I think this is important.
> + } else {
> + return iommu_bus->iommu_ops->get_address_space(bus,
> + iommu_bus->iommu_opaque, devfn);
> + }
> + }
> + return &address_space_memory;
> +}
> +
> int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n,
> IOMMUNotify fn, void *opaque)
> {
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index c54f2b53ae..0d3b351903 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -652,6 +652,21 @@ typedef struct PCIIOMMUOps {
> uint32_t pasid, bool priv_req, bool exec_req,
> hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
> bool is_write);
> + /**
> + * @get_msi_address_space: get the address space for MSI doorbell address
> + * for devices
> + *
> + * Optional callback which returns a pointer to an #AddressSpace. This
> + * is required if MSI doorbell also gets translated through IOMMU(eg: ARM)
IOMMU (
> + *
> + * @bus: the #PCIBus being accessed.
> + *
> + * @opaque: the data passed to pci_setup_iommu().
> + *
> + * @devfn: device and function number
> + */
> + AddressSpace * (*get_msi_address_space)(PCIBus *bus, void *opaque,
> + int devfn);
> } PCIIOMMUOps;
>
> bool pci_device_get_iommu_bus_devfn(PCIDevice *dev, PCIBus **piommu_bus,
> @@ -660,6 +675,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
> bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
> Error **errp);
> void pci_device_unset_iommu_device(PCIDevice *dev);
> +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev);
>
> /**
> * pci_device_get_viommu_flags: get vIOMMU flags.
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index b8a1c071f5..10eb8655c6 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -1611,7 +1611,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level)
> int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
> uint64_t address, uint32_t data, PCIDevice *dev)
> {
> - AddressSpace *as = pci_device_iommu_address_space(dev);
> + AddressSpace *as = pci_device_iommu_msi_address_space(dev);
> hwaddr xlat, len, doorbell_gpa;
> MemoryRegionSection mrs;
> MemoryRegion *mr;
Thanks
Eric
On Mon, Sep 29, 2025 at 02:36:27PM +0100, Shameer Kolothum wrote:
> On ARM, when a device is behind an IOMMU, its MSI doorbell address is
> subject to translation by the IOMMU. This behavior affects vfio-pci
> passthrough devices assigned to guests using an accelerated SMMUv3.
>
> In this setup, we configure the host SMMUv3 in nested mode, where
> VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
> controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
> we currently return the system address space via the get_address_space()
> callback for vfio-pci devices.
>
> However, QEMU/KVM also uses this same callback path when resolving the
> address space for MSI doorbells:
>
> kvm_irqchip_add_msi_route()
> kvm_arch_fixup_msi_route()
> pci_device_iommu_address_space()
> get_address_space()
>
> This will cause the device to be configured with wrong MSI doorbell
> address if it return the system address space.
I think it'd be nicer to elaborate why a wrong address will be returned:
--------------------------------------------------------------------------
On ARM, a device behind an IOMMU requires translation for its MSI doorbell
address. When HW nested translation is enabled, the translation will also
happen in two stages: gIOVA => gPA => ITS page.
In the accelerated SMMUv3 mode, both stages are translated by the HW. So,
get_address_space() returns the system address space for stage-2 mappings,
as the smmuv3-accel model doesn't involve in either stage.
On the other hand, this callback is also invoked by QEMU/KVM:
kvm_irqchip_add_msi_route()
kvm_arch_fixup_msi_route()
pci_device_iommu_address_space()
get_address_space()
What KVM wants is to translate an MSI doorbell gIOVA to a vITS page (gPA),
so as to inject IRQs to the guest VM. And it expected get_address_space()
to return the address space for stage-1 mappings instead. Apparently, this
is broken.
Introduce an optional get_msi_address_space() callback and use that in the
above path.
--------------------------------------------------------------------------
> @@ -652,6 +652,21 @@ typedef struct PCIIOMMUOps {
> uint32_t pasid, bool priv_req, bool exec_req,
> hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
> bool is_write);
> + /**
> + * @get_msi_address_space: get the address space for MSI doorbell address
> + * for devices
+ * @get_msi_address_space: get the address space to translate MSI doorbell
+ * address for a device
> + *
> + * Optional callback which returns a pointer to an #AddressSpace. This
> + * is required if MSI doorbell also gets translated through IOMMU(eg: ARM)
through vIOMMU (e.g. ARM).
With these,
Reviewed-by Nicolin Chen <nicolinc@nvidia.com>
Hi Nicolin, Shameer,
On 10/17/25 12:30 AM, Nicolin Chen wrote:
> On Mon, Sep 29, 2025 at 02:36:27PM +0100, Shameer Kolothum wrote:
>> On ARM, when a device is behind an IOMMU, its MSI doorbell address is
>> subject to translation by the IOMMU. This behavior affects vfio-pci
>> passthrough devices assigned to guests using an accelerated SMMUv3.
>>
>> In this setup, we configure the host SMMUv3 in nested mode, where
>> VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
>> controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
>> we currently return the system address space via the get_address_space()
>> callback for vfio-pci devices.
>>
>> However, QEMU/KVM also uses this same callback path when resolving the
>> address space for MSI doorbells:
>>
>> kvm_irqchip_add_msi_route()
>> kvm_arch_fixup_msi_route()
>> pci_device_iommu_address_space()
>> get_address_space()
>>
>> This will cause the device to be configured with wrong MSI doorbell
>> address if it return the system address space.
> I think it'd be nicer to elaborate why a wrong address will be returned:
>
> --------------------------------------------------------------------------
> On ARM, a device behind an IOMMU requires translation for its MSI doorbell
> address. When HW nested translation is enabled, the translation will also
> happen in two stages: gIOVA => gPA => ITS page.
>
> In the accelerated SMMUv3 mode, both stages are translated by the HW. So,
> get_address_space() returns the system address space for stage-2 mappings,
> as the smmuv3-accel model doesn't involve in either stage.
I don't understand "doesn't involve in either stage". This is still not
obious to me that for an HW accelerated nested IOMMU get_address_space()
shall return the system address space. I think this deserves to be
explained and maybe documented along with the callback.
>
> On the other hand, this callback is also invoked by QEMU/KVM:
>
> kvm_irqchip_add_msi_route()
> kvm_arch_fixup_msi_route()
> pci_device_iommu_address_space()
> get_address_space()
>
> What KVM wants is to translate an MSI doorbell gIOVA to a vITS page (gPA),
> so as to inject IRQs to the guest VM. And it expected get_address_space()
> to return the address space for stage-1 mappings instead. Apparently, this
> is broken.
"Apparently this is broken". Please clarify what is broken. Definitively if
pci_device_iommu_address_space(dev) retruns @adress_system_memory no
translation is attempted.
kvm_arch_fixup_msi_route() was introduced by
https://lore.kernel.org/all/1523518688-26674-12-git-send-email-eric.auger@redhat.com/
This relies on the vIOMMU translate callback which is supposed to be bypassed in general with VFIO devices. Isn't needed only for emulated devices?
May you and shameer discussed that in a previous thread. Might be worth to add the link to this discussion.
Thanks
Eric
>
> Introduce an optional get_msi_address_space() callback and use that in the
> above path.
> --------------------------------------------------------------------------
>
>> @@ -652,6 +652,21 @@ typedef struct PCIIOMMUOps {
>> uint32_t pasid, bool priv_req, bool exec_req,
>> hwaddr addr, bool lpig, uint16_t prgi, bool is_read,
>> bool is_write);
>> + /**
>> + * @get_msi_address_space: get the address space for MSI doorbell address
>> + * for devices
> + * @get_msi_address_space: get the address space to translate MSI doorbell
> + * address for a device
>
>> + *
>> + * Optional callback which returns a pointer to an #AddressSpace. This
>> + * is required if MSI doorbell also gets translated through IOMMU(eg: ARM)
> through vIOMMU (e.g. ARM).
>
> With these,
>
> Reviewed-by Nicolin Chen <nicolinc@nvidia.com>
>
On Mon, Oct 20, 2025 at 06:14:33PM +0200, Eric Auger wrote: > >> This will cause the device to be configured with wrong MSI doorbell > >> address if it return the system address space. > > > > I think it'd be nicer to elaborate why a wrong address will be returned: > > > > -------------------------------------------------------------------------- > > On ARM, a device behind an IOMMU requires translation for its MSI doorbell > > address. When HW nested translation is enabled, the translation will also > > happen in two stages: gIOVA => gPA => ITS page. > > > > In the accelerated SMMUv3 mode, both stages are translated by the HW. So, > > get_address_space() returns the system address space for stage-2 mappings, > > as the smmuv3-accel model doesn't involve in either stage. > I don't understand "doesn't involve in either stage". This is still not > obious to me that for an HW accelerated nested IOMMU get_address_space() > shall return the system address space. I think this deserves to be > explained and maybe documented along with the callback. get_address_space() is used by pci_device_iommu_address_space(), which is for attach or translation. In QEMU, we have an "iommu" type of memory region, to represent the address space providing the stage-1 translation. In accel case excluding MSI, there is no need of "emulated iommu translation" since HW/host SMMU takes care of both stages. Thus, the system address is returned for get_address_space(), to avoid stage-1 translation and to also allow VFIO devices to attach to the system address space that the VFIO core will monitor to take care of stage-2 mappings. > > On the other hand, this callback is also invoked by QEMU/KVM: > > > > kvm_irqchip_add_msi_route() > > kvm_arch_fixup_msi_route() > > pci_device_iommu_address_space() > > get_address_space() > > > > What KVM wants is to translate an MSI doorbell gIOVA to a vITS page (gPA), > > so as to inject IRQs to the guest VM. And it expected get_address_space() > > to return the address space for stage-1 mappings instead. Apparently, this > > is broken. > "Apparently this is broken". Please clarify what is broken. Definitively if > > pci_device_iommu_address_space(dev) retruns @adress_system_memory no > translation is attempted. Hmm, I thought my writing was clear: - pci_device_iommu_address_space() returns the system address space that can't do a stage-1 translation. - KVM/MSI pathway requires an adress space that can do a stage-1 translation. > kvm_arch_fixup_msi_route() was introduced by > https://lore.kernel.org/all/1523518688-26674-12-git-send-email-eric.auger@redhat.com/ > > This relies on the vIOMMU translate callback which is supposed to be bypassed in general with VFIO devices. Isn't needed only for emulated devices? Not only for emulated devices. This KVM function needs the translation for the IRQ injection for VFIO devices as well. Although we use RMR for underlying HW to bypass the stage-1, the translation for gIOVA=>vITS page (VIRT_GIC_ITS) still exists in the guest level. FWIW, it's just doesn't have the stage-2 mapping because HW never uses the "gIOVA" but a hard-coded SW_MSI address. In the meantime, a VFIO device in the guest is programmed with a gIOVA for MSI doorbell. This gIOVA can't be used for KVM code to inject IRQs. It needs the gPA (i.e. VIRT_GIC_ITS). So, it needs a translation address space to do that. Hope this is clear now. Thanks Nicolin
Hi Nicolin, On 10/20/25 8:00 PM, Nicolin Chen wrote: > On Mon, Oct 20, 2025 at 06:14:33PM +0200, Eric Auger wrote: >>>> This will cause the device to be configured with wrong MSI doorbell >>>> address if it return the system address space. >>> I think it'd be nicer to elaborate why a wrong address will be returned: >>> >>> -------------------------------------------------------------------------- >>> On ARM, a device behind an IOMMU requires translation for its MSI doorbell >>> address. When HW nested translation is enabled, the translation will also >>> happen in two stages: gIOVA => gPA => ITS page. >>> >>> In the accelerated SMMUv3 mode, both stages are translated by the HW. So, >>> get_address_space() returns the system address space for stage-2 mappings, >>> as the smmuv3-accel model doesn't involve in either stage. >> I don't understand "doesn't involve in either stage". This is still not >> obious to me that for an HW accelerated nested IOMMU get_address_space() >> shall return the system address space. I think this deserves to be >> explained and maybe documented along with the callback. > get_address_space() is used by pci_device_iommu_address_space(), > which is for attach or translation. > > In QEMU, we have an "iommu" type of memory region, to represent > the address space providing the stage-1 translation. > > In accel case excluding MSI, there is no need of "emulated iommu > translation" since HW/host SMMU takes care of both stages. Thus, > the system address is returned for get_address_space(), to avoid > stage-1 translation and to also allow VFIO devices to attach to > the system address space that the VFIO core will monitor to take > care of stage-2 mappings. but in general if you set as output 'as' the system_address_memory it rather means you have no translation in place. This is what I am not convinced about. you say it aims at - avoiding stage-1 translation - allow VFIO devices to attach to the system address space that the VFIO core will monitor to take care of stage-2 mappings. Can you achieve the same goals with a proper address space? > >>> On the other hand, this callback is also invoked by QEMU/KVM: >>> >>> kvm_irqchip_add_msi_route() >>> kvm_arch_fixup_msi_route() >>> pci_device_iommu_address_space() >>> get_address_space() >>> >>> What KVM wants is to translate an MSI doorbell gIOVA to a vITS page (gPA), >>> so as to inject IRQs to the guest VM. And it expected get_address_space() >>> to return the address space for stage-1 mappings instead. Apparently, this >>> is broken. >> "Apparently this is broken". Please clarify what is broken. Definitively if >> >> pci_device_iommu_address_space(dev) retruns @adress_system_memory no >> translation is attempted. > Hmm, I thought my writing was clear: > - pci_device_iommu_address_space() returns the system address > space that can't do a stage-1 translation. > - KVM/MSI pathway requires an adress space that can do a stage-1 > translation. understood. although I am not sure using system address space is the best choice. But I may not be the best person to decide about this. > >> kvm_arch_fixup_msi_route() was introduced by >> https://lore.kernel.org/all/1523518688-26674-12-git-send-email-eric.auger@redhat.com/ >> >> This relies on the vIOMMU translate callback which is supposed to be bypassed in general with VFIO devices. Isn't needed only for emulated devices? > Not only for emulated devices. > > This KVM function needs the translation for the IRQ injection for > VFIO devices as well. understood. > > Although we use RMR for underlying HW to bypass the stage-1, the > translation for gIOVA=>vITS page (VIRT_GIC_ITS) still exists in > the guest level. FWIW, it's just doesn't have the stage-2 mapping > because HW never uses the "gIOVA" but a hard-coded SW_MSI address. > > In the meantime, a VFIO device in the guest is programmed with a > gIOVA for MSI doorbell. This gIOVA can't be used for KVM code to > inject IRQs. It needs the gPA (i.e. VIRT_GIC_ITS). So, it needs a > translation address space to do that. > > Hope this is clear now. OK. I understand the needs but I am unsure using system address space is the good choice. Eric > > Thanks > Nicolin >
On Tue, Oct 21, 2025 at 06:26:39PM +0200, Eric Auger wrote:
> Hi Nicolin,
>
> On 10/20/25 8:00 PM, Nicolin Chen wrote:
> > On Mon, Oct 20, 2025 at 06:14:33PM +0200, Eric Auger wrote:
> >>>> This will cause the device to be configured with wrong MSI doorbell
> >>>> address if it return the system address space.
> >>> I think it'd be nicer to elaborate why a wrong address will be returned:
> >>>
> >>> --------------------------------------------------------------------------
> >>> On ARM, a device behind an IOMMU requires translation for its MSI doorbell
> >>> address. When HW nested translation is enabled, the translation will also
> >>> happen in two stages: gIOVA => gPA => ITS page.
> >>>
> >>> In the accelerated SMMUv3 mode, both stages are translated by the HW. So,
> >>> get_address_space() returns the system address space for stage-2 mappings,
> >>> as the smmuv3-accel model doesn't involve in either stage.
> >> I don't understand "doesn't involve in either stage". This is still not
> >> obious to me that for an HW accelerated nested IOMMU get_address_space()
> >> shall return the system address space. I think this deserves to be
> >> explained and maybe documented along with the callback.
> > get_address_space() is used by pci_device_iommu_address_space(),
> > which is for attach or translation.
> >
> > In QEMU, we have an "iommu" type of memory region, to represent
> > the address space providing the stage-1 translation.
> >
> > In accel case excluding MSI, there is no need of "emulated iommu
> > translation" since HW/host SMMU takes care of both stages. Thus,
> > the system address is returned for get_address_space(), to avoid
> > stage-1 translation and to also allow VFIO devices to attach to
> > the system address space that the VFIO core will monitor to take
> > care of stage-2 mappings.
> but in general if you set as output 'as' the system_address_memory it
> rather means you have no translation in place. This is what I am not
> convinced about.
You mean you are not convinced about "no translation"?
> you say it aims at
> - avoiding stage-1 translation - allow VFIO devices to attach to the
> system address space that the VFIO core will monitor to take care of
> stage-2 mappings. Can you achieve the same goals with a proper address
> space?
Would you please define "proper"?
The disagreement is seemingly about using system address space or
even address_space_memory, IIUIC.
To our purpose here, so long as the vfio core can setup a proper
listener to monitor the guest physical address space, we are fine
with any alternative.
The system address space just seems to be the simplest one. FWIW,
kvm_arch_fixup_msi_route() also checks in the beginning:
if (as == &address_space_memory)
So, returning @address_space_memory seems to be straightforward?
I think I also need some education to understand why do we need
an indirect address space that eventually will be routed back to
address_space_memory?
Thanks
Nicolin
Hi Nicolin,
On 10/21/25 8:56 PM, Nicolin Chen wrote:
> On Tue, Oct 21, 2025 at 06:26:39PM +0200, Eric Auger wrote:
>> Hi Nicolin,
>>
>> On 10/20/25 8:00 PM, Nicolin Chen wrote:
>>> On Mon, Oct 20, 2025 at 06:14:33PM +0200, Eric Auger wrote:
>>>>>> This will cause the device to be configured with wrong MSI doorbell
>>>>>> address if it return the system address space.
>>>>> I think it'd be nicer to elaborate why a wrong address will be returned:
>>>>>
>>>>> --------------------------------------------------------------------------
>>>>> On ARM, a device behind an IOMMU requires translation for its MSI doorbell
>>>>> address. When HW nested translation is enabled, the translation will also
>>>>> happen in two stages: gIOVA => gPA => ITS page.
>>>>>
>>>>> In the accelerated SMMUv3 mode, both stages are translated by the HW. So,
>>>>> get_address_space() returns the system address space for stage-2 mappings,
>>>>> as the smmuv3-accel model doesn't involve in either stage.
>>>> I don't understand "doesn't involve in either stage". This is still not
>>>> obious to me that for an HW accelerated nested IOMMU get_address_space()
>>>> shall return the system address space. I think this deserves to be
>>>> explained and maybe documented along with the callback.
>>> get_address_space() is used by pci_device_iommu_address_space(),
>>> which is for attach or translation.
>>>
>>> In QEMU, we have an "iommu" type of memory region, to represent
>>> the address space providing the stage-1 translation.
>>>
>>> In accel case excluding MSI, there is no need of "emulated iommu
>>> translation" since HW/host SMMU takes care of both stages. Thus,
>>> the system address is returned for get_address_space(), to avoid
>>> stage-1 translation and to also allow VFIO devices to attach to
>>> the system address space that the VFIO core will monitor to take
>>> care of stage-2 mappings.
>> but in general if you set as output 'as' the system_address_memory it
>> rather means you have no translation in place. This is what I am not
>> convinced about.
> You mean you are not convinced about "no translation"?
I am not convinced about the choice of using address_space_memory.
>
>> you say it aims at
>> - avoiding stage-1 translation - allow VFIO devices to attach to the
>> system address space that the VFIO core will monitor to take care of
>> stage-2 mappings. Can you achieve the same goals with a proper address
>> space?
> Would you please define "proper"?
an address space different from address_space_memory
>
> The disagreement is seemingly about using system address space or
> even address_space_memory, IIUIC.
Yes my doubt is about:
smmuv3_accel_find_add_as()
* We are using the global &address_space_memory here, as this will
ensure
* same system address space pointer for all devices behind the
accelerated
* SMMUv3s in a VM. That way VFIO/iommufd can reuse a single IOAS ID in
* iommufd_cdev_attach(), allowing the Stage-2 page tables to be shared
* within the VM instead of duplicating them for every SMMUv3 instance.
*/
if (vfio_pci) {
return &address_space_memory;
I think it would be cleaner to a have an AddressSpace allocated on
purpose to support the VFIO accel use case, if possible.
To me returning address_space_memory pretends we are not doing any
translation. I understand it is "easy" to reuse that one but I wonder it
is the spirit of the get_address_space callback.
I would rather allocate a dedicated (shared) AddressSpace to support the
VFIO accel case. That's my suggestion.
>
> To our purpose here, so long as the vfio core can setup a proper
> listener to monitor the guest physical address space, we are fine
> with any alternative.
>
> The system address space just seems to be the simplest one. FWIW,
> kvm_arch_fixup_msi_route() also checks in the beginning:
> if (as == &address_space_memory)
>
> So, returning @address_space_memory seems to be straightforward?
>
> I think I also need some education to understand why do we need
> an indirect address space that eventually will be routed back to
> address_space_memory?
Well I am not an expert of AddressSpaces either. Reading hw/pci/pci.h
and get_address_space() callback API doc comment, I understand this is
the output address space for the PCI device. If you return
address_space_memory, to me this means there is no translation in place. By the way, this was the interpretation of kvm_arch_fixup_msi_route() on ARM
AddressSpace *as = pci_device_iommu_address_space(dev)
if (as == &address_space_memory) {
return 0;
}
/* MSI doorbell address is translated by an IOMMU */
Note: I am currently out of the office so I am not able to reply as fast as you may wish.
Thanks
Eric
>
> Thanks
> Nicolin
>
> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: 22 October 2025 17:25
> To: Nicolin Chen <nicolinc@nvidia.com>
> Cc: Shameer Kolothum <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org; peter.maydell@linaro.org;
> Jason Gunthorpe <jgg@nvidia.com>; ddutile@redhat.com;
> berrange@redhat.com; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> shameerkolothum@gmail.com
> Subject: Re: [PATCH v4 11/27] hw/pci/pci: Introduce optional
> get_msi_address_space() callback
>
> External email: Use caution opening links or attachments
>
>
> Hi Nicolin,
>
> On 10/21/25 8:56 PM, Nicolin Chen wrote:
> > On Tue, Oct 21, 2025 at 06:26:39PM +0200, Eric Auger wrote:
> >> Hi Nicolin,
> >>
> >> On 10/20/25 8:00 PM, Nicolin Chen wrote:
> >>> On Mon, Oct 20, 2025 at 06:14:33PM +0200, Eric Auger wrote:
> >>>>>> This will cause the device to be configured with wrong MSI doorbell
> >>>>>> address if it return the system address space.
> >>>>> I think it'd be nicer to elaborate why a wrong address will be returned:
> >>>>>
> >>>>> --------------------------------------------------------------------------
> >>>>> On ARM, a device behind an IOMMU requires translation for its MSI
> doorbell
> >>>>> address. When HW nested translation is enabled, the translation will
> also
> >>>>> happen in two stages: gIOVA => gPA => ITS page.
> >>>>>
> >>>>> In the accelerated SMMUv3 mode, both stages are translated by the
> HW. So,
> >>>>> get_address_space() returns the system address space for stage-2
> mappings,
> >>>>> as the smmuv3-accel model doesn't involve in either stage.
> >>>> I don't understand "doesn't involve in either stage". This is still not
> >>>> obious to me that for an HW accelerated nested IOMMU
> get_address_space()
> >>>> shall return the system address space. I think this deserves to be
> >>>> explained and maybe documented along with the callback.
> >>> get_address_space() is used by pci_device_iommu_address_space(),
> >>> which is for attach or translation.
> >>>
> >>> In QEMU, we have an "iommu" type of memory region, to represent
> >>> the address space providing the stage-1 translation.
> >>>
> >>> In accel case excluding MSI, there is no need of "emulated iommu
> >>> translation" since HW/host SMMU takes care of both stages. Thus,
> >>> the system address is returned for get_address_space(), to avoid
> >>> stage-1 translation and to also allow VFIO devices to attach to
> >>> the system address space that the VFIO core will monitor to take
> >>> care of stage-2 mappings.
> >> but in general if you set as output 'as' the system_address_memory it
> >> rather means you have no translation in place. This is what I am not
> >> convinced about.
> > You mean you are not convinced about "no translation"?
> I am not convinced about the choice of using address_space_memory.
> >
> >> you say it aims at
> >> - avoiding stage-1 translation - allow VFIO devices to attach to the
> >> system address space that the VFIO core will monitor to take care of
> >> stage-2 mappings. Can you achieve the same goals with a proper address
> >> space?
> > Would you please define "proper"?
> an address space different from address_space_memory
> >
> > The disagreement is seemingly about using system address space or
> > even address_space_memory, IIUIC.
> Yes my doubt is about:
>
> smmuv3_accel_find_add_as()
> * We are using the global &address_space_memory here, as this will
> ensure
> * same system address space pointer for all devices behind the
> accelerated
> * SMMUv3s in a VM. That way VFIO/iommufd can reuse a single IOAS ID in
> * iommufd_cdev_attach(), allowing the Stage-2 page tables to be shared
> * within the VM instead of duplicating them for every SMMUv3 instance.
> */
> if (vfio_pci) {
> return &address_space_memory;
>
> I think it would be cleaner to a have an AddressSpace allocated on
> purpose to support the VFIO accel use case, if possible.
> To me returning address_space_memory pretends we are not doing any
> translation. I understand it is "easy" to reuse that one but I wonder it
> is the spirit of the get_address_space callback.
>
> I would rather allocate a dedicated (shared) AddressSpace to support the
> VFIO accel case. That's my suggestion.
Ok. I will give it a go with the "global variable in smmu-accel.c" route for a
separate shared address space that you suggested earlier in patch #6 thread.
Thanks,
Shameer
On Mon, 29 Sep 2025 14:36:27 +0100
Shameer Kolothum <skolothumtho@nvidia.com> wrote:
> On ARM, when a device is behind an IOMMU, its MSI doorbell address is
> subject to translation by the IOMMU. This behavior affects vfio-pci
> passthrough devices assigned to guests using an accelerated SMMUv3.
>
> In this setup, we configure the host SMMUv3 in nested mode, where
> VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest
> controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings,
> we currently return the system address space via the get_address_space()
> callback for vfio-pci devices.
>
> However, QEMU/KVM also uses this same callback path when resolving the
> address space for MSI doorbells:
>
> kvm_irqchip_add_msi_route()
> kvm_arch_fixup_msi_route()
> pci_device_iommu_address_space()
> get_address_space()
>
> This will cause the device to be configured with wrong MSI doorbell
> address if it return the system address space.
>
> Introduce an optional get_msi_address_space() callback and use that in
> the above path if available. This will enable IOMMU implementations to
> make use of this if required.
Extra space before required.
>
> Suggested-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
one comment inline. Either way
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
> ---
> hw/pci/pci.c | 19 +++++++++++++++++++
> include/hw/pci/pci.h | 16 ++++++++++++++++
> target/arm/kvm.c | 2 +-
> 3 files changed, 36 insertions(+), 1 deletion(-)
>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 1315ef13ea..6f9e1616dd 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2964,6 +2964,25 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
> return &address_space_memory;
> }
>
> +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev)
> +{
> + PCIBus *bus;
> + PCIBus *iommu_bus;
> + int devfn;
> +
> + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
> + if (iommu_bus) {
> + if (iommu_bus->iommu_ops->get_msi_address_space) {
> + return iommu_bus->iommu_ops->get_msi_address_space(bus,
> + iommu_bus->iommu_opaque, devfn);
> + } else {
Not important so up to you.
I see the 'else' as unnecessary here both because you returned above and
because it's kind of the natural default - i.e. what we did before the
new callback.
> + return iommu_bus->iommu_ops->get_address_space(bus,
> + iommu_bus->iommu_opaque, devfn);
> + }
> + }
> + return &address_space_memory;
> +}
> +
> int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n,
> IOMMUNotify fn, void *opaque)
> {
© 2016 - 2025 Red Hat, Inc.