pci_device_get_viommu_cap() call pci_device_get_iommu_bus_devfn()
to get iommu_bus->iommu_ops and call get_viommu_cap() callback to
get a bitmap with each bit represents a vIOMMU exposed capability.
Suggested-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
MAINTAINERS | 1 +
hw/pci/pci.c | 11 +++++++++++
include/hw/iommu.h | 16 ++++++++++++++++
include/hw/pci/pci.h | 23 +++++++++++++++++++++++
4 files changed, 51 insertions(+)
create mode 100644 include/hw/iommu.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 1842c3dd83..d9fc977b81 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2302,6 +2302,7 @@ F: include/system/iommufd.h
F: backends/host_iommu_device.c
F: include/system/host_iommu_device.h
F: include/qemu/chardev_open.h
+F: include/hw/iommu.h
F: util/chardev_open.c
F: docs/devel/vfio-iommufd.rst
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index c70b5ceeba..df1fb615a8 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2992,6 +2992,17 @@ void pci_device_unset_iommu_device(PCIDevice *dev)
}
}
+uint64_t pci_device_get_viommu_cap(PCIDevice *dev)
+{
+ PCIBus *iommu_bus;
+
+ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+ if (iommu_bus && iommu_bus->iommu_ops->get_viommu_cap) {
+ return iommu_bus->iommu_ops->get_viommu_cap(iommu_bus->iommu_opaque);
+ }
+ return 0;
+}
+
int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req,
bool exec_req, hwaddr addr, bool lpig,
uint16_t prgi, bool is_read, bool is_write)
diff --git a/include/hw/iommu.h b/include/hw/iommu.h
new file mode 100644
index 0000000000..e80aaf4431
--- /dev/null
+++ b/include/hw/iommu.h
@@ -0,0 +1,16 @@
+/*
+ * General vIOMMU capabilities, flags, etc
+ *
+ * Copyright (C) 2025 Intel Corporation.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_IOMMU_H
+#define HW_IOMMU_H
+
+enum {
+ VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table supported */
+};
+
+#endif /* HW_IOMMU_H */
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index df3cc7b875..a11ab14bdc 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -453,6 +453,19 @@ typedef struct PCIIOMMUOps {
* @devfn: device and function number of the PCI device.
*/
void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
+ /**
+ * @get_viommu_cap: get vIOMMU capabilities
+ *
+ * Optional callback, if not implemented, then vIOMMU doesn't
+ * support exposing capabilities to other subsystem, e.g., VFIO.
+ * vIOMMU can choose which capabilities to expose.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * Returns: 64bit bitmap with each bit represents a capability emulated
+ * by VIOMMU_CAP_* in include/hw/iommu.h
+ */
+ uint64_t (*get_viommu_cap)(void *opaque);
/**
* @get_iotlb_info: get properties required to initialize a device IOTLB.
*
@@ -633,6 +646,16 @@ bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
Error **errp);
void pci_device_unset_iommu_device(PCIDevice *dev);
+/**
+ * pci_device_get_viommu_cap: get vIOMMU capabilities.
+ *
+ * Returns a 64bit bitmap with each bit represents a vIOMMU exposed
+ * capability, 0 if vIOMMU doesn't support esposing capabilities.
+ *
+ * @dev: PCI device pointer.
+ */
+uint64_t pci_device_get_viommu_cap(PCIDevice *dev);
+
/**
* pci_iommu_get_iotlb_info: get properties required to initialize a
* device IOTLB.
--
2.47.1
Hi Zhenzhong,
On 7/8/25 1:05 PM, Zhenzhong Duan wrote:
> pci_device_get_viommu_cap() call pci_device_get_iommu_bus_devfn()
> to get iommu_bus->iommu_ops and call get_viommu_cap() callback to
> get a bitmap with each bit represents a vIOMMU exposed capability.
Suggesting:
Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which
allows to retrieve capabilities exposed by a vIOMMU. The first planned
capability is VIOMMU_CAP_HW_NESTED that advertises the support of HW
nested stage translation scheme. pci_device_get_viommu_cap is a wrapper
that can be called on a PCI device potentially protected by a vIOMMU.
>
> Suggested-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
> MAINTAINERS | 1 +
> hw/pci/pci.c | 11 +++++++++++
> include/hw/iommu.h | 16 ++++++++++++++++
> include/hw/pci/pci.h | 23 +++++++++++++++++++++++
> 4 files changed, 51 insertions(+)
> create mode 100644 include/hw/iommu.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1842c3dd83..d9fc977b81 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2302,6 +2302,7 @@ F: include/system/iommufd.h
> F: backends/host_iommu_device.c
> F: include/system/host_iommu_device.h
> F: include/qemu/chardev_open.h
> +F: include/hw/iommu.h
> F: util/chardev_open.c
> F: docs/devel/vfio-iommufd.rst
>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index c70b5ceeba..df1fb615a8 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2992,6 +2992,17 @@ void pci_device_unset_iommu_device(PCIDevice *dev)
> }
> }
>
> +uint64_t pci_device_get_viommu_cap(PCIDevice *dev)
> +{
> + PCIBus *iommu_bus;
> +
> + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
> + if (iommu_bus && iommu_bus->iommu_ops->get_viommu_cap) {
> + return iommu_bus->iommu_ops->get_viommu_cap(iommu_bus->iommu_opaque);
> + }
> + return 0;
> +}
> +
> int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req,
> bool exec_req, hwaddr addr, bool lpig,
> uint16_t prgi, bool is_read, bool is_write)
> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
> new file mode 100644
> index 0000000000..e80aaf4431
> --- /dev/null
> +++ b/include/hw/iommu.h
> @@ -0,0 +1,16 @@
> +/*
> + * General vIOMMU capabilities, flags, etc
> + *
> + * Copyright (C) 2025 Intel Corporation.
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#ifndef HW_IOMMU_H
> +#define HW_IOMMU_H
> +
> +enum {
> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table supported */
with the enum name change,
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Thanks
Eric
> +};
> +
> +#endif /* HW_IOMMU_H */
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index df3cc7b875..a11ab14bdc 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -453,6 +453,19 @@ typedef struct PCIIOMMUOps {
> * @devfn: device and function number of the PCI device.
> */
> void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
> + /**
> + * @get_viommu_cap: get vIOMMU capabilities
> + *
> + * Optional callback, if not implemented, then vIOMMU doesn't
> + * support exposing capabilities to other subsystem, e.g., VFIO.
> + * vIOMMU can choose which capabilities to expose.
> + *
> + * @opaque: the data passed to pci_setup_iommu().
> + *
> + * Returns: 64bit bitmap with each bit represents a capability emulated
> + * by VIOMMU_CAP_* in include/hw/iommu.h
> + */
> + uint64_t (*get_viommu_cap)(void *opaque);
> /**
> * @get_iotlb_info: get properties required to initialize a device IOTLB.
> *
> @@ -633,6 +646,16 @@ bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
> Error **errp);
> void pci_device_unset_iommu_device(PCIDevice *dev);
>
> +/**
> + * pci_device_get_viommu_cap: get vIOMMU capabilities.
> + *
> + * Returns a 64bit bitmap with each bit represents a vIOMMU exposed
> + * capability, 0 if vIOMMU doesn't support esposing capabilities.
> + *
> + * @dev: PCI device pointer.
> + */
> +uint64_t pci_device_get_viommu_cap(PCIDevice *dev);
> +
> /**
> * pci_iommu_get_iotlb_info: get properties required to initialize a
> * device IOTLB.
On Tue, Jul 08, 2025 at 07:05:43AM -0400, Zhenzhong Duan wrote:
> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
> new file mode 100644
> index 0000000000..e80aaf4431
> --- /dev/null
> +++ b/include/hw/iommu.h
> @@ -0,0 +1,16 @@
> +/*
> + * General vIOMMU capabilities, flags, etc
> + *
> + * Copyright (C) 2025 Intel Corporation.
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#ifndef HW_IOMMU_H
> +#define HW_IOMMU_H
> +
> +enum {
> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table supported */
> +};
Thanks for this work. I am happy to see that we can share the
common code that allocates a NESTING_PARENT in the core using
this flag.
Yet on ARM, a STAGE1 page table isn't always a nested S1, the
hardware accelerated one. More often, it can be just a regular
1-stage translation table via emulated translation code and an
emulated iotlb.
I think this flag should indicate that the vIOMMU supports a
HW-accelerated nested S1 HWPT allocation/invalidation.
So, perhaps:
/* hardware-accelerated nested stage-1 page table support */
VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
?
Nicolin
On 7/8/25 8:39 PM, Nicolin Chen wrote:
> On Tue, Jul 08, 2025 at 07:05:43AM -0400, Zhenzhong Duan wrote:
>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>> new file mode 100644
>> index 0000000000..e80aaf4431
>> --- /dev/null
>> +++ b/include/hw/iommu.h
>> @@ -0,0 +1,16 @@
>> +/*
>> + * General vIOMMU capabilities, flags, etc
>> + *
>> + * Copyright (C) 2025 Intel Corporation.
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#ifndef HW_IOMMU_H
>> +#define HW_IOMMU_H
>> +
>> +enum {
>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table supported */
>> +};
>
> Thanks for this work. I am happy to see that we can share the
> common code that allocates a NESTING_PARENT in the core using
> this flag.
>
> Yet on ARM, a STAGE1 page table isn't always a nested S1, the
> hardware accelerated one. More often, it can be just a regular
> 1-stage translation table via emulated translation code and an
> emulated iotlb.
>
Because the user-created smmuv3 started as 'accelerated smmuv3',
and had been 'de-accelerated' to simply 'user created smmuv3',
I'm looking for some clarification in the above statement/request.
Is the above suppose to reflect that a nested IOMMU has some hw-acceleration
in its Stage1 implementation?
If so, then call it that: STAGE1_ACCEL.
If it's suppose to represent that an IOMMU has nested/2-stage support,
then the above is a valid cap; -but-, having a nested/2-stage support IOMMU
doesn't necessarily mean its accelerated.
So, let's ensure terminology, esp. bit-macro names(pace) reflects
the (exact) meaning, and not any indirect meaning.
A recent kvm-arm email thread had such indirect naming cleaned up
when referring to pfn-map/device-map/struct-page-mapped pages, which
I'd like not to start down a similar mis-naming/indirect-naming path here.
Look forward to the clarification.
- Don
> I think this flag should indicate that the vIOMMU supports a
> HW-accelerated nested S1 HWPT allocation/invalidation.
>
> So, perhaps:
> /* hardware-accelerated nested stage-1 page table support */
> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
> ?
>
> Nicolin
>
On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
> > > +enum {
> > > + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table supported */
> > > +};
> >
> > Thanks for this work. I am happy to see that we can share the
> > common code that allocates a NESTING_PARENT in the core using
> > this flag.
> >
> > Yet on ARM, a STAGE1 page table isn't always a nested S1, the
> > hardware accelerated one. More often, it can be just a regular
> > 1-stage translation table via emulated translation code and an
> > emulated iotlb.
> >
> Because the user-created smmuv3 started as 'accelerated smmuv3',
> and had been 'de-accelerated' to simply 'user created smmuv3',
> I'm looking for some clarification in the above statement/request.
>
> Is the above suppose to reflect that a nested IOMMU has some hw-acceleration
> in its Stage1 implementation?
> If so, then call it that: STAGE1_ACCEL.
> If it's suppose to represent that an IOMMU has nested/2-stage support,
> then the above is a valid cap; -but-, having a nested/2-stage support IOMMU
> doesn't necessarily mean its accelerated.
Well, there are an emulated "nested" mode and an hw-accelerated
"nested" mode in the smmuv3 code, so we had to choose something
like "accel" over "nested".
Here, on the other hand, I think the core using this CAP would
unlikely care about an emulated "nested" mode in the individual
vIOMMU..
So I suggested:
/* hardware-accelerated nested stage-1 page table support */
VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
which it should be clear IMHO.
If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
Thanks
Nicolin
> -----Original Message-----
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Wednesday, July 9, 2025 8:20 PM
> To: Donald Dutile <ddutile@redhat.com>
> Cc: Zhenzhong Duan <zhenzhong.duan@intel.com>; qemu-
> devel@nongnu.org; alex.williamson@redhat.com; clg@redhat.com;
> eric.auger@redhat.com; mst@redhat.com; jasowang@redhat.com;
> peterx@redhat.com; jgg@nvidia.com; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; joao.m.martins@oracle.com;
> clement.mathieu--drif@eviden.com; kevin.tian@intel.com;
> yi.l.liu@intel.com; chao.p.peng@intel.com
> Subject: Re: [PATCH v3 02/20] hw/pci: Introduce
> pci_device_get_viommu_cap()
>
> On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
> > > > +enum {
> > > > + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
> supported */
> > > > +};
> > >
> > > Thanks for this work. I am happy to see that we can share the
> > > common code that allocates a NESTING_PARENT in the core using
> > > this flag.
> > >
> > > Yet on ARM, a STAGE1 page table isn't always a nested S1, the
> > > hardware accelerated one. More often, it can be just a regular
> > > 1-stage translation table via emulated translation code and an
> > > emulated iotlb.
> > >
> > Because the user-created smmuv3 started as 'accelerated smmuv3',
> > and had been 'de-accelerated' to simply 'user created smmuv3',
> > I'm looking for some clarification in the above statement/request.
> >
> > Is the above suppose to reflect that a nested IOMMU has some hw-
> acceleration
> > in its Stage1 implementation?
> > If so, then call it that: STAGE1_ACCEL.
> > If it's suppose to represent that an IOMMU has nested/2-stage support,
> > then the above is a valid cap; -but-, having a nested/2-stage support
> IOMMU
> > doesn't necessarily mean its accelerated.
>
> Well, there are an emulated "nested" mode and an hw-accelerated
> "nested" mode in the smmuv3 code, so we had to choose something
> like "accel" over "nested".
>
> Here, on the other hand, I think the core using this CAP would
> unlikely care about an emulated "nested" mode in the individual
> vIOMMU..
>
> So I suggested:
> /* hardware-accelerated nested stage-1 page table support */
> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>
> which it should be clear IMHO.
>
> If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
I am not sure the _S1 part makes much sense in ARM case. It doesn't matter
whether the Guest SMMUv3 is configured in s1/s2 or nested for this CAP.
With the new SMMUv3 dev support, the user can pretty much specify,
-device arm-smmuv3,primary-bus=pcie.0,id=smmuv3.1,accel=on,stage={stage1|stage2|nested}
And I think it will work with a host SMMUv3 nested configuration in all the
above cases. Unless I am missing something and we need to restrict its
use with stage=stage1 only.
Thanks,
Shameer
On Thu, Jul 10, 2025 at 08:11:28AM +0000, Shameerali Kolothum Thodi wrote:
> > So I suggested:
> > /* hardware-accelerated nested stage-1 page table support */
> > VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
> >
> > which it should be clear IMHO.
> >
> > If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
>
> I am not sure the _S1 part makes much sense in ARM case. It doesn't matter
> whether the Guest SMMUv3 is configured in s1/s2 or nested for this CAP.
> With the new SMMUv3 dev support, the user can pretty much specify,
>
> -device arm-smmuv3,primary-bus=pcie.0,id=smmuv3.1,accel=on,stage={stage1|stage2|nested}
>
> And I think it will work with a host SMMUv3 nested configuration in all the
> above cases. Unless I am missing something and
[...]
> we need to restrict its
> use with stage=stage1 only.
I think we do..
The HW nesting works when we forward the s1ctxptr and make sure
that "stage-1" is the last stage, in other word, the only stage.
Otherwise how can we support stage2/nested in a HW nesting case?
Thanks
Nicolin
> -----Original Message-----
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Thursday, July 10, 2025 6:16 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Donald Dutile <ddutile@redhat.com>; Zhenzhong Duan
> <zhenzhong.duan@intel.com>; qemu-devel@nongnu.org;
> alex.williamson@redhat.com; clg@redhat.com; eric.auger@redhat.com;
> mst@redhat.com; jasowang@redhat.com; peterx@redhat.com;
> jgg@nvidia.com; joao.m.martins@oracle.com; clement.mathieu--
> drif@eviden.com; kevin.tian@intel.com; yi.l.liu@intel.com;
> chao.p.peng@intel.com
> Subject: Re: [PATCH v3 02/20] hw/pci: Introduce
> pci_device_get_viommu_cap()
>
> On Thu, Jul 10, 2025 at 08:11:28AM +0000, Shameerali Kolothum Thodi
> wrote:
> > > So I suggested:
> > > /* hardware-accelerated nested stage-1 page table support */
> > > VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
> > >
> > > which it should be clear IMHO.
> > >
> > > If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
> >
> > I am not sure the _S1 part makes much sense in ARM case. It doesn't
> matter
> > whether the Guest SMMUv3 is configured in s1/s2 or nested for this CAP.
> > With the new SMMUv3 dev support, the user can pretty much specify,
> >
> > -device arm-smmuv3,primary-
> bus=pcie.0,id=smmuv3.1,accel=on,stage={stage1|stage2|nested}
> >
> > And I think it will work with a host SMMUv3 nested configuration in all
> the
> > above cases. Unless I am missing something and
> [...]
> > we need to restrict its
> > use with stage=stage1 only.
>
> I think we do..
>
> The HW nesting works when we forward the s1ctxptr and make sure
> that "stage-1" is the last stage, in other word, the only stage.
>
> Otherwise how can we support stage2/nested in a HW nesting case?
Yep. That's right. Stage 2 is a no. But nesting may work if the Guest only
uses S1. But its better to restrict it to S1.
Thanks,
Shameer
On 7/10/25 4:11 AM, Shameerali Kolothum Thodi wrote:
>
>
>> -----Original Message-----
>> From: Nicolin Chen <nicolinc@nvidia.com>
>> Sent: Wednesday, July 9, 2025 8:20 PM
>> To: Donald Dutile <ddutile@redhat.com>
>> Cc: Zhenzhong Duan <zhenzhong.duan@intel.com>; qemu-
>> devel@nongnu.org; alex.williamson@redhat.com; clg@redhat.com;
>> eric.auger@redhat.com; mst@redhat.com; jasowang@redhat.com;
>> peterx@redhat.com; jgg@nvidia.com; Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>; joao.m.martins@oracle.com;
>> clement.mathieu--drif@eviden.com; kevin.tian@intel.com;
>> yi.l.liu@intel.com; chao.p.peng@intel.com
>> Subject: Re: [PATCH v3 02/20] hw/pci: Introduce
>> pci_device_get_viommu_cap()
>>
>> On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
>>>>> +enum {
>>>>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
>> supported */
>>>>> +};
>>>>
>>>> Thanks for this work. I am happy to see that we can share the
>>>> common code that allocates a NESTING_PARENT in the core using
>>>> this flag.
>>>>
>>>> Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>>>> hardware accelerated one. More often, it can be just a regular
>>>> 1-stage translation table via emulated translation code and an
>>>> emulated iotlb.
>>>>
>>> Because the user-created smmuv3 started as 'accelerated smmuv3',
>>> and had been 'de-accelerated' to simply 'user created smmuv3',
>>> I'm looking for some clarification in the above statement/request.
>>>
>>> Is the above suppose to reflect that a nested IOMMU has some hw-
>> acceleration
>>> in its Stage1 implementation?
>>> If so, then call it that: STAGE1_ACCEL.
>>> If it's suppose to represent that an IOMMU has nested/2-stage support,
>>> then the above is a valid cap; -but-, having a nested/2-stage support
>> IOMMU
>>> doesn't necessarily mean its accelerated.
>>
>> Well, there are an emulated "nested" mode and an hw-accelerated
>> "nested" mode in the smmuv3 code, so we had to choose something
>> like "accel" over "nested".
>>
>> Here, on the other hand, I think the core using this CAP would
>> unlikely care about an emulated "nested" mode in the individual
>> vIOMMU..
>>
>> So I suggested:
>> /* hardware-accelerated nested stage-1 page table support */
>> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>>
>> which it should be clear IMHO.
>>
>> If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
>
> I am not sure the _S1 part makes much sense in ARM case. It doesn't matter
> whether the Guest SMMUv3 is configured in s1/s2 or nested for this CAP.
> With the new SMMUv3 dev support, the user can pretty much specify,
>
> -device arm-smmuv3,primary-bus=pcie.0,id=smmuv3.1,accel=on,stage={stage1|stage2|nested}
>
> And I think it will work with a host SMMUv3 nested configuration in all the
> above cases. Unless I am missing something and we need to restrict its
> use with stage=stage1 only.
>
> Thanks,
> Shameer
>
oops, inadvertently hit send before I could add reply... apologies... reply below...
On 7/10/25 1:01 PM, Donald Dutile wrote:
>
>
> On 7/10/25 4:11 AM, Shameerali Kolothum Thodi wrote:
>>
>>
>>> -----Original Message-----
>>> From: Nicolin Chen <nicolinc@nvidia.com>
>>> Sent: Wednesday, July 9, 2025 8:20 PM
>>> To: Donald Dutile <ddutile@redhat.com>
>>> Cc: Zhenzhong Duan <zhenzhong.duan@intel.com>; qemu-
>>> devel@nongnu.org; alex.williamson@redhat.com; clg@redhat.com;
>>> eric.auger@redhat.com; mst@redhat.com; jasowang@redhat.com;
>>> peterx@redhat.com; jgg@nvidia.com; Shameerali Kolothum Thodi
>>> <shameerali.kolothum.thodi@huawei.com>; joao.m.martins@oracle.com;
>>> clement.mathieu--drif@eviden.com; kevin.tian@intel.com;
>>> yi.l.liu@intel.com; chao.p.peng@intel.com
>>> Subject: Re: [PATCH v3 02/20] hw/pci: Introduce
>>> pci_device_get_viommu_cap()
>>>
>>> On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
>>>>>> +enum {
>>>>>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
>>> supported */
>>>>>> +};
>>>>>
>>>>> Thanks for this work. I am happy to see that we can share the
>>>>> common code that allocates a NESTING_PARENT in the core using
>>>>> this flag.
>>>>>
>>>>> Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>>>>> hardware accelerated one. More often, it can be just a regular
>>>>> 1-stage translation table via emulated translation code and an
>>>>> emulated iotlb.
>>>>>
>>>> Because the user-created smmuv3 started as 'accelerated smmuv3',
>>>> and had been 'de-accelerated' to simply 'user created smmuv3',
>>>> I'm looking for some clarification in the above statement/request.
>>>>
>>>> Is the above suppose to reflect that a nested IOMMU has some hw-
>>> acceleration
>>>> in its Stage1 implementation?
>>>> If so, then call it that: STAGE1_ACCEL.
>>>> If it's suppose to represent that an IOMMU has nested/2-stage support,
>>>> then the above is a valid cap; -but-, having a nested/2-stage support
>>> IOMMU
>>>> doesn't necessarily mean its accelerated.
>>>
>>> Well, there are an emulated "nested" mode and an hw-accelerated
>>> "nested" mode in the smmuv3 code, so we had to choose something
>>> like "accel" over "nested".
>>>
>>> Here, on the other hand, I think the core using this CAP would
>>> unlikely care about an emulated "nested" mode in the individual
>>> vIOMMU..
>>>
>>> So I suggested:
>>> /* hardware-accelerated nested stage-1 page table support */
>>> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>>>
>>> which it should be clear IMHO.
>>>
>>> If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
>>
>> I am not sure the _S1 part makes much sense in ARM case. It doesn't matter
>> whether the Guest SMMUv3 is configured in s1/s2 or nested for this CAP.
>> With the new SMMUv3 dev support, the user can pretty much specify,
>>
>> -device arm-smmuv3,primary-bus=pcie.0,id=smmuv3.1,accel=on,stage={stage1|stage2|nested}
>>
>> And I think it will work with a host SMMUv3 nested configuration in all the
>> above cases. Unless I am missing something and we need to restrict its
>> use with stage=stage1 only.
>>
>> Thanks,
>> Shameer
>>
Shameer,
I didn't think of these config options this way... so is this CAP always 0 for smmuv3's associated VIOMMU?
or ... should intel-iommu follow this same/smmuv3 config option(s) and avoid this VIOMMU CAP req. ?
- Don
On Thu, Jul 10, 2025 at 01:07:11PM -0400, Donald Dutile wrote:
> > > > If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
> > >
> > > I am not sure the _S1 part makes much sense in ARM case. It doesn't matter
> > > whether the Guest SMMUv3 is configured in s1/s2 or nested for this CAP.
> > > With the new SMMUv3 dev support, the user can pretty much specify,
> > >
> > > -device arm-smmuv3,primary-bus=pcie.0,id=smmuv3.1,accel=on,stage={stage1|stage2|nested}
> > >
> > > And I think it will work with a host SMMUv3 nested configuration in all the
> > > above cases. Unless I am missing something and we need to restrict its
> > > use with stage=stage1 only.
> > >
> > > Thanks,
> > > Shameer
> > >
> Shameer,
> I didn't think of these config options this way... so is this CAP always 0 for smmuv3's associated VIOMMU?
There should be a "bool hw_nesting" flag (or some other name) in
the SMMUState or SMMUDevice structure, that's set earlier by the
support of nested HWPT allocation/invalidation and other factors.
E.g. if "accel=on" is missing, set hw_nesting to false.
Then SMMU's get_viommu_cap() callback function will simply check
that flag.
> or ... should intel-iommu follow this same/smmuv3 config option(s) and avoid this VIOMMU CAP req. ?
We need this CAP for the vfio/pci core to allocate an S2 nesting
parent HWPT.
Thanks
Nicolin
On 7/9/25 3:20 PM, Nicolin Chen wrote:
> On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
>>>> +enum {
>>>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table supported */
>>>> +};
>>>
>>> Thanks for this work. I am happy to see that we can share the
>>> common code that allocates a NESTING_PARENT in the core using
>>> this flag.
>>>
>>> Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>>> hardware accelerated one. More often, it can be just a regular
>>> 1-stage translation table via emulated translation code and an
>>> emulated iotlb.
>>>
>> Because the user-created smmuv3 started as 'accelerated smmuv3',
>> and had been 'de-accelerated' to simply 'user created smmuv3',
>> I'm looking for some clarification in the above statement/request.
>>
>> Is the above suppose to reflect that a nested IOMMU has some hw-acceleration
>> in its Stage1 implementation?
>> If so, then call it that: STAGE1_ACCEL.
>> If it's suppose to represent that an IOMMU has nested/2-stage support,
>> then the above is a valid cap; -but-, having a nested/2-stage support IOMMU
>> doesn't necessarily mean its accelerated.
>
> Well, there are an emulated "nested" mode and an hw-accelerated
> "nested" mode in the smmuv3 code, so we had to choose something
> like "accel" over "nested".
>
> Here, on the other hand, I think the core using this CAP would
> unlikely care about an emulated "nested" mode in the individual
> vIOMMU..
>
> So I suggested:
> /* hardware-accelerated nested stage-1 page table support */
> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>
> which it should be clear IMHO.
>
> If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
>
> Thanks
> Nicolin
>
If the distinction is hw-based s1 vs emulated-based s1, than
I'd prefer the use of VIOMMU_CAP_HW_NESTED_S1, and avoid the use
of 'accel'/'ACCEL' unless it is an explicitly stated 'acceleration'
feature/option in the SMMU spec.
Thanks,
- Don
Hi,
On 7/10/25 3:22 AM, Donald Dutile wrote:
>
>
> On 7/9/25 3:20 PM, Nicolin Chen wrote:
>> On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
>>>>> +enum {
>>>>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
>>>>> supported */
>>>>> +};
>>>>
>>>> Thanks for this work. I am happy to see that we can share the
>>>> common code that allocates a NESTING_PARENT in the core using
>>>> this flag.
>>>>
>>>> Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>>>> hardware accelerated one. More often, it can be just a regular
>>>> 1-stage translation table via emulated translation code and an
>>>> emulated iotlb.
>>>>
>>> Because the user-created smmuv3 started as 'accelerated smmuv3',
>>> and had been 'de-accelerated' to simply 'user created smmuv3',
>>> I'm looking for some clarification in the above statement/request.
>>>
>>> Is the above suppose to reflect that a nested IOMMU has some
>>> hw-acceleration
>>> in its Stage1 implementation?
>>> If so, then call it that: STAGE1_ACCEL.
>>> If it's suppose to represent that an IOMMU has nested/2-stage support,
>>> then the above is a valid cap; -but-, having a nested/2-stage
>>> support IOMMU
>>> doesn't necessarily mean its accelerated.
>>
>> Well, there are an emulated "nested" mode and an hw-accelerated
>> "nested" mode in the smmuv3 code, so we had to choose something
>> like "accel" over "nested".
>>
>> Here, on the other hand, I think the core using this CAP would
>> unlikely care about an emulated "nested" mode in the individual
>> vIOMMU..
>>
>> So I suggested:
>> /* hardware-accelerated nested stage-1 page table support */
>> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>>
>> which it should be clear IMHO.
>>
>> If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
>>
>> Thanks
>> Nicolin
>>
> If the distinction is hw-based s1 vs emulated-based s1, than
> I'd prefer the use of VIOMMU_CAP_HW_NESTED_S1, and avoid the use
VIOMMU_CAP_HW_NESTED_S1 or even VIOMMU_CAP_HW_NESTED look good to me too
Thanks
Eric
> of 'accel'/'ACCEL' unless it is an explicitly stated 'acceleration'
> feature/option in the SMMU spec.
>
> Thanks,
> - Don
>
On 7/15/25 11:28 AM, Eric Auger wrote:
> Hi,
>
> On 7/10/25 3:22 AM, Donald Dutile wrote:
>>
>>
>> On 7/9/25 3:20 PM, Nicolin Chen wrote:
>>> On Wed, Jul 09, 2025 at 01:55:46PM -0400, Donald Dutile wrote:
>>>>>> +enum {
>>>>>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
>>>>>> supported */
>>>>>> +};
>>>>>
>>>>> Thanks for this work. I am happy to see that we can share the
>>>>> common code that allocates a NESTING_PARENT in the core using
>>>>> this flag.
>>>>>
>>>>> Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>>>>> hardware accelerated one. More often, it can be just a regular
>>>>> 1-stage translation table via emulated translation code and an
>>>>> emulated iotlb.
>>>>>
>>>> Because the user-created smmuv3 started as 'accelerated smmuv3',
>>>> and had been 'de-accelerated' to simply 'user created smmuv3',
>>>> I'm looking for some clarification in the above statement/request.
>>>>
>>>> Is the above suppose to reflect that a nested IOMMU has some
>>>> hw-acceleration
>>>> in its Stage1 implementation?
>>>> If so, then call it that: STAGE1_ACCEL.
>>>> If it's suppose to represent that an IOMMU has nested/2-stage support,
>>>> then the above is a valid cap; -but-, having a nested/2-stage
>>>> support IOMMU
>>>> doesn't necessarily mean its accelerated.
>>>
>>> Well, there are an emulated "nested" mode and an hw-accelerated
>>> "nested" mode in the smmuv3 code, so we had to choose something
>>> like "accel" over "nested".
>>>
>>> Here, on the other hand, I think the core using this CAP would
>>> unlikely care about an emulated "nested" mode in the individual
>>> vIOMMU..
>>>
>>> So I suggested:
>>> /* hardware-accelerated nested stage-1 page table support */
>>> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>>>
>>> which it should be clear IMHO.
>>>
>>> If not, maybe go a bit further like "VIOMMU_CAP_HW_NESTED_S1"?
>>>
>>> Thanks
>>> Nicolin
>>>
>> If the distinction is hw-based s1 vs emulated-based s1, than
>> I'd prefer the use of VIOMMU_CAP_HW_NESTED_S1, and avoid the use
> VIOMMU_CAP_HW_NESTED_S1 or even VIOMMU_CAP_HW_NESTED look good to me too
>
> Thanks
>
> Eric
+1 to VIOMMU_CAP_HW_NESTED ; the S1 is unnecessary if nested IOMMU is the info being conveyed.
- Don
>> of 'accel'/'ACCEL' unless it is an explicitly stated 'acceleration'
>> feature/option in the SMMU spec.
>>
>> Thanks,
>> - Don
>>
>
>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH v3 02/20] hw/pci: Introduce
>pci_device_get_viommu_cap()
>
>On Tue, Jul 08, 2025 at 07:05:43AM -0400, Zhenzhong Duan wrote:
>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>> new file mode 100644
>> index 0000000000..e80aaf4431
>> --- /dev/null
>> +++ b/include/hw/iommu.h
>> @@ -0,0 +1,16 @@
>> +/*
>> + * General vIOMMU capabilities, flags, etc
>> + *
>> + * Copyright (C) 2025 Intel Corporation.
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#ifndef HW_IOMMU_H
>> +#define HW_IOMMU_H
>> +
>> +enum {
>> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
>supported */
>> +};
>
>Thanks for this work. I am happy to see that we can share the
>common code that allocates a NESTING_PARENT in the core using
>this flag.
>
>Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>hardware accelerated one. More often, it can be just a regular
>1-stage translation table via emulated translation code and an
>emulated iotlb.
What to return for an emulated device, VIOMMU_CAP_NESTED_S1 or 0?
>
>I think this flag should indicate that the vIOMMU supports a
>HW-accelerated nested S1 HWPT allocation/invalidation.
I'm OK to use VIOMMU_CAP_NESTED_S1 name. But still unclear which checks should we have in .get_viommu_cap() callback to determine returning VIOMMU_CAP_NESTED_S1 or not,
as we know nesting support is determined by host IOMMU not vIOMMU.
Thanks
Zhenzhong
>
>So, perhaps:
> /* hardware-accelerated nested stage-1 page table support */
> VIOMMU_CAP_NESTED_S1 = BIT_ULL(0),
>?
>
>Nicolin
On Wed, Jul 09, 2025 at 03:38:49AM +0000, Duan, Zhenzhong wrote:
> >> +enum {
> >> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
> >supported */
> >> +};
> >
> >Thanks for this work. I am happy to see that we can share the
> >common code that allocates a NESTING_PARENT in the core using
> >this flag.
> >
> >Yet on ARM, a STAGE1 page table isn't always a nested S1, the
> >hardware accelerated one. More often, it can be just a regular
> >1-stage translation table via emulated translation code and an
> >emulated iotlb.
>
> What to return for an emulated device, VIOMMU_CAP_NESTED_S1 or 0?
It would be ideally 0, since the get_viommu_cap is a per VFIO/PCI
device op. But I see the caller of pci_device_get_viommu_cap() in
this series has already done the identification between "emulated"
and "passthrough" devices?
That being said, it doesn't hurt to run it again in the callback
function.
> >I think this flag should indicate that the vIOMMU supports a
> >HW-accelerated nested S1 HWPT allocation/invalidation.
>
> I'm OK to use VIOMMU_CAP_NESTED_S1 name. But still unclear which
> checks should we have in .get_viommu_cap() callback to determine
> returning VIOMMU_CAP_NESTED_S1 or not,
> as we know nesting support is determined by host IOMMU not vIOMMU.
I would say it's determined by both the host and vIOMMU.
This is a common API that may return other caps too: eventually
there can be a case where vIOMMU has its get_viommu_cap callback
function but doesn't implement HW nesting via iommufd, even if
the host IOMMU driver supports nesting.
Thanks
Nicolin
>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH v3 02/20] hw/pci: Introduce
>pci_device_get_viommu_cap()
>
>On Wed, Jul 09, 2025 at 03:38:49AM +0000, Duan, Zhenzhong wrote:
>> >> +enum {
>> >> + VIOMMU_CAP_STAGE1 = BIT_ULL(0), /* stage1 page table
>> >supported */
>> >> +};
>> >
>> >Thanks for this work. I am happy to see that we can share the
>> >common code that allocates a NESTING_PARENT in the core using
>> >this flag.
>> >
>> >Yet on ARM, a STAGE1 page table isn't always a nested S1, the
>> >hardware accelerated one. More often, it can be just a regular
>> >1-stage translation table via emulated translation code and an
>> >emulated iotlb.
>>
>> What to return for an emulated device, VIOMMU_CAP_NESTED_S1 or 0?
>
>It would be ideally 0, since the get_viommu_cap is a per VFIO/PCI
>device op. But I see the caller of pci_device_get_viommu_cap() in
>this series has already done the identification between "emulated"
>and "passthrough" devices?
We do allow emulated PCI device calling pci_device_get_viommu_cap().
It's just no such request for now.
>
>That being said, it doesn't hurt to run it again in the callback
>function.
>
>> >I think this flag should indicate that the vIOMMU supports a
>> >HW-accelerated nested S1 HWPT allocation/invalidation.
>>
>> I'm OK to use VIOMMU_CAP_NESTED_S1 name. But still unclear which
>> checks should we have in .get_viommu_cap() callback to determine
>> returning VIOMMU_CAP_NESTED_S1 or not,
>> as we know nesting support is determined by host IOMMU not vIOMMU.
>
>I would say it's determined by both the host and vIOMMU.
>
>This is a common API that may return other caps too: eventually
>there can be a case where vIOMMU has its get_viommu_cap callback
>function but doesn't implement HW nesting via iommufd, even if
>the host IOMMU driver supports nesting.
OK, will use VIOMMU_CAP_NESTED_S1.
But for VTD, we check nesting compatibility in .set_iommu_device callback,
in . get_viommu_cap, we will return VIOMMU_CAP_NESTED_S1 if vIOMMU support it, not checking for host IOMMU for simpler code.
Thanks
Zhenzhong
© 2016 - 2025 Red Hat, Inc.