Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which
allows to retrieve capabilities exposed by a vIOMMU. The first planned
vIOMMU device capability is VIOMMU_CAP_HW_NESTED that advertises the
support of HW nested stage translation scheme. pci_device_get_viommu_cap
is a wrapper that can be called on a PCI device potentially protected by
a vIOMMU.
get_viommu_cap() is designed to return 64bit bitmap of purely emulated
capabilities which are only determined by user's configuration, no host
capabilities involved. Reasons are:
1. host may has heterogeneous IOMMUs, each with different capabilities
2. this is migration friendly, return value is consistent between source
and target.
3. host IOMMU capabilities are passed to vIOMMU through set_iommu_device()
interface which have to be after attach_device(), when get_viommu_cap()
is called in attach_device(), there is no way for vIOMMU to get host
IOMMU capabilities yet, so only emulated capabilities can be returned.
See below sequence:
vfio_device_attach():
iommufd_cdev_attach():
pci_device_get_viommu_cap() for HW nesting cap
create a nesting parent hwpt
attach device to the hwpt
vfio_device_hiod_create_and_realize() creating hiod
...
pci_device_set_iommu_device(hiod)
Suggested-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
MAINTAINERS | 1 +
include/hw/iommu.h | 19 +++++++++++++++++++
include/hw/pci/pci.h | 25 +++++++++++++++++++++++++
hw/pci/pci.c | 11 +++++++++++
4 files changed, 56 insertions(+)
create mode 100644 include/hw/iommu.h
diff --git a/MAINTAINERS b/MAINTAINERS
index a07086ed76..54fb878128 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2305,6 +2305,7 @@ F: include/system/iommufd.h
F: backends/host_iommu_device.c
F: include/system/host_iommu_device.h
F: include/qemu/chardev_open.h
+F: include/hw/iommu.h
F: util/chardev_open.c
F: docs/devel/vfio-iommufd.rst
diff --git a/include/hw/iommu.h b/include/hw/iommu.h
new file mode 100644
index 0000000000..7dd0c11b16
--- /dev/null
+++ b/include/hw/iommu.h
@@ -0,0 +1,19 @@
+/*
+ * General vIOMMU capabilities, flags, etc
+ *
+ * Copyright (C) 2025 Intel Corporation.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_IOMMU_H
+#define HW_IOMMU_H
+
+#include "qemu/bitops.h"
+
+enum {
+ /* hardware nested stage-1 page table support */
+ VIOMMU_CAP_HW_NESTED = BIT_ULL(0),
+};
+
+#endif /* HW_IOMMU_H */
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 6b7d3ac8a3..cde7a54a69 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -462,6 +462,21 @@ typedef struct PCIIOMMUOps {
* @devfn: device and function number of the PCI device.
*/
void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
+ /**
+ * @get_viommu_cap: get vIOMMU capabilities
+ *
+ * Optional callback, if not implemented, then vIOMMU doesn't
+ * support exposing capabilities to other subsystem, e.g., VFIO.
+ * vIOMMU can choose which capabilities to expose.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * Returns: 64bit bitmap with each bit represents a capability emulated by
+ * VIOMMU_CAP_* in include/hw/iommu.h, these capabilities are theoretical
+ * which are only determined by vIOMMU device properties and independent
+ * on the actual host capabilities they may depend on.
+ */
+ uint64_t (*get_viommu_cap)(void *opaque);
/**
* @get_iotlb_info: get properties required to initialize a device IOTLB.
*
@@ -642,6 +657,16 @@ bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
Error **errp);
void pci_device_unset_iommu_device(PCIDevice *dev);
+/**
+ * pci_device_get_viommu_cap: get vIOMMU capabilities.
+ *
+ * Returns a 64bit bitmap with each bit represents a vIOMMU exposed
+ * capability, 0 if vIOMMU doesn't support exposing capabilities.
+ *
+ * @dev: PCI device pointer.
+ */
+uint64_t pci_device_get_viommu_cap(PCIDevice *dev);
+
/**
* pci_iommu_get_iotlb_info: get properties required to initialize a
* device IOTLB.
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index c70b5ceeba..df1fb615a8 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2992,6 +2992,17 @@ void pci_device_unset_iommu_device(PCIDevice *dev)
}
}
+uint64_t pci_device_get_viommu_cap(PCIDevice *dev)
+{
+ PCIBus *iommu_bus;
+
+ pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL);
+ if (iommu_bus && iommu_bus->iommu_ops->get_viommu_cap) {
+ return iommu_bus->iommu_ops->get_viommu_cap(iommu_bus->iommu_opaque);
+ }
+ return 0;
+}
+
int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req,
bool exec_req, hwaddr addr, bool lpig,
uint16_t prgi, bool is_read, bool is_write)
--
2.47.1
On 2025/8/22 14:40, Zhenzhong Duan wrote:
> Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which
> allows to retrieve capabilities exposed by a vIOMMU. The first planned
> vIOMMU device capability is VIOMMU_CAP_HW_NESTED that advertises the
> support of HW nested stage translation scheme. pci_device_get_viommu_cap
> is a wrapper that can be called on a PCI device potentially protected by
> a vIOMMU.
>
> get_viommu_cap() is designed to return 64bit bitmap of purely emulated
> capabilities which are only determined by user's configuration, no host
> capabilities involved. Reasons are:
>
> 1. host may has heterogeneous IOMMUs, each with different capabilities
> 2. this is migration friendly, return value is consistent between source
> and target.
> 3. host IOMMU capabilities are passed to vIOMMU through set_iommu_device()
> interface which have to be after attach_device(), when get_viommu_cap()
> is called in attach_device(), there is no way for vIOMMU to get host
> IOMMU capabilities yet, so only emulated capabilities can be returned.
> See below sequence:
>
> vfio_device_attach():
> iommufd_cdev_attach():
> pci_device_get_viommu_cap() for HW nesting cap
> create a nesting parent hwpt
> attach device to the hwpt
> vfio_device_hiod_create_and_realize() creating hiod
> ...
> pci_device_set_iommu_device(hiod)
> Suggested-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
> MAINTAINERS | 1 +
> include/hw/iommu.h | 19 +++++++++++++++++++
> include/hw/pci/pci.h | 25 +++++++++++++++++++++++++
> hw/pci/pci.c | 11 +++++++++++
> 4 files changed, 56 insertions(+)
> create mode 100644 include/hw/iommu.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a07086ed76..54fb878128 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2305,6 +2305,7 @@ F: include/system/iommufd.h
> F: backends/host_iommu_device.c
> F: include/system/host_iommu_device.h
> F: include/qemu/chardev_open.h
> +F: include/hw/iommu.h
> F: util/chardev_open.c
> F: docs/devel/vfio-iommufd.rst
>
> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
> new file mode 100644
> index 0000000000..7dd0c11b16
> --- /dev/null
> +++ b/include/hw/iommu.h
> @@ -0,0 +1,19 @@
> +/*
> + * General vIOMMU capabilities, flags, etc
> + *
> + * Copyright (C) 2025 Intel Corporation.
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#ifndef HW_IOMMU_H
> +#define HW_IOMMU_H
> +
> +#include "qemu/bitops.h"
> +
> +enum {
> + /* hardware nested stage-1 page table support */
> + VIOMMU_CAP_HW_NESTED = BIT_ULL(0),
This naming is a bit confusing. get_viommu_cap indicates it will return
the viommu's capability while this naming is HW_NESTED. It's conflict
with the commit message which claims only emulated capability will be
returned.
TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little
larger than what we want so far. So I'm wondering if it can be done in a
more straightforward way. e.g. just a bool op named
iommu_nested_wanted(). Just an example, maybe better naming. We can
extend the op to be returning a u64 value in the future when we see
another request on VFIO from vIOMMU.
Regards,
Yi Liu
Hi
On 8/27/25 1:13 PM, Yi Liu wrote:
> On 2025/8/22 14:40, Zhenzhong Duan wrote:
>> Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which
>> allows to retrieve capabilities exposed by a vIOMMU. The first planned
>> vIOMMU device capability is VIOMMU_CAP_HW_NESTED that advertises the
>> support of HW nested stage translation scheme. pci_device_get_viommu_cap
>> is a wrapper that can be called on a PCI device potentially protected by
>> a vIOMMU.
>>
>> get_viommu_cap() is designed to return 64bit bitmap of purely emulated
>> capabilities which are only determined by user's configuration, no host
>> capabilities involved. Reasons are:
>>
>> 1. host may has heterogeneous IOMMUs, each with different capabilities
>> 2. this is migration friendly, return value is consistent between source
>> and target.
>> 3. host IOMMU capabilities are passed to vIOMMU through
>> set_iommu_device()
>> interface which have to be after attach_device(), when
>> get_viommu_cap()
>> is called in attach_device(), there is no way for vIOMMU to get host
>> IOMMU capabilities yet, so only emulated capabilities can be
>> returned.
>> See below sequence:
>>
>> vfio_device_attach():
>> iommufd_cdev_attach():
>> pci_device_get_viommu_cap() for HW nesting cap
>> create a nesting parent hwpt
>> attach device to the hwpt
>> vfio_device_hiod_create_and_realize() creating hiod
>> ...
>> pci_device_set_iommu_device(hiod)
>
>> Suggested-by: Yi Liu <yi.l.liu@intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> MAINTAINERS | 1 +
>> include/hw/iommu.h | 19 +++++++++++++++++++
>> include/hw/pci/pci.h | 25 +++++++++++++++++++++++++
>> hw/pci/pci.c | 11 +++++++++++
>> 4 files changed, 56 insertions(+)
>> create mode 100644 include/hw/iommu.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index a07086ed76..54fb878128 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -2305,6 +2305,7 @@ F: include/system/iommufd.h
>> F: backends/host_iommu_device.c
>> F: include/system/host_iommu_device.h
>> F: include/qemu/chardev_open.h
>> +F: include/hw/iommu.h
>> F: util/chardev_open.c
>> F: docs/devel/vfio-iommufd.rst
>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>> new file mode 100644
>> index 0000000000..7dd0c11b16
>> --- /dev/null
>> +++ b/include/hw/iommu.h
>> @@ -0,0 +1,19 @@
>> +/*
>> + * General vIOMMU capabilities, flags, etc
>> + *
>> + * Copyright (C) 2025 Intel Corporation.
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#ifndef HW_IOMMU_H
>> +#define HW_IOMMU_H
>> +
>> +#include "qemu/bitops.h"
>> +
>> +enum {
>> + /* hardware nested stage-1 page table support */
>> + VIOMMU_CAP_HW_NESTED = BIT_ULL(0),
>
> This naming is a bit confusing. get_viommu_cap indicates it will return
> the viommu's capability while this naming is HW_NESTED. It's conflict
> with the commit message which claims only emulated capability will be
> returned.
it actually means the viommu has the code to handle HW nested case,
independently on the actual HW support.
maybe remove the "emulation" wording.
Otherwise we may also use the virtio has_feature naming?
>
> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little
> larger than what we want so far. So I'm wondering if it can be done in a
> more straightforward way. e.g. just a bool op named
> iommu_nested_wanted(). Just an example, maybe better naming. We can
> extend the op to be returning a u64 value in the future when we see
> another request on VFIO from vIOMMU.
personnally I am fine with the bitmask which looks more future proof.
besides,
Reviewed-by: Eric Auger <eric.auger@redhat.com>
>
> Regards,
> Yi Liu
>
Hi Eric,
On 2025/8/27 19:22, Eric Auger wrote:
> Hi
>
> On 8/27/25 1:13 PM, Yi Liu wrote:
>> On 2025/8/22 14:40, Zhenzhong Duan wrote:
>>> Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which
>>> allows to retrieve capabilities exposed by a vIOMMU. The first planned
>>> vIOMMU device capability is VIOMMU_CAP_HW_NESTED that advertises the
>>> support of HW nested stage translation scheme. pci_device_get_viommu_cap
>>> is a wrapper that can be called on a PCI device potentially protected by
>>> a vIOMMU.
>>>
>>> get_viommu_cap() is designed to return 64bit bitmap of purely emulated
>>> capabilities which are only determined by user's configuration, no host
>>> capabilities involved. Reasons are:
>>>
>>> 1. host may has heterogeneous IOMMUs, each with different capabilities
>>> 2. this is migration friendly, return value is consistent between source
>>> and target.
>>> 3. host IOMMU capabilities are passed to vIOMMU through
>>> set_iommu_device()
>>> interface which have to be after attach_device(), when
>>> get_viommu_cap()
>>> is called in attach_device(), there is no way for vIOMMU to get host
>>> IOMMU capabilities yet, so only emulated capabilities can be
>>> returned.
>>> See below sequence:
>>>
>>> vfio_device_attach():
>>> iommufd_cdev_attach():
>>> pci_device_get_viommu_cap() for HW nesting cap
>>> create a nesting parent hwpt
>>> attach device to the hwpt
>>> vfio_device_hiod_create_and_realize() creating hiod
>>> ...
>>> pci_device_set_iommu_device(hiod)
>>
>>> Suggested-by: Yi Liu <yi.l.liu@intel.com>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>> ---
>>> MAINTAINERS | 1 +
>>> include/hw/iommu.h | 19 +++++++++++++++++++
>>> include/hw/pci/pci.h | 25 +++++++++++++++++++++++++
>>> hw/pci/pci.c | 11 +++++++++++
>>> 4 files changed, 56 insertions(+)
>>> create mode 100644 include/hw/iommu.h
>>>
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index a07086ed76..54fb878128 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -2305,6 +2305,7 @@ F: include/system/iommufd.h
>>> F: backends/host_iommu_device.c
>>> F: include/system/host_iommu_device.h
>>> F: include/qemu/chardev_open.h
>>> +F: include/hw/iommu.h
>>> F: util/chardev_open.c
>>> F: docs/devel/vfio-iommufd.rst
>>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>>> new file mode 100644
>>> index 0000000000..7dd0c11b16
>>> --- /dev/null
>>> +++ b/include/hw/iommu.h
>>> @@ -0,0 +1,19 @@
>>> +/*
>>> + * General vIOMMU capabilities, flags, etc
>>> + *
>>> + * Copyright (C) 2025 Intel Corporation.
>>> + *
>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>> + */
>>> +
>>> +#ifndef HW_IOMMU_H
>>> +#define HW_IOMMU_H
>>> +
>>> +#include "qemu/bitops.h"
>>> +
>>> +enum {
>>> + /* hardware nested stage-1 page table support */
>>> + VIOMMU_CAP_HW_NESTED = BIT_ULL(0),
>>
>> This naming is a bit confusing. get_viommu_cap indicates it will return
>> the viommu's capability while this naming is HW_NESTED. It's conflict
>> with the commit message which claims only emulated capability will be
>> returned.
>
> it actually means the viommu has the code to handle HW nested case,
> independently on the actual HW support.
> maybe remove the "emulation" wording.
yeah, I know the meaning and the purpose here. Just not quite satisfied
with the naming.
>
> Otherwise we may also use the virtio has_feature naming?
has_feature seems better. Looks to ask if vIOMMU has something and then
do something.
>
>>
>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little
>> larger than what we want so far. So I'm wondering if it can be done in a
>> more straightforward way. e.g. just a bool op named
>> iommu_nested_wanted(). Just an example, maybe better naming. We can
>> extend the op to be returning a u64 value in the future when we see
>> another request on VFIO from vIOMMU.
> personnally I am fine with the bitmask which looks more future proof.
not quite sure if there is another info that needs to be checked in
this "VFIO asks vIOMMU" manner. Have you seen one beside this
nested hwpt requirement by vIOMMU?
Regards,
Yi Liu
Hi Yi,
On 8/27/25 2:30 PM, Yi Liu wrote:
> Hi Eric,
>
> On 2025/8/27 19:22, Eric Auger wrote:
>> Hi
>>
>> On 8/27/25 1:13 PM, Yi Liu wrote:
>>> On 2025/8/22 14:40, Zhenzhong Duan wrote:
>>>> Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which
>>>> allows to retrieve capabilities exposed by a vIOMMU. The first planned
>>>> vIOMMU device capability is VIOMMU_CAP_HW_NESTED that advertises the
>>>> support of HW nested stage translation scheme.
>>>> pci_device_get_viommu_cap
>>>> is a wrapper that can be called on a PCI device potentially
>>>> protected by
>>>> a vIOMMU.
>>>>
>>>> get_viommu_cap() is designed to return 64bit bitmap of purely emulated
>>>> capabilities which are only determined by user's configuration, no
>>>> host
>>>> capabilities involved. Reasons are:
>>>>
>>>> 1. host may has heterogeneous IOMMUs, each with different capabilities
>>>> 2. this is migration friendly, return value is consistent between
>>>> source
>>>> and target.
>>>> 3. host IOMMU capabilities are passed to vIOMMU through
>>>> set_iommu_device()
>>>> interface which have to be after attach_device(), when
>>>> get_viommu_cap()
>>>> is called in attach_device(), there is no way for vIOMMU to
>>>> get host
>>>> IOMMU capabilities yet, so only emulated capabilities can be
>>>> returned.
>>>> See below sequence:
>>>>
>>>> vfio_device_attach():
>>>> iommufd_cdev_attach():
>>>> pci_device_get_viommu_cap() for HW nesting cap
>>>> create a nesting parent hwpt
>>>> attach device to the hwpt
>>>> vfio_device_hiod_create_and_realize() creating hiod
>>>> ...
>>>> pci_device_set_iommu_device(hiod)
>>>
>>>> Suggested-by: Yi Liu <yi.l.liu@intel.com>
>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>>> ---
>>>> MAINTAINERS | 1 +
>>>> include/hw/iommu.h | 19 +++++++++++++++++++
>>>> include/hw/pci/pci.h | 25 +++++++++++++++++++++++++
>>>> hw/pci/pci.c | 11 +++++++++++
>>>> 4 files changed, 56 insertions(+)
>>>> create mode 100644 include/hw/iommu.h
>>>>
>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>> index a07086ed76..54fb878128 100644
>>>> --- a/MAINTAINERS
>>>> +++ b/MAINTAINERS
>>>> @@ -2305,6 +2305,7 @@ F: include/system/iommufd.h
>>>> F: backends/host_iommu_device.c
>>>> F: include/system/host_iommu_device.h
>>>> F: include/qemu/chardev_open.h
>>>> +F: include/hw/iommu.h
>>>> F: util/chardev_open.c
>>>> F: docs/devel/vfio-iommufd.rst
>>>> diff --git a/include/hw/iommu.h b/include/hw/iommu.h
>>>> new file mode 100644
>>>> index 0000000000..7dd0c11b16
>>>> --- /dev/null
>>>> +++ b/include/hw/iommu.h
>>>> @@ -0,0 +1,19 @@
>>>> +/*
>>>> + * General vIOMMU capabilities, flags, etc
>>>> + *
>>>> + * Copyright (C) 2025 Intel Corporation.
>>>> + *
>>>> + * SPDX-License-Identifier: GPL-2.0-or-later
>>>> + */
>>>> +
>>>> +#ifndef HW_IOMMU_H
>>>> +#define HW_IOMMU_H
>>>> +
>>>> +#include "qemu/bitops.h"
>>>> +
>>>> +enum {
>>>> + /* hardware nested stage-1 page table support */
>>>> + VIOMMU_CAP_HW_NESTED = BIT_ULL(0),
>>>
>>> This naming is a bit confusing. get_viommu_cap indicates it will return
>>> the viommu's capability while this naming is HW_NESTED. It's conflict
>>> with the commit message which claims only emulated capability will be
>>> returned.
>>
>> it actually means the viommu has the code to handle HW nested case,
>> independently on the actual HW support.
>> maybe remove the "emulation" wording.
>
> yeah, I know the meaning and the purpose here. Just not quite satisfied
> with the naming.
>
>>
>> Otherwise we may also use the virtio has_feature naming?
>
> has_feature seems better. Looks to ask if vIOMMU has something and then
> do something.
>
>>
>>>
>>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little
>>> larger than what we want so far. So I'm wondering if it can be done
>>> in a
>>> more straightforward way. e.g. just a bool op named
>>> iommu_nested_wanted(). Just an example, maybe better naming. We can
>>> extend the op to be returning a u64 value in the future when we see
>>> another request on VFIO from vIOMMU.
>> personnally I am fine with the bitmask which looks more future proof.
>
> not quite sure if there is another info that needs to be checked in
> this "VFIO asks vIOMMU" manner. Have you seen one beside this
> nested hwpt requirement by vIOMMU?
I don't remember any at this point. But I guess with ARM CCA device
passthrough we might have other needs
Eric
>
> Regards,
> Yi Liu
>
On Wed, Aug 27, 2025 at 02:32:42PM +0200, Eric Auger wrote: > On 8/27/25 2:30 PM, Yi Liu wrote: > > On 2025/8/27 19:22, Eric Auger wrote: > >>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little > >>> larger than what we want so far. So I'm wondering if it can be done > >>> in a > >>> more straightforward way. e.g. just a bool op named > >>> iommu_nested_wanted(). Just an example, maybe better naming. We can > >>> extend the op to be returning a u64 value in the future when we see > >>> another request on VFIO from vIOMMU. > >> personnally I am fine with the bitmask which looks more future proof. > > > > not quite sure if there is another info that needs to be checked in > > this "VFIO asks vIOMMU" manner. Have you seen one beside this > > nested hwpt requirement by vIOMMU? > > I don't remember any at this point. But I guess with ARM CCA device > passthrough we might have other needs Yea. A Realm vSMMU instance won't allocate IOAS/HWPT. So it will ask the core to bypass those allocations, via the same op. I don't know: does "get_viommu_flags" sound more fitting to have a clear meaning of "want"? VIOMMU_FLAG_WANT_NESTING_PARENT VIOMMU_FLAG_WANT_NO_IOAS At least, the 2nd one being a "cap" wouldn't sound nice to me.. Nicolin
On 2025/8/27 23:30, Nicolin Chen wrote: > On Wed, Aug 27, 2025 at 02:32:42PM +0200, Eric Auger wrote: >> On 8/27/25 2:30 PM, Yi Liu wrote: >>> On 2025/8/27 19:22, Eric Auger wrote: >>>>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little >>>>> larger than what we want so far. So I'm wondering if it can be done >>>>> in a >>>>> more straightforward way. e.g. just a bool op named >>>>> iommu_nested_wanted(). Just an example, maybe better naming. We can >>>>> extend the op to be returning a u64 value in the future when we see >>>>> another request on VFIO from vIOMMU. >>>> personnally I am fine with the bitmask which looks more future proof. >>> >>> not quite sure if there is another info that needs to be checked in >>> this "VFIO asks vIOMMU" manner. Have you seen one beside this >>> nested hwpt requirement by vIOMMU? >> >> I don't remember any at this point. But I guess with ARM CCA device >> passthrough we might have other needs > > Yea. A Realm vSMMU instance won't allocate IOAS/HWPT. So it will > ask the core to bypass those allocations, via the same op. > > I don't know: does "get_viommu_flags" sound more fitting to have > a clear meaning of "want"? > > VIOMMU_FLAG_WANT_NESTING_PARENT > VIOMMU_FLAG_WANT_NO_IOAS > > At least, the 2nd one being a "cap" wouldn't sound nice to me.. this looks good to me. Regards, Yi Liu
>-----Original Message----- >From: Liu, Yi L <yi.l.liu@intel.com> >Subject: Re: [PATCH v5 02/21] hw/pci: Introduce >pci_device_get_viommu_cap() > >On 2025/8/27 23:30, Nicolin Chen wrote: >> On Wed, Aug 27, 2025 at 02:32:42PM +0200, Eric Auger wrote: >>> On 8/27/25 2:30 PM, Yi Liu wrote: >>>> On 2025/8/27 19:22, Eric Auger wrote: >>>>>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a little >>>>>> larger than what we want so far. So I'm wondering if it can be done >>>>>> in a >>>>>> more straightforward way. e.g. just a bool op named >>>>>> iommu_nested_wanted(). Just an example, maybe better naming. We >can >>>>>> extend the op to be returning a u64 value in the future when we see >>>>>> another request on VFIO from vIOMMU. >>>>> personnally I am fine with the bitmask which looks more future proof. >>>> >>>> not quite sure if there is another info that needs to be checked in >>>> this "VFIO asks vIOMMU" manner. Have you seen one beside this >>>> nested hwpt requirement by vIOMMU? >>> >>> I don't remember any at this point. But I guess with ARM CCA device >>> passthrough we might have other needs >> >> Yea. A Realm vSMMU instance won't allocate IOAS/HWPT. So it will >> ask the core to bypass those allocations, via the same op. >> >> I don't know: does "get_viommu_flags" sound more fitting to have >> a clear meaning of "want"? >> >> VIOMMU_FLAG_WANT_NESTING_PARENT >> VIOMMU_FLAG_WANT_NO_IOAS >> >> At least, the 2nd one being a "cap" wouldn't sound nice to me.. > >this looks good to me. OK, will do s/get_viommu_cap/get_viommu_flags and s/VIOMMU_CAP_HW_NESTED/ VIOMMU_FLAG_WANT_NESTING_PARENT if no more suggestions. Thanks Zhenzhong
Hi Eric, >-----Original Message----- >From: Duan, Zhenzhong >Sent: Thursday, August 28, 2025 5:07 PM >Subject: RE: [PATCH v5 02/21] hw/pci: Introduce >pci_device_get_viommu_cap() > > > >>-----Original Message----- >>From: Liu, Yi L <yi.l.liu@intel.com> >>Subject: Re: [PATCH v5 02/21] hw/pci: Introduce >>pci_device_get_viommu_cap() >> >>On 2025/8/27 23:30, Nicolin Chen wrote: >>> On Wed, Aug 27, 2025 at 02:32:42PM +0200, Eric Auger wrote: >>>> On 8/27/25 2:30 PM, Yi Liu wrote: >>>>> On 2025/8/27 19:22, Eric Auger wrote: >>>>>>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a >little >>>>>>> larger than what we want so far. So I'm wondering if it can be done >>>>>>> in a >>>>>>> more straightforward way. e.g. just a bool op named >>>>>>> iommu_nested_wanted(). Just an example, maybe better naming. We >>can >>>>>>> extend the op to be returning a u64 value in the future when we see >>>>>>> another request on VFIO from vIOMMU. >>>>>> personnally I am fine with the bitmask which looks more future proof. >>>>> >>>>> not quite sure if there is another info that needs to be checked in >>>>> this "VFIO asks vIOMMU" manner. Have you seen one beside this >>>>> nested hwpt requirement by vIOMMU? >>>> >>>> I don't remember any at this point. But I guess with ARM CCA device >>>> passthrough we might have other needs >>> >>> Yea. A Realm vSMMU instance won't allocate IOAS/HWPT. So it will >>> ask the core to bypass those allocations, via the same op. >>> >>> I don't know: does "get_viommu_flags" sound more fitting to have >>> a clear meaning of "want"? >>> >>> VIOMMU_FLAG_WANT_NESTING_PARENT >>> VIOMMU_FLAG_WANT_NO_IOAS >>> >>> At least, the 2nd one being a "cap" wouldn't sound nice to me.. >> >>this looks good to me. > >OK, will do s/get_viommu_cap/get_viommu_flags and >s/VIOMMU_CAP_HW_NESTED/ VIOMMU_FLAG_WANT_NESTING_PARENT if >no more suggestions. I just noticed this change will conflict with your suggestion of using HW_NESTED terminology. Let me know if you agree with this change or not? Thanks Zhenzhong
On Fri, Aug 29, 2025 at 01:54:50AM +0000, Duan, Zhenzhong wrote: > >>On 2025/8/27 23:30, Nicolin Chen wrote: > >>> On Wed, Aug 27, 2025 at 02:32:42PM +0200, Eric Auger wrote: > >>>> On 8/27/25 2:30 PM, Yi Liu wrote: > >>>>> On 2025/8/27 19:22, Eric Auger wrote: > >>>>>>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a > >little > >>>>>>> larger than what we want so far. So I'm wondering if it can be done > >>>>>>> in a > >>>>>>> more straightforward way. e.g. just a bool op named > >>>>>>> iommu_nested_wanted(). Just an example, maybe better naming. We > >>can > >>>>>>> extend the op to be returning a u64 value in the future when we see > >>>>>>> another request on VFIO from vIOMMU. > >>>>>> personnally I am fine with the bitmask which looks more future proof. > >>>>> > >>>>> not quite sure if there is another info that needs to be checked in > >>>>> this "VFIO asks vIOMMU" manner. Have you seen one beside this > >>>>> nested hwpt requirement by vIOMMU? > >>>> > >>>> I don't remember any at this point. But I guess with ARM CCA device > >>>> passthrough we might have other needs > >>> > >>> Yea. A Realm vSMMU instance won't allocate IOAS/HWPT. So it will > >>> ask the core to bypass those allocations, via the same op. > >>> > >>> I don't know: does "get_viommu_flags" sound more fitting to have > >>> a clear meaning of "want"? > >>> > >>> VIOMMU_FLAG_WANT_NESTING_PARENT > >>> VIOMMU_FLAG_WANT_NO_IOAS > >>> > >>> At least, the 2nd one being a "cap" wouldn't sound nice to me.. > >> > >>this looks good to me. > > > >OK, will do s/get_viommu_cap/get_viommu_flags and > >s/VIOMMU_CAP_HW_NESTED/ VIOMMU_FLAG_WANT_NESTING_PARENT if > >no more suggestions. > > I just noticed this change will conflict with your suggestion of using HW_NESTED terminology. > Let me know if you agree with this change or not? It wouldn't necessarily conflict. VIOMMU_FLAG_WANT_NESTING_PARENT is a request, interchangeable with VIOMMU_FLAG_SUPPORT_HW_NESTED, i.e. a cap. At the end of the day, they are fundamentally the same thing that is to tell the core to allocate a nesting parent HWPT. The former one is just more straightforward, avoiding confusing terms such as "stage-1" and "nested". IMHO, you wouldn't even need the comments in the other thread, as the flag explains clearly what it wants and what the core is doing. Also, once you use the "want" one, the "HW_NESTED" terminology will not exist in the code. Nicolin
>-----Original Message----- >From: Nicolin Chen <nicolinc@nvidia.com> >Subject: Re: [PATCH v5 02/21] hw/pci: Introduce >pci_device_get_viommu_cap() > >On Fri, Aug 29, 2025 at 01:54:50AM +0000, Duan, Zhenzhong wrote: >> >>On 2025/8/27 23:30, Nicolin Chen wrote: >> >>> On Wed, Aug 27, 2025 at 02:32:42PM +0200, Eric Auger wrote: >> >>>> On 8/27/25 2:30 PM, Yi Liu wrote: >> >>>>> On 2025/8/27 19:22, Eric Auger wrote: >> >>>>>>> TBH. I'm hesitating to name it as get_viommu_cap. The scope is a >> >little >> >>>>>>> larger than what we want so far. So I'm wondering if it can be >done >> >>>>>>> in a >> >>>>>>> more straightforward way. e.g. just a bool op named >> >>>>>>> iommu_nested_wanted(). Just an example, maybe better naming. >We >> >>can >> >>>>>>> extend the op to be returning a u64 value in the future when we >see >> >>>>>>> another request on VFIO from vIOMMU. >> >>>>>> personnally I am fine with the bitmask which looks more future >proof. >> >>>>> >> >>>>> not quite sure if there is another info that needs to be checked in >> >>>>> this "VFIO asks vIOMMU" manner. Have you seen one beside this >> >>>>> nested hwpt requirement by vIOMMU? >> >>>> >> >>>> I don't remember any at this point. But I guess with ARM CCA device >> >>>> passthrough we might have other needs >> >>> >> >>> Yea. A Realm vSMMU instance won't allocate IOAS/HWPT. So it will >> >>> ask the core to bypass those allocations, via the same op. >> >>> >> >>> I don't know: does "get_viommu_flags" sound more fitting to have >> >>> a clear meaning of "want"? >> >>> >> >>> VIOMMU_FLAG_WANT_NESTING_PARENT >> >>> VIOMMU_FLAG_WANT_NO_IOAS >> >>> >> >>> At least, the 2nd one being a "cap" wouldn't sound nice to me.. >> >> >> >>this looks good to me. >> > >> >OK, will do s/get_viommu_cap/get_viommu_flags and >> >s/VIOMMU_CAP_HW_NESTED/ >VIOMMU_FLAG_WANT_NESTING_PARENT if >> >no more suggestions. >> >> I just noticed this change will conflict with your suggestion of using >HW_NESTED terminology. >> Let me know if you agree with this change or not? > >It wouldn't necessarily conflict. VIOMMU_FLAG_WANT_NESTING_PARENT >is a request, interchangeable with VIOMMU_FLAG_SUPPORT_HW_NESTED, >i.e. a cap. > >At the end of the day, they are fundamentally the same thing that >is to tell the core to allocate a nesting parent HWPT. The former >one is just more straightforward, avoiding confusing terms such as >"stage-1" and "nested". > >IMHO, you wouldn't even need the comments in the other thread, as >the flag explains clearly what it wants and what the core is doing. > >Also, once you use the "want" one, the "HW_NESTED" terminology will >not exist in the code. OK, will use the *_flags and _WANT_* style, do you have suggestions for the name of vfio_device_viommu_get_nested() since "HW_NESTED" terminology will not exist, what about vfio_device_get_viommu_flags_W_N_P()? Thanks Zhenzhong
On Mon, Sep 01, 2025 at 02:35:29AM +0000, Duan, Zhenzhong wrote:
> >> I just noticed this change will conflict with your suggestion of using
> >HW_NESTED terminology.
> >> Let me know if you agree with this change or not?
> >
> >It wouldn't necessarily conflict. VIOMMU_FLAG_WANT_NESTING_PARENT
> >is a request, interchangeable with VIOMMU_FLAG_SUPPORT_HW_NESTED,
> >i.e. a cap.
> >
> >At the end of the day, they are fundamentally the same thing that
> >is to tell the core to allocate a nesting parent HWPT. The former
> >one is just more straightforward, avoiding confusing terms such as
> >"stage-1" and "nested".
> >
> >IMHO, you wouldn't even need the comments in the other thread, as
> >the flag explains clearly what it wants and what the core is doing.
> >
> >Also, once you use the "want" one, the "HW_NESTED" terminology will
> >not exist in the code.
>
> OK, will use the *_flags and _WANT_* style, do you have suggestions
> for the name of vfio_device_viommu_get_nested() since "HW_NESTED"
> terminology will not exist, what about vfio_device_get_viommu_flags_W_N_P()?
I don't see it very necessary to have a specific API per flag. So,
it could be just:
uint64_t viommu_flags = vfio_device_get_viommu_flags(vbasedev);
if (viommu_flags & VIOMMU_FLAG_WANT_NEST_PARENT) {
flags |= IOMMU_HWPT_ALLOC_NEST_PARENT;
}
?
Thanks
Nic
>-----Original Message-----
>From: Nicolin Chen <nicolinc@nvidia.com>
>Subject: Re: [PATCH v5 02/21] hw/pci: Introduce
>pci_device_get_viommu_cap()
>
>On Mon, Sep 01, 2025 at 02:35:29AM +0000, Duan, Zhenzhong wrote:
>> >> I just noticed this change will conflict with your suggestion of using
>> >HW_NESTED terminology.
>> >> Let me know if you agree with this change or not?
>> >
>> >It wouldn't necessarily conflict. VIOMMU_FLAG_WANT_NESTING_PARENT
>> >is a request, interchangeable with
>VIOMMU_FLAG_SUPPORT_HW_NESTED,
>> >i.e. a cap.
>> >
>> >At the end of the day, they are fundamentally the same thing that
>> >is to tell the core to allocate a nesting parent HWPT. The former
>> >one is just more straightforward, avoiding confusing terms such as
>> >"stage-1" and "nested".
>> >
>> >IMHO, you wouldn't even need the comments in the other thread, as
>> >the flag explains clearly what it wants and what the core is doing.
>> >
>> >Also, once you use the "want" one, the "HW_NESTED" terminology will
>> >not exist in the code.
>>
>> OK, will use the *_flags and _WANT_* style, do you have suggestions
>> for the name of vfio_device_viommu_get_nested() since "HW_NESTED"
>> terminology will not exist, what about
>vfio_device_get_viommu_flags_W_N_P()?
>
>I don't see it very necessary to have a specific API per flag. So,
>it could be just:
>
> uint64_t viommu_flags = vfio_device_get_viommu_flags(vbasedev);
>
> if (viommu_flags & VIOMMU_FLAG_WANT_NEST_PARENT) {
> flags |= IOMMU_HWPT_ALLOC_NEST_PARENT;
> }
>?
OK, make sense.
Thanks
Zhenzhong
On Fri, Aug 22, 2025 at 02:40:40AM -0400, Zhenzhong Duan wrote: > Introduce a new PCIIOMMUOps optional callback, get_viommu_cap() which > allows to retrieve capabilities exposed by a vIOMMU. The first planned > vIOMMU device capability is VIOMMU_CAP_HW_NESTED that advertises the > support of HW nested stage translation scheme. pci_device_get_viommu_cap > is a wrapper that can be called on a PCI device potentially protected by > a vIOMMU. > > get_viommu_cap() is designed to return 64bit bitmap of purely emulated > capabilities which are only determined by user's configuration, no host > capabilities involved. Reasons are: > > 1. host may has heterogeneous IOMMUs, each with different capabilities > 2. this is migration friendly, return value is consistent between source > and target. > 3. host IOMMU capabilities are passed to vIOMMU through set_iommu_device() > interface which have to be after attach_device(), when get_viommu_cap() > is called in attach_device(), there is no way for vIOMMU to get host > IOMMU capabilities yet, so only emulated capabilities can be returned. > See below sequence: > > vfio_device_attach(): > iommufd_cdev_attach(): > pci_device_get_viommu_cap() for HW nesting cap > create a nesting parent hwpt > attach device to the hwpt > vfio_device_hiod_create_and_realize() creating hiod > ... > pci_device_set_iommu_device(hiod) > > Suggested-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
© 2016 - 2025 Red Hat, Inc.