From: Yi Liu <yi.l.liu@intel.com>
If user wants to expose PASID capability in vIOMMU, then VFIO would also
report the PASID cap for this device if the underlying hardware supports
it as well.
As a start, this chooses to put the vPASID cap in the last 8 bytes of the
vconfig space. This is a choice in the good hope of no conflict with any
existing cap or hidden registers. For the devices that has hidden registers,
user should figure out a proper offset for the vPASID cap. This may require
an option for user to config it. Here we leave it as a future extension.
There are more discussions on the mechanism of finding the proper offset.
https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/vfio/pci.c | 31 +++++++++++++++++++++++++++++++
include/hw/iommu.h | 1 +
2 files changed, 32 insertions(+)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5b022da19e..f54ebd0111 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -24,6 +24,7 @@
#include <sys/ioctl.h>
#include "hw/hw.h"
+#include "hw/iommu.h"
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
#include "hw/pci/pci_bridge.h"
@@ -2500,7 +2501,12 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
{
+ HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
+ HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
PCIDevice *pdev = PCI_DEVICE(vdev);
+ uint8_t max_pasid_log2 = 0;
+ bool pasid_cap_added = false;
+ uint64_t hw_caps;
uint32_t header;
uint16_t cap_id, next, size;
uint8_t cap_ver;
@@ -2578,12 +2584,37 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
pcie_add_capability(pdev, cap_id, cap_ver, next, size);
}
break;
+ case PCI_EXT_CAP_ID_PASID:
+ pasid_cap_added = true;
+ /* fallthrough */
default:
pcie_add_capability(pdev, cap_id, cap_ver, next, size);
}
}
+ /*
+ * If PCI_EXT_CAP_ID_PASID not present, try to get information from the host
+ */
+ if (!pasid_cap_added && hiodc->get_pasid) {
+ max_pasid_log2 = hiodc->get_pasid(hiod, &hw_caps);
+ }
+
+ /*
+ * If supported, adds the PASID capability in the end of the PCIE config
+ * space. TODO: Add option for enabling pasid at a safe offset.
+ */
+ if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) &
+ VIOMMU_FLAG_PASID_SUPPORTED)) {
+ bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) ? true : false;
+ bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) ? true : false;
+
+ pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF,
+ max_pasid_log2, exec_perm, priv_mod);
+ /* PASID capability is fully emulated by QEMU */
+ memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
+ }
+
/* Cleanup chain head ID if necessary */
if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
diff --git a/include/hw/iommu.h b/include/hw/iommu.h
index 65d652950a..52e7f0cd96 100644
--- a/include/hw/iommu.h
+++ b/include/hw/iommu.h
@@ -14,6 +14,7 @@
enum {
/* Nesting parent HWPT will be reused by vIOMMU to create nested HWPT */
VIOMMU_FLAG_WANT_NESTING_PARENT = BIT_ULL(0),
+ VIOMMU_FLAG_PASID_SUPPORTED = BIT_ULL(1),
};
#endif /* HW_IOMMU_H */
--
2.43.0
On Mon, 29 Sep 2025 14:36:42 +0100
Shameer Kolothum <skolothumtho@nvidia.com> wrote:
> From: Yi Liu <yi.l.liu@intel.com>
>
> If user wants to expose PASID capability in vIOMMU, then VFIO would also
> report the PASID cap for this device if the underlying hardware supports
> it as well.
>
> As a start, this chooses to put the vPASID cap in the last 8 bytes of the
> vconfig space. This is a choice in the good hope of no conflict with any
> existing cap or hidden registers. For the devices that has hidden registers,
> user should figure out a proper offset for the vPASID cap. This may require
> an option for user to config it. Here we leave it as a future extension.
> There are more discussions on the mechanism of finding the proper offset.
>
> https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/
>
> Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/vfio/pci.c | 31 +++++++++++++++++++++++++++++++
> include/hw/iommu.h | 1 +
> 2 files changed, 32 insertions(+)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 5b022da19e..f54ebd0111 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -24,6 +24,7 @@
> #include <sys/ioctl.h>
>
> #include "hw/hw.h"
> +#include "hw/iommu.h"
> #include "hw/pci/msi.h"
> #include "hw/pci/msix.h"
> #include "hw/pci/pci_bridge.h"
> @@ -2500,7 +2501,12 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
>
> static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> {
> + HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
> + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
> PCIDevice *pdev = PCI_DEVICE(vdev);
> + uint8_t max_pasid_log2 = 0;
> + bool pasid_cap_added = false;
> + uint64_t hw_caps;
> uint32_t header;
> uint16_t cap_id, next, size;
> uint8_t cap_ver;
> @@ -2578,12 +2584,37 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> }
> break;
> + case PCI_EXT_CAP_ID_PASID:
> + pasid_cap_added = true;
> + /* fallthrough */
> default:
> pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> }
>
> }
>
> + /*
> + * If PCI_EXT_CAP_ID_PASID not present, try to get information from the host
Say why it might or might not be present...
> + */
> + if (!pasid_cap_added && hiodc->get_pasid) {
> + max_pasid_log2 = hiodc->get_pasid(hiod, &hw_caps);
> + }
> +
> + /*
> + * If supported, adds the PASID capability in the end of the PCIE config
> + * space. TODO: Add option for enabling pasid at a safe offset.
What are you thinking needs doing to make it safe? If it's at the end and there
is space isn't that enough?
> + */
> + if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) &
> + VIOMMU_FLAG_PASID_SUPPORTED)) {
> + bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) ? true : false;
> + bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) ? true : false;
> +
> + pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF,
> + max_pasid_log2, exec_perm, priv_mod);
> + /* PASID capability is fully emulated by QEMU */
> + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
> + }
> +
> /* Cleanup chain head ID if necessary */
> if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
> -----Original Message-----
> From: Jonathan Cameron <jonathan.cameron@huawei.com>
> Sent: 01 October 2025 14:58
> To: Shameer Kolothum <skolothumtho@nvidia.com>
> Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
> eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
> berrange@redhat.com; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; zhangfei.gao@linaro.org;
> zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> shameerkolothum@gmail.com
> Subject: Re: [PATCH v4 26/27] vfio: Synthesize vPASID capability to VM
>
> External email: Use caution opening links or attachments
>
>
> On Mon, 29 Sep 2025 14:36:42 +0100
> Shameer Kolothum <skolothumtho@nvidia.com> wrote:
>
> > From: Yi Liu <yi.l.liu@intel.com>
> >
> > If user wants to expose PASID capability in vIOMMU, then VFIO would also
> > report the PASID cap for this device if the underlying hardware supports
> > it as well.
> >
> > As a start, this chooses to put the vPASID cap in the last 8 bytes of the
> > vconfig space. This is a choice in the good hope of no conflict with any
> > existing cap or hidden registers. For the devices that has hidden registers,
> > user should figure out a proper offset for the vPASID cap. This may require
> > an option for user to config it. Here we leave it as a future extension.
> > There are more discussions on the mechanism of finding the proper offset.
> >
> >
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.
> kernel.org%2Fkvm%2FBN9PR11MB5276318969A212AD0649C7BE8CBE2%4
> 0BN9PR11MB5276.namprd11.prod.outlook.com%2F&data=05%7C02%7Csk
> olothumtho%40nvidia.com%7Cfc027ec76e294ee3db4808de00f29861%7C4
> 3083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C63894923913220611
> 4%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjA
> uMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7
> C%7C%7C&sdata=Qjerx3fDhLranvJPSoif4z0ue%2FcVgFYvFjroPgCjOQQ%3D&
> reserved=0
> >
> > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> > ---
> > hw/vfio/pci.c | 31 +++++++++++++++++++++++++++++++
> > include/hw/iommu.h | 1 +
> > 2 files changed, 32 insertions(+)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 5b022da19e..f54ebd0111 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -24,6 +24,7 @@
> > #include <sys/ioctl.h>
> >
> > #include "hw/hw.h"
> > +#include "hw/iommu.h"
> > #include "hw/pci/msi.h"
> > #include "hw/pci/msix.h"
> > #include "hw/pci/pci_bridge.h"
> > @@ -2500,7 +2501,12 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice
> *vdev, uint16_t pos)
> >
> > static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> > {
> > + HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
> > + HostIOMMUDeviceClass *hiodc =
> HOST_IOMMU_DEVICE_GET_CLASS(hiod);
> > PCIDevice *pdev = PCI_DEVICE(vdev);
> > + uint8_t max_pasid_log2 = 0;
> > + bool pasid_cap_added = false;
> > + uint64_t hw_caps;
> > uint32_t header;
> > uint16_t cap_id, next, size;
> > uint8_t cap_ver;
> > @@ -2578,12 +2584,37 @@ static void vfio_add_ext_cap(VFIOPCIDevice
> *vdev)
> > pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> > }
> > break;
> > + case PCI_EXT_CAP_ID_PASID:
> > + pasid_cap_added = true;
> > + /* fallthrough */
> > default:
> > pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> > }
> >
> > }
> >
> > + /*
> > + * If PCI_EXT_CAP_ID_PASID not present, try to get information from the
> host
>
> Say why it might or might not be present...
>
> > + */
> > + if (!pasid_cap_added && hiodc->get_pasid) {
> > + max_pasid_log2 = hiodc->get_pasid(hiod, &hw_caps);
> > + }
> > +
> > + /*
> > + * If supported, adds the PASID capability in the end of the PCIE config
> > + * space. TODO: Add option for enabling pasid at a safe offset.
>
> What are you thinking needs doing to make it safe? If it's at the end and there
> is space isn't that enough?
That is based on this discussion thread (mentioned in commit log as well)
https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/
" - Some devices are known to place registers in configuration space,
outside of the capability chains, which historically makes it
difficult to place a purely virtual capability without potentially
masking such hidden registers."
However, in this series we're trying to limit the impact by only placing the PASID
capability for devices that are behind the vIOMMU and where the user has explicitly
enabled PASID support for vIOMMU.
Thanks,
Shameer
>
> > + */
> > + if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) &
> > + VIOMMU_FLAG_PASID_SUPPORTED)) {
> > + bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) ?
> true : false;
> > + bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) ?
> true : false;
> > +
> > + pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE -
> PCI_EXT_CAP_PASID_SIZEOF,
> > + max_pasid_log2, exec_perm, priv_mod);
> > + /* PASID capability is fully emulated by QEMU */
> > + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
> > + }
> > +
> > /* Cleanup chain head ID if necessary */
> > if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> > pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
On Thu, 2 Oct 2025 08:03:09 +0000
Shameer Kolothum <skolothumtho@nvidia.com> wrote:
> > -----Original Message-----
> > From: Jonathan Cameron <jonathan.cameron@huawei.com>
> > Sent: 01 October 2025 14:58
> > To: Shameer Kolothum <skolothumtho@nvidia.com>
> > Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
> > eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> > <jgg@nvidia.com>; Nicolin Chen <nicolinc@nvidia.com>; ddutile@redhat.com;
> > berrange@redhat.com; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> > <mochs@nvidia.com>; smostafa@google.com; wangzhou1@hisilicon.com;
> > jiangkunkun@huawei.com; zhangfei.gao@linaro.org;
> > zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> > shameerkolothum@gmail.com
> > Subject: Re: [PATCH v4 26/27] vfio: Synthesize vPASID capability to VM
> >
> > External email: Use caution opening links or attachments
> >
> >
> > On Mon, 29 Sep 2025 14:36:42 +0100
> > Shameer Kolothum <skolothumtho@nvidia.com> wrote:
> >
> > > From: Yi Liu <yi.l.liu@intel.com>
> > >
> > > If user wants to expose PASID capability in vIOMMU, then VFIO would also
> > > report the PASID cap for this device if the underlying hardware supports
> > > it as well.
> > >
> > > As a start, this chooses to put the vPASID cap in the last 8 bytes of the
> > > vconfig space. This is a choice in the good hope of no conflict with any
> > > existing cap or hidden registers. For the devices that has hidden registers,
> > > user should figure out a proper offset for the vPASID cap. This may require
> > > an option for user to config it. Here we leave it as a future extension.
> > > There are more discussions on the mechanism of finding the proper offset.
> > >
> > >
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.
> > kernel.org%2Fkvm%2FBN9PR11MB5276318969A212AD0649C7BE8CBE2%4
> > 0BN9PR11MB5276.namprd11.prod.outlook.com%2F&data=05%7C02%7Csk
> > olothumtho%40nvidia.com%7Cfc027ec76e294ee3db4808de00f29861%7C4
> > 3083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C63894923913220611
> > 4%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjA
> > uMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7
> > C%7C%7C&sdata=Qjerx3fDhLranvJPSoif4z0ue%2FcVgFYvFjroPgCjOQQ%3D&
> > reserved=0
> > >
> > > Signed-off-by: Yi Liu <yi.l.liu@intel.com>
> > > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> > > ---
> > > hw/vfio/pci.c | 31 +++++++++++++++++++++++++++++++
> > > include/hw/iommu.h | 1 +
> > > 2 files changed, 32 insertions(+)
> > >
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index 5b022da19e..f54ebd0111 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -24,6 +24,7 @@
> > > #include <sys/ioctl.h>
> > >
> > > #include "hw/hw.h"
> > > +#include "hw/iommu.h"
> > > #include "hw/pci/msi.h"
> > > #include "hw/pci/msix.h"
> > > #include "hw/pci/pci_bridge.h"
> > > @@ -2500,7 +2501,12 @@ static int vfio_setup_rebar_ecap(VFIOPCIDevice
> > *vdev, uint16_t pos)
> > >
> > > static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
> > > {
> > > + HostIOMMUDevice *hiod = vdev->vbasedev.hiod;
> > > + HostIOMMUDeviceClass *hiodc =
> > HOST_IOMMU_DEVICE_GET_CLASS(hiod);
> > > PCIDevice *pdev = PCI_DEVICE(vdev);
> > > + uint8_t max_pasid_log2 = 0;
> > > + bool pasid_cap_added = false;
> > > + uint64_t hw_caps;
> > > uint32_t header;
> > > uint16_t cap_id, next, size;
> > > uint8_t cap_ver;
> > > @@ -2578,12 +2584,37 @@ static void vfio_add_ext_cap(VFIOPCIDevice
> > *vdev)
> > > pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> > > }
> > > break;
> > > + case PCI_EXT_CAP_ID_PASID:
> > > + pasid_cap_added = true;
> > > + /* fallthrough */
> > > default:
> > > pcie_add_capability(pdev, cap_id, cap_ver, next, size);
> > > }
> > >
> > > }
> > >
> > > + /*
> > > + * If PCI_EXT_CAP_ID_PASID not present, try to get information from the
> > host
> >
> > Say why it might or might not be present...
> >
> > > + */
> > > + if (!pasid_cap_added && hiodc->get_pasid) {
> > > + max_pasid_log2 = hiodc->get_pasid(hiod, &hw_caps);
> > > + }
> > > +
> > > + /*
> > > + * If supported, adds the PASID capability in the end of the PCIE config
> > > + * space. TODO: Add option for enabling pasid at a safe offset.
> >
> > What are you thinking needs doing to make it safe? If it's at the end and there
> > is space isn't that enough?
>
> That is based on this discussion thread (mentioned in commit log as well)
> https://lore.kernel.org/kvm/BN9PR11MB5276318969A212AD0649C7BE8CBE2@BN9PR11MB5276.namprd11.prod.outlook.com/
>
>
> " - Some devices are known to place registers in configuration space,
> outside of the capability chains, which historically makes it
> difficult to place a purely virtual capability without potentially
> masking such hidden registers."
Yuk. I know this is sometimes done to chicken bit certain capabilities. Some of the
CXL ones only surface in root ports (on one platform anyway) if the link is trained up
as CXL. The registers are there anyway, just the chain pointers that are edited.
Anything truely hidden that is need for operation and in my view they are on
their own!
>
> However, in this series we're trying to limit the impact by only placing the PASID
> capability for devices that are behind the vIOMMU and where the user has explicitly
> enabled PASID support for vIOMMU.
Make sense. So this TODO is more of a do it if anyone ever needs it.
Jonathan
>
> Thanks,
> Shameer
>
>
> >
> > > + */
> > > + if (max_pasid_log2 && (pci_device_get_viommu_flags(pdev) &
> > > + VIOMMU_FLAG_PASID_SUPPORTED)) {
> > > + bool exec_perm = (hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) ?
> > true : false;
> > > + bool priv_mod = (hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) ?
> > true : false;
> > > +
> > > + pcie_pasid_init(pdev, PCIE_CONFIG_SPACE_SIZE -
> > PCI_EXT_CAP_PASID_SIZEOF,
> > > + max_pasid_log2, exec_perm, priv_mod);
> > > + /* PASID capability is fully emulated by QEMU */
> > > + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8);
> > > + }
> > > +
> > > /* Cleanup chain head ID if necessary */
> > > if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
> > > pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
>
>
© 2016 - 2025 Red Hat, Inc.