On 11/17/25 10:37 AM, Zhenzhong Duan wrote:
> On a system influenced by ERRATA_772415, IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
> is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the readonly
> range mapped on second stage page table could still be written.
>
> Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
> Update, Errata Details, SPR17.
> https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
>
> Also copied the SPR17 details from above link:
> "Problem: When remapping hardware is configured by system software in
> scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the
> PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended
> Access bit if enabled) in first-stage page-table entries even when
> second-stage mappings indicate that corresponding first-stage page-table
> is Read-Only.
>
> Implication: Due to this erratum, pages mapped as Read-only in second-stage
> page-tables may be modified by remapping hardware Access/Dirty bit updates.
>
> Workaround: None identified. System software enabling nested translations
> for a VM should ensure that there are no read-only pages in the
> corresponding second-stage mappings."
>
> Introduce a helper vfio_device_get_host_iommu_quirk_bypass_ro to check if
> readonly mappings should be bypassed.
>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
since it will be moved to a different series, I skip the review for now.
Thanks
Eric
> ---
> include/hw/vfio/vfio-container.h | 1 +
> include/hw/vfio/vfio-device.h | 3 +++
> hw/vfio/device.c | 14 ++++++++++++++
> hw/vfio/iommufd.c | 9 ++++++++-
> hw/vfio/listener.c | 6 ++++--
> 5 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/include/hw/vfio/vfio-container.h b/include/hw/vfio/vfio-container.h
> index 9f6e8cedfc..a7d5c5ed67 100644
> --- a/include/hw/vfio/vfio-container.h
> +++ b/include/hw/vfio/vfio-container.h
> @@ -52,6 +52,7 @@ struct VFIOContainer {
> QLIST_HEAD(, VFIODevice) device_list;
> GList *iova_ranges;
> NotifierWithReturn cpr_reboot_notifier;
> + bool bypass_ro;
> };
>
> #define TYPE_VFIO_IOMMU "vfio-iommu"
> diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
> index 48d00c7bc4..f6f3d0e378 100644
> --- a/include/hw/vfio/vfio-device.h
> +++ b/include/hw/vfio/vfio-device.h
> @@ -268,6 +268,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainer *bcontainer,
> void vfio_device_unprepare(VFIODevice *vbasedev);
>
> bool vfio_device_get_viommu_flags_want_nesting(VFIODevice *vbasedev);
> +bool vfio_device_get_host_iommu_quirk_bypass_ro(VFIODevice *vbasedev,
> + uint32_t type, void *caps,
> + uint32_t size);
>
> int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
> struct vfio_region_info **info);
> diff --git a/hw/vfio/device.c b/hw/vfio/device.c
> index 71eb069eb6..290011e154 100644
> --- a/hw/vfio/device.c
> +++ b/hw/vfio/device.c
> @@ -533,6 +533,20 @@ bool vfio_device_get_viommu_flags_want_nesting(VFIODevice *vbasedev)
> return false;
> }
>
> +bool vfio_device_get_host_iommu_quirk_bypass_ro(VFIODevice *vbasedev,
> + uint32_t type, void *caps,
> + uint32_t size)
> +{
> + VFIOPCIDevice *vdev = vfio_pci_from_vfio_device(vbasedev);
> +
> + if (vdev) {
> + return !!(pci_device_get_host_iommu_quirks(PCI_DEVICE(vdev), type,
> + caps, size) &
> + HOST_IOMMU_QUIRK_NESTING_PARENT_BYPASS_RO);
> + }
> + return false;
> +}
> +
> /*
> * Traditional ioctl() based io
> */
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 63f8442865..2a7b0d0c07 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -351,6 +351,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> VFIOContainer *bcontainer = VFIO_IOMMU(container);
> uint32_t type, flags = 0;
> uint64_t hw_caps;
> + VendorCaps caps;
> VFIOIOASHwpt *hwpt;
> uint32_t hwpt_id;
> int ret;
> @@ -396,7 +397,8 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> * instead.
> */
> if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
> - &type, NULL, 0, &hw_caps, errp)) {
> + &type, &caps, sizeof(caps), &hw_caps,
> + errp)) {
> return false;
> }
>
> @@ -411,6 +413,11 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> */
> if (vfio_device_get_viommu_flags_want_nesting(vbasedev)) {
> flags |= IOMMU_HWPT_ALLOC_NEST_PARENT;
> +
> + if (vfio_device_get_host_iommu_quirk_bypass_ro(vbasedev, type,
> + &caps, sizeof(caps))) {
> + bcontainer->bypass_ro = true;
> + }
> }
>
> if (cpr_is_incoming()) {
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index ca2377d860..090f935d30 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -502,7 +502,8 @@ void vfio_container_region_add(VFIOContainer *bcontainer,
> int ret;
> Error *err = NULL;
>
> - if (!vfio_listener_valid_section(section, false, "region_add")) {
> + if (!vfio_listener_valid_section(section, bcontainer->bypass_ro,
> + "region_add")) {
> return;
> }
>
> @@ -668,7 +669,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
> int ret;
> bool try_unmap = true;
>
> - if (!vfio_listener_valid_section(section, false, "region_del")) {
> + if (!vfio_listener_valid_section(section, bcontainer->bypass_ro,
> + "region_del")) {
> return;
> }
>