On a system influenced by ERRATA_772415, IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the readonly
range mapped on stage-2 page table could still be written.
Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
Update, Errata Details, SPR17.
https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
include/hw/vfio/vfio-container-base.h | 1 +
hw/vfio/iommufd.c | 8 +++++++-
hw/vfio/listener.c | 13 +++++++++----
3 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index f0232654ee..e5c51a51ac 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -51,6 +51,7 @@ typedef struct VFIOContainerBase {
QLIST_HEAD(, VFIODevice) device_list;
GList *iova_ranges;
NotifierWithReturn cpr_reboot_notifier;
+ bool bypass_ro;
} VFIOContainerBase;
typedef struct VFIOGuestIOMMU {
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 83a632bdee..23839a511a 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -306,6 +306,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
{
ERRP_GUARD();
IOMMUFDBackend *iommufd = vbasedev->iommufd;
+ struct iommu_hw_info_vtd vtd;
uint32_t type, flags = 0;
uint64_t hw_caps;
VFIOIOASHwpt *hwpt;
@@ -345,10 +346,15 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
* instead.
*/
if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
- &type, NULL, 0, &hw_caps, errp)) {
+ &type, &vtd, sizeof(vtd), &hw_caps,
+ errp)) {
return false;
}
+ if (vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17) {
+ container->bcontainer.bypass_ro = true;
+ }
+
if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
}
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index f498e23a93..c64aa4539e 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -364,7 +364,8 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
return true;
}
-static bool vfio_listener_valid_section(MemoryRegionSection *section,
+static bool vfio_listener_valid_section(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
const char *name)
{
if (vfio_listener_skipped_section(section)) {
@@ -375,6 +376,10 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section,
return false;
}
+ if (bcontainer && bcontainer->bypass_ro && section->readonly) {
+ return false;
+ }
+
if (unlikely((section->offset_within_address_space &
~qemu_real_host_page_mask()) !=
(section->offset_within_region & ~qemu_real_host_page_mask()))) {
@@ -494,7 +499,7 @@ void vfio_container_region_add(VFIOContainerBase *bcontainer,
int ret;
Error *err = NULL;
- if (!vfio_listener_valid_section(section, "region_add")) {
+ if (!vfio_listener_valid_section(bcontainer, section, "region_add")) {
return;
}
@@ -655,7 +660,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
int ret;
bool try_unmap = true;
- if (!vfio_listener_valid_section(section, "region_del")) {
+ if (!vfio_listener_valid_section(bcontainer, section, "region_del")) {
return;
}
@@ -812,7 +817,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
container_of(listener, VFIODirtyRangesListener, listener);
hwaddr iova, end;
- if (!vfio_listener_valid_section(section, "tracking_update") ||
+ if (!vfio_listener_valid_section(NULL, section, "tracking_update") ||
!vfio_get_section_iova_range(dirty->bcontainer, section,
&iova, &end, NULL)) {
return;
--
2.34.1
Hi Zhenzhong,
On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
> On a system influenced by ERRATA_772415, IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
> is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the readonly
> range mapped on stage-2 page table could still be written.
I would split this patch into a vfio only patch and an iommu one that
sets bcontainer->readonly according to the fetched info.
>
> Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
> Update, Errata Details, SPR17.
> https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
the link does not work for me.
Please could you explain in english what the errata is about and what
actions need to be taken care in VFIO?
Sorry I failed to understand "
Due to this errata, even the readonly
range mapped on stage-2 page table could still be written.
"
>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> ---
> include/hw/vfio/vfio-container-base.h | 1 +
> hw/vfio/iommufd.c | 8 +++++++-
> hw/vfio/listener.c | 13 +++++++++----
> 3 files changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
> index f0232654ee..e5c51a51ac 100644
> --- a/include/hw/vfio/vfio-container-base.h
> +++ b/include/hw/vfio/vfio-container-base.h
> @@ -51,6 +51,7 @@ typedef struct VFIOContainerBase {
> QLIST_HEAD(, VFIODevice) device_list;
> GList *iova_ranges;
> NotifierWithReturn cpr_reboot_notifier;
> + bool bypass_ro;
> } VFIOContainerBase;
>
> typedef struct VFIOGuestIOMMU {
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 83a632bdee..23839a511a 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -306,6 +306,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> {
> ERRP_GUARD();
> IOMMUFDBackend *iommufd = vbasedev->iommufd;
> + struct iommu_hw_info_vtd vtd;
= {}; ?
> uint32_t type, flags = 0;
> uint64_t hw_caps;
> VFIOIOASHwpt *hwpt;
> @@ -345,10 +346,15 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
> * instead.
> */
> if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid,
> - &type, NULL, 0, &hw_caps, errp)) {
> + &type, &vtd, sizeof(vtd), &hw_caps,
> + errp)) {
> return false;
> }
>
> + if (vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17) {
> + container->bcontainer.bypass_ro = true;
> + }
> +
> if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
> flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
> }
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index f498e23a93..c64aa4539e 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -364,7 +364,8 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
> return true;
> }
>
> -static bool vfio_listener_valid_section(MemoryRegionSection *section,
> +static bool vfio_listener_valid_section(VFIOContainerBase *bcontainer,
> + MemoryRegionSection *section,
> const char *name)
> {
> if (vfio_listener_skipped_section(section)) {
> @@ -375,6 +376,10 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section,
> return false;
> }
>
> + if (bcontainer && bcontainer->bypass_ro && section->readonly) {
> + return false;
> + }
> +
> if (unlikely((section->offset_within_address_space &
> ~qemu_real_host_page_mask()) !=
> (section->offset_within_region & ~qemu_real_host_page_mask()))) {
> @@ -494,7 +499,7 @@ void vfio_container_region_add(VFIOContainerBase *bcontainer,
> int ret;
> Error *err = NULL;
>
> - if (!vfio_listener_valid_section(section, "region_add")) {
> + if (!vfio_listener_valid_section(bcontainer, section, "region_add")) {
> return;
> }
>
> @@ -655,7 +660,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
> int ret;
> bool try_unmap = true;
>
> - if (!vfio_listener_valid_section(section, "region_del")) {
> + if (!vfio_listener_valid_section(bcontainer, section, "region_del")) {
> return;
> }
>
> @@ -812,7 +817,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
> container_of(listener, VFIODirtyRangesListener, listener);
> hwaddr iova, end;
>
> - if (!vfio_listener_valid_section(section, "tracking_update") ||
> + if (!vfio_listener_valid_section(NULL, section, "tracking_update") ||
> !vfio_get_section_iova_range(dirty->bcontainer, section,
> &iova, &end, NULL)) {
> return;
Thanks
Eric
>-----Original Message-----
>From: Eric Auger <eric.auger@redhat.com>
>Subject: Re: [PATCH v2 18/19] Workaround for ERRATA_772415_SPR17
>
>Hi Zhenzhong,
>
>On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
>> On a system influenced by ERRATA_772415,
>IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
>> is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the
>readonly
>> range mapped on stage-2 page table could still be written.
>
>I would split this patch into a vfio only patch and an iommu one that
>sets bcontainer->readonly according to the fetched info.
Will do.
>>
>> Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
>> Update, Errata Details, SPR17.
>> https://www.intel.com/content/www/us/en/content-details/772415/content-
>details.html
>the link does not work for me.
That's strange, what about:
https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
>
>Please could you explain in english what the errata is about and what
>actions need to be taken care in VFIO?
>
>Sorry I failed to understand "
>
>Due to this errata, even the readonly
>range mapped on stage-2 page table could still be written.
Copy the explanation in above link:
"SPR17. Remapping Hardware May Set Access/Dirty Bits in a First-stage
Page-table Entry
Problem: When remapping hardware is configured by system software in
scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the
PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended
Access bit if enabled) in first-stage page-table entries even when
second-stage mappings indicate that corresponding first-stage page-table
is Read-Only.
Implication: Due to this erratum, pages mapped as Read-only in second-stage
page-tables may be modified by remapping hardware Access/Dirty bit updates.
Workaround: None identified. System software enabling nested translations
for a VM should ensure that there are no read-only pages in the
corresponding second-stage mappings."
>
>"
>>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> include/hw/vfio/vfio-container-base.h | 1 +
>> hw/vfio/iommufd.c | 8 +++++++-
>> hw/vfio/listener.c | 13 +++++++++----
>> 3 files changed, 17 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-
>container-base.h
>> index f0232654ee..e5c51a51ac 100644
>> --- a/include/hw/vfio/vfio-container-base.h
>> +++ b/include/hw/vfio/vfio-container-base.h
>> @@ -51,6 +51,7 @@ typedef struct VFIOContainerBase {
>> QLIST_HEAD(, VFIODevice) device_list;
>> GList *iova_ranges;
>> NotifierWithReturn cpr_reboot_notifier;
>> + bool bypass_ro;
>> } VFIOContainerBase;
>>
>> typedef struct VFIOGuestIOMMU {
>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>> index 83a632bdee..23839a511a 100644
>> --- a/hw/vfio/iommufd.c
>> +++ b/hw/vfio/iommufd.c
>> @@ -306,6 +306,7 @@ static bool
>iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>> {
>> ERRP_GUARD();
>> IOMMUFDBackend *iommufd = vbasedev->iommufd;
>> + struct iommu_hw_info_vtd vtd;
>= {}; ?
Will do.
>> uint32_t type, flags = 0;
>> uint64_t hw_caps;
>> VFIOIOASHwpt *hwpt;
>> @@ -345,10 +346,15 @@ static bool
>iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>> * instead.
>> */
>> if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev-
>>devid,
>> - &type, NULL, 0, &hw_caps, errp)) {
>> + &type, &vtd, sizeof(vtd), &hw_caps,
>> + errp)) {
>> return false;
>> }
>>
>> + if (vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17) {
>> + container->bcontainer.bypass_ro = true;
>> + }
>> +
>> if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
>> flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
>> }
>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>> index f498e23a93..c64aa4539e 100644
>> --- a/hw/vfio/listener.c
>> +++ b/hw/vfio/listener.c
>> @@ -364,7 +364,8 @@ static bool
>vfio_known_safe_misalignment(MemoryRegionSection *section)
>> return true;
>> }
>>
>> -static bool vfio_listener_valid_section(MemoryRegionSection *section,
>> +static bool vfio_listener_valid_section(VFIOContainerBase *bcontainer,
>> + MemoryRegionSection *section,
>> const char *name)
>> {
>> if (vfio_listener_skipped_section(section)) {
>> @@ -375,6 +376,10 @@ static bool
>vfio_listener_valid_section(MemoryRegionSection *section,
>> return false;
>> }
>>
>> + if (bcontainer && bcontainer->bypass_ro && section->readonly) {
>> + return false;
>> + }
>> +
>> if (unlikely((section->offset_within_address_space &
>> ~qemu_real_host_page_mask()) !=
>> (section->offset_within_region & ~qemu_real_host_page_mask()))) {
>> @@ -494,7 +499,7 @@ void vfio_container_region_add(VFIOContainerBase
>*bcontainer,
>> int ret;
>> Error *err = NULL;
>>
>> - if (!vfio_listener_valid_section(section, "region_add")) {
>> + if (!vfio_listener_valid_section(bcontainer, section, "region_add")) {
>> return;
>> }
>>
>> @@ -655,7 +660,7 @@ static void vfio_listener_region_del(MemoryListener
>*listener,
>> int ret;
>> bool try_unmap = true;
>>
>> - if (!vfio_listener_valid_section(section, "region_del")) {
>> + if (!vfio_listener_valid_section(bcontainer, section, "region_del")) {
>> return;
>> }
>>
>> @@ -812,7 +817,7 @@ static void vfio_dirty_tracking_update(MemoryListener
>*listener,
>> container_of(listener, VFIODirtyRangesListener, listener);
>> hwaddr iova, end;
>>
>> - if (!vfio_listener_valid_section(section, "tracking_update") ||
>> + if (!vfio_listener_valid_section(NULL, section, "tracking_update") ||
>> !vfio_get_section_iova_range(dirty->bcontainer, section,
>> &iova, &end, NULL)) {
>> return;
>Thanks
>
>Eric
On 6/23/25 5:29 AM, Duan, Zhenzhong wrote:
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Subject: Re: [PATCH v2 18/19] Workaround for ERRATA_772415_SPR17
>>
>> Hi Zhenzhong,
>>
>> On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
>>> On a system influenced by ERRATA_772415,
>> IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
>>> is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the
>> readonly
>>> range mapped on stage-2 page table could still be written.
>> I would split this patch into a vfio only patch and an iommu one that
>> sets bcontainer->readonly according to the fetched info.
> Will do.
>
>>> Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
>>> Update, Errata Details, SPR17.
>>> https://www.intel.com/content/www/us/en/content-details/772415/content-
>> details.html
>> the link does not work for me.
> That's strange, what about:
>
> https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
yes this link works for me.
>
>> Please could you explain in english what the errata is about and what
>> actions need to be taken care in VFIO?
>>
>> Sorry I failed to understand "
>>
>> Due to this errata, even the readonly
>> range mapped on stage-2 page table could still be written.
> Copy the explanation in above link:
>
> "SPR17. Remapping Hardware May Set Access/Dirty Bits in a First-stage
> Page-table Entry
>
> Problem: When remapping hardware is configured by system software in
> scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the
> PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended
> Access bit if enabled) in first-stage page-table entries even when
> second-stage mappings indicate that corresponding first-stage page-table
> is Read-Only.
>
> Implication: Due to this erratum, pages mapped as Read-only in second-stage
> page-tables may be modified by remapping hardware Access/Dirty bit updates.
>
> Workaround: None identified. System software enabling nested translations
> for a VM should ensure that there are no read-only pages in the
> corresponding second-stage mappings."
OK this definitively helps. I will further look at the patch with that
background
Eric
>
>> "
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>> ---
>>> include/hw/vfio/vfio-container-base.h | 1 +
>>> hw/vfio/iommufd.c | 8 +++++++-
>>> hw/vfio/listener.c | 13 +++++++++----
>>> 3 files changed, 17 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-
>> container-base.h
>>> index f0232654ee..e5c51a51ac 100644
>>> --- a/include/hw/vfio/vfio-container-base.h
>>> +++ b/include/hw/vfio/vfio-container-base.h
>>> @@ -51,6 +51,7 @@ typedef struct VFIOContainerBase {
>>> QLIST_HEAD(, VFIODevice) device_list;
>>> GList *iova_ranges;
>>> NotifierWithReturn cpr_reboot_notifier;
>>> + bool bypass_ro;
>>> } VFIOContainerBase;
>>>
>>> typedef struct VFIOGuestIOMMU {
>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>>> index 83a632bdee..23839a511a 100644
>>> --- a/hw/vfio/iommufd.c
>>> +++ b/hw/vfio/iommufd.c
>>> @@ -306,6 +306,7 @@ static bool
>> iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>>> {
>>> ERRP_GUARD();
>>> IOMMUFDBackend *iommufd = vbasedev->iommufd;
>>> + struct iommu_hw_info_vtd vtd;
>> = {}; ?
> Will do.
>
>>> uint32_t type, flags = 0;
>>> uint64_t hw_caps;
>>> VFIOIOASHwpt *hwpt;
>>> @@ -345,10 +346,15 @@ static bool
>> iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>>> * instead.
>>> */
>>> if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev-
>>> devid,
>>> - &type, NULL, 0, &hw_caps, errp)) {
>>> + &type, &vtd, sizeof(vtd), &hw_caps,
>>> + errp)) {
>>> return false;
>>> }
>>>
>>> + if (vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17) {
>>> + container->bcontainer.bypass_ro = true;
>>> + }
>>> +
>>> if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
>>> flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
>>> }
>>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>>> index f498e23a93..c64aa4539e 100644
>>> --- a/hw/vfio/listener.c
>>> +++ b/hw/vfio/listener.c
>>> @@ -364,7 +364,8 @@ static bool
>> vfio_known_safe_misalignment(MemoryRegionSection *section)
>>> return true;
>>> }
>>>
>>> -static bool vfio_listener_valid_section(MemoryRegionSection *section,
>>> +static bool vfio_listener_valid_section(VFIOContainerBase *bcontainer,
>>> + MemoryRegionSection *section,
>>> const char *name)
>>> {
>>> if (vfio_listener_skipped_section(section)) {
>>> @@ -375,6 +376,10 @@ static bool
>> vfio_listener_valid_section(MemoryRegionSection *section,
>>> return false;
>>> }
>>>
>>> + if (bcontainer && bcontainer->bypass_ro && section->readonly) {
>>> + return false;
>>> + }
>>> +
>>> if (unlikely((section->offset_within_address_space &
>>> ~qemu_real_host_page_mask()) !=
>>> (section->offset_within_region & ~qemu_real_host_page_mask()))) {
>>> @@ -494,7 +499,7 @@ void vfio_container_region_add(VFIOContainerBase
>> *bcontainer,
>>> int ret;
>>> Error *err = NULL;
>>>
>>> - if (!vfio_listener_valid_section(section, "region_add")) {
>>> + if (!vfio_listener_valid_section(bcontainer, section, "region_add")) {
>>> return;
>>> }
>>>
>>> @@ -655,7 +660,7 @@ static void vfio_listener_region_del(MemoryListener
>> *listener,
>>> int ret;
>>> bool try_unmap = true;
>>>
>>> - if (!vfio_listener_valid_section(section, "region_del")) {
>>> + if (!vfio_listener_valid_section(bcontainer, section, "region_del")) {
>>> return;
>>> }
>>>
>>> @@ -812,7 +817,7 @@ static void vfio_dirty_tracking_update(MemoryListener
>> *listener,
>>> container_of(listener, VFIODirtyRangesListener, listener);
>>> hwaddr iova, end;
>>>
>>> - if (!vfio_listener_valid_section(section, "tracking_update") ||
>>> + if (!vfio_listener_valid_section(NULL, section, "tracking_update") ||
>>> !vfio_get_section_iova_range(dirty->bcontainer, section,
>>> &iova, &end, NULL)) {
>>> return;
>> Thanks
>>
>> Eric
© 2016 - 2025 Red Hat, Inc.