When doing ditry tracking or calculating dirty tracking range, readonly
regions can be bypassed, because corresponding DMA mappings are readonly
and never become dirty.
This can optimize dirty tracking a bit for passthrough device.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
hw/vfio/listener.c | 46 +++++++++++++++++++++++++++++++++-----------
hw/vfio/trace-events | 1 +
2 files changed, 36 insertions(+), 11 deletions(-)
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 3b48f6796c..ca2377d860 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const VFIOContainer *bcontainer)
return true;
}
-static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+static bool vfio_listener_skipped_section(MemoryRegionSection *section,
+ bool bypass_ro)
{
+ if (bypass_ro && section->readonly) {
+ return true;
+ }
+
return (!memory_region_is_ram(section->mr) &&
!memory_region_is_iommu(section->mr)) ||
memory_region_is_protected(section->mr) ||
@@ -368,9 +373,9 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
}
static bool vfio_listener_valid_section(MemoryRegionSection *section,
- const char *name)
+ bool bypass_ro, const char *name)
{
- if (vfio_listener_skipped_section(section)) {
+ if (vfio_listener_skipped_section(section, bypass_ro)) {
trace_vfio_listener_region_skip(name,
section->offset_within_address_space,
section->offset_within_address_space +
@@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer *bcontainer,
int ret;
Error *err = NULL;
- if (!vfio_listener_valid_section(section, "region_add")) {
+ if (!vfio_listener_valid_section(section, false, "region_add")) {
return;
}
@@ -663,7 +668,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
int ret;
bool try_unmap = true;
- if (!vfio_listener_valid_section(section, "region_del")) {
+ if (!vfio_listener_valid_section(section, false, "region_del")) {
return;
}
@@ -722,11 +727,11 @@ static void vfio_listener_region_del(MemoryListener *listener,
}
/*
- * Fake an IOTLB entry for identity mapping which is needed by dirty
- * tracking. In fact, in unmap_bitmap, only translated_addr field is
- * used to set dirty bitmap.
+ * Fake an IOTLB entry for writable identity mapping which is needed
+ * by dirty tracking. In fact, in unmap_bitmap, only translated_addr
+ * field is used to set dirty bitmap.
*/
- if (!memory_region_is_iommu(section->mr)) {
+ if (!memory_region_is_iommu(section->mr) && !section->readonly) {
entry.iova = iova;
entry.translated_addr = iova;
iotlb = &entry;
@@ -834,7 +839,8 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
container_of(listener, VFIODirtyRangesListener, listener);
hwaddr iova, end;
- if (!vfio_listener_valid_section(section, "tracking_update") ||
+ /* Bypass readonly section as it never becomes dirty */
+ if (!vfio_listener_valid_section(section, true, "tracking_update") ||
!vfio_get_section_iova_range(dirty->bcontainer, section,
&iova, &end, NULL)) {
return;
@@ -1093,6 +1099,19 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if (!mr) {
goto out_unlock;
}
+
+ /*
+ * The mapping is readonly when either it's a readonly mapping in guest
+ * or mapped target is readonly, bypass it for dirty tracking as it
+ * never becomes dirty.
+ */
+ if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
+ trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
+ iova + iotlb->addr_mask);
+ rcu_read_unlock();
+ return;
+ }
+
translated_addr = memory_region_get_ram_addr(mr) + xlat;
ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
@@ -1228,7 +1247,12 @@ static void vfio_listener_log_sync(MemoryListener *listener,
int ret;
Error *local_err = NULL;
- if (vfio_listener_skipped_section(section)) {
+ /*
+ * Bypass readonly section as it never becomes dirty, iommu memory section
+ * is RW and never bypassed. The readonly mappings in iommu MR are bypassed
+ * in vfio_iommu_map_dirty_notify().
+ */
+ if (vfio_listener_skipped_section(section, true)) {
return;
}
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 3c62bab764..180e3d526b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -103,6 +103,7 @@ vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" -
vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+vfio_iommu_map_dirty_notify_skip_ro(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
# container.c
vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t backend_flag, uint64_t bitmap_size, uint64_t translated_addr, uint64_t dirty_pages) "iova=0x%"PRIx64" size=0x%"PRIx64" backend_flag=0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64" dirty_pages=%"PRIu64
--
2.47.1
Hi Yi, Cedric,
Could you also help comment on this patch? This is a pure VFIO migration related optimization, I think it's better to let it go with the "vfio: relax the vIOMMU check" series.
I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if you think it make sense.
Thanks
Zhenzhong
>-----Original Message-----
>From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>Subject: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>tracking
>
>When doing ditry tracking or calculating dirty tracking range, readonly
>regions can be bypassed, because corresponding DMA mappings are readonly
>and never become dirty.
>
>This can optimize dirty tracking a bit for passthrough device.
>
>Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>---
> hw/vfio/listener.c | 46 +++++++++++++++++++++++++++++++++-----------
> hw/vfio/trace-events | 1 +
> 2 files changed, 36 insertions(+), 11 deletions(-)
>
>diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>index 3b48f6796c..ca2377d860 100644
>--- a/hw/vfio/listener.c
>+++ b/hw/vfio/listener.c
>@@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const VFIOContainer
>*bcontainer)
> return true;
> }
>
>-static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>+static bool vfio_listener_skipped_section(MemoryRegionSection *section,
>+ bool bypass_ro)
> {
>+ if (bypass_ro && section->readonly) {
>+ return true;
>+ }
>+
> return (!memory_region_is_ram(section->mr) &&
> !memory_region_is_iommu(section->mr)) ||
> memory_region_is_protected(section->mr) ||
>@@ -368,9 +373,9 @@ static bool
>vfio_known_safe_misalignment(MemoryRegionSection *section)
> }
>
> static bool vfio_listener_valid_section(MemoryRegionSection *section,
>- const char *name)
>+ bool bypass_ro, const char
>*name)
> {
>- if (vfio_listener_skipped_section(section)) {
>+ if (vfio_listener_skipped_section(section, bypass_ro)) {
> trace_vfio_listener_region_skip(name,
> section->offset_within_address_space,
> section->offset_within_address_space +
>@@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer
>*bcontainer,
> int ret;
> Error *err = NULL;
>
>- if (!vfio_listener_valid_section(section, "region_add")) {
>+ if (!vfio_listener_valid_section(section, false, "region_add")) {
> return;
> }
>
>@@ -663,7 +668,7 @@ static void vfio_listener_region_del(MemoryListener
>*listener,
> int ret;
> bool try_unmap = true;
>
>- if (!vfio_listener_valid_section(section, "region_del")) {
>+ if (!vfio_listener_valid_section(section, false, "region_del")) {
> return;
> }
>
>@@ -722,11 +727,11 @@ static void
>vfio_listener_region_del(MemoryListener *listener,
> }
>
> /*
>- * Fake an IOTLB entry for identity mapping which is needed by
>dirty
>- * tracking. In fact, in unmap_bitmap, only translated_addr field is
>- * used to set dirty bitmap.
>+ * Fake an IOTLB entry for writable identity mapping which is
>needed
>+ * by dirty tracking. In fact, in unmap_bitmap, only
>translated_addr
>+ * field is used to set dirty bitmap.
> */
>- if (!memory_region_is_iommu(section->mr)) {
>+ if (!memory_region_is_iommu(section->mr)
>&& !section->readonly) {
> entry.iova = iova;
> entry.translated_addr = iova;
> iotlb = &entry;
>@@ -834,7 +839,8 @@ static void
>vfio_dirty_tracking_update(MemoryListener *listener,
> container_of(listener, VFIODirtyRangesListener, listener);
> hwaddr iova, end;
>
>- if (!vfio_listener_valid_section(section, "tracking_update") ||
>+ /* Bypass readonly section as it never becomes dirty */
>+ if (!vfio_listener_valid_section(section, true, "tracking_update") ||
> !vfio_get_section_iova_range(dirty->bcontainer, section,
> &iova, &end, NULL)) {
> return;
>@@ -1093,6 +1099,19 @@ static void
>vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> if (!mr) {
> goto out_unlock;
> }
>+
>+ /*
>+ * The mapping is readonly when either it's a readonly mapping in guest
>+ * or mapped target is readonly, bypass it for dirty tracking as it
>+ * never becomes dirty.
>+ */
>+ if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
>+ trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
>+ iova +
>iotlb->addr_mask);
>+ rcu_read_unlock();
>+ return;
>+ }
>+
> translated_addr = memory_region_get_ram_addr(mr) + xlat;
>
> ret = vfio_container_query_dirty_bitmap(bcontainer, iova,
>iotlb->addr_mask + 1,
>@@ -1228,7 +1247,12 @@ static void
>vfio_listener_log_sync(MemoryListener *listener,
> int ret;
> Error *local_err = NULL;
>
>- if (vfio_listener_skipped_section(section)) {
>+ /*
>+ * Bypass readonly section as it never becomes dirty, iommu memory
>section
>+ * is RW and never bypassed. The readonly mappings in iommu MR are
>bypassed
>+ * in vfio_iommu_map_dirty_notify().
>+ */
>+ if (vfio_listener_skipped_section(section, true)) {
> return;
> }
>
>diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>index 3c62bab764..180e3d526b 100644
>--- a/hw/vfio/trace-events
>+++ b/hw/vfio/trace-events
>@@ -103,6 +103,7 @@ vfio_listener_region_del(uint64_t start, uint64_t end)
>"region_del 0x%"PRIx64" -
> vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
>uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
>0x%"PRIx64"]"
> vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t
>max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci)
>"nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" -
>0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
> vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end)
>"iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>+vfio_iommu_map_dirty_notify_skip_ro(uint64_t iova_start, uint64_t
>iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>
> # container.c
> vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t
>backend_flag, uint64_t bitmap_size, uint64_t translated_addr, uint64_t
>dirty_pages) "iova=0x%"PRIx64" size=0x%"PRIx64"
>backend_flag=0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64"
>dirty_pages=%"PRIu64
>--
>2.47.1
Hello Zhenzhong,
On 11/28/25 03:08, Duan, Zhenzhong wrote:
> Hi Yi, Cedric,
>
> Could you also help comment on this patch? This is a pure VFIO migration related optimization, I think it's better to let it go with the "vfio: relax the vIOMMU check" series.
> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if you think it make sense.
IMO, the "vfio: relax the vIOMMU check" is fine as it is.
I would instead introduce a new series to handle ERRATA_772415
since it is a special case of "intel_iommu: Enable first stage
translation for passthrough device".
So we would have 3 series (in order of appearance on the list) :
1. "vfio: relax the vIOMMU check"
2. "intel_iommu: Enable first stage translation for passthrough
device" without quirks
3. "vfio: handle ERRATA_772415" with the quirk part, so that's
patch 17,19,20,21 ?
Series 2 seems the most important, as it sets the foundation
for the other architectures which have a need for nested
IOMMU support (smmu/nvidia). Series 1 is nice to have.
Series 3. is an extension of 2. for broken HW.
For the next iterations (QEMU 11.0), let's get series 2. in
first. I have been including it in my QEMU tree for a while
now I didn't see any regression. Should be fine.
Then, we can merge 1. and 3. through the vfio queue. Shouldn't
be a major task now that we had all these reviews.
How's that ?
btw,
Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
Update, Errata Details, SPR17.
https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
Url is not accessible (for me).
Thanks,
C.
Hi Cédric, >-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Subject: Re: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty >tracking > >Hello Zhenzhong, > >On 11/28/25 03:08, Duan, Zhenzhong wrote: >> Hi Yi, Cedric, >> >> Could you also help comment on this patch? This is a pure VFIO migration >related optimization, I think it's better to let it go with the "vfio: relax the >vIOMMU check" series. >> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if >you think it make sense. > >IMO, the "vfio: relax the vIOMMU check" is fine as it is. > >I would instead introduce a new series to handle ERRATA_772415 >since it is a special case of "intel_iommu: Enable first stage >translation for passthrough device". > >So we would have 3 series (in order of appearance on the list) : > >1. "vfio: relax the vIOMMU check" >2. "intel_iommu: Enable first stage translation for passthrough > device" without quirks >3. "vfio: handle ERRATA_772415" with the quirk part, so that's > patch 17,19,20,21 ? > > >Series 2 seems the most important, as it sets the foundation >for the other architectures which have a need for nested >IOMMU support (smmu/nvidia). Series 1 is nice to have. >Series 3. is an extension of 2. for broken HW. Good suggestion to split out ERRATA_772415 from the base nesting series. There is some code in patch17 which is needed by ERRATA_772415, it makes sense to move it to ERRATA_772415 series. > >For the next iterations (QEMU 11.0), let's get series 2. in >first. I have been including it in my QEMU tree for a while >now I didn't see any regression. Should be fine. > >Then, we can merge 1. and 3. through the vfio queue. Shouldn't >be a major task now that we had all these reviews. > >How's that ? Good for me. Then I'll send series2, then series1, finally series3 in order. I'd like to wait a few days to collect comments before sending v9 of series2 because the split of series2 and series 3 involves only patch reorder and no code changes. > > >btw, > > Reference from 4th Gen Intel Xeon Processor Scalable Family >Specification > Update, Errata Details, SPR17. > >https://edc.intel.com/content/www/us/en/design/products-and-solutions/pr >ocessors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/ > >Url is not accessible (for me). Me too, will update. https://cdrdv2.intel.com/v1/dl/getContent/772415 is accessible for me now. BRs, Zhenzhong
On 2025/11/28 10:08, Duan, Zhenzhong wrote:
> Hi Yi, Cedric,
>
> Could you also help comment on this patch? This is a pure VFIO migration related optimization, I think it's better to let it go with the "vfio: relax the vIOMMU check" series.
> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if you think it make sense.
It makes sense to me.
> Thanks
> Zhenzhong
>
>> -----Original Message-----
>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>> Subject: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>> tracking
>>
>> When doing ditry tracking or calculating dirty tracking range, readonly
s/ditry/dirty/
>> regions can be bypassed, because corresponding DMA mappings are readonly
>> and never become dirty.
>>
>> This can optimize dirty tracking a bit for passthrough device.
>>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> hw/vfio/listener.c | 46 +++++++++++++++++++++++++++++++++-----------
>> hw/vfio/trace-events | 1 +
>> 2 files changed, 36 insertions(+), 11 deletions(-)
>>
>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>> index 3b48f6796c..ca2377d860 100644
>> --- a/hw/vfio/listener.c
>> +++ b/hw/vfio/listener.c
>> @@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const VFIOContainer
>> *bcontainer)
>> return true;
>> }
>>
>> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>> +static bool vfio_listener_skipped_section(MemoryRegionSection *section,
>> + bool bypass_ro)
>> {
>> + if (bypass_ro && section->readonly) {
>> + return true;
>> + }
>> +
>> return (!memory_region_is_ram(section->mr) &&
>> !memory_region_is_iommu(section->mr)) ||
>> memory_region_is_protected(section->mr) ||
>> @@ -368,9 +373,9 @@ static bool
>> vfio_known_safe_misalignment(MemoryRegionSection *section)
>> }
>>
>> static bool vfio_listener_valid_section(MemoryRegionSection *section,
>> - const char *name)
>> + bool bypass_ro, const char
>> *name)
>> {
>> - if (vfio_listener_skipped_section(section)) {
>> + if (vfio_listener_skipped_section(section, bypass_ro)) {
>> trace_vfio_listener_region_skip(name,
>> section->offset_within_address_space,
>> section->offset_within_address_space +
>> @@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer
>> *bcontainer,
>> int ret;
>> Error *err = NULL;
>>
>> - if (!vfio_listener_valid_section(section, "region_add")) {
>> + if (!vfio_listener_valid_section(section, false, "region_add")) {
>> return;
>> }
>>
>> @@ -663,7 +668,7 @@ static void vfio_listener_region_del(MemoryListener
>> *listener,
>> int ret;
>> bool try_unmap = true;
>>
>> - if (!vfio_listener_valid_section(section, "region_del")) {
>> + if (!vfio_listener_valid_section(section, false, "region_del")) {
>> return;
>> }
>>
>> @@ -722,11 +727,11 @@ static void
>> vfio_listener_region_del(MemoryListener *listener,
>> }
>>
>> /*
>> - * Fake an IOTLB entry for identity mapping which is needed by
>> dirty
>> - * tracking. In fact, in unmap_bitmap, only translated_addr field is
>> - * used to set dirty bitmap.
>> + * Fake an IOTLB entry for writable identity mapping which is
>> needed
>> + * by dirty tracking. In fact, in unmap_bitmap, only
>> translated_addr
>> + * field is used to set dirty bitmap.
>> */
>> - if (!memory_region_is_iommu(section->mr)) {
>> + if (!memory_region_is_iommu(section->mr)
>> && !section->readonly) {
>> entry.iova = iova;
>> entry.translated_addr = iova;
>> iotlb = &entry;
>> @@ -834,7 +839,8 @@ static void
>> vfio_dirty_tracking_update(MemoryListener *listener,
>> container_of(listener, VFIODirtyRangesListener, listener);
>> hwaddr iova, end;
>>
>> - if (!vfio_listener_valid_section(section, "tracking_update") ||
>> + /* Bypass readonly section as it never becomes dirty */
>> + if (!vfio_listener_valid_section(section, true, "tracking_update") ||
>> !vfio_get_section_iova_range(dirty->bcontainer, section,
>> &iova, &end, NULL)) {
>> return;
>> @@ -1093,6 +1099,19 @@ static void
>> vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>> if (!mr) {
>> goto out_unlock;
>> }
>> +
>> + /*
>> + * The mapping is readonly when either it's a readonly mapping in guest
>> + * or mapped target is readonly, bypass it for dirty tracking as it
>> + * never becomes dirty.
>> + */
>> + if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
is it possible that guest maps RW, while the backend mr is readonly?
>> + trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
>> + iova +
>> iotlb->addr_mask);
>> + rcu_read_unlock();
>> + return;
>> + }
>> +
>> translated_addr = memory_region_get_ram_addr(mr) + xlat;
>>
>> ret = vfio_container_query_dirty_bitmap(bcontainer, iova,
>> iotlb->addr_mask + 1,
>> @@ -1228,7 +1247,12 @@ static void
>> vfio_listener_log_sync(MemoryListener *listener,
>> int ret;
>> Error *local_err = NULL;
>>
>> - if (vfio_listener_skipped_section(section)) {
>> + /*
>> + * Bypass readonly section as it never becomes dirty, iommu memory
>> section
>> + * is RW and never bypassed. The readonly mappings in iommu MR are
>> bypassed
>> + * in vfio_iommu_map_dirty_notify().
>> + */
>> + if (vfio_listener_skipped_section(section, true)) {
>> return;
>> }
>>
Looks the vfio_iommu_map_notify() is missed. It also has unmap op. is
it?
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 3c62bab764..180e3d526b 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -103,6 +103,7 @@ vfio_listener_region_del(uint64_t start, uint64_t end)
>> "region_del 0x%"PRIx64" -
>> vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
>> uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
>> 0x%"PRIx64"]"
>> vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t
>> max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci)
>> "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" -
>> 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
>> vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end)
>> "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_iommu_map_dirty_notify_skip_ro(uint64_t iova_start, uint64_t
>> iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>>
>> # container.c
>> vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t
>> backend_flag, uint64_t bitmap_size, uint64_t translated_addr, uint64_t
>> dirty_pages) "iova=0x%"PRIx64" size=0x%"PRIx64"
>> backend_flag=0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64"
>> dirty_pages=%"PRIu64
>> --
>> 2.47.1
>
>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>tracking
>
>On 2025/11/28 10:08, Duan, Zhenzhong wrote:
>> Hi Yi, Cedric,
>>
>> Could you also help comment on this patch? This is a pure VFIO migration
>related optimization, I think it's better to let it go with the "vfio: relax the
>vIOMMU check" series.
>> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if
>you think it make sense.
>
>It makes sense to me.
>
>> Thanks
>> Zhenzhong
>>
>>> -----Original Message-----
>>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>>> Subject: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>>> tracking
>>>
>>> When doing ditry tracking or calculating dirty tracking range, readonly
>
>s/ditry/dirty/
Will fix, it's strange --codespell didn't catch it.
>
>>> regions can be bypassed, because corresponding DMA mappings are
>readonly
>>> and never become dirty.
>>>
>>> This can optimize dirty tracking a bit for passthrough device.
>>>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>> ---
>>> hw/vfio/listener.c | 46
>+++++++++++++++++++++++++++++++++-----------
>>> hw/vfio/trace-events | 1 +
>>> 2 files changed, 36 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>>> index 3b48f6796c..ca2377d860 100644
>>> --- a/hw/vfio/listener.c
>>> +++ b/hw/vfio/listener.c
>>> @@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const
>VFIOContainer
>>> *bcontainer)
>>> return true;
>>> }
>>>
>>> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>> +static bool vfio_listener_skipped_section(MemoryRegionSection
>*section,
>>> + bool bypass_ro)
>>> {
>>> + if (bypass_ro && section->readonly) {
>>> + return true;
>>> + }
>>> +
>>> return (!memory_region_is_ram(section->mr) &&
>>> !memory_region_is_iommu(section->mr)) ||
>>> memory_region_is_protected(section->mr) ||
>>> @@ -368,9 +373,9 @@ static bool
>>> vfio_known_safe_misalignment(MemoryRegionSection *section)
>>> }
>>>
>>> static bool vfio_listener_valid_section(MemoryRegionSection *section,
>>> - const char *name)
>>> + bool bypass_ro, const
>char
>>> *name)
>>> {
>>> - if (vfio_listener_skipped_section(section)) {
>>> + if (vfio_listener_skipped_section(section, bypass_ro)) {
>>> trace_vfio_listener_region_skip(name,
>>> section->offset_within_address_space,
>>> section->offset_within_address_space +
>>> @@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer
>>> *bcontainer,
>>> int ret;
>>> Error *err = NULL;
>>>
>>> - if (!vfio_listener_valid_section(section, "region_add")) {
>>> + if (!vfio_listener_valid_section(section, false, "region_add")) {
>>> return;
>>> }
>>>
>>> @@ -663,7 +668,7 @@ static void
>vfio_listener_region_del(MemoryListener
>>> *listener,
>>> int ret;
>>> bool try_unmap = true;
>>>
>>> - if (!vfio_listener_valid_section(section, "region_del")) {
>>> + if (!vfio_listener_valid_section(section, false, "region_del")) {
>>> return;
>>> }
>>>
>>> @@ -722,11 +727,11 @@ static void
>>> vfio_listener_region_del(MemoryListener *listener,
>>> }
>>>
>>> /*
>>> - * Fake an IOTLB entry for identity mapping which is needed by
>>> dirty
>>> - * tracking. In fact, in unmap_bitmap, only translated_addr field
>is
>>> - * used to set dirty bitmap.
>>> + * Fake an IOTLB entry for writable identity mapping which is
>>> needed
>>> + * by dirty tracking. In fact, in unmap_bitmap, only
>>> translated_addr
>>> + * field is used to set dirty bitmap.
>>> */
>>> - if (!memory_region_is_iommu(section->mr)) {
>>> + if (!memory_region_is_iommu(section->mr)
>>> && !section->readonly) {
>>> entry.iova = iova;
>>> entry.translated_addr = iova;
>>> iotlb = &entry;
>>> @@ -834,7 +839,8 @@ static void
>>> vfio_dirty_tracking_update(MemoryListener *listener,
>>> container_of(listener, VFIODirtyRangesListener, listener);
>>> hwaddr iova, end;
>>>
>>> - if (!vfio_listener_valid_section(section, "tracking_update") ||
>>> + /* Bypass readonly section as it never becomes dirty */
>>> + if (!vfio_listener_valid_section(section, true, "tracking_update") ||
>>> !vfio_get_section_iova_range(dirty->bcontainer, section,
>>> &iova, &end, NULL)) {
>>> return;
>>> @@ -1093,6 +1099,19 @@ static void
>>> vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry
>*iotlb)
>>> if (!mr) {
>>> goto out_unlock;
>>> }
>>> +
>>> + /*
>>> + * The mapping is readonly when either it's a readonly mapping in
>guest
>>> + * or mapped target is readonly, bypass it for dirty tracking as it
>>> + * never becomes dirty.
>>> + */
>>> + if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
>
>is it possible that guest maps RW, while the backend mr is readonly?
I think a normal OS does not need to map a readonly region as RW, because writing will always fail.
If it does, then shadow mapping on host side is readonly. This emulates same behavior as on baremetal.
>
>>> + trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
>>> + iova +
>>> iotlb->addr_mask);
>>> + rcu_read_unlock();
>>> + return;
>>> + }
>>> +
>>> translated_addr = memory_region_get_ram_addr(mr) + xlat;
>>>
>>> ret = vfio_container_query_dirty_bitmap(bcontainer, iova,
>>> iotlb->addr_mask + 1,
>>> @@ -1228,7 +1247,12 @@ static void
>>> vfio_listener_log_sync(MemoryListener *listener,
>>> int ret;
>>> Error *local_err = NULL;
>>>
>>> - if (vfio_listener_skipped_section(section)) {
>>> + /*
>>> + * Bypass readonly section as it never becomes dirty, iommu
>memory
>>> section
>>> + * is RW and never bypassed. The readonly mappings in iommu MR
>are
>>> bypassed
>>> + * in vfio_iommu_map_dirty_notify().
>>> + */
>>> + if (vfio_listener_skipped_section(section, true)) {
>>> return;
>>> }
>>>
>
>Looks the vfio_iommu_map_notify() is missed. It also has unmap op. is
>it?
Right, because we don't know existing shadow mapping is readonly or not,
vfio_listener_log_sync() also doesn't provide that info.
That means we do dirty traking in this case, but returned bitmaps should be all zero.
Thanks
Zhenzhong
© 2016 - 2025 Red Hat, Inc.