[PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking

Zhenzhong Duan posted 23 patches 3 weeks, 5 days ago
[PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking
Posted by Zhenzhong Duan 3 weeks, 5 days ago
When doing ditry tracking or calculating dirty tracking range, readonly
regions can be bypassed, because corresponding DMA mappings are readonly
and never become dirty.

This can optimize dirty tracking a bit for passthrough device.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
 hw/vfio/listener.c   | 46 +++++++++++++++++++++++++++++++++-----------
 hw/vfio/trace-events |  1 +
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 3b48f6796c..ca2377d860 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const VFIOContainer *bcontainer)
     return true;
 }
 
-static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+static bool vfio_listener_skipped_section(MemoryRegionSection *section,
+                                          bool bypass_ro)
 {
+    if (bypass_ro && section->readonly) {
+        return true;
+    }
+
     return (!memory_region_is_ram(section->mr) &&
             !memory_region_is_iommu(section->mr)) ||
            memory_region_is_protected(section->mr) ||
@@ -368,9 +373,9 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
 }
 
 static bool vfio_listener_valid_section(MemoryRegionSection *section,
-                                        const char *name)
+                                        bool bypass_ro, const char *name)
 {
-    if (vfio_listener_skipped_section(section)) {
+    if (vfio_listener_skipped_section(section, bypass_ro)) {
         trace_vfio_listener_region_skip(name,
                 section->offset_within_address_space,
                 section->offset_within_address_space +
@@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer *bcontainer,
     int ret;
     Error *err = NULL;
 
-    if (!vfio_listener_valid_section(section, "region_add")) {
+    if (!vfio_listener_valid_section(section, false, "region_add")) {
         return;
     }
 
@@ -663,7 +668,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
     int ret;
     bool try_unmap = true;
 
-    if (!vfio_listener_valid_section(section, "region_del")) {
+    if (!vfio_listener_valid_section(section, false, "region_del")) {
         return;
     }
 
@@ -722,11 +727,11 @@ static void vfio_listener_region_del(MemoryListener *listener,
         }
 
         /*
-         * Fake an IOTLB entry for identity mapping which is needed by dirty
-         * tracking. In fact, in unmap_bitmap, only translated_addr field is
-         * used to set dirty bitmap.
+         * Fake an IOTLB entry for writable identity mapping which is needed
+         * by dirty tracking. In fact, in unmap_bitmap, only translated_addr
+         * field is used to set dirty bitmap.
          */
-        if (!memory_region_is_iommu(section->mr)) {
+        if (!memory_region_is_iommu(section->mr) && !section->readonly) {
             entry.iova = iova;
             entry.translated_addr = iova;
             iotlb = &entry;
@@ -834,7 +839,8 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
         container_of(listener, VFIODirtyRangesListener, listener);
     hwaddr iova, end;
 
-    if (!vfio_listener_valid_section(section, "tracking_update") ||
+    /* Bypass readonly section as it never becomes dirty */
+    if (!vfio_listener_valid_section(section, true, "tracking_update") ||
         !vfio_get_section_iova_range(dirty->bcontainer, section,
                                      &iova, &end, NULL)) {
         return;
@@ -1093,6 +1099,19 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     if (!mr) {
         goto out_unlock;
     }
+
+    /*
+     * The mapping is readonly when either it's a readonly mapping in guest
+     * or mapped target is readonly, bypass it for dirty tracking as it
+     * never becomes dirty.
+     */
+    if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
+        trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
+                                                  iova + iotlb->addr_mask);
+        rcu_read_unlock();
+        return;
+    }
+
     translated_addr = memory_region_get_ram_addr(mr) + xlat;
 
     ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
@@ -1228,7 +1247,12 @@ static void vfio_listener_log_sync(MemoryListener *listener,
     int ret;
     Error *local_err = NULL;
 
-    if (vfio_listener_skipped_section(section)) {
+    /*
+     * Bypass readonly section as it never becomes dirty, iommu memory section
+     * is RW and never bypassed. The readonly mappings in iommu MR are bypassed
+     * in vfio_iommu_map_dirty_notify().
+     */
+    if (vfio_listener_skipped_section(section, true)) {
         return;
     }
 
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 3c62bab764..180e3d526b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -103,6 +103,7 @@ vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" -
 vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+vfio_iommu_map_dirty_notify_skip_ro(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
 
 # container.c
 vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t backend_flag, uint64_t bitmap_size, uint64_t translated_addr, uint64_t dirty_pages) "iova=0x%"PRIx64" size=0x%"PRIx64" backend_flag=0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64" dirty_pages=%"PRIu64
-- 
2.47.1
RE: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking
Posted by Duan, Zhenzhong 2 weeks, 2 days ago
Hi Yi, Cedric,

Could you also help comment on this patch? This is a pure VFIO migration related optimization, I think it's better to let it go with the "vfio: relax the vIOMMU check" series.
I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if you think it make sense.

Thanks
Zhenzhong

>-----Original Message-----
>From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>Subject: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>tracking
>
>When doing ditry tracking or calculating dirty tracking range, readonly
>regions can be bypassed, because corresponding DMA mappings are readonly
>and never become dirty.
>
>This can optimize dirty tracking a bit for passthrough device.
>
>Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>---
> hw/vfio/listener.c   | 46 +++++++++++++++++++++++++++++++++-----------
> hw/vfio/trace-events |  1 +
> 2 files changed, 36 insertions(+), 11 deletions(-)
>
>diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>index 3b48f6796c..ca2377d860 100644
>--- a/hw/vfio/listener.c
>+++ b/hw/vfio/listener.c
>@@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const VFIOContainer
>*bcontainer)
>     return true;
> }
>
>-static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>+static bool vfio_listener_skipped_section(MemoryRegionSection *section,
>+                                          bool bypass_ro)
> {
>+    if (bypass_ro && section->readonly) {
>+        return true;
>+    }
>+
>     return (!memory_region_is_ram(section->mr) &&
>             !memory_region_is_iommu(section->mr)) ||
>            memory_region_is_protected(section->mr) ||
>@@ -368,9 +373,9 @@ static bool
>vfio_known_safe_misalignment(MemoryRegionSection *section)
> }
>
> static bool vfio_listener_valid_section(MemoryRegionSection *section,
>-                                        const char *name)
>+                                        bool bypass_ro, const char
>*name)
> {
>-    if (vfio_listener_skipped_section(section)) {
>+    if (vfio_listener_skipped_section(section, bypass_ro)) {
>         trace_vfio_listener_region_skip(name,
>                 section->offset_within_address_space,
>                 section->offset_within_address_space +
>@@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer
>*bcontainer,
>     int ret;
>     Error *err = NULL;
>
>-    if (!vfio_listener_valid_section(section, "region_add")) {
>+    if (!vfio_listener_valid_section(section, false, "region_add")) {
>         return;
>     }
>
>@@ -663,7 +668,7 @@ static void vfio_listener_region_del(MemoryListener
>*listener,
>     int ret;
>     bool try_unmap = true;
>
>-    if (!vfio_listener_valid_section(section, "region_del")) {
>+    if (!vfio_listener_valid_section(section, false, "region_del")) {
>         return;
>     }
>
>@@ -722,11 +727,11 @@ static void
>vfio_listener_region_del(MemoryListener *listener,
>         }
>
>         /*
>-         * Fake an IOTLB entry for identity mapping which is needed by
>dirty
>-         * tracking. In fact, in unmap_bitmap, only translated_addr field is
>-         * used to set dirty bitmap.
>+         * Fake an IOTLB entry for writable identity mapping which is
>needed
>+         * by dirty tracking. In fact, in unmap_bitmap, only
>translated_addr
>+         * field is used to set dirty bitmap.
>          */
>-        if (!memory_region_is_iommu(section->mr)) {
>+        if (!memory_region_is_iommu(section->mr)
>&& !section->readonly) {
>             entry.iova = iova;
>             entry.translated_addr = iova;
>             iotlb = &entry;
>@@ -834,7 +839,8 @@ static void
>vfio_dirty_tracking_update(MemoryListener *listener,
>         container_of(listener, VFIODirtyRangesListener, listener);
>     hwaddr iova, end;
>
>-    if (!vfio_listener_valid_section(section, "tracking_update") ||
>+    /* Bypass readonly section as it never becomes dirty */
>+    if (!vfio_listener_valid_section(section, true, "tracking_update") ||
>         !vfio_get_section_iova_range(dirty->bcontainer, section,
>                                      &iova, &end, NULL)) {
>         return;
>@@ -1093,6 +1099,19 @@ static void
>vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>     if (!mr) {
>         goto out_unlock;
>     }
>+
>+    /*
>+     * The mapping is readonly when either it's a readonly mapping in guest
>+     * or mapped target is readonly, bypass it for dirty tracking as it
>+     * never becomes dirty.
>+     */
>+    if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
>+        trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
>+                                                  iova +
>iotlb->addr_mask);
>+        rcu_read_unlock();
>+        return;
>+    }
>+
>     translated_addr = memory_region_get_ram_addr(mr) + xlat;
>
>     ret = vfio_container_query_dirty_bitmap(bcontainer, iova,
>iotlb->addr_mask + 1,
>@@ -1228,7 +1247,12 @@ static void
>vfio_listener_log_sync(MemoryListener *listener,
>     int ret;
>     Error *local_err = NULL;
>
>-    if (vfio_listener_skipped_section(section)) {
>+    /*
>+     * Bypass readonly section as it never becomes dirty, iommu memory
>section
>+     * is RW and never bypassed. The readonly mappings in iommu MR are
>bypassed
>+     * in vfio_iommu_map_dirty_notify().
>+     */
>+    if (vfio_listener_skipped_section(section, true)) {
>         return;
>     }
>
>diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>index 3c62bab764..180e3d526b 100644
>--- a/hw/vfio/trace-events
>+++ b/hw/vfio/trace-events
>@@ -103,6 +103,7 @@ vfio_listener_region_del(uint64_t start, uint64_t end)
>"region_del 0x%"PRIx64" -
> vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
>uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
>0x%"PRIx64"]"
> vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t
>max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci)
>"nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" -
>0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
> vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end)
>"iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>+vfio_iommu_map_dirty_notify_skip_ro(uint64_t iova_start, uint64_t
>iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>
> # container.c
> vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t
>backend_flag, uint64_t bitmap_size, uint64_t translated_addr, uint64_t
>dirty_pages) "iova=0x%"PRIx64" size=0x%"PRIx64"
>backend_flag=0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64"
>dirty_pages=%"PRIu64
>--
>2.47.1
Re: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking
Posted by Cédric Le Goater 2 weeks, 1 day ago
Hello Zhenzhong,

On 11/28/25 03:08, Duan, Zhenzhong wrote:
> Hi Yi, Cedric,
> 
> Could you also help comment on this patch? This is a pure VFIO migration related optimization, I think it's better to let it go with the "vfio: relax the vIOMMU check" series.
> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if you think it make sense.

IMO, the "vfio: relax the vIOMMU check" is fine as it is.

I would instead introduce a new series to handle ERRATA_772415
since it is a special case of "intel_iommu: Enable first stage
translation for passthrough device".

So we would have 3 series (in order of appearance on the list) :

1. "vfio: relax the vIOMMU check"
2. "intel_iommu: Enable first stage translation for passthrough
     device" without quirks
3. "vfio: handle ERRATA_772415" with the quirk part, so that's
    patch 17,19,20,21 ?


Series 2 seems the most important, as it sets the foundation
for the other architectures which have a need for nested
IOMMU support (smmu/nvidia). Series 1 is nice to have.
Series 3. is an extension of 2. for broken HW.

For the next iterations (QEMU 11.0), let's get series 2. in
first. I have been including it in my QEMU tree for a while
now I didn't see any regression. Should be fine.

Then, we can merge 1. and 3. through the vfio queue. Shouldn't
be a major task now that we had all these reviews.

How's that ?


btw,

   Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
   Update, Errata Details, SPR17.
   https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/

Url is not accessible (for me).

Thanks,

C.
RE: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking
Posted by Duan, Zhenzhong 1 week, 6 days ago
Hi Cédric,

>-----Original Message-----
>From: Cédric Le Goater <clg@redhat.com>
>Subject: Re: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>tracking
>
>Hello Zhenzhong,
>
>On 11/28/25 03:08, Duan, Zhenzhong wrote:
>> Hi Yi, Cedric,
>>
>> Could you also help comment on this patch? This is a pure VFIO migration
>related optimization, I think it's better to let it go with the "vfio: relax the
>vIOMMU check" series.
>> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if
>you think it make sense.
>
>IMO, the "vfio: relax the vIOMMU check" is fine as it is.
>
>I would instead introduce a new series to handle ERRATA_772415
>since it is a special case of "intel_iommu: Enable first stage
>translation for passthrough device".
>
>So we would have 3 series (in order of appearance on the list) :
>
>1. "vfio: relax the vIOMMU check"
>2. "intel_iommu: Enable first stage translation for passthrough
>     device" without quirks
>3. "vfio: handle ERRATA_772415" with the quirk part, so that's
>    patch 17,19,20,21 ?
>
>
>Series 2 seems the most important, as it sets the foundation
>for the other architectures which have a need for nested
>IOMMU support (smmu/nvidia). Series 1 is nice to have.
>Series 3. is an extension of 2. for broken HW.

Good suggestion to split out ERRATA_772415 from the base nesting series.
There is some code in patch17 which is needed by ERRATA_772415,
it makes sense to move it to ERRATA_772415 series.

>
>For the next iterations (QEMU 11.0), let's get series 2. in
>first. I have been including it in my QEMU tree for a while
>now I didn't see any regression. Should be fine.
>
>Then, we can merge 1. and 3. through the vfio queue. Shouldn't
>be a major task now that we had all these reviews.
>
>How's that ?

Good for me.
Then I'll send series2, then series1, finally series3 in order.

I'd like to wait a few days to collect comments before sending v9 of series2
because the split of series2 and series 3 involves only patch reorder and
no code changes.

>
>
>btw,
>
>   Reference from 4th Gen Intel Xeon Processor Scalable Family
>Specification
>   Update, Errata Details, SPR17.
>
>https://edc.intel.com/content/www/us/en/design/products-and-solutions/pr
>ocessors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
>
>Url is not accessible (for me).

Me too, will update. https://cdrdv2.intel.com/v1/dl/getContent/772415 is accessible for me now.

BRs,
Zhenzhong
Re: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking
Posted by Yi Liu 2 weeks, 2 days ago
On 2025/11/28 10:08, Duan, Zhenzhong wrote:
> Hi Yi, Cedric,
> 
> Could you also help comment on this patch? This is a pure VFIO migration related optimization, I think it's better to let it go with the "vfio: relax the vIOMMU check" series.
> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if you think it make sense.

It makes sense to me.

> Thanks
> Zhenzhong
> 
>> -----Original Message-----
>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>> Subject: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>> tracking
>>
>> When doing ditry tracking or calculating dirty tracking range, readonly

s/ditry/dirty/

>> regions can be bypassed, because corresponding DMA mappings are readonly
>> and never become dirty.
>>
>> This can optimize dirty tracking a bit for passthrough device.
>>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>> ---
>> hw/vfio/listener.c   | 46 +++++++++++++++++++++++++++++++++-----------
>> hw/vfio/trace-events |  1 +
>> 2 files changed, 36 insertions(+), 11 deletions(-)
>>
>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>> index 3b48f6796c..ca2377d860 100644
>> --- a/hw/vfio/listener.c
>> +++ b/hw/vfio/listener.c
>> @@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const VFIOContainer
>> *bcontainer)
>>      return true;
>> }
>>
>> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>> +static bool vfio_listener_skipped_section(MemoryRegionSection *section,
>> +                                          bool bypass_ro)
>> {
>> +    if (bypass_ro && section->readonly) {
>> +        return true;
>> +    }
>> +
>>      return (!memory_region_is_ram(section->mr) &&
>>              !memory_region_is_iommu(section->mr)) ||
>>             memory_region_is_protected(section->mr) ||
>> @@ -368,9 +373,9 @@ static bool
>> vfio_known_safe_misalignment(MemoryRegionSection *section)
>> }
>>
>> static bool vfio_listener_valid_section(MemoryRegionSection *section,
>> -                                        const char *name)
>> +                                        bool bypass_ro, const char
>> *name)
>> {
>> -    if (vfio_listener_skipped_section(section)) {
>> +    if (vfio_listener_skipped_section(section, bypass_ro)) {
>>          trace_vfio_listener_region_skip(name,
>>                  section->offset_within_address_space,
>>                  section->offset_within_address_space +
>> @@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer
>> *bcontainer,
>>      int ret;
>>      Error *err = NULL;
>>
>> -    if (!vfio_listener_valid_section(section, "region_add")) {
>> +    if (!vfio_listener_valid_section(section, false, "region_add")) {
>>          return;
>>      }
>>
>> @@ -663,7 +668,7 @@ static void vfio_listener_region_del(MemoryListener
>> *listener,
>>      int ret;
>>      bool try_unmap = true;
>>
>> -    if (!vfio_listener_valid_section(section, "region_del")) {
>> +    if (!vfio_listener_valid_section(section, false, "region_del")) {
>>          return;
>>      }
>>
>> @@ -722,11 +727,11 @@ static void
>> vfio_listener_region_del(MemoryListener *listener,
>>          }
>>
>>          /*
>> -         * Fake an IOTLB entry for identity mapping which is needed by
>> dirty
>> -         * tracking. In fact, in unmap_bitmap, only translated_addr field is
>> -         * used to set dirty bitmap.
>> +         * Fake an IOTLB entry for writable identity mapping which is
>> needed
>> +         * by dirty tracking. In fact, in unmap_bitmap, only
>> translated_addr
>> +         * field is used to set dirty bitmap.
>>           */
>> -        if (!memory_region_is_iommu(section->mr)) {
>> +        if (!memory_region_is_iommu(section->mr)
>> && !section->readonly) {
>>              entry.iova = iova;
>>              entry.translated_addr = iova;
>>              iotlb = &entry;
>> @@ -834,7 +839,8 @@ static void
>> vfio_dirty_tracking_update(MemoryListener *listener,
>>          container_of(listener, VFIODirtyRangesListener, listener);
>>      hwaddr iova, end;
>>
>> -    if (!vfio_listener_valid_section(section, "tracking_update") ||
>> +    /* Bypass readonly section as it never becomes dirty */
>> +    if (!vfio_listener_valid_section(section, true, "tracking_update") ||
>>          !vfio_get_section_iova_range(dirty->bcontainer, section,
>>                                       &iova, &end, NULL)) {
>>          return;
>> @@ -1093,6 +1099,19 @@ static void
>> vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>      if (!mr) {
>>          goto out_unlock;
>>      }
>> +
>> +    /*
>> +     * The mapping is readonly when either it's a readonly mapping in guest
>> +     * or mapped target is readonly, bypass it for dirty tracking as it
>> +     * never becomes dirty.
>> +     */
>> +    if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {

is it possible that guest maps RW, while the backend mr is readonly?

>> +        trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
>> +                                                  iova +
>> iotlb->addr_mask);
>> +        rcu_read_unlock();
>> +        return;
>> +    }
>> +
>>      translated_addr = memory_region_get_ram_addr(mr) + xlat;
>>
>>      ret = vfio_container_query_dirty_bitmap(bcontainer, iova,
>> iotlb->addr_mask + 1,
>> @@ -1228,7 +1247,12 @@ static void
>> vfio_listener_log_sync(MemoryListener *listener,
>>      int ret;
>>      Error *local_err = NULL;
>>
>> -    if (vfio_listener_skipped_section(section)) {
>> +    /*
>> +     * Bypass readonly section as it never becomes dirty, iommu memory
>> section
>> +     * is RW and never bypassed. The readonly mappings in iommu MR are
>> bypassed
>> +     * in vfio_iommu_map_dirty_notify().
>> +     */
>> +    if (vfio_listener_skipped_section(section, true)) {
>>          return;
>>      }
>>

Looks the vfio_iommu_map_notify() is missed. It also has unmap op. is
it?

>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 3c62bab764..180e3d526b 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -103,6 +103,7 @@ vfio_listener_region_del(uint64_t start, uint64_t end)
>> "region_del 0x%"PRIx64" -
>> vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
>> uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
>> 0x%"PRIx64"]"
>> vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t
>> max32, uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci)
>> "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" -
>> 0x%"PRIx64"], pci64:[0x%"PRIx64" - 0x%"PRIx64"]"
>> vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end)
>> "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_iommu_map_dirty_notify_skip_ro(uint64_t iova_start, uint64_t
>> iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>>
>> # container.c
>> vfio_container_query_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t
>> backend_flag, uint64_t bitmap_size, uint64_t translated_addr, uint64_t
>> dirty_pages) "iova=0x%"PRIx64" size=0x%"PRIx64"
>> backend_flag=0x%"PRIx64" bitmap_size=0x%"PRIx64" gpa=0x%"PRIx64"
>> dirty_pages=%"PRIu64
>> --
>> 2.47.1
>
RE: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty tracking
Posted by Duan, Zhenzhong 2 weeks, 2 days ago

>-----Original Message-----
>From: Liu, Yi L <yi.l.liu@intel.com>
>Subject: Re: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>tracking
>
>On 2025/11/28 10:08, Duan, Zhenzhong wrote:
>> Hi Yi, Cedric,
>>
>> Could you also help comment on this patch? This is a pure VFIO migration
>related optimization, I think it's better to let it go with the "vfio: relax the
>vIOMMU check" series.
>> I'd like to move it in next respin of "vfio: relax the vIOMMU check" series if
>you think it make sense.
>
>It makes sense to me.
>
>> Thanks
>> Zhenzhong
>>
>>> -----Original Message-----
>>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>>> Subject: [PATCH v8 17/23] vfio/listener: Bypass readonly region for dirty
>>> tracking
>>>
>>> When doing ditry tracking or calculating dirty tracking range, readonly
>
>s/ditry/dirty/

Will fix, it's strange --codespell didn't catch it.

>
>>> regions can be bypassed, because corresponding DMA mappings are
>readonly
>>> and never become dirty.
>>>
>>> This can optimize dirty tracking a bit for passthrough device.
>>>
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
>>> ---
>>> hw/vfio/listener.c   | 46
>+++++++++++++++++++++++++++++++++-----------
>>> hw/vfio/trace-events |  1 +
>>> 2 files changed, 36 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>>> index 3b48f6796c..ca2377d860 100644
>>> --- a/hw/vfio/listener.c
>>> +++ b/hw/vfio/listener.c
>>> @@ -76,8 +76,13 @@ static bool vfio_log_sync_needed(const
>VFIOContainer
>>> *bcontainer)
>>>      return true;
>>> }
>>>
>>> -static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>>> +static bool vfio_listener_skipped_section(MemoryRegionSection
>*section,
>>> +                                          bool bypass_ro)
>>> {
>>> +    if (bypass_ro && section->readonly) {
>>> +        return true;
>>> +    }
>>> +
>>>      return (!memory_region_is_ram(section->mr) &&
>>>              !memory_region_is_iommu(section->mr)) ||
>>>             memory_region_is_protected(section->mr) ||
>>> @@ -368,9 +373,9 @@ static bool
>>> vfio_known_safe_misalignment(MemoryRegionSection *section)
>>> }
>>>
>>> static bool vfio_listener_valid_section(MemoryRegionSection *section,
>>> -                                        const char *name)
>>> +                                        bool bypass_ro, const
>char
>>> *name)
>>> {
>>> -    if (vfio_listener_skipped_section(section)) {
>>> +    if (vfio_listener_skipped_section(section, bypass_ro)) {
>>>          trace_vfio_listener_region_skip(name,
>>>                  section->offset_within_address_space,
>>>                  section->offset_within_address_space +
>>> @@ -497,7 +502,7 @@ void vfio_container_region_add(VFIOContainer
>>> *bcontainer,
>>>      int ret;
>>>      Error *err = NULL;
>>>
>>> -    if (!vfio_listener_valid_section(section, "region_add")) {
>>> +    if (!vfio_listener_valid_section(section, false, "region_add")) {
>>>          return;
>>>      }
>>>
>>> @@ -663,7 +668,7 @@ static void
>vfio_listener_region_del(MemoryListener
>>> *listener,
>>>      int ret;
>>>      bool try_unmap = true;
>>>
>>> -    if (!vfio_listener_valid_section(section, "region_del")) {
>>> +    if (!vfio_listener_valid_section(section, false, "region_del")) {
>>>          return;
>>>      }
>>>
>>> @@ -722,11 +727,11 @@ static void
>>> vfio_listener_region_del(MemoryListener *listener,
>>>          }
>>>
>>>          /*
>>> -         * Fake an IOTLB entry for identity mapping which is needed by
>>> dirty
>>> -         * tracking. In fact, in unmap_bitmap, only translated_addr field
>is
>>> -         * used to set dirty bitmap.
>>> +         * Fake an IOTLB entry for writable identity mapping which is
>>> needed
>>> +         * by dirty tracking. In fact, in unmap_bitmap, only
>>> translated_addr
>>> +         * field is used to set dirty bitmap.
>>>           */
>>> -        if (!memory_region_is_iommu(section->mr)) {
>>> +        if (!memory_region_is_iommu(section->mr)
>>> && !section->readonly) {
>>>              entry.iova = iova;
>>>              entry.translated_addr = iova;
>>>              iotlb = &entry;
>>> @@ -834,7 +839,8 @@ static void
>>> vfio_dirty_tracking_update(MemoryListener *listener,
>>>          container_of(listener, VFIODirtyRangesListener, listener);
>>>      hwaddr iova, end;
>>>
>>> -    if (!vfio_listener_valid_section(section, "tracking_update") ||
>>> +    /* Bypass readonly section as it never becomes dirty */
>>> +    if (!vfio_listener_valid_section(section, true, "tracking_update") ||
>>>          !vfio_get_section_iova_range(dirty->bcontainer, section,
>>>                                       &iova, &end, NULL)) {
>>>          return;
>>> @@ -1093,6 +1099,19 @@ static void
>>> vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry
>*iotlb)
>>>      if (!mr) {
>>>          goto out_unlock;
>>>      }
>>> +
>>> +    /*
>>> +     * The mapping is readonly when either it's a readonly mapping in
>guest
>>> +     * or mapped target is readonly, bypass it for dirty tracking as it
>>> +     * never becomes dirty.
>>> +     */
>>> +    if (!(iotlb->perm & IOMMU_WO) || mr->readonly) {
>
>is it possible that guest maps RW, while the backend mr is readonly?

I think a normal OS does not need to map a readonly region as RW, because writing will always fail.
If it does, then shadow mapping on host side is readonly. This emulates same behavior as on baremetal.

>
>>> +        trace_vfio_iommu_map_dirty_notify_skip_ro(iova,
>>> +                                                  iova +
>>> iotlb->addr_mask);
>>> +        rcu_read_unlock();
>>> +        return;
>>> +    }
>>> +
>>>      translated_addr = memory_region_get_ram_addr(mr) + xlat;
>>>
>>>      ret = vfio_container_query_dirty_bitmap(bcontainer, iova,
>>> iotlb->addr_mask + 1,
>>> @@ -1228,7 +1247,12 @@ static void
>>> vfio_listener_log_sync(MemoryListener *listener,
>>>      int ret;
>>>      Error *local_err = NULL;
>>>
>>> -    if (vfio_listener_skipped_section(section)) {
>>> +    /*
>>> +     * Bypass readonly section as it never becomes dirty, iommu
>memory
>>> section
>>> +     * is RW and never bypassed. The readonly mappings in iommu MR
>are
>>> bypassed
>>> +     * in vfio_iommu_map_dirty_notify().
>>> +     */
>>> +    if (vfio_listener_skipped_section(section, true)) {
>>>          return;
>>>      }
>>>
>
>Looks the vfio_iommu_map_notify() is missed. It also has unmap op. is
>it?

Right, because we don't know existing shadow mapping is readonly or not,
vfio_listener_log_sync() also doesn't provide that info.
That means we do dirty traking in this case, but returned bitmaps should be all zero.

Thanks
Zhenzhong