Using the new calls, use an atomic refcount to track how many times
a page is mapped in any of the IOMMUs.
For unmap we need to use iova_to_phys() to get the physical address
of the pages.
We use the smallest supported page size as the granularity of tracking
per domain.
This is important as it is possible to map pages and unmap them with
larger sizes (as in map_sg()) cases.
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Mostafa Saleh <smostafa@google.com>
---
drivers/iommu/iommu-debug-pagealloc.c | 91 +++++++++++++++++++++++++++
1 file changed, 91 insertions(+)
diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
index 1d343421da98..86ccb310a4a8 100644
--- a/drivers/iommu/iommu-debug-pagealloc.c
+++ b/drivers/iommu/iommu-debug-pagealloc.c
@@ -29,19 +29,110 @@ struct page_ext_operations page_iommu_debug_ops = {
.need = need_iommu_debug,
};
+static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
+{
+ struct page *page = phys_to_page(phys);
+ struct page_ext *page_ext = page_ext_get(page);
+
+ return page_ext;
+}
+
+static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
+{
+ return page_ext_data(page_ext, &page_iommu_debug_ops);
+}
+
+static void iommu_debug_inc_page(phys_addr_t phys)
+{
+ struct page_ext *page_ext = get_iommu_page_ext(phys);
+ struct iommu_debug_metadata *d = get_iommu_data(page_ext);
+
+ WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
+ page_ext_put(page_ext);
+}
+
+static void iommu_debug_dec_page(phys_addr_t phys)
+{
+ struct page_ext *page_ext = get_iommu_page_ext(phys);
+ struct iommu_debug_metadata *d = get_iommu_data(page_ext);
+
+ WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
+ page_ext_put(page_ext);
+}
+
+/*
+ * IOMMU page size doesn't have to match the CPU page size. So, we use
+ * the smallest IOMMU page size to refcount the pages in the vmemmap.
+ * That is important as both map and unmap has to use the same page size
+ * to update the refcount to avoid double counting the same page.
+ * And as we can't know from iommu_unmap() what was the original page size
+ * used for map, we just use the minimum supported one for both.
+ */
+static size_t iommu_debug_page_size(struct iommu_domain *domain)
+{
+ return 1UL << __ffs(domain->pgsize_bitmap);
+}
+
void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
{
+ size_t off, end;
+ size_t page_size = iommu_debug_page_size(domain);
+
+ if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
+ return;
+
+ for (off = 0 ; off < size ; off += page_size) {
+ if (!pfn_valid(__phys_to_pfn(phys + off)))
+ continue;
+ iommu_debug_inc_page(phys + off);
+ }
+}
+
+static void __iommu_debug_update_iova(struct iommu_domain *domain,
+ unsigned long iova, size_t size, bool inc)
+{
+ size_t off, end;
+ size_t page_size = iommu_debug_page_size(domain);
+
+ if (WARN_ON(check_add_overflow(iova, size, &end)))
+ return;
+
+ for (off = 0 ; off < size ; off += page_size) {
+ phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
+
+ if (!phys || !pfn_valid(__phys_to_pfn(phys)))
+ continue;
+
+ if (inc)
+ iommu_debug_inc_page(phys);
+ else
+ iommu_debug_dec_page(phys);
+ }
}
void __iommu_debug_unmap_begin(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
+ __iommu_debug_update_iova(domain, iova, size, false);
}
void __iommu_debug_unmap_end(struct iommu_domain *domain,
unsigned long iova, size_t size,
size_t unmapped)
{
+ if (unmapped == size)
+ return;
+
+ /*
+ * If unmap failed, re-increment the refcount, but if it unmapped
+ * larger size, decrement the extra part.
+ */
+ if (unmapped < size)
+ __iommu_debug_update_iova(domain, iova + unmapped,
+ size - unmapped, true);
+ else
+ __iommu_debug_update_iova(domain, iova + size,
+ unmapped - size, false);
}
void iommu_debug_init(void)
--
2.52.0.351.gbe84eed79e-goog
On Tue, Jan 06, 2026 at 04:21:59PM +0000, Mostafa Saleh wrote:
> Using the new calls, use an atomic refcount to track how many times
> a page is mapped in any of the IOMMUs.
>
> For unmap we need to use iova_to_phys() to get the physical address
> of the pages.
>
> We use the smallest supported page size as the granularity of tracking
> per domain.
> This is important as it is possible to map pages and unmap them with
> larger sizes (as in map_sg()) cases.
>
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Mostafa Saleh <smostafa@google.com>
> ---
> drivers/iommu/iommu-debug-pagealloc.c | 91 +++++++++++++++++++++++++++
> 1 file changed, 91 insertions(+)
>
> diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
> index 1d343421da98..86ccb310a4a8 100644
> --- a/drivers/iommu/iommu-debug-pagealloc.c
> +++ b/drivers/iommu/iommu-debug-pagealloc.c
> @@ -29,19 +29,110 @@ struct page_ext_operations page_iommu_debug_ops = {
> .need = need_iommu_debug,
> };
>
> +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> +{
> + struct page *page = phys_to_page(phys);
> + struct page_ext *page_ext = page_ext_get(page);
> +
> + return page_ext;
> +}
> +
> +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> +{
> + return page_ext_data(page_ext, &page_iommu_debug_ops);
> +}
> +
> +static void iommu_debug_inc_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> +
> + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
> + page_ext_put(page_ext);
> +}
> +
> +static void iommu_debug_dec_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> +
> + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
> + page_ext_put(page_ext);
> +}
> +
> +/*
> + * IOMMU page size doesn't have to match the CPU page size. So, we use
> + * the smallest IOMMU page size to refcount the pages in the vmemmap.
> + * That is important as both map and unmap has to use the same page size
> + * to update the refcount to avoid double counting the same page.
> + * And as we can't know from iommu_unmap() what was the original page size
> + * used for map, we just use the minimum supported one for both.
> + */
> +static size_t iommu_debug_page_size(struct iommu_domain *domain)
> +{
> + return 1UL << __ffs(domain->pgsize_bitmap);
> +}
> +
> void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
> {
> + size_t off, end;
> + size_t page_size = iommu_debug_page_size(domain);
> +
> + if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
> + return;
> +
> + for (off = 0 ; off < size ; off += page_size) {
> + if (!pfn_valid(__phys_to_pfn(phys + off)))
> + continue;
> + iommu_debug_inc_page(phys + off);
> + }
> +}
> +
> +static void __iommu_debug_update_iova(struct iommu_domain *domain,
> + unsigned long iova, size_t size, bool inc)
> +{
> + size_t off, end;
> + size_t page_size = iommu_debug_page_size(domain);
> +
> + if (WARN_ON(check_add_overflow(iova, size, &end)))
> + return;
> +
> + for (off = 0 ; off < size ; off += page_size) {
> + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
> +
> + if (!phys || !pfn_valid(__phys_to_pfn(phys)))
> + continue;
> +
> + if (inc)
> + iommu_debug_inc_page(phys);
> + else
> + iommu_debug_dec_page(phys);
> + }
This might loop for too long when we're unmapping a big buffer (say 1GB)
which is backed by multiple 4K mappings (i.e. not mapped using large
mappings) it may hold the CPU for too long, per the above example:
1,073,741,824 / 4096 = 262,144 iterations each with an iova_to_phys walk
in a tight loop, could hold the CPU for a little too long and could
potentially result in soft lockups (painful to see in a debug kernel).
Since, iommu_unmap can be called in atomic contexts (i.e. interrupts,
spinlocks with pre-emption disabled) we cannot simply add cond_resched()
here as well.
Maybe we can cross that bridge once we get there, but if we can't solve
the latency now, it'd be nice to explicitly document this risk
(potential soft lockups on large unmaps) in the Kconfig or cmdline help text?
> }
>
> void __iommu_debug_unmap_begin(struct iommu_domain *domain,
> unsigned long iova, size_t size)
> {
> + __iommu_debug_update_iova(domain, iova, size, false);
> }
>
> void __iommu_debug_unmap_end(struct iommu_domain *domain,
> unsigned long iova, size_t size,
> size_t unmapped)
> {
> + if (unmapped == size)
> + return;
> +
> + /*
> + * If unmap failed, re-increment the refcount, but if it unmapped
> + * larger size, decrement the extra part.
> + */
> + if (unmapped < size)
> + __iommu_debug_update_iova(domain, iova + unmapped,
> + size - unmapped, true);
> + else
> + __iommu_debug_update_iova(domain, iova + size,
> + unmapped - size, false);
> }
I'm a little concerned about this part, when we unmap more than requested,
the __iommu_debug_update_iova relies on
iommu_iova_to_phys(domain, iova + off) to find the physical page to
decrement. However, since __iommu_debug_unmap_end is called *after* the
IOMMU driver has removed the mapping (in __iommu_unmap). Thus, the
iommu_iova_to_phys return 0 (fail) causing the loop in update_iova:
`if (!phys ...)` to silently continue.
Since the refcounts for the physical pages in the range:
[iova + size, iova + unmapped] are never decremented. Won't this result
in false positives (warnings about page leaks) when those pages are
eventually freed?
For example:
- A driver maps a 2MB region (512 x 4KB). All 512 pgs have refcount = 1.
- A driver / IOMMU-client calls iommu_unmap(iova, 4KB)
- unmap_begin(4KB) calls iova_to_phys, succeeds, and decrements the
refcount for the 1st page to 0.
- __iommu_unmap calls the IOMMU driver. The driver (unable to split the
block) zaps the entire 2MB range and returns unmapped = 2MB.
- unmap_end(size=4KB, unmapped=2MB) sees that more was unmapped than
requested & attempts to decrement refcounts for the remaining 511 pgs
- __iommu_debug_update_iova is called for the remaining range, which
ends up calling iommu_iova_to_phys. Since the mapping was destroyed,
iova_to_phys returns 0.
- The loop skips the decrement causing the remaining 511 pages to leak
with refcount = 1.
Thanks,
Praan
On Wed, Jan 07, 2026 at 03:21:41PM +0000, Pranjal Shrivastava wrote:
> On Tue, Jan 06, 2026 at 04:21:59PM +0000, Mostafa Saleh wrote:
> > Using the new calls, use an atomic refcount to track how many times
> > a page is mapped in any of the IOMMUs.
> >
> > For unmap we need to use iova_to_phys() to get the physical address
> > of the pages.
> >
> > We use the smallest supported page size as the granularity of tracking
> > per domain.
> > This is important as it is possible to map pages and unmap them with
> > larger sizes (as in map_sg()) cases.
> >
> > Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> > Signed-off-by: Mostafa Saleh <smostafa@google.com>
> > ---
> > drivers/iommu/iommu-debug-pagealloc.c | 91 +++++++++++++++++++++++++++
> > 1 file changed, 91 insertions(+)
> >
> > diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
> > index 1d343421da98..86ccb310a4a8 100644
> > --- a/drivers/iommu/iommu-debug-pagealloc.c
> > +++ b/drivers/iommu/iommu-debug-pagealloc.c
> > @@ -29,19 +29,110 @@ struct page_ext_operations page_iommu_debug_ops = {
> > .need = need_iommu_debug,
> > };
> >
> > +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> > +{
> > + struct page *page = phys_to_page(phys);
> > + struct page_ext *page_ext = page_ext_get(page);
> > +
> > + return page_ext;
> > +}
> > +
> > +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> > +{
> > + return page_ext_data(page_ext, &page_iommu_debug_ops);
> > +}
> > +
> > +static void iommu_debug_inc_page(phys_addr_t phys)
> > +{
> > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> > +
> > + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
> > + page_ext_put(page_ext);
> > +}
> > +
> > +static void iommu_debug_dec_page(phys_addr_t phys)
> > +{
> > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> > +
> > + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
> > + page_ext_put(page_ext);
> > +}
> > +
> > +/*
> > + * IOMMU page size doesn't have to match the CPU page size. So, we use
> > + * the smallest IOMMU page size to refcount the pages in the vmemmap.
> > + * That is important as both map and unmap has to use the same page size
> > + * to update the refcount to avoid double counting the same page.
> > + * And as we can't know from iommu_unmap() what was the original page size
> > + * used for map, we just use the minimum supported one for both.
> > + */
> > +static size_t iommu_debug_page_size(struct iommu_domain *domain)
> > +{
> > + return 1UL << __ffs(domain->pgsize_bitmap);
> > +}
> > +
> > void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
> > {
> > + size_t off, end;
> > + size_t page_size = iommu_debug_page_size(domain);
> > +
> > + if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
> > + return;
> > +
> > + for (off = 0 ; off < size ; off += page_size) {
> > + if (!pfn_valid(__phys_to_pfn(phys + off)))
> > + continue;
> > + iommu_debug_inc_page(phys + off);
> > + }
> > +}
> > +
> > +static void __iommu_debug_update_iova(struct iommu_domain *domain,
> > + unsigned long iova, size_t size, bool inc)
> > +{
> > + size_t off, end;
> > + size_t page_size = iommu_debug_page_size(domain);
> > +
> > + if (WARN_ON(check_add_overflow(iova, size, &end)))
> > + return;
> > +
> > + for (off = 0 ; off < size ; off += page_size) {
> > + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
> > +
> > + if (!phys || !pfn_valid(__phys_to_pfn(phys)))
> > + continue;
> > +
> > + if (inc)
> > + iommu_debug_inc_page(phys);
> > + else
> > + iommu_debug_dec_page(phys);
> > + }
>
> This might loop for too long when we're unmapping a big buffer (say 1GB)
> which is backed by multiple 4K mappings (i.e. not mapped using large
> mappings) it may hold the CPU for too long, per the above example:
>
> 1,073,741,824 / 4096 = 262,144 iterations each with an iova_to_phys walk
> in a tight loop, could hold the CPU for a little too long and could
> potentially result in soft lockups (painful to see in a debug kernel).
> Since, iommu_unmap can be called in atomic contexts (i.e. interrupts,
> spinlocks with pre-emption disabled) we cannot simply add cond_resched()
> here as well.
>
> Maybe we can cross that bridge once we get there, but if we can't solve
> the latency now, it'd be nice to explicitly document this risk
> (potential soft lockups on large unmaps) in the Kconfig or cmdline help text?
>
Yes, I am not sure how bad that would be, looking at the code, the closest
pattern I see in that path is for SWIOTLB, when it’s enabled it will do a
lot of per-page operations on unmap.
There is a disclaimer already in dmesg and the Kconfig about the performance
overhead, and you would need to enable a config + cmdline to get this, so
I’d expect someone enabling it to have some expectations of what they are
doing. But I can add more info to Kconfig if that makes sense.
> > }
> >
> > void __iommu_debug_unmap_begin(struct iommu_domain *domain,
> > unsigned long iova, size_t size)
> > {
> > + __iommu_debug_update_iova(domain, iova, size, false);
> > }
> >
> > void __iommu_debug_unmap_end(struct iommu_domain *domain,
> > unsigned long iova, size_t size,
> > size_t unmapped)
> > {
> > + if (unmapped == size)
> > + return;
> > +
> > + /*
> > + * If unmap failed, re-increment the refcount, but if it unmapped
> > + * larger size, decrement the extra part.
> > + */
> > + if (unmapped < size)
> > + __iommu_debug_update_iova(domain, iova + unmapped,
> > + size - unmapped, true);
> > + else
> > + __iommu_debug_update_iova(domain, iova + size,
> > + unmapped - size, false);
> > }
>
> I'm a little concerned about this part, when we unmap more than requested,
> the __iommu_debug_update_iova relies on
> iommu_iova_to_phys(domain, iova + off) to find the physical page to
> decrement. However, since __iommu_debug_unmap_end is called *after* the
> IOMMU driver has removed the mapping (in __iommu_unmap). Thus, the
> iommu_iova_to_phys return 0 (fail) causing the loop in update_iova:
> `if (!phys ...)` to silently continue.
>
> Since the refcounts for the physical pages in the range:
> [iova + size, iova + unmapped] are never decremented. Won't this result
> in false positives (warnings about page leaks) when those pages are
> eventually freed?
>
> For example:
>
> - A driver maps a 2MB region (512 x 4KB). All 512 pgs have refcount = 1.
>
> - A driver / IOMMU-client calls iommu_unmap(iova, 4KB)
>
> - unmap_begin(4KB) calls iova_to_phys, succeeds, and decrements the
> refcount for the 1st page to 0.
>
> - __iommu_unmap calls the IOMMU driver. The driver (unable to split the
> block) zaps the entire 2MB range and returns unmapped = 2MB.
>
> - unmap_end(size=4KB, unmapped=2MB) sees that more was unmapped than
> requested & attempts to decrement refcounts for the remaining 511 pgs
>
> - __iommu_debug_update_iova is called for the remaining range, which
> ends up calling iommu_iova_to_phys. Since the mapping was destroyed,
> iova_to_phys returns 0.
>
> - The loop skips the decrement causing the remaining 511 pages to leak
> with refcount = 1.
>
Agh, yes, iova_to_phys will always return zero, so the
__iommu_debug_update_iova() will do nothing in that case.
I am not aware which drivers are doing this, I added this logic
because I saw the IOMMU core allow it. I vaguely remember that
had something about splitting blocks which might be related to VFIO,
but I don't think that is needed anymore.
I am happy just to drop it or even preemptively warn in that case, as
it is impossible to retrieve the old addresses.
And maybe, that's a chance to re-evaluate we allow this behviour.
Thanks,
Mostafa
> Thanks,
> Praan
On Thu, Jan 8, 2026 at 11:06 AM Mostafa Saleh <smostafa@google.com> wrote:
>
> On Wed, Jan 07, 2026 at 03:21:41PM +0000, Pranjal Shrivastava wrote:
> > On Tue, Jan 06, 2026 at 04:21:59PM +0000, Mostafa Saleh wrote:
> > > Using the new calls, use an atomic refcount to track how many times
> > > a page is mapped in any of the IOMMUs.
> > >
> > > For unmap we need to use iova_to_phys() to get the physical address
> > > of the pages.
> > >
> > > We use the smallest supported page size as the granularity of tracking
> > > per domain.
> > > This is important as it is possible to map pages and unmap them with
> > > larger sizes (as in map_sg()) cases.
> > >
> > > Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> > > Signed-off-by: Mostafa Saleh <smostafa@google.com>
> > > ---
> > > drivers/iommu/iommu-debug-pagealloc.c | 91 +++++++++++++++++++++++++++
> > > 1 file changed, 91 insertions(+)
> > >
> > > diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
> > > index 1d343421da98..86ccb310a4a8 100644
> > > --- a/drivers/iommu/iommu-debug-pagealloc.c
> > > +++ b/drivers/iommu/iommu-debug-pagealloc.c
> > > @@ -29,19 +29,110 @@ struct page_ext_operations page_iommu_debug_ops = {
> > > .need = need_iommu_debug,
> > > };
> > >
> > > +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> > > +{
> > > + struct page *page = phys_to_page(phys);
> > > + struct page_ext *page_ext = page_ext_get(page);
> > > +
> > > + return page_ext;
> > > +}
> > > +
> > > +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> > > +{
> > > + return page_ext_data(page_ext, &page_iommu_debug_ops);
> > > +}
> > > +
> > > +static void iommu_debug_inc_page(phys_addr_t phys)
> > > +{
> > > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> > > +
> > > + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
> > > + page_ext_put(page_ext);
> > > +}
> > > +
> > > +static void iommu_debug_dec_page(phys_addr_t phys)
> > > +{
> > > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> > > +
> > > + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
> > > + page_ext_put(page_ext);
> > > +}
> > > +
> > > +/*
> > > + * IOMMU page size doesn't have to match the CPU page size. So, we use
> > > + * the smallest IOMMU page size to refcount the pages in the vmemmap.
> > > + * That is important as both map and unmap has to use the same page size
> > > + * to update the refcount to avoid double counting the same page.
> > > + * And as we can't know from iommu_unmap() what was the original page size
> > > + * used for map, we just use the minimum supported one for both.
> > > + */
> > > +static size_t iommu_debug_page_size(struct iommu_domain *domain)
> > > +{
> > > + return 1UL << __ffs(domain->pgsize_bitmap);
> > > +}
> > > +
> > > void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
> > > {
> > > + size_t off, end;
> > > + size_t page_size = iommu_debug_page_size(domain);
> > > +
> > > + if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
> > > + return;
> > > +
> > > + for (off = 0 ; off < size ; off += page_size) {
> > > + if (!pfn_valid(__phys_to_pfn(phys + off)))
> > > + continue;
> > > + iommu_debug_inc_page(phys + off);
> > > + }
> > > +}
> > > +
> > > +static void __iommu_debug_update_iova(struct iommu_domain *domain,
> > > + unsigned long iova, size_t size, bool inc)
> > > +{
> > > + size_t off, end;
> > > + size_t page_size = iommu_debug_page_size(domain);
> > > +
> > > + if (WARN_ON(check_add_overflow(iova, size, &end)))
> > > + return;
> > > +
> > > + for (off = 0 ; off < size ; off += page_size) {
> > > + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
> > > +
> > > + if (!phys || !pfn_valid(__phys_to_pfn(phys)))
> > > + continue;
> > > +
> > > + if (inc)
> > > + iommu_debug_inc_page(phys);
> > > + else
> > > + iommu_debug_dec_page(phys);
> > > + }
> >
> > This might loop for too long when we're unmapping a big buffer (say 1GB)
> > which is backed by multiple 4K mappings (i.e. not mapped using large
> > mappings) it may hold the CPU for too long, per the above example:
> >
> > 1,073,741,824 / 4096 = 262,144 iterations each with an iova_to_phys walk
> > in a tight loop, could hold the CPU for a little too long and could
> > potentially result in soft lockups (painful to see in a debug kernel).
> > Since, iommu_unmap can be called in atomic contexts (i.e. interrupts,
> > spinlocks with pre-emption disabled) we cannot simply add cond_resched()
> > here as well.
> >
> > Maybe we can cross that bridge once we get there, but if we can't solve
> > the latency now, it'd be nice to explicitly document this risk
> > (potential soft lockups on large unmaps) in the Kconfig or cmdline help text?
> >
>
> Yes, I am not sure how bad that would be, looking at the code, the closest
> pattern I see in that path is for SWIOTLB, when it’s enabled it will do a
> lot of per-page operations on unmap.
> There is a disclaimer already in dmesg and the Kconfig about the performance
> overhead, and you would need to enable a config + cmdline to get this, so
> I’d expect someone enabling it to have some expectations of what they are
> doing. But I can add more info to Kconfig if that makes sense.
>
> > > }
> > >
> > > void __iommu_debug_unmap_begin(struct iommu_domain *domain,
> > > unsigned long iova, size_t size)
> > > {
> > > + __iommu_debug_update_iova(domain, iova, size, false);
> > > }
> > >
> > > void __iommu_debug_unmap_end(struct iommu_domain *domain,
> > > unsigned long iova, size_t size,
> > > size_t unmapped)
> > > {
> > > + if (unmapped == size)
> > > + return;
> > > +
> > > + /*
> > > + * If unmap failed, re-increment the refcount, but if it unmapped
> > > + * larger size, decrement the extra part.
> > > + */
> > > + if (unmapped < size)
> > > + __iommu_debug_update_iova(domain, iova + unmapped,
> > > + size - unmapped, true);
> > > + else
> > > + __iommu_debug_update_iova(domain, iova + size,
> > > + unmapped - size, false);
> > > }
> >
> > I'm a little concerned about this part, when we unmap more than requested,
> > the __iommu_debug_update_iova relies on
> > iommu_iova_to_phys(domain, iova + off) to find the physical page to
> > decrement. However, since __iommu_debug_unmap_end is called *after* the
> > IOMMU driver has removed the mapping (in __iommu_unmap). Thus, the
> > iommu_iova_to_phys return 0 (fail) causing the loop in update_iova:
> > `if (!phys ...)` to silently continue.
> >
> > Since the refcounts for the physical pages in the range:
> > [iova + size, iova + unmapped] are never decremented. Won't this result
> > in false positives (warnings about page leaks) when those pages are
> > eventually freed?
> >
> > For example:
> >
> > - A driver maps a 2MB region (512 x 4KB). All 512 pgs have refcount = 1.
> >
> > - A driver / IOMMU-client calls iommu_unmap(iova, 4KB)
> >
> > - unmap_begin(4KB) calls iova_to_phys, succeeds, and decrements the
> > refcount for the 1st page to 0.
> >
> > - __iommu_unmap calls the IOMMU driver. The driver (unable to split the
> > block) zaps the entire 2MB range and returns unmapped = 2MB.
> >
> > - unmap_end(size=4KB, unmapped=2MB) sees that more was unmapped than
> > requested & attempts to decrement refcounts for the remaining 511 pgs
> >
> > - __iommu_debug_update_iova is called for the remaining range, which
> > ends up calling iommu_iova_to_phys. Since the mapping was destroyed,
> > iova_to_phys returns 0.
> >
> > - The loop skips the decrement causing the remaining 511 pages to leak
> > with refcount = 1.
> >
>
> Agh, yes, iova_to_phys will always return zero, so the
> __iommu_debug_update_iova() will do nothing in that case.
>
> I am not aware which drivers are doing this, I added this logic
> because I saw the IOMMU core allow it. I vaguely remember that
> had something about splitting blocks which might be related to VFIO,
> but I don't think that is needed anymore.
>
> I am happy just to drop it or even preemptively warn in that case, as
> it is impossible to retrieve the old addresses.
>
> And maybe, that's a chance to re-evaluate we allow this behviour.
>
I have this, it should have the same effect + a WARN, I will include
it in the new version
diff --git a/drivers/iommu/iommu-debug-pagealloc.c
b/drivers/iommu/iommu-debug-pagealloc.c
index 5353417e64f9..64ec0795fe4c 100644
--- a/drivers/iommu/iommu-debug-pagealloc.c
+++ b/drivers/iommu/iommu-debug-pagealloc.c
@@ -146,16 +146,12 @@ void __iommu_debug_unmap_end(struct iommu_domain *domain,
if (unmapped == size)
return;
- /*
- * If unmap failed, re-increment the refcount, but if it unmapped
- * larger size, decrement the extra part.
- */
+ /* If unmap failed, re-increment the refcount. */
if (unmapped < size)
__iommu_debug_update_iova(domain, iova + unmapped,
size - unmapped, true);
else
- __iommu_debug_update_iova(domain, iova + size,
- unmapped - size, false);
+ WARN_ONCE(1, "iommu: unmap larger than requested is
not supported in debug_pagealloc\n");
}
void iommu_debug_init(void)
Thanks,
Mostafa
> Thanks,
> Mostafa
>
> > Thanks,
> > Praan
On 1/8/26 19:33, Mostafa Saleh wrote: > I have this, it should have the same effect + a WARN, I will include > it in the new version > > diff --git a/drivers/iommu/iommu-debug-pagealloc.c > b/drivers/iommu/iommu-debug-pagealloc.c > index 5353417e64f9..64ec0795fe4c 100644 > --- a/drivers/iommu/iommu-debug-pagealloc.c > +++ b/drivers/iommu/iommu-debug-pagealloc.c > @@ -146,16 +146,12 @@ void __iommu_debug_unmap_end(struct iommu_domain *domain, > if (unmapped == size) > return; > > - /* > - * If unmap failed, re-increment the refcount, but if it unmapped > - * larger size, decrement the extra part. > - */ > + /* If unmap failed, re-increment the refcount. */ > if (unmapped < size) > __iommu_debug_update_iova(domain, iova + unmapped, > size - unmapped, true); > else > - __iommu_debug_update_iova(domain, iova + size, > - unmapped - size, false); > + WARN_ONCE(1, "iommu: unmap larger than requested is > not supported in debug_pagealloc\n"); > } > > void iommu_debug_init(void) How aobut if ((unmapped == size) || WARN_ON_ONCE(unmapped > size)) return; /* If unmap failed, re-increment the refcount. */ __iommu_debug_update_iova(domain, iova + unmapped, size - unmapped, true); ? Thanks, baolu
On Fri, Jan 09, 2026 at 11:28:32AM +0800, Baolu Lu wrote: > On 1/8/26 19:33, Mostafa Saleh wrote: > > I have this, it should have the same effect + a WARN, I will include > > it in the new version > > > > diff --git a/drivers/iommu/iommu-debug-pagealloc.c > > b/drivers/iommu/iommu-debug-pagealloc.c > > index 5353417e64f9..64ec0795fe4c 100644 > > --- a/drivers/iommu/iommu-debug-pagealloc.c > > +++ b/drivers/iommu/iommu-debug-pagealloc.c > > @@ -146,16 +146,12 @@ void __iommu_debug_unmap_end(struct iommu_domain *domain, > > if (unmapped == size) > > return; > > > > - /* > > - * If unmap failed, re-increment the refcount, but if it unmapped > > - * larger size, decrement the extra part. > > - */ > > + /* If unmap failed, re-increment the refcount. */ > > if (unmapped < size) > > __iommu_debug_update_iova(domain, iova + unmapped, > > size - unmapped, true); > > else > > - __iommu_debug_update_iova(domain, iova + size, > > - unmapped - size, false); > > + WARN_ONCE(1, "iommu: unmap larger than requested is > > not supported in debug_pagealloc\n"); > > } > > > > void iommu_debug_init(void) > > How aobut > > if ((unmapped == size) || WARN_ON_ONCE(unmapped > size)) > return; > > /* If unmap failed, re-increment the refcount. */ > __iommu_debug_update_iova(domain, iova + unmapped, size - unmapped, true); > > ? > That's nice, We could also print the message as Mostafa mentioned, like: if ((unmapped == size) || WARN_ON_ONCE(unmapped > size, "iommu: unmap larger than requested is not supported in debug_pagealloc\n")) An explicit "This is unsupported" warning makes it clear IMHO. If a driver triggers this, at least we know the sanitizer state isn't supported. Thanks, Praan
On Fri, Jan 9, 2026 at 7:34 AM Pranjal Shrivastava <praan@google.com> wrote: > > On Fri, Jan 09, 2026 at 11:28:32AM +0800, Baolu Lu wrote: > > On 1/8/26 19:33, Mostafa Saleh wrote: > > > I have this, it should have the same effect + a WARN, I will include > > > it in the new version > > > > > > diff --git a/drivers/iommu/iommu-debug-pagealloc.c > > > b/drivers/iommu/iommu-debug-pagealloc.c > > > index 5353417e64f9..64ec0795fe4c 100644 > > > --- a/drivers/iommu/iommu-debug-pagealloc.c > > > +++ b/drivers/iommu/iommu-debug-pagealloc.c > > > @@ -146,16 +146,12 @@ void __iommu_debug_unmap_end(struct iommu_domain *domain, > > > if (unmapped == size) > > > return; > > > > > > - /* > > > - * If unmap failed, re-increment the refcount, but if it unmapped > > > - * larger size, decrement the extra part. > > > - */ > > > + /* If unmap failed, re-increment the refcount. */ > > > if (unmapped < size) > > > __iommu_debug_update_iova(domain, iova + unmapped, > > > size - unmapped, true); > > > else > > > - __iommu_debug_update_iova(domain, iova + size, > > > - unmapped - size, false); > > > + WARN_ONCE(1, "iommu: unmap larger than requested is > > > not supported in debug_pagealloc\n"); > > > } > > > > > > void iommu_debug_init(void) > > > > How aobut > > > > if ((unmapped == size) || WARN_ON_ONCE(unmapped > size)) > > return; > > > > /* If unmap failed, re-increment the refcount. */ > > __iommu_debug_update_iova(domain, iova + unmapped, size - unmapped, true); > > > > ? > > > > That's nice, We could also print the message as Mostafa mentioned, like: > > if ((unmapped == size) || WARN_ON_ONCE(unmapped > size, > "iommu: unmap larger than requested is not supported in > debug_pagealloc\n")) > > An explicit "This is unsupported" warning makes it clear IMHO. > If a driver triggers this, at least we know the sanitizer state isn't > supported. > I guess both are fine, the WARN points to the line which should be simple to deduce the cause anyway. Thanks, Mostafa > Thanks, > Praan
On Tue, Jan 6, 2026 at 8:22 AM Mostafa Saleh <smostafa@google.com> wrote:
>
> Using the new calls, use an atomic refcount to track how many times
> a page is mapped in any of the IOMMUs.
>
> For unmap we need to use iova_to_phys() to get the physical address
> of the pages.
>
> We use the smallest supported page size as the granularity of tracking
> per domain.
> This is important as it is possible to map pages and unmap them with
> larger sizes (as in map_sg()) cases.
>
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Mostafa Saleh <smostafa@google.com>
> ---
> drivers/iommu/iommu-debug-pagealloc.c | 91 +++++++++++++++++++++++++++
> 1 file changed, 91 insertions(+)
>
> diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
> index 1d343421da98..86ccb310a4a8 100644
> --- a/drivers/iommu/iommu-debug-pagealloc.c
> +++ b/drivers/iommu/iommu-debug-pagealloc.c
> @@ -29,19 +29,110 @@ struct page_ext_operations page_iommu_debug_ops = {
> .need = need_iommu_debug,
> };
>
> +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> +{
> + struct page *page = phys_to_page(phys);
> + struct page_ext *page_ext = page_ext_get(page);
> +
> + return page_ext;
> +}
> +
> +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> +{
> + return page_ext_data(page_ext, &page_iommu_debug_ops);
> +}
> +
> +static void iommu_debug_inc_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> +
> + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
> + page_ext_put(page_ext);
> +}
> +
> +static void iommu_debug_dec_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> +
> + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
> + page_ext_put(page_ext);
> +}
> +
> +/*
> + * IOMMU page size doesn't have to match the CPU page size. So, we use
> + * the smallest IOMMU page size to refcount the pages in the vmemmap.
> + * That is important as both map and unmap has to use the same page size
> + * to update the refcount to avoid double counting the same page.
> + * And as we can't know from iommu_unmap() what was the original page size
> + * used for map, we just use the minimum supported one for both.
> + */
> +static size_t iommu_debug_page_size(struct iommu_domain *domain)
> +{
> + return 1UL << __ffs(domain->pgsize_bitmap);
> +}
> +
> void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
> {
> + size_t off, end;
> + size_t page_size = iommu_debug_page_size(domain);
> +
> + if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
> + return;
> +
> + for (off = 0 ; off < size ; off += page_size) {
> + if (!pfn_valid(__phys_to_pfn(phys + off)))
> + continue;
> + iommu_debug_inc_page(phys + off);
> + }
> +}
> +
> +static void __iommu_debug_update_iova(struct iommu_domain *domain,
> + unsigned long iova, size_t size, bool inc)
> +{
> + size_t off, end;
> + size_t page_size = iommu_debug_page_size(domain);
> +
> + if (WARN_ON(check_add_overflow(iova, size, &end)))
> + return;
> +
> + for (off = 0 ; off < size ; off += page_size) {
> + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
> +
> + if (!phys || !pfn_valid(__phys_to_pfn(phys)))
> + continue;
> +
> + if (inc)
> + iommu_debug_inc_page(phys);
> + else
> + iommu_debug_dec_page(phys);
> + }
> }
>
> void __iommu_debug_unmap_begin(struct iommu_domain *domain,
> unsigned long iova, size_t size)
> {
> + __iommu_debug_update_iova(domain, iova, size, false);
> }
>
> void __iommu_debug_unmap_end(struct iommu_domain *domain,
> unsigned long iova, size_t size,
> size_t unmapped)
> {
> + if (unmapped == size)
> + return;
> +
> + /*
> + * If unmap failed, re-increment the refcount, but if it unmapped
> + * larger size, decrement the extra part.
> + */
> + if (unmapped < size)
> + __iommu_debug_update_iova(domain, iova + unmapped,
> + size - unmapped, true);
> + else
> + __iommu_debug_update_iova(domain, iova + size,
> + unmapped - size, false);
> }
>
> void iommu_debug_init(void)
> --
> 2.52.0.351.gbe84eed79e-goog
>
>
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
© 2016 - 2026 Red Hat, Inc.