Using the new calls, use an atomic refcount to track how many times
a page is mapped in any of the IOMMUs.
For unmap we need to use iova_to_phys() to get the physical address
of the pages.
We use the smallest supported page size as the granularity of tracking
per domain.
This is important as it is possible to map pages and unmap them with
larger sizes (as in map_sg()) cases.
Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Mostafa Saleh <smostafa@google.com>
---
drivers/iommu/iommu-debug-pagealloc.c | 84 +++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
index 1d343421da98..9eb49e1230ee 100644
--- a/drivers/iommu/iommu-debug-pagealloc.c
+++ b/drivers/iommu/iommu-debug-pagealloc.c
@@ -29,19 +29,103 @@ struct page_ext_operations page_iommu_debug_ops = {
.need = need_iommu_debug,
};
+static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
+{
+ struct page *page = phys_to_page(phys);
+ struct page_ext *page_ext = page_ext_get(page);
+
+ return page_ext;
+}
+
+static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
+{
+ return page_ext_data(page_ext, &page_iommu_debug_ops);
+}
+
+static void iommu_debug_inc_page(phys_addr_t phys)
+{
+ struct page_ext *page_ext = get_iommu_page_ext(phys);
+ struct iommu_debug_metadata *d = get_iommu_data(page_ext);
+
+ WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
+ page_ext_put(page_ext);
+}
+
+static void iommu_debug_dec_page(phys_addr_t phys)
+{
+ struct page_ext *page_ext = get_iommu_page_ext(phys);
+ struct iommu_debug_metadata *d = get_iommu_data(page_ext);
+
+ WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
+ page_ext_put(page_ext);
+}
+
+/*
+ * IOMMU page size doesn't have to match the CPU page size. So, we use
+ * the smallest IOMMU page size to refcount the pages in the vmemmap.
+ * That is important as both map and unmap has to use the same page size
+ * to update the refcount to avoid double counting the same page.
+ * And as we can't know from iommu_unmap() what was the original page size
+ * used for map, we just use the minimum supported one for both.
+ */
+static size_t iommu_debug_page_size(struct iommu_domain *domain)
+{
+ return 1UL << __ffs(domain->pgsize_bitmap);
+}
+
void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
{
+ size_t off, end;
+ size_t page_size = iommu_debug_page_size(domain);
+
+ if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
+ return;
+
+ for (off = 0 ; off < size ; off += page_size) {
+ if (!pfn_valid(__phys_to_pfn(phys + off)))
+ continue;
+ iommu_debug_inc_page(phys + off);
+ }
+}
+
+static void __iommu_debug_update_iova(struct iommu_domain *domain,
+ unsigned long iova, size_t size, bool inc)
+{
+ size_t off, end;
+ size_t page_size = iommu_debug_page_size(domain);
+
+ if (WARN_ON(check_add_overflow(iova, size, &end)))
+ return;
+
+ for (off = 0 ; off < size ; off += page_size) {
+ phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
+
+ if (!phys || !pfn_valid(__phys_to_pfn(phys)))
+ continue;
+
+ if (inc)
+ iommu_debug_inc_page(phys);
+ else
+ iommu_debug_dec_page(phys);
+ }
}
void __iommu_debug_unmap_begin(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
+ __iommu_debug_update_iova(domain, iova, size, false);
}
void __iommu_debug_unmap_end(struct iommu_domain *domain,
unsigned long iova, size_t size,
size_t unmapped)
{
+ if ((unmapped == size) || WARN_ON_ONCE(unmapped > size))
+ return;
+
+ /* If unmap failed, re-increment the refcount. */
+ __iommu_debug_update_iova(domain, iova + unmapped,
+ size - unmapped, true);
}
void iommu_debug_init(void)
--
2.52.0.457.g6b5491de43-goog
On Fri, Jan 09, 2026 at 05:18:04PM +0000, Mostafa Saleh wrote:
> +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> +{
> + struct page *page = phys_to_page(phys);
> + struct page_ext *page_ext = page_ext_get(page);
> +
> + return page_ext;
> +}
> +
> +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> +{
> + return page_ext_data(page_ext, &page_iommu_debug_ops);
> +}
> +
> +static void iommu_debug_inc_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
You cannot do this - phys_to_page() can only be called if we already
know that phys is a struct page backed item and by the time you get
here that information is lost.
Probably the only way to resolve this is to somehow pass in an iommu
prot flag that can tell the difference between struct page and
non-struct page addresses.
But I have to NAK this approach of blindly calling phys_to_page().
Jason
On Fri, Jan 9, 2026 at 7:51 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Jan 09, 2026 at 05:18:04PM +0000, Mostafa Saleh wrote:
> > +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> > +{
> > + struct page *page = phys_to_page(phys);
> > + struct page_ext *page_ext = page_ext_get(page);
> > +
> > + return page_ext;
> > +}
> > +
> > +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> > +{
> > + return page_ext_data(page_ext, &page_iommu_debug_ops);
> > +}
> > +
> > +static void iommu_debug_inc_page(phys_addr_t phys)
> > +{
> > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
>
> You cannot do this - phys_to_page() can only be called if we already
> know that phys is a struct page backed item and by the time you get
> here that information is lost.
>
> Probably the only way to resolve this is to somehow pass in an iommu
> prot flag that can tell the difference between struct page and
> non-struct page addresses.
>
> But I have to NAK this approach of blindly calling phys_to_page().
>
The callers to this, first will check "pfn_valid", which is the right
check AFAICT (looking at similar patterns in page_owner for example).
Thanks,
Mostafa
> Jason
>
On Mon, Jan 12, 2026 at 10:20:14AM +0000, Mostafa Saleh wrote:
> On Fri, Jan 9, 2026 at 7:51 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Fri, Jan 09, 2026 at 05:18:04PM +0000, Mostafa Saleh wrote:
> > > +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> > > +{
> > > + struct page *page = phys_to_page(phys);
> > > + struct page_ext *page_ext = page_ext_get(page);
> > > +
> > > + return page_ext;
> > > +}
> > > +
> > > +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> > > +{
> > > + return page_ext_data(page_ext, &page_iommu_debug_ops);
> > > +}
> > > +
> > > +static void iommu_debug_inc_page(phys_addr_t phys)
> > > +{
> > > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> >
> > You cannot do this - phys_to_page() can only be called if we already
> > know that phys is a struct page backed item and by the time you get
> > here that information is lost.
> >
> > Probably the only way to resolve this is to somehow pass in an iommu
> > prot flag that can tell the difference between struct page and
> > non-struct page addresses.
> >
> > But I have to NAK this approach of blindly calling phys_to_page().
>
> The callers to this, first will check "pfn_valid", which is the right
> check AFAICT (looking at similar patterns in page_owner for example).
I'm not sure pfn_valid really works in all cases, it has a number of
exclusions that can be relevant when phys_addr_t can be a MMIO
address..
So far we haven't been using it in the DMA paths at all, I'm not so
keen to see that start..
Jason
On Mon, Jan 12, 2026 at 1:32 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Mon, Jan 12, 2026 at 10:20:14AM +0000, Mostafa Saleh wrote:
> > On Fri, Jan 9, 2026 at 7:51 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Fri, Jan 09, 2026 at 05:18:04PM +0000, Mostafa Saleh wrote:
> > > > +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> > > > +{
> > > > + struct page *page = phys_to_page(phys);
> > > > + struct page_ext *page_ext = page_ext_get(page);
> > > > +
> > > > + return page_ext;
> > > > +}
> > > > +
> > > > +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> > > > +{
> > > > + return page_ext_data(page_ext, &page_iommu_debug_ops);
> > > > +}
> > > > +
> > > > +static void iommu_debug_inc_page(phys_addr_t phys)
> > > > +{
> > > > + struct page_ext *page_ext = get_iommu_page_ext(phys);
> > > > + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> > >
> > > You cannot do this - phys_to_page() can only be called if we already
> > > know that phys is a struct page backed item and by the time you get
> > > here that information is lost.
> > >
> > > Probably the only way to resolve this is to somehow pass in an iommu
> > > prot flag that can tell the difference between struct page and
> > > non-struct page addresses.
> > >
> > > But I have to NAK this approach of blindly calling phys_to_page().
> >
> > The callers to this, first will check "pfn_valid", which is the right
> > check AFAICT (looking at similar patterns in page_owner for example).
>
> I'm not sure pfn_valid really works in all cases, it has a number of
> exclusions that can be relevant when phys_addr_t can be a MMIO
> address..
>
> So far we haven't been using it in the DMA paths at all, I'm not so
> keen to see that start..
>
But I don’t see why not. from the documentation:
/**
* pfn_valid - check if there is a valid memory map entry for a PFN
* @pfn: the page frame number to check
*
* Check if there is a valid memory map entry aka struct page for the @pfn.
* Note, that availability of the memory map entry does not imply that
* there is actual usable memory at that @pfn. The struct page may
* represent a hole or an unusable page frame.
…
That means that struct page exists, which is all what we need here.
I can see many places have the same pattern in the kernel already, for example:
- vfio_iommu_type1.c, is_invalid_reserved_pfn() which does the same
check which can include MMIO and then get the page struct.
- kvm_main.c: in __kvm_vcpu_map(), it distinguishes MMIO from memory
and then accesses the page struct.
Thanks,
Mostafa
> Jason
On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: > But I don’t see why not. from the documentation: > /** > * pfn_valid - check if there is a valid memory map entry for a PFN > * @pfn: the page frame number to check > * > * Check if there is a valid memory map entry aka struct page for the @pfn. > * Note, that availability of the memory map entry does not imply that > * there is actual usable memory at that @pfn. The struct page may > * represent a hole or an unusable page frame. > … > > That means that struct page exists, which is all what we need here. A struct page that has never been initialize shouldn't ever be read. I don't know how that relates to page_ext, but are you really sure that is all you need? > I can see many places have the same pattern in the kernel already, for example: > - vfio_iommu_type1.c, is_invalid_reserved_pfn() which does the same > check which can include MMIO and then get the page struct. This whole flow is nonsensical and wrong though, I wouldn't point to it as something reliable. > - kvm_main.c: in __kvm_vcpu_map(), it distinguishes MMIO from memory > and then accesses the page struct. That's sure looks sketchy to me.. Eg if CONFIG_WANT_PAGE_VIRTUAL is set and you try to feed a MMIO through through that kmap() it will explode. KVM can argue that it doesn't work with CONFIG_WANT_PAGE_VIRTUAL but iommu cannot. So, again, IDK, we are trying not to use pfn_valid() in the DMA code. Jason
On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: > > But I don’t see why not. from the documentation: > > /** > > * pfn_valid - check if there is a valid memory map entry for a PFN > > * @pfn: the page frame number to check > > * > > * Check if there is a valid memory map entry aka struct page for the @pfn. > > * Note, that availability of the memory map entry does not imply that > > * there is actual usable memory at that @pfn. The struct page may > > * represent a hole or an unusable page frame. > > … > > > > That means that struct page exists, which is all what we need here. > > A struct page that has never been initialize shouldn't ever be read. I > don't know how that relates to page_ext, but are you really sure that > is all you need? > AFAIU, if pfn_valid() returns true, it means the struct page is valid, and lookup_page_ext() will check that a valid page_ext exists for this entry. So, what is missing is the NULL check for the page_ext returned, as it can be NULL even if pfn_valid() was true. But I can't see why we shouldn't use pfn_valid() at all in that path. I don't like the approach of using the prot to check that, as the driver can be buggy which is what the santizer is defending against. If we find some CONFIGs conflicting with it, we can just express that in Kconfig and disable the santaizer in that case. > > I can see many places have the same pattern in the kernel already, for example: > > - vfio_iommu_type1.c, is_invalid_reserved_pfn() which does the same > > check which can include MMIO and then get the page struct. > > This whole flow is nonsensical and wrong though, I wouldn't point to > it as something reliable. > > > - kvm_main.c: in __kvm_vcpu_map(), it distinguishes MMIO from memory > > and then accesses the page struct. > > That's sure looks sketchy to me.. Eg if CONFIG_WANT_PAGE_VIRTUAL is > set and you try to feed a MMIO through through that kmap() it will > explode. > > KVM can argue that it doesn't work with CONFIG_WANT_PAGE_VIRTUAL but > iommu cannot. > WANT_PAGE_VIRTUAL seems possible in loongarch which supports KVM. Thanks, Mostafa > So, again, IDK, we are trying not to use pfn_valid() in the DMA code. > > Jason
On 1/12/26 15:58, Mostafa Saleh wrote: > On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: >> >> On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: >>> But I don’t see why not. from the documentation: >>> /** >>> * pfn_valid - check if there is a valid memory map entry for a PFN >>> * @pfn: the page frame number to check >>> * >>> * Check if there is a valid memory map entry aka struct page for the @pfn. >>> * Note, that availability of the memory map entry does not imply that >>> * there is actual usable memory at that @pfn. The struct page may >>> * represent a hole or an unusable page frame. >>> … >>> >>> That means that struct page exists, which is all what we need here. >> >> A struct page that has never been initialize shouldn't ever be read. I >> don't know how that relates to page_ext, but are you really sure that >> is all you need? >> > > AFAIU, if pfn_valid() returns true, it means the struct page is valid, > and lookup_page_ext() will check that a valid page_ext exists for this > entry. Not always. Offline memory blocks have a memory map but no page ext. We allocate the page ext at memory onlining time. Also, I'm not sure about ZONE_DEVICE memory, very likely we never allocate a page_ext for them? I'd assume both cases are not relevant for your use case, though. -- Cheers David
On Mon, Jan 12, 2026 at 7:12 PM David Hildenbrand (Red Hat) <david@kernel.org> wrote: > > On 1/12/26 15:58, Mostafa Saleh wrote: > > On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > >> > >> On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: > >>> But I don’t see why not. from the documentation: > >>> /** > >>> * pfn_valid - check if there is a valid memory map entry for a PFN > >>> * @pfn: the page frame number to check > >>> * > >>> * Check if there is a valid memory map entry aka struct page for the @pfn. > >>> * Note, that availability of the memory map entry does not imply that > >>> * there is actual usable memory at that @pfn. The struct page may > >>> * represent a hole or an unusable page frame. > >>> … > >>> > >>> That means that struct page exists, which is all what we need here. > >> > >> A struct page that has never been initialize shouldn't ever be read. I > >> don't know how that relates to page_ext, but are you really sure that > >> is all you need? > >> > > > > AFAIU, if pfn_valid() returns true, it means the struct page is valid, > > and lookup_page_ext() will check that a valid page_ext exists for this > > entry. > > Not always. Offline memory blocks have a memory map but no page ext. We > allocate the page ext at memory onlining time. > > Also, I'm not sure about ZONE_DEVICE memory, very likely we never > allocate a page_ext for them? > > I'd assume both cases are not relevant for your use case, though. > From my understanding, in that case, page_ext_get() will return NULL. So, as long as struct page exists, page_ext_get won't misbehave. I am not sure about offline memory, but MMIO can be used. We use pfn_valid() before getting the struct page that we pass to page_ext. Would that be OK? Thanks, Mostafa > -- > Cheers > > David
On 1/13/26 11:22, Mostafa Saleh wrote: > On Mon, Jan 12, 2026 at 7:12 PM David Hildenbrand (Red Hat) > <david@kernel.org> wrote: >> >> On 1/12/26 15:58, Mostafa Saleh wrote: >>> On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: >>>> >>>> On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: >>>>> But I don’t see why not. from the documentation: >>>>> /** >>>>> * pfn_valid - check if there is a valid memory map entry for a PFN >>>>> * @pfn: the page frame number to check >>>>> * >>>>> * Check if there is a valid memory map entry aka struct page for the @pfn. >>>>> * Note, that availability of the memory map entry does not imply that >>>>> * there is actual usable memory at that @pfn. The struct page may >>>>> * represent a hole or an unusable page frame. >>>>> … >>>>> >>>>> That means that struct page exists, which is all what we need here. >>>> >>>> A struct page that has never been initialize shouldn't ever be read. I >>>> don't know how that relates to page_ext, but are you really sure that >>>> is all you need? >>>> >>> >>> AFAIU, if pfn_valid() returns true, it means the struct page is valid, >>> and lookup_page_ext() will check that a valid page_ext exists for this >>> entry. >> >> Not always. Offline memory blocks have a memory map but no page ext. We >> allocate the page ext at memory onlining time. >> >> Also, I'm not sure about ZONE_DEVICE memory, very likely we never >> allocate a page_ext for them? >> >> I'd assume both cases are not relevant for your use case, though. >> > > From my understanding, in that case, page_ext_get() will return NULL. > > So, as long as struct page exists, page_ext_get won't misbehave. > > I am not sure about offline memory, but MMIO can be used. We use > pfn_valid() before getting the struct page that we pass to page_ext. > Would that be OK? It's tricky. If you look at lookup_page_ext(), it relies on extracting the pfn+nid from the "struct page". If the "struct page" is uninitialized (e.g., offline memory) that cannot possibly work, as it could just give you random garbage. (note that there are two implementations of lookup_page_ext(), both extracting the PFN but only one extracting the NID). -- Cheers David
On Tue, Jan 13, 2026 at 10:31 AM David Hildenbrand (Red Hat) <david@kernel.org> wrote: > > On 1/13/26 11:22, Mostafa Saleh wrote: > > On Mon, Jan 12, 2026 at 7:12 PM David Hildenbrand (Red Hat) > > <david@kernel.org> wrote: > >> > >> On 1/12/26 15:58, Mostafa Saleh wrote: > >>> On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > >>>> > >>>> On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: > >>>>> But I don’t see why not. from the documentation: > >>>>> /** > >>>>> * pfn_valid - check if there is a valid memory map entry for a PFN > >>>>> * @pfn: the page frame number to check > >>>>> * > >>>>> * Check if there is a valid memory map entry aka struct page for the @pfn. > >>>>> * Note, that availability of the memory map entry does not imply that > >>>>> * there is actual usable memory at that @pfn. The struct page may > >>>>> * represent a hole or an unusable page frame. > >>>>> … > >>>>> > >>>>> That means that struct page exists, which is all what we need here. > >>>> > >>>> A struct page that has never been initialize shouldn't ever be read. I > >>>> don't know how that relates to page_ext, but are you really sure that > >>>> is all you need? > >>>> > >>> > >>> AFAIU, if pfn_valid() returns true, it means the struct page is valid, > >>> and lookup_page_ext() will check that a valid page_ext exists for this > >>> entry. > >> > >> Not always. Offline memory blocks have a memory map but no page ext. We > >> allocate the page ext at memory onlining time. > >> > >> Also, I'm not sure about ZONE_DEVICE memory, very likely we never > >> allocate a page_ext for them? > >> > >> I'd assume both cases are not relevant for your use case, though. > >> > > > > From my understanding, in that case, page_ext_get() will return NULL. > > > > So, as long as struct page exists, page_ext_get won't misbehave. > > > > I am not sure about offline memory, but MMIO can be used. We use > > pfn_valid() before getting the struct page that we pass to page_ext. > > Would that be OK? > It's tricky. If you look at lookup_page_ext(), it relies on extracting > the pfn+nid from the "struct page". > > If the "struct page" is uninitialized (e.g., offline memory) that cannot > possibly work, as it could just give you random garbage. > > (note that there are two implementations of lookup_page_ext(), both > extracting the PFN but only one extracting the NID). > I see, thanks for the clarification. I see that pfn_to_online_page() exits, which seems to handle online memory and ZONE_DEVICE. Would that be a suitable alternative? Would you have a problem if we added a new function in page_ext "page_ext_from_phys()" as Jason suggested? Thanks, Mostafa Thanks, Mostafa > -- > Cheers > > David >
On Tue, Jan 13, 2026 at 10:49:28AM +0000, Mostafa Saleh wrote: > Would you have a problem if we added a new function in page_ext > "page_ext_from_phys()" as Jason suggested? Given the hidden complexity that David just pointed out I think this is essential to encapsulate it in a function. The function must be able to accept any phys_addr_t and safely returns NULL if page_ext cannot be used for some reason. Jason
On 1/13/26 16:08, Jason Gunthorpe wrote: > On Tue, Jan 13, 2026 at 10:49:28AM +0000, Mostafa Saleh wrote: >> Would you have a problem if we added a new function in page_ext >> "page_ext_from_phys()" as Jason suggested? > > Given the hidden complexity that David just pointed out I think this > is essential to encapsulate it in a function. > > The function must be able to accept any phys_addr_t and safely returns > NULL if page_ext cannot be used for some reason. Right. I think pfn_to_online_page() is the appropriate check. @Mostafa, I saw you already sent a v7, is the pfn_to_online_page() stuff handled in there? -- Cheers David
On Wed, Jan 14, 2026 at 5:30 PM David Hildenbrand (Red Hat) <david@kernel.org> wrote: > > On 1/13/26 16:08, Jason Gunthorpe wrote: > > On Tue, Jan 13, 2026 at 10:49:28AM +0000, Mostafa Saleh wrote: > >> Would you have a problem if we added a new function in page_ext > >> "page_ext_from_phys()" as Jason suggested? > > > > Given the hidden complexity that David just pointed out I think this > > is essential to encapsulate it in a function. > > > > The function must be able to accept any phys_addr_t and safely returns > > NULL if page_ext cannot be used for some reason. > > Right. I think pfn_to_online_page() is the appropriate check. > > @Mostafa, I saw you already sent a v7, is the pfn_to_online_page() stuff > handled in there? > Yes, it's in the first patch in function page_ext_get_phys(). After spending some time looking into it, that was my conclusion also. Thanks, Mostafa > -- > Cheers > > David
On Mon, Jan 12, 2026 at 08:11:50PM +0100, David Hildenbrand (Red Hat) wrote: > On 1/12/26 15:58, Mostafa Saleh wrote: > > On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > > On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: > > > > But I don’t see why not. from the documentation: > > > > /** > > > > * pfn_valid - check if there is a valid memory map entry for a PFN > > > > * @pfn: the page frame number to check > > > > * > > > > * Check if there is a valid memory map entry aka struct page for the @pfn. > > > > * Note, that availability of the memory map entry does not imply that > > > > * there is actual usable memory at that @pfn. The struct page may > > > > * represent a hole or an unusable page frame. > > > > … > > > > > > > > That means that struct page exists, which is all what we need here. > > > > > > A struct page that has never been initialize shouldn't ever be read. I > > > don't know how that relates to page_ext, but are you really sure that > > > is all you need? > > > > > > > AFAIU, if pfn_valid() returns true, it means the struct page is valid, > > and lookup_page_ext() will check that a valid page_ext exists for this > > entry. > > Not always. Offline memory blocks have a memory map but no page ext. We > allocate the page ext at memory onlining time. > > Also, I'm not sure about ZONE_DEVICE memory, very likely we never allocate a > page_ext for them? > > I'd assume both cases are not relevant for your use case, though. They are in the sense that those PFNs can get into these routines and still need to be handled in some appropriate way.. Jason
On Mon, Jan 12, 2026 at 02:58:47PM +0000, Mostafa Saleh wrote: > On Mon, Jan 12, 2026 at 1:52 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Mon, Jan 12, 2026 at 01:43:41PM +0000, Mostafa Saleh wrote: > > > But I don’t see why not. from the documentation: > > > /** > > > * pfn_valid - check if there is a valid memory map entry for a PFN > > > * @pfn: the page frame number to check > > > * > > > * Check if there is a valid memory map entry aka struct page for the @pfn. > > > * Note, that availability of the memory map entry does not imply that > > > * there is actual usable memory at that @pfn. The struct page may > > > * represent a hole or an unusable page frame. > > > … > > > > > > That means that struct page exists, which is all what we need here. > > > > A struct page that has never been initialize shouldn't ever be read. I > > don't know how that relates to page_ext, but are you really sure that > > is all you need? > > > > AFAIU, if pfn_valid() returns true, it means the struct page is valid, > and lookup_page_ext() will check that a valid page_ext exists for this > entry. > So, what is missing is the NULL check for the page_ext returned, as it > can be NULL even if pfn_valid() was true. > > But I can't see why we shouldn't use pfn_valid() at all in that path. > I don't like the approach of using the prot to check that, as the > driver can be buggy which is what the santizer is defending against. > If we find some CONFIGs conflicting with it, we can just express that > in Kconfig and disable the santaizer in that case. That's a fair point. How about adding a page_ext_from_phys() kind of function that could use pfn_valid internally and has documented semantics for what it even does for "holes" in the page map? I'd be happier to see such a well defined API than randomly adding pfn_valid() and phys_to_page() when we are trying hard to not have those things in these paths. > > That's sure looks sketchy to me.. Eg if CONFIG_WANT_PAGE_VIRTUAL is > > set and you try to feed a MMIO through through that kmap() it will > > explode. > > > > KVM can argue that it doesn't work with CONFIG_WANT_PAGE_VIRTUAL but > > iommu cannot. > > WANT_PAGE_VIRTUAL seems possible in loongarch which supports KVM. Yikes, maybe that configuration doesn't run KVM? Jason
On Fri, Jan 09, 2026 at 05:18:04PM +0000, Mostafa Saleh wrote:
> Using the new calls, use an atomic refcount to track how many times
> a page is mapped in any of the IOMMUs.
>
> For unmap we need to use iova_to_phys() to get the physical address
> of the pages.
>
> We use the smallest supported page size as the granularity of tracking
> per domain.
> This is important as it is possible to map pages and unmap them with
> larger sizes (as in map_sg()) cases.
>
> Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
> Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Mostafa Saleh <smostafa@google.com>
> ---
> drivers/iommu/iommu-debug-pagealloc.c | 84 +++++++++++++++++++++++++++
> 1 file changed, 84 insertions(+)
>
> diff --git a/drivers/iommu/iommu-debug-pagealloc.c b/drivers/iommu/iommu-debug-pagealloc.c
> index 1d343421da98..9eb49e1230ee 100644
> --- a/drivers/iommu/iommu-debug-pagealloc.c
> +++ b/drivers/iommu/iommu-debug-pagealloc.c
> @@ -29,19 +29,103 @@ struct page_ext_operations page_iommu_debug_ops = {
> .need = need_iommu_debug,
> };
>
> +static struct page_ext *get_iommu_page_ext(phys_addr_t phys)
> +{
> + struct page *page = phys_to_page(phys);
> + struct page_ext *page_ext = page_ext_get(page);
> +
> + return page_ext;
> +}
> +
> +static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
> +{
> + return page_ext_data(page_ext, &page_iommu_debug_ops);
> +}
> +
> +static void iommu_debug_inc_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> +
> + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
> + page_ext_put(page_ext);
> +}
> +
> +static void iommu_debug_dec_page(phys_addr_t phys)
> +{
> + struct page_ext *page_ext = get_iommu_page_ext(phys);
> + struct iommu_debug_metadata *d = get_iommu_data(page_ext);
> +
> + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
> + page_ext_put(page_ext);
> +}
> +
> +/*
> + * IOMMU page size doesn't have to match the CPU page size. So, we use
> + * the smallest IOMMU page size to refcount the pages in the vmemmap.
> + * That is important as both map and unmap has to use the same page size
> + * to update the refcount to avoid double counting the same page.
> + * And as we can't know from iommu_unmap() what was the original page size
> + * used for map, we just use the minimum supported one for both.
> + */
> +static size_t iommu_debug_page_size(struct iommu_domain *domain)
> +{
> + return 1UL << __ffs(domain->pgsize_bitmap);
> +}
> +
> void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
> {
> + size_t off, end;
> + size_t page_size = iommu_debug_page_size(domain);
> +
> + if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
> + return;
> +
> + for (off = 0 ; off < size ; off += page_size) {
> + if (!pfn_valid(__phys_to_pfn(phys + off)))
> + continue;
> + iommu_debug_inc_page(phys + off);
> + }
> +}
> +
> +static void __iommu_debug_update_iova(struct iommu_domain *domain,
> + unsigned long iova, size_t size, bool inc)
> +{
> + size_t off, end;
> + size_t page_size = iommu_debug_page_size(domain);
> +
> + if (WARN_ON(check_add_overflow(iova, size, &end)))
> + return;
> +
> + for (off = 0 ; off < size ; off += page_size) {
> + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
> +
> + if (!phys || !pfn_valid(__phys_to_pfn(phys)))
> + continue;
> +
> + if (inc)
> + iommu_debug_inc_page(phys);
> + else
> + iommu_debug_dec_page(phys);
> + }
> }
>
> void __iommu_debug_unmap_begin(struct iommu_domain *domain,
> unsigned long iova, size_t size)
> {
> + __iommu_debug_update_iova(domain, iova, size, false);
> }
>
> void __iommu_debug_unmap_end(struct iommu_domain *domain,
> unsigned long iova, size_t size,
> size_t unmapped)
> {
> + if ((unmapped == size) || WARN_ON_ONCE(unmapped > size))
> + return;
> +
Looks good!
> + /* If unmap failed, re-increment the refcount. */
> + __iommu_debug_update_iova(domain, iova + unmapped,
> + size - unmapped, true);
> }
>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Thanks!
> void iommu_debug_init(void)
> --
> 2.52.0.457.g6b5491de43-goog
>
>
© 2016 - 2026 Red Hat, Inc.