Large DMA alloc/skip 32-bit alloc if size > 32-bit

[PATCH 0/8] Large DMA alloc/skip 32-bit alloc if size > 32-bit

Posted by Constantine Gavrilov 2 months ago

This is the first patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Do not consider 32-bit range allocation and skip iterating the rbtree,
if the allocation size exceeds 32-bit.

commit 1c21c64befe18d626855a828c721eb786dbb69b8
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Sun Jun 22 13:05:26 2025 +0300

    iommu_dma_alloc_iova(): do not try to allocate 32-bit address if
the size is above 32-bit.

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ea2ef53bd4fe..8280e8864ef3 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -772,7 +772,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
      * some inherent bug in handling >32-bit addresses, or not all the
      * expected address bits are wired up between the device and the IOMMU.
      */
-    if (dma_limit > DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
+    if (dma_limit > DMA_BIT_MASK(32) && (size - 1) <=
DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
         iova = alloc_iova_fast(iovad, iova_len,
                        DMA_BIT_MASK(32) >> shift, false);
         if (iova)

-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

[PATCH 1/8] Large DMA alloc/skip 32-bit alloc if size > 32-bit

Posted by Constantine Gavrilov 2 months ago

This is the second patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Current implementation aligns DMA allocations to size, which fragments address
space in the case of large allocations. Introduce alignment parameter (size,
PMD, PUD or NONE). This change does not change the existing behavior but
facilitates the next change.

commit 3b1aa27401cb020455854ba6c5343ec618c63067
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Sun Jun 22 13:13:47 2025 +0300

    Large IOMMU registrations: extend alloc_iova() and
alloc_iova_fast() to use aligment parameter.

    This patch does not change existing behavior, it just extends the API.

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 4596073fe28f..bf525d59e82e 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1046,7 +1046,7 @@ void *tegra_drm_alloc(struct tegra_drm *tegra,
size_t size, dma_addr_t *dma)

     alloc = alloc_iova(&tegra->carveout.domain,
                size >> tegra->carveout.shift,
-               tegra->carveout.limit, true);
+               tegra->carveout.limit, ALLOC_IOVA_ALIGN_SIZE);
     if (!alloc) {
         err = -EBUSY;
         goto free_pages;
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index ba2e572567c0..fbd647fc031c 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -97,7 +97,7 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)

         shift = iova_shift(&host1x->iova);
         alloc = alloc_iova(&host1x->iova, size >> shift,
-                   host1x->iova_end >> shift, true);
+                   host1x->iova_end >> shift, ALLOC_IOVA_ALIGN_SIZE);
         if (!alloc) {
             err = -ENOMEM;
             goto iommu_free_mem;
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 3ed49e1fd933..ff5325d21fe8 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -242,7 +242,7 @@ static unsigned int pin_job(struct host1x *host,
struct host1x_job *job)

             shift = iova_shift(&host->iova);
             alloc = alloc_iova(&host->iova, gather_size >> shift,
-                       host->iova_end >> shift, true);
+                       host->iova_end >> shift, ALLOC_IOVA_ALIGN_SIZE);
             if (!alloc) {
                 err = -ENOMEM;
                 goto put;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 8280e8864ef3..ef5fa3587c3b 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -774,7 +774,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
      */
     if (dma_limit > DMA_BIT_MASK(32) && (size - 1) <=
DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
         iova = alloc_iova_fast(iovad, iova_len,
-                       DMA_BIT_MASK(32) >> shift, false);
+                       DMA_BIT_MASK(32) >> shift, false,
ALLOC_IOVA_ALIGN_SIZE);
         if (iova)
             goto done;

@@ -782,7 +782,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
         dev_notice(dev, "Using %d-bit DMA addresses\n", bits_per(dma_limit));
     }

-    iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true);
+    iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true,
ALLOC_IOVA_ALIGN_SIZE);
 done:
     return (dma_addr_t)iova << shift;
 }
@@ -1798,7 +1798,7 @@ bool dma_iova_try_alloc(struct device *dev,
struct dma_iova_state *state,

     addr = iommu_dma_alloc_iova(domain,
             iova_align(iovad, size + iova_off),
-            dma_get_mask(dev), dev);
+            dma_get_mask(dev), dev, ALLOC_IOVA_ALIGN_SIZE);
     if (!addr)
         return false;

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 18f839721813..41d5d34fcc33 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -163,17 +163,22 @@ iova_insert_rbtree(struct rb_root *root, struct
iova *iova,

 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
         unsigned long size, unsigned long limit_pfn,
-            struct iova *new, bool size_aligned)
+            struct iova *new, iova_align_t align)
 {
     struct rb_node *curr, *prev;
     struct iova *curr_iova;
     unsigned long flags;
     unsigned long new_pfn, retry_pfn;
-    unsigned long align_mask = ~0UL;
+    unsigned long align_mask;
     unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;

-    if (size_aligned)
-        align_mask <<= fls_long(size - 1);
+    switch (align) {
+        case ALLOC_IOVA_ALIGN_NONE: align_mask = ~0UL; break;
+        case ALLOC_IOVA_ALIGN_SIZE: align_mask = (~0UL) <<
fls_long(size - 1); break;
+        case ALLOC_IOVA_ALIGN_PMD: align_mask = (~0UL) <<  (PMD_SHIFT
- iova_shift(iovad)); break;
+        case ALLOC_IOVA_ALIGN_PUD: align_mask = (~0UL) <<  (PUD_SHIFT
- iova_shift(iovad)); break;
+        default: return -EINVAL;
+    }

     /* Walk the tree backwards */
     spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
@@ -206,7 +211,7 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
         goto iova32_full;
     }

-    /* pfn_lo will point to size aligned address if size_aligned is set */
+    /* pfn_lo will point to size aligned address if align is not
ALLOC_IOVA_ALIGN_NONE */
     new->pfn_lo = new_pfn;
     new->pfn_hi = new->pfn_lo + size - 1;

@@ -242,16 +247,19 @@ static void free_iova_mem(struct iova *iova)
  * @iovad: - iova domain in question
  * @size: - size of page frames to allocate
  * @limit_pfn: - max limit address
- * @size_aligned: - set if size_aligned address range is required
+ * @align: - alignment
  * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
- * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
- * flag is set then the allocated address iova->pfn_lo will be naturally
- * aligned on roundup_power_of_two(size).
+ * searching top-down from limit_pfn to iovad->start_pfn.
+ * If align is not set to ALLOC_IOVA_ALIGN_NONE, then the allocated address
+ * iova->pfn_lo will be naturally aligned as follows:
+ *  roundup_power_of_two(size) for align == ALLOC_IOVA_ALIGN_SIZE
+ *  1UL << PMD_SHIFT for align == ALLOC_IOVA_ALIGN_PMD
+ *  1UL << PUD_SHIFT for align == ALLOC_IOVA_ALIGN_PUD
  */
 struct iova *
 alloc_iova(struct iova_domain *iovad, unsigned long size,
     unsigned long limit_pfn,
-    bool size_aligned)
+    iova_align_t align)
 {
     struct iova *new_iova;
     int ret;
@@ -261,7 +269,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
         return NULL;

     ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
-            new_iova, size_aligned);
+            new_iova, align);

     if (ret) {
         free_iova_mem(new_iova);
@@ -369,13 +377,14 @@ EXPORT_SYMBOL_GPL(free_iova);
  * @size: - size of page frames to allocate
  * @limit_pfn: - max limit address
  * @flush_rcache: - set to flush rcache on regular allocation failure
+ * @align: - alignment constraint on DMA address
  * This function tries to satisfy an iova allocation from the rcache,
  * and falls back to regular allocation on failure. If regular allocation
  * fails too and the flush_rcache flag is set then the rcache will be flushed.
 */
 unsigned long
 alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
-        unsigned long limit_pfn, bool flush_rcache)
+        unsigned long limit_pfn, bool flush_rcache, iova_align_t align)
 {
     unsigned long iova_pfn;
     struct iova *new_iova;
@@ -394,7 +403,7 @@ alloc_iova_fast(struct iova_domain *iovad,
unsigned long size,
         return iova_pfn;

 retry:
-    new_iova = alloc_iova(iovad, size, limit_pfn, true);
+    new_iova = alloc_iova(iovad, size, limit_pfn, align);
     if (!new_iova) {
         unsigned int cpu;

diff --git a/drivers/media/pci/intel/ipu6/ipu6-dma.c
b/drivers/media/pci/intel/ipu6/ipu6-dma.c
index 7296373d36b0..4e2b98c4f348 100644
--- a/drivers/media/pci/intel/ipu6/ipu6-dma.c
+++ b/drivers/media/pci/intel/ipu6/ipu6-dma.c
@@ -172,7 +172,7 @@ void *ipu6_dma_alloc(struct ipu6_bus_device *sys,
size_t size,
     count = PHYS_PFN(size);

     iova = alloc_iova(&mmu->dmap->iovad, count,
-              PHYS_PFN(mmu->dmap->mmu_info->aperture_end), 0);
+              PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         goto out_kfree;

@@ -398,7 +398,7 @@ int ipu6_dma_map_sg(struct ipu6_bus_device *sys,
struct scatterlist *sglist,
         nents, npages);

     iova = alloc_iova(&mmu->dmap->iovad, npages,
-              PHYS_PFN(mmu->dmap->mmu_info->aperture_end), 0);
+              PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         return 0;

diff --git a/drivers/media/pci/intel/ipu6/ipu6-mmu.c
b/drivers/media/pci/intel/ipu6/ipu6-mmu.c
index 6d1c0b90169d..4d6f9b8d68bb 100644
--- a/drivers/media/pci/intel/ipu6/ipu6-mmu.c
+++ b/drivers/media/pci/intel/ipu6/ipu6-mmu.c
@@ -422,7 +422,7 @@ static int allocate_trash_buffer(struct ipu6_mmu *mmu)

     /* Allocate 8MB in iova range */
     iova = alloc_iova(&mmu->dmap->iovad, n_pages,
-              PHYS_PFN(mmu->dmap->mmu_info->aperture_end), 0);
+              PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
     if (!iova) {
         dev_err(mmu->dev, "cannot allocate iova range for trash\n");
         return -ENOMEM;
diff --git a/drivers/media/platform/nvidia/tegra-vde/iommu.c
b/drivers/media/platform/nvidia/tegra-vde/iommu.c
index b1d9d841d944..ad010ad65735 100644
--- a/drivers/media/platform/nvidia/tegra-vde/iommu.c
+++ b/drivers/media/platform/nvidia/tegra-vde/iommu.c
@@ -30,7 +30,7 @@ int tegra_vde_iommu_map(struct tegra_vde *vde,
     size = iova_align(&vde->iova, size);
     shift = iova_shift(&vde->iova);

-    iova = alloc_iova(&vde->iova, size >> shift, end >> shift, true);
+    iova = alloc_iova(&vde->iova, size >> shift, end >> shift,
ALLOC_IOVA_ALIGN_SIZE);
     if (!iova)
         return -ENOMEM;

diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c
b/drivers/staging/media/ipu3/ipu3-dmamap.c
index 8a19b0024152..330314a3aa94 100644
--- a/drivers/staging/media/ipu3/ipu3-dmamap.c
+++ b/drivers/staging/media/ipu3/ipu3-dmamap.c
@@ -105,7 +105,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu,
struct imgu_css_map *map,
     dev_dbg(dev, "%s: allocating %zu\n", __func__, size);

     iova = alloc_iova(&imgu->iova_domain, size >> shift,
-              imgu->mmu->aperture_end >> shift, 0);
+              imgu->mmu->aperture_end >> shift, ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         return NULL;

@@ -205,7 +205,7 @@ int imgu_dmamap_map_sg(struct imgu_device *imgu,
struct scatterlist *sglist,
         nents, size >> shift);

     iova = alloc_iova(&imgu->iova_domain, size >> shift,
-              imgu->mmu->aperture_end >> shift, 0);
+              imgu->mmu->aperture_end >> shift, ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         return -ENOMEM;

diff --git a/drivers/vdpa/vdpa_user/iova_domain.c
b/drivers/vdpa/vdpa_user/iova_domain.c
index 58116f89d8da..96ce209762f9 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.c
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -362,7 +362,7 @@ vduse_domain_alloc_iova(struct iova_domain *iovad,
     unsigned long iova_len = iova_align(iovad, size) >> shift;
     unsigned long iova_pfn;

-    iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true);
+    iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true,
ALLOC_IOVA_ALIGN_SIZE);

     return (dma_addr_t)iova_pfn << shift;
 }
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 55c03e5fe8cb..5cb8e6e49138 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -82,6 +82,15 @@ struct dma_iova_state {
  */
 #define DMA_IOVA_USE_SWIOTLB        (1ULL << 63)

+typedef enum {
+    ALLOC_IOVA_ALIGN_NONE,
+    ALLOC_IOVA_ALIGN_SIZE,
+    ALLOC_IOVA_ALIGN_PMD,
+    ALLOC_IOVA_ALIGN_PUD,
+    ALLOC_IOVA_ALIGN_INV,
+} iova_align_t;
+
+
 static inline size_t dma_iova_size(struct dma_iova_state *state)
 {
     /* Casting is needed for 32-bits systems */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d2c4fd923efa..e35762c0acdb 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -90,11 +90,11 @@ void free_iova(struct iova_domain *iovad, unsigned
long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
     unsigned long limit_pfn,
-    bool size_aligned);
+    iova_align_t align);
 void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
             unsigned long size);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
-                  unsigned long limit_pfn, bool flush_rcache);
+                  unsigned long limit_pfn, bool flush_rcache,
iova_align_t align);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
     unsigned long pfn_hi);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -123,7 +123,7 @@ static inline void __free_iova(struct iova_domain
*iovad, struct iova *iova)
 static inline struct iova *alloc_iova(struct iova_domain *iovad,
                       unsigned long size,
                       unsigned long limit_pfn,
-                      bool size_aligned)
+                      iova_align_t align)
 {
     return NULL;
 }
@@ -137,7 +137,7 @@ static inline void free_iova_fast(struct iova_domain *iovad,
 static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
                         unsigned long size,
                         unsigned long limit_pfn,
-                        bool flush_rcache)
+                        bool flush_rcache, iova_align_t align)
 {
     return 0;
 }

-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

Re: [PATCH 2/8] Large DMA alloc/skip 32-bit alloc if size > 32-bit

Posted by Constantine Gavrilov 2 months ago

This is the third patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Current implementation aligns DMA allocations to size, which fragments address
space in the case of large allocations. Extend the use of previously added
alignment parameter to additional kernel functions. Do not request specific
alignment in some functions that allocate large DMA areas.


commit 8a758550f6b39392a9cad627f323f8649621e6e2
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Sun Jun 22 15:44:52 2025 +0300

    Large IOMMU registrations: do not align IOMMU allocations to map
size by default.

    Implemented as follows:
    * extend iommu_dma_alloc_iova() function to use alignment parameter
    * extend __iommu_dma_map() function to use alignment parameter
    * extend dma_iova_try_alloc() function to use alignment parameter
    * add DMA_ATTR_IOVA_ALIGN_{PMD, PUD, SIZE} DMA mapping attributes

    The followings static functions will not request DMA address alignment,
    unless one of the  DMA_ATTR_IOVA_ALIGN_{PMD, PUD, SIZE} DMA mapping
    attributes is used. The previous behavior was to request mapping size
    alignment:
    * __iommu_dma_alloc_noncontiguous()
    * iommu_dma_alloc_remap()
    * __dma_map_sg_attrs() - calls iommu_dma_map_sg() that changes behavior

    The following kernel functions will not request DMA address alignment
    when calling  dma_iova_try_alloc() function:
    * register_dma_pages() from mlx5 VFIO driver
    * hmm_dma_map_alloc()
    * blk_rq_dma_map_iter_start()

    The following public APIs will not request DMA address alignment, unless
    one of the  DMA_ATTR_IOVA_ALIGN_{PMD, PUD, SIZE} DMA mapping flags is
    used. The previous behavior was to request mapping size
    alignment:
    * iommu_dma_map_page() - calls __iommu_dma_map()
    * iommu_dma_map_resource() - calls __iommu_dma_map()
    * iommu_dma_alloc() - calls __iommu_dma_map()
    * iommu_dma_alloc_noncontiguous() - calls iommu_dma_alloc_remap() or
      __iommu_dma_alloc_noncontiguous()
    * iommu_dma_map_sg() - calls iommu_dma_alloc_iova()
    * iommu_dma_map_sg() - software IOTLB case - calls
iommu_dma_map_sg_swiotlb()
      that calls iommu_dma_map_page()
    * dma_map_sg_attrs() - calls __dma_map_sg_attrs()
    * dma_map_sgtable() - calls __dma_map_sg_attrs()
    * dma_map_page_attrs() - calls iommu_dma_map_page()
    * dma_common_alloc_pages() - calls iommu_dma_map_page()
    * dma_map_resource() - calls iommu_dma_map_resource()
    * dma_alloc_attrs() - calls iommu_dma_alloc()
    * dma_alloc_noncontiguous() - calls iommu_dma_alloc_noncontiguous()

diff --git a/Documentation/core-api/dma-api.rst
b/Documentation/core-api/dma-api.rst
index 2ad08517e626..b9d3f290b6fe 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -541,7 +541,7 @@ matter.  All the considerations from the previous
section apply here as well.
 ::

     bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state,
-        phys_addr_t phys, size_t size);
+        phys_addr_t phys, size_t size, iova_align_t align);

 Is used to try to allocate IOVA space for mapping operation.  If it returns
 false this API can't be used for the given device and the normal streaming
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index ad283017caef..a9ef7fab2790 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -184,7 +184,7 @@ bool blk_rq_dma_map_iter_start(struct request
*req, struct device *dma_dev,
     }

     if (blk_can_dma_map_iova(req, dma_dev) &&
-        dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
+        dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len,
ALLOC_IOVA_ALIGN_NONE))
         return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
     return blk_dma_map_direct(req, dma_dev, iter, &vec);
 }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ef5fa3587c3b..0b7537e9812f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -741,8 +741,28 @@ static int dma_info_to_prot(enum
dma_data_direction dir, bool coherent,
     }
 }

+static iova_align_t dma_info_to_alignment(unsigned long attrs)
+{
+    iova_align_t align = ALLOC_IOVA_ALIGN_NONE;
+
+    if (attrs & DMA_ATTR_IOVA_ALIGN_PMD) {
+        if (attrs & (DMA_ATTR_IOVA_ALIGN_PUD | DMA_ATTR_IOVA_ALIGN_SIZE))
+            return ALLOC_IOVA_ALIGN_INV;
+        return ALLOC_IOVA_ALIGN_PMD;
+    } else if (attrs & DMA_ATTR_IOVA_ALIGN_PUD) {
+        if (attrs & (DMA_ATTR_IOVA_ALIGN_PMD | DMA_ATTR_IOVA_ALIGN_SIZE))
+            return ALLOC_IOVA_ALIGN_INV;
+        return ALLOC_IOVA_ALIGN_PUD;
+    } else if (attrs & DMA_ATTR_IOVA_ALIGN_SIZE) {
+        if (attrs & (DMA_ATTR_IOVA_ALIGN_PMD | DMA_ATTR_IOVA_ALIGN_PUD))
+            return ALLOC_IOVA_ALIGN_INV;
+        return ALLOC_IOVA_ALIGN_SIZE;
+    }
+    return align;
+}
+
 static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
-        size_t size, u64 dma_limit, struct device *dev)
+        size_t size, u64 dma_limit, struct device *dev, iova_align_t align)
 {
     struct iommu_dma_cookie *cookie = domain->iova_cookie;
     struct iova_domain *iovad = &cookie->iovad;
@@ -774,7 +794,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
      */
     if (dma_limit > DMA_BIT_MASK(32) && (size - 1) <=
DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
         iova = alloc_iova_fast(iovad, iova_len,
-                       DMA_BIT_MASK(32) >> shift, false,
ALLOC_IOVA_ALIGN_SIZE);
+                       DMA_BIT_MASK(32) >> shift, false, align);
         if (iova)
             goto done;

@@ -782,7 +802,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
         dev_notice(dev, "Using %d-bit DMA addresses\n", bits_per(dma_limit));
     }

-    iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true,
ALLOC_IOVA_ALIGN_SIZE);
+    iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true, align);
 done:
     return (dma_addr_t)iova << shift;
 }
@@ -828,7 +848,7 @@ static void __iommu_dma_unmap(struct device *dev,
dma_addr_t dma_addr,
 }

 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
-        size_t size, int prot, u64 dma_mask)
+        size_t size, int prot, u64 dma_mask, iova_align_t align)
 {
     struct iommu_domain *domain = iommu_get_dma_domain(dev);
     struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -847,7 +867,7 @@ static dma_addr_t __iommu_dma_map(struct device
*dev, phys_addr_t phys,

     size = iova_align(iovad, size + iova_off);

-    iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);
+    iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev, align);
     if (!iova)
         return DMA_MAPPING_ERROR;

@@ -933,6 +953,12 @@ static struct page
**__iommu_dma_alloc_noncontiguous(struct device *dev,
     struct page **pages;
     dma_addr_t iova;
     ssize_t ret;
+    iova_align_t align = dma_info_to_alignment(attrs);
+
+    if (align == ALLOC_IOVA_ALIGN_INV) {
+        dev_warn_once(dev, "%s: invalid alignment requested\n", __func__);
+        return NULL;
+    }

     if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
         iommu_deferred_attach(dev, domain))
@@ -955,7 +981,7 @@ static struct page
**__iommu_dma_alloc_noncontiguous(struct device *dev,
         return NULL;

     size = iova_align(iovad, size);
-    iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev);
+    iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask,
dev, align);
     if (!iova)
         goto out_free_pages;

@@ -1201,7 +1227,12 @@ dma_addr_t iommu_dma_map_page(struct device
*dev, struct page *page,
     struct iommu_dma_cookie *cookie = domain->iova_cookie;
     struct iova_domain *iovad = &cookie->iovad;
     dma_addr_t iova, dma_mask = dma_get_mask(dev);
+    iova_align_t align = dma_info_to_alignment(attrs);

+    if (align == ALLOC_IOVA_ALIGN_INV) {
+        dev_warn_once(dev, "%s: invalid alignment requested\n", __func__);
+        return DMA_MAPPING_ERROR;
+    }
     /*
      * If both the physical buffer start address and size are page aligned,
      * we don't need to use a bounce page.
@@ -1216,7 +1247,7 @@ dma_addr_t iommu_dma_map_page(struct device
*dev, struct page *page,
     if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
         arch_sync_dma_for_device(phys, size, dir);

-    iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
+    iova = __iommu_dma_map(dev, phys, size, prot, dma_mask, align);
     if (iova == DMA_MAPPING_ERROR)
         swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
     return iova;
@@ -1389,6 +1420,12 @@ int iommu_dma_map_sg(struct device *dev, struct
scatterlist *sg, int nents,
     unsigned long mask = dma_get_seg_boundary(dev);
     ssize_t ret;
     int i;
+    iova_align_t align = dma_info_to_alignment(attrs);
+
+    if (align == ALLOC_IOVA_ALIGN_INV) {
+        dev_warn_once(dev, "%s: invalid alignment requested\n", __func__);
+        return -EINVAL;
+    }

     if (static_branch_unlikely(&iommu_deferred_attach_enabled)) {
         ret = iommu_deferred_attach(dev, domain);
@@ -1470,7 +1507,7 @@ int iommu_dma_map_sg(struct device *dev, struct
scatterlist *sg, int nents,
     if (!iova_len)
         return __finalise_sg(dev, sg, nents, 0);

-    iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
+    iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev),
dev, align);
     if (!iova) {
         ret = -ENOMEM;
         goto out_restore_sg;
@@ -1549,9 +1586,15 @@ void iommu_dma_unmap_sg(struct device *dev,
struct scatterlist *sg, int nents,
 dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
         size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
+    iova_align_t align = dma_info_to_alignment(attrs);
+
+    if (align == ALLOC_IOVA_ALIGN_INV) {
+        dev_warn_once(dev, "%s: invalid alignment requested\n", __func__);
+        return DMA_MAPPING_ERROR;
+    }
     return __iommu_dma_map(dev, phys, size,
             dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
-            dma_get_mask(dev));
+            dma_get_mask(dev), align);
 }

 void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
@@ -1642,6 +1685,12 @@ void *iommu_dma_alloc(struct device *dev,
size_t size, dma_addr_t *handle,
     int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
     struct page *page = NULL;
     void *cpu_addr;
+    iova_align_t align = dma_info_to_alignment(attrs);
+
+    if (align == ALLOC_IOVA_ALIGN_INV) {
+        dev_warn_once(dev, "%s: invalid alignment requested\n", __func__);
+        return NULL;
+    }

     gfp |= __GFP_ZERO;

@@ -1660,7 +1709,7 @@ void *iommu_dma_alloc(struct device *dev, size_t
size, dma_addr_t *handle,
         return NULL;

     *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot,
-            dev->coherent_dma_mask);
+            dev->coherent_dma_mask, align);
     if (*handle == DMA_MAPPING_ERROR) {
         __iommu_dma_free(dev, size, cpu_addr);
         return NULL;
@@ -1753,6 +1802,7 @@ size_t iommu_dma_max_mapping_size(struct device *dev)
  * @state: IOVA state
  * @phys: physical address
  * @size: IOVA size
+ * @align: DMA address alignment
  *
  * Check if @dev supports the IOVA-based DMA API, and if yes allocate
IOVA space
  * for the given base address and size.
@@ -1764,7 +1814,7 @@ size_t iommu_dma_max_mapping_size(struct device *dev)
  * allocated, or %false if the regular DMA API should be used.
  */
 bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state,
-        phys_addr_t phys, size_t size)
+        phys_addr_t phys, size_t size, iova_align_t align)
 {
     struct iommu_dma_cookie *cookie;
     struct iommu_domain *domain;
@@ -1798,7 +1848,7 @@ bool dma_iova_try_alloc(struct device *dev,
struct dma_iova_state *state,

     addr = iommu_dma_alloc_iova(domain,
             iova_align(iovad, size + iova_off),
-            dma_get_mask(dev), dev, ALLOC_IOVA_ALIGN_SIZE);
+            dma_get_mask(dev), dev, align);
     if (!addr)
         return false;

@@ -2161,7 +2211,7 @@ static struct iommu_dma_msi_page
*iommu_dma_get_msi_page(struct device *dev,
     if (!msi_page)
         return NULL;

-    iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+    iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev,
ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         goto out_free_page;

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 5b919a0b2524..36d21eec9959 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -387,7 +387,7 @@ static int register_dma_pages(struct mlx5_core_dev
*mdev, u32 npages,

     mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);

-    if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) {
+    if (dma_iova_try_alloc(mdev->device, state, 0, npages *
PAGE_SIZE, ALLOC_IOVA_ALIGN_NONE)) {
         addr = state->addr;
         for (i = 0; i < npages; i++) {
             err = dma_iova_link(mdev->device, state,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 5cb8e6e49138..7eef81301755 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -58,6 +58,16 @@
  */
 #define DMA_ATTR_PRIVILEGED        (1UL << 9)

+/*
+ * Alignment flags when using IOMMU. In the case of direct mapping, DMA address
+ * will typically have the same alignment as the virtual address. So, alignment
+ * expectation works in general case if the virtual address is aligned to the
+ * requested alignment.
+ */
+#define DMA_ATTR_IOVA_ALIGN_PMD (1UL << 10)
+#define DMA_ATTR_IOVA_ALIGN_PUD (1UL << 11)
+#define DMA_ATTR_IOVA_ALIGN_SIZE (1UL << 12)
+
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
  * be given to a device to use as a DMA source or target.  It is specific to a
@@ -316,7 +326,7 @@ static inline bool dma_use_iova(struct
dma_iova_state *state)
 }

 bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state,
-        phys_addr_t phys, size_t size);
+        phys_addr_t phys, size_t size, iova_align_t align);
 void dma_iova_free(struct device *dev, struct dma_iova_state *state);
 void dma_iova_destroy(struct device *dev, struct dma_iova_state *state,
         size_t mapped_len, enum dma_data_direction dir,
@@ -335,7 +345,7 @@ static inline bool dma_use_iova(struct
dma_iova_state *state)
     return false;
 }
 static inline bool dma_iova_try_alloc(struct device *dev,
-        struct dma_iova_state *state, phys_addr_t phys, size_t size)
+        struct dma_iova_state *state, phys_addr_t phys, size_t size,
iova_align_t align)
 {
     return false;
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index feac86196a65..47a415cfc60c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -660,7 +660,7 @@ int hmm_dma_map_alloc(struct device *dev, struct
hmm_dma_map *map,
         return -ENOMEM;

     use_iova = dma_iova_try_alloc(dev, &map->state, 0,
-            nr_entries * PAGE_SIZE);
+            nr_entries * PAGE_SIZE, ALLOC_IOVA_ALIGN_NONE);
     if (!use_iova && dma_need_unmap(dev)) {
         map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
                      GFP_KERNEL | __GFP_NOWARN);


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

[PATCH 3/8] Large DMA alloc/add busy_regions sysfs attribute

Posted by Constantine Gavrilov 2 months ago

  This is the fourth patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Add busy_regions SYSFS attribute to IOMMU group. This allows to see used
addresses and debug failed allocations.

commit b01feb650dc080f268adb5ff26bda1b9bf2193a1
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Wed Jun 25 19:49:16 2025 +0300

    Add busy_regions sysfs attribute to IOMMU group.

    This attribute shows allocated DMA regions for the group.

    Add exported function iovad_show_busy_regions() to allow other
implementations.

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0b7537e9812f..6ba9be4fb64d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -761,6 +761,14 @@ static iova_align_t
dma_info_to_alignment(unsigned long attrs)
     return align;
 }

+ssize_t iommu_domain_show_busy_regions(struct iommu_domain *domain, char *buf)
+{
+    struct iommu_dma_cookie *cookie = domain->iova_cookie;
+    struct iova_domain *iovad = &cookie->iovad;
+
+    return iovad_show_busy_regions(iovad, buf);
+}
+
 static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
         size_t size, u64 dma_limit, struct device *dev, iova_align_t align)
 {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a4b606c591da..5daeb86a4aef 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -34,7 +34,9 @@
 #include <linux/sched/mm.h>
 #include <linux/msi.h>
 #include <uapi/linux/iommufd.h>
-
+#ifdef CONFIG_IOMMU_DMA
+#include <linux/iommu-dma.h>
+#endif
 #include "dma-iommu.h"
 #include "iommu-priv.h"

@@ -927,6 +929,19 @@ static ssize_t
iommu_group_show_resv_regions(struct iommu_group *group,
     return offset;
 }

+#ifdef CONFIG_IOMMU_DMA
+static ssize_t iommu_group_show_busy_regions(struct iommu_group *group,
+                         char *buf)
+{
+    if (!group->domain)
+        return 0;
+
+    return iommu_domain_show_busy_regions(group->domain, buf);
+}    int off = 0;
+
+
+#endif
+
 static ssize_t iommu_group_show_type(struct iommu_group *group,
                      char *buf)
 {
@@ -962,6 +977,11 @@ static IOMMU_GROUP_ATTR(name, S_IRUGO,
iommu_group_show_name, NULL);
 static IOMMU_GROUP_ATTR(reserved_regions, 0444,
             iommu_group_show_resv_regions, NULL);

+#ifdef CONFIG_IOMMU_DMA
+static IOMMU_GROUP_ATTR(busy_regions, 0444,
+            iommu_group_show_busy_regions, NULL);
+#endif
+
 static IOMMU_GROUP_ATTR(type, 0644, iommu_group_show_type,
             iommu_group_store_type);

@@ -1049,6 +1069,15 @@ struct iommu_group *iommu_group_alloc(void)
         return ERR_PTR(ret);
     }

+#ifdef CONFIG_IOMMU_DMA
+    ret = iommu_group_create_file(group,
+                      &iommu_group_attr_busy_regions);
+    if (ret) {
+        kobject_put(group->devices_kobj);
+        return ERR_PTR(ret);
+    }
+#endif
+
     ret = iommu_group_create_file(group, &iommu_group_attr_type);
     if (ret) {
         kobject_put(group->devices_kobj);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 41d5d34fcc33..96144c58b386 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -280,6 +280,37 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);

+/*
+ * Helper function to output allocated regions to a buffer.
+ * Can be used as a show function for a sysfs attribute.
+ * buf is page aigned buffer of PAGE_SIZE.
+*/
+ssize_t iovad_show_busy_regions(struct iova_domain *iovad, char *buf)
+{
+    int off = 0;
+    struct rb_node *curr;
+    struct iova *curr_iova;
+    unsigned long flags;
+    unsigned long shift = iova_shift(iovad);
+
+    spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+    curr = &iovad->anchor.node;
+    /* skip ancor node, it has pfn_hi = pfn_lo = IOVA_ANCHOR = -1LU */
+    curr = rb_prev(curr);
+    while(curr) {
+        curr_iova = rb_entry(curr, struct iova, node);
+        off += sysfs_emit_at(buf, off, "0x%016lx-0x%016lx\n",
curr_iova->pfn_lo << shift,
+            ((curr_iova->pfn_hi + 1) << shift) - 1);
+        curr = rb_prev(curr);
+        /* do not iterate further if the page is full */
+        if (off >= (PAGE_SIZE - 38))
+            break;
+    }
+    spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+    return off;
+}
+EXPORT_SYMBOL_GPL(iovad_show_busy_regions);
+
 static struct iova *
 private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 156732807994..5fe92c00221d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1511,6 +1511,7 @@ static inline void iommu_debugfs_setup(void) {}

 #ifdef CONFIG_IOMMU_DMA
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
+ssize_t iommu_domain_show_busy_regions(struct iommu_domain *domain, char *buf);
 #else /* CONFIG_IOMMU_DMA */
 static inline int iommu_get_msi_cookie(struct iommu_domain *domain,
dma_addr_t base)
 {
diff --git a/include/linux/iova.h b/include/linux/iova.h
index e35762c0acdb..c09d224cce2b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -91,8 +91,12 @@ void __free_iova(struct iova_domain *iovad, struct
iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
     unsigned long limit_pfn,
     iova_align_t align);
+
 void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
             unsigned long size);
+
+ssize_t iovad_show_busy_regions(struct iova_domain *iovad, char *buf);
+
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
                   unsigned long limit_pfn, bool flush_rcache,
iova_align_t align);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -120,6 +124,11 @@ static inline void __free_iova(struct iova_domain
*iovad, struct iova *iova)
 {
 }

+ssize_t iovad_show_busy_regions(struct iova_domain *iovad, char *buf)
+{
+    return -ENOTSUPP;
+}
+
 static inline struct iova *alloc_iova(struct iova_domain *iovad,
                       unsigned long size,
                       unsigned long limit_pfn,

-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

[PATCH 4/8] Large DMA alloc/add APIs to query available range

Posted by Constantine Gavrilov 2 months ago

This is the fifth patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Some devices (like NTB or GPU) allow mapping of the system memory to PCIe bars,
allowing to implement PCIe interconnects when devices are connected to more
that one root complex. After one root complex does the mapping, an application
on another root complex can access the memory using the PCIe bar of the device.
Since a typical system memory mapping uses offset translation (between the
device bar address and the DMA address), the device driver needs to know which
contiguous DMA address range is available to satisfy the device needs before it
can set up the mapping offset. This patch provides APIs to do this.

This patch was developed before the 6.16 kernel that provides functions
dma_iova_try_alloc() and and dma_iova_link() to help with this task. With
dma_iova_try_alloc(), the device driver can reserve a DMA address range for its
future use and use dma_iova_link() later to update IOMMU translations on the
reserved range. However, we do not have APIs that would allow allocations of
smaller regions from the reserved area that would provide functionality
similar to iommu_dma_alloc_iova(). This patch allows to query the available
range, set up the offset, and use standard DMA allocation APIs, after enforcing
the DMA mask constraint on the device.

commit 31b8abf68f5114dc90c1d38bd70e505727383666
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Thu Jun 26 23:20:40 2025 +0300

    Add APIs to query available DMA address range.

    This adds two exported functions:
    * iommu_domain_get_lowest_free_address_range()
    * iovad_get_lowest_free_address_range()

    NTB drivers that implement translation by offset can query the
availalble range
    and set the first region offset to the returned value and also set
the DMA max
    address to returned value + window size. Since DMA address allocation is
    from the top addresses, this allows the applications to request a large
    IOMMU registration that matches the NTB windows size.

    The prior query to iommu_domain_get_lowest_free_address_range() makes
    sure that a required DMA range is available and not used by other devices.

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 6ba9be4fb64d..e78d7f8a2d61 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -769,6 +769,15 @@ ssize_t iommu_domain_show_busy_regions(struct
iommu_domain *domain, char *buf)
     return iovad_show_busy_regions(iovad, buf);
 }

+int iommu_domain_get_lowest_free_address_range(struct iommu_domain
*domain, struct addr_range_query *query, u64 *res)
+{
+    struct iommu_dma_cookie *cookie = domain->iova_cookie;
+    struct iova_domain *iovad = &cookie->iovad;
+
+    return iovad_get_lowest_free_address_range(iovad, query, res);
+}
+EXPORT_SYMBOL(iommu_domain_get_lowest_free_address_range);
+
 static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
         size_t size, u64 dma_limit, struct device *dev, iova_align_t align)
 {
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 96144c58b386..aba58630be12 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -311,6 +311,59 @@ ssize_t iovad_show_busy_regions(struct
iova_domain *iovad, char *buf)
 }
 EXPORT_SYMBOL_GPL(iovad_show_busy_regions);

+/*
+ * Get a hint for lowest available address range.
+*/
+int iovad_get_lowest_free_address_range(struct iova_domain *iovad,
struct addr_range_query *query, u64 *res)
+{
+    struct rb_node *curr, *prev;
+    struct iova *curr_iova, *prev_iova;
+    unsigned long flags;
+    unsigned long shift = iova_shift(iovad);
+    int ret = -ENOMEM;
+
+    if (query->align) {
+        if (!is_power_of_2(query->align))
+            return -EINVAL;
+    }
+    if (query->addr_min >= query->addr_max)
+        return -EINVAL;
+
+    spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+    curr = &iovad->anchor.node;
+    curr_iova = rb_entry(curr, struct iova, node);
+    while(curr) {
+        prev = rb_prev(curr);
+        curr = prev;
+        if (prev) {
+            u64 free_start;
+            u64 free_end;
+            u64 alloc_end;
+            prev_iova = rb_entry(prev, struct iova, node);
+            free_start = (prev_iova->pfn_hi + 1) << shift;
+            free_end = (curr_iova->pfn_lo) << shift;
+            curr_iova = prev_iova;
+            if (query->align)
+                free_start = ALIGN(free_start, query->align);
+            alloc_end = free_start + query->size;
+
+            if (free_start < query->addr_min)
+                break;
+            if (alloc_end > query->addr_max)
+                continue; //does not match address consraint
+            if (free_start > alloc_end || free_start >= free_end ||
alloc_end > free_end)
+                continue; //overflow
+
+            ret = 0;
+            *res = free_start;
+        }
+    }
+    spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+    return ret;
+}
+EXPORT_SYMBOL(iovad_get_lowest_free_address_range);
+
 static struct iova *
 private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5fe92c00221d..96ac4333f727 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/iova_bitmap.h>
+#include <linux/iova.h>

 #define IOMMU_READ    (1 << 0)
 #define IOMMU_WRITE    (1 << 1)
@@ -1512,6 +1513,7 @@ static inline void iommu_debugfs_setup(void) {}
 #ifdef CONFIG_IOMMU_DMA
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 ssize_t iommu_domain_show_busy_regions(struct iommu_domain *domain, char *buf);
+int iommu_domain_get_lowest_free_address_range(struct iommu_domain
*domain, struct addr_range_query *query, u64 *res);
 #else /* CONFIG_IOMMU_DMA */
 static inline int iommu_get_msi_cookie(struct iommu_domain *domain,
dma_addr_t base)
 {
diff --git a/include/linux/iova.h b/include/linux/iova.h
index c09d224cce2b..30ce5ad499d2 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -80,6 +80,13 @@ static inline unsigned long iova_pfn(struct
iova_domain *iovad, dma_addr_t iova)
     return iova >> iova_shift(iovad);
 }

+struct addr_range_query {
+    u64 size;
+    u64 addr_min;
+    u64 addr_max;
+    u64 align;
+};
+
 #if IS_REACHABLE(CONFIG_IOMMU_IOVA)
 int iova_cache_get(void);
 void iova_cache_put(void);
@@ -97,6 +104,9 @@ void free_iova_fast(struct iova_domain *iovad,
unsigned long pfn,

 ssize_t iovad_show_busy_regions(struct iova_domain *iovad, char *buf);

+#define IOVAD_HAS_FREE_ADDR_RANGE
+int iovad_get_lowest_free_address_range(struct iova_domain *iovad,
struct addr_range_query *query, u64 *res);
+
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
                   unsigned long limit_pfn, bool flush_rcache,
iova_align_t align);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -129,6 +139,11 @@ ssize_t iovad_show_busy_regions(struct
iova_domain *iovad, char *buf)
     return -ENOTSUPP;
 }

+int iovad_get_lowest_free_address_range(struct iova_domain *iovad,
struct addr_range_query *query, u64 *res)
+{
+    return -ENOTSUPP;
+}
+
 static inline struct iova *alloc_iova(struct iova_domain *iovad,
                       unsigned long size,
                       unsigned long limit_pfn,


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
Ha-Menofim 9, Hertzelia
----------------------------------------

[PATCH 5/8] Large DMA alloc/remove max32_alloc_size field

Posted by Constantine Gavrilov 2 months ago

This is the sixth patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

This patch removes max32_alloc_size field from the iova_domain structure.
This field was introduced to optimize the path of allocation failure (return
failed allocation without checking for available regions), but was not
implemented correctly, resulting in failed allocations when the space maybe
available.

commit cfdad4eb84e8c5dc7aa3f868575007de2e1fc1e4
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Tue Jul 1 10:05:11 2025 +0300

    Removed max32_alloc_size field from the iova_domain structure.

    This field exists for the purpose of optimizing a path of failed
    allocations in 32-bit DMA space, when the 32-bit range is depleted.
    There are a number of issues:

    1. max32_alloc_size is updated in __alloc_and_insert_iova_range()
       without checking the size after allocation of the range fails. This
       will cause a wrong failed small allocation after a "big" allocation
       fails.
    2. max32_alloc_size is updated in __cached_rbnode_delete_update()
       without checking the top bound of the released range and the size
       of the released range. This defeats the purpose of the intended
       "optimization".
    3. Alignment constraints and DMA  address limits are not taken into account
       when updating this field after the allocation fails. This results in
       future allocation failures that can succeed if looking for available
       ranges is not skipped.

    Rather than fixing this "optimization", remove it altogether. We shall
    not optimize for failed allocation path that is rare, and the optimizing
    affect of this feature is questionable.

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index aba58630be12..0c436dd35404 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -52,7 +52,6 @@ init_iova_domain(struct iova_domain *iovad, unsigned
long granule,
     iovad->granule = granule;
     iovad->start_pfn = start_pfn;
     iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
-    iovad->max32_alloc_size = iovad->dma_32bit_pfn;
     iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
     rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
     rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
@@ -88,9 +87,6 @@ __cached_rbnode_delete_update(struct iova_domain
*iovad, struct iova *free)
          free->pfn_lo >= cached_iova->pfn_lo))
         iovad->cached32_node = rb_next(&free->node);

-    if (free->pfn_lo < iovad->dma_32bit_pfn)
-        iovad->max32_alloc_size = iovad->dma_32bit_pfn;
-
     cached_iova = to_iova(iovad->cached_node);
     if (free->pfn_lo >= cached_iova->pfn_lo)
         iovad->cached_node = rb_next(&free->node);
@@ -182,9 +178,6 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,

     /* Walk the tree backwards */
     spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-    if (limit_pfn <= iovad->dma_32bit_pfn &&
-            size >= iovad->max32_alloc_size)
-        goto iova32_full;

     curr = __get_cached_rbnode(iovad, limit_pfn);
     curr_iova = to_iova(curr);
@@ -207,7 +200,6 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
             curr_iova = to_iova(curr);
             goto retry;
         }
-        iovad->max32_alloc_size = size;
         goto iova32_full;
     }

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 30ce5ad499d2..2800bdc203b1 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -33,7 +33,6 @@ struct iova_domain {
     unsigned long    granule;    /* pfn granularity for this domain */
     unsigned long    start_pfn;    /* Lower limit for this domain */
     unsigned long    dma_32bit_pfn;
-    unsigned long    max32_alloc_size; /* Size of last failed allocation */
     struct iova    anchor;        /* rbtree lookup anchor */

     struct iova_rcache    *rcaches;


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

[PATCH 6/8] Large DMA alloc/alloc DMA addresses from the top

Posted by Constantine Gavrilov 2 months ago

This is the seventh patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

This patch ensures that addresses in IOMMU group are allocated from the top
to the bottom of the address space. It fixes some issues with the use of
cached_node and cached32_node fields of the iova_domain structure that
resulted in fragmentation of the address space. Fragmented address space can
lead to failed allocations of DMA ranges.

commit d8bb3c731ff750afc568fa73d770eb1fa3e96c09
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Tue Jul 1 11:19:08 2025 +0300

    Allocate DMA addresses from top to bottom in IOVA domains.

    The cached_node and cached32_node fields of the iova_domain structure
    are used as the starting point for the address search only if the cached
    node starts at the DMA limit or below it, or if the DMA limit is 64 bit
    or 32 bit respectively.

    The cached_node and cached32_node are updated upon successful allocation
    only if the search was performed from the node that does not lie below the
    cached values and not above the DMA limit.

    For clarity, cached_node field was renamed to cached_top_node.

    To enable the existing optimization for network stack behavior - where
    network drivers can allocate above 250K DMA buffers for network pools
    without using SG tables, we add cached_middle_node and middle_pfn_limit
    fields. Without using those, the system locks up for minutes at boot
    time trying to allocate network pools.

    This ensures contiguous allocations from top to the bottom, with holes
    due to alignment or due to lower DMA address requirements for some devices
    in the group. Altogether, this avoids fragmentation of DMA address space
    and ensures that large DMA ranges are available.

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 0c436dd35404..09356d6065ef 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -47,7 +47,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned
long granule,

     spin_lock_init(&iovad->iova_rbtree_lock);
     iovad->rbroot = RB_ROOT;
-    iovad->cached_node = &iovad->anchor.node;
+    iovad->cached_top_node = &iovad->anchor.node;
+    iovad->cached_middle_node = &iovad->anchor.node;
+    iovad->middle_pfn_limit = IOVA_ANCHOR;
     iovad->cached32_node = &iovad->anchor.node;
     iovad->granule = granule;
     iovad->start_pfn = start_pfn;
@@ -58,22 +60,63 @@ init_iova_domain(struct iova_domain *iovad,
unsigned long granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);

+static struct rb_node *iova_find_limit(struct iova_domain *iovad,
unsigned long limit_pfn);
+
 static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
+__get_start_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
 {
-    if (limit_pfn <= iovad->dma_32bit_pfn)
+    struct iova *cached = to_iova(iovad->cached32_node);
+    if (limit_pfn == iovad->dma_32bit_pfn || (cached->pfn_hi + 1) >= limit_pfn)
         return iovad->cached32_node;

-    return iovad->cached_node;
+    cached = to_iova(iovad->cached_middle_node);
+    if (limit_pfn == iovad->middle_pfn_limit || (cached->pfn_hi + 1)
>= limit_pfn)
+        return iovad->cached_middle_node;
+
+    cached = to_iova(iovad->cached_top_node);
+    if (limit_pfn == IOVA_ANCHOR || (cached->pfn_hi + 1) >= limit_pfn)
+        return iovad->cached_top_node;
+
+    return iova_find_limit(iovad, limit_pfn);
 }

 static void
-__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova
*new, struct iova *start_search, unsigned long limit_pfn)
 {
-    if (new->pfn_hi < iovad->dma_32bit_pfn)
-        iovad->cached32_node = &new->node;
-    else
-        iovad->cached_node = &new->node;
+    /* insert the update only if the search started from the cached
node or above it
+     * This way, we alttempt to allocate from top to the bottom, with
holes due to alignment
+     * or DMA address limit for individual devices in the group
+     */
+    struct iova *cached;
+
+    /* update top node */
+    cached = to_iova(iovad->cached_top_node);
+    if (limit_pfn >= start_search->pfn_lo &&
+            start_search->pfn_lo >= cached->pfn_lo &&
+            new->pfn_lo < cached->pfn_lo)
+        iovad->cached_top_node = &new->node;
+
+    /* update middle node */
+    cached = to_iova(iovad->cached_middle_node);
+    if (limit_pfn >= start_search->pfn_lo &&
+            start_search->pfn_lo >= cached->pfn_lo &&
+             new->pfn_lo < cached->pfn_lo) {
+        iovad->cached_middle_node = &new->node;
+        if (limit_pfn != IOVA_ANCHOR && (limit_pfn > iovad->middle_pfn_limit ||
+                iovad->middle_pfn_limit == IOVA_ANCHOR))
+            iovad->middle_pfn_limit = limit_pfn;
+    } else if (limit_pfn != IOVA_ANCHOR) {
+        iovad->middle_pfn_limit = limit_pfn;
+        iovad->cached_middle_node = &new->node;
+    }
+
+    if (new->pfn_lo <= iovad->dma_32bit_pfn) {
+        cached = to_iova(iovad->cached32_node);
+        if (limit_pfn >= start_search->pfn_lo &&
+            start_search->pfn_lo >= cached->pfn_lo &&
+             new->pfn_lo < cached->pfn_lo)
+            iovad->cached32_node = &new->node;
+    }
 }

 static void
@@ -87,9 +130,13 @@ __cached_rbnode_delete_update(struct iova_domain
*iovad, struct iova *free)
          free->pfn_lo >= cached_iova->pfn_lo))
         iovad->cached32_node = rb_next(&free->node);

-    cached_iova = to_iova(iovad->cached_node);
+    cached_iova = to_iova(iovad->cached_top_node);
     if (free->pfn_lo >= cached_iova->pfn_lo)
-        iovad->cached_node = rb_next(&free->node);
+        iovad->cached_top_node = rb_next(&free->node);
+
+    cached_iova = to_iova(iovad->cached_middle_node);
+    if (free->pfn_lo >= cached_iova->pfn_lo && free->pfn_lo <
iovad->middle_pfn_limit)
+        iovad->cached_middle_node = rb_next(&free->node);
 }

 static struct rb_node *iova_find_limit(struct iova_domain *iovad,
unsigned long limit_pfn)
@@ -161,8 +208,8 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
         unsigned long size, unsigned long limit_pfn,
             struct iova *new, iova_align_t align)
 {
-    struct rb_node *curr, *prev;
-    struct iova *curr_iova;
+    struct rb_node *curr, *prev, *start_search;
+    struct iova *curr_iova, *start_iova;
     unsigned long flags;
     unsigned long new_pfn, retry_pfn;
     unsigned long align_mask;
@@ -179,8 +226,8 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
     /* Walk the tree backwards */
     spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);

-    curr = __get_cached_rbnode(iovad, limit_pfn);
-    curr_iova = to_iova(curr);
+    curr = start_search = __get_start_rbnode(iovad, limit_pfn);
+    curr_iova = start_iova = to_iova(curr);
     retry_pfn = curr_iova->pfn_hi;

 retry:
@@ -193,11 +240,11 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
     } while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);

     if (high_pfn < size || new_pfn < low_pfn) {
-        if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
+        if (start_search != &iovad->anchor.node && low_pfn ==
iovad->start_pfn && retry_pfn < limit_pfn) {
             high_pfn = limit_pfn;
             low_pfn = retry_pfn + 1;
-            curr = iova_find_limit(iovad, limit_pfn);
-            curr_iova = to_iova(curr);
+            curr = start_search = iova_find_limit(iovad, limit_pfn);
+            curr_iova = start_iova = to_iova(curr);
             goto retry;
         }
         goto iova32_full;
@@ -209,7 +256,7 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,

     /* If we have 'prev', it's a valid place to start the insertion. */
     iova_insert_rbtree(&iovad->rbroot, new, prev);
-    __cached_rbnode_insert_update(iovad, new);
+    __cached_rbnode_insert_update(iovad, new, start_iova, limit_pfn);

     spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
     return 0;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 2800bdc203b1..0780a64e1149 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -26,14 +26,16 @@ struct iova_rcache;

 /* holds all the iova translations for a domain */
 struct iova_domain {
-    spinlock_t    iova_rbtree_lock; /* Lock to protect update of rbtree */
-    struct rb_root    rbroot;        /* iova domain rbtree root */
-    struct rb_node    *cached_node;    /* Save last alloced node */
-    struct rb_node    *cached32_node; /* Save last 32-bit alloced node */
-    unsigned long    granule;    /* pfn granularity for this domain */
-    unsigned long    start_pfn;    /* Lower limit for this domain */
-    unsigned long    dma_32bit_pfn;
-    struct iova    anchor;        /* rbtree lookup anchor */
+    spinlock_t    iova_rbtree_lock;    /* Lock to protect update of rbtree */
+    struct rb_root    rbroot;              /* iova domain rbtree root */
+    struct rb_node    *cached_top_node;    /* Save last alloced node
from the top*/
+    struct rb_node    *cached_middle_node; /* Saved last alloced node
in the middle */
+    struct rb_node    *cached32_node;      /* Save last 32-bit alloced node */
+    unsigned long    granule;             /* pfn granularity for this domain */
+    unsigned long    start_pfn;           /* Lower limit for this domain */
+    unsigned long    dma_32bit_pfn;       /* 32-bit PFN limit, constant */
+    unsigned long   middle_pfn_limit;    /* cached_middle_node is for
this limit */
+    struct iova    anchor;              /* rbtree lookup anchor */

     struct iova_rcache    *rcaches;
     struct hlist_node    cpuhp_dead;


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

[PATCH 7/8] Large DMA alloc/low address limit in alloc funcs

Posted by Constantine Gavrilov 2 months ago

This is the eighth patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Some devices (like NTB or GPU) allow mapping of the system memory to PCIe bars,
allowing to implement PCIe interconnects when devices are connected to more
than one root complex. After one root complex does the mapping, an application
on another root complex can access the memory using the PCIe bar of the device.
Since a typical system memory mapping uses offset translation (between the
device bar address and the DMA address), the device driver needs to know which
contiguous DMA address range is available to satisfy the device needs before it
can set up the mapping offset. After querying the available range, the device
driver can set up the mapping translation and use the top and low DMA address
constraints to ensure that future DMA allocation APIs will allocate DMA
addresses within the selected range.

This patch was developed before the 6.16 kernel that provides functions
dma_iova_try_alloc() and and dma_iova_link() to help with this task. With
dma_iova_try_alloc(), the device driver can reserve a DMA address range for its
future use and use dma_iova_link() later to update IOMMU translations on the
reserved range. However, we do not have APIs that would allow allocations of
smaller regions from the reserve red area that would provide functionality
similar to iommu_dma_alloc_iova(). This patch allows to query the available
range, set up the offset, and use standard DMA allocation APIs, after enforcing
the DMA mask constraints on the device.

This patch does not change the existing behavior but extends some DMA address
allocation APIs to use the low and top address constraints.

commit 00de9ac3bdb6747fbe6b21de78ad11a32c67a71f
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Tue Jul 1 14:12:27 2025 +0300

    Support low address limit in functions that allocate DMA address.

    Devices that map memory windows to PCI bars (for example NTB devices)
    may need this change. This is because such devices may use very large
    memory windows (terabytes in size), and they cannot simply cap the top
    DMA address to the size of the window, since the range [0-window size]
    may not be available. Such devices can find an available DMA region, set
    up the memory window, and request DMA address allocations in the found
    range.

    This is a preparation step, there is no change of behavior in this
    change.

    The following public APIs add low_limit_pfn parameter:
    * alloc_iova() - passes parameter
    * alloc_iova_fast() - passes parameter

    The following static functions that are called during DMA address
    allocation add low_limit_pfn parameter:

    * iova_rcache_get() - passes parameter
    * __alloc_and_insert_iova_range() - uses parameter
    * iova_magazine_pop() - uses parameter
    * __iova_rcache_get() - uses parameter
    * iova_rcache_get() - passes parameter

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index bf525d59e82e..f8b7eadeee05 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1046,7 +1046,7 @@ void *tegra_drm_alloc(struct tegra_drm *tegra,
size_t size, dma_addr_t *dma)

     alloc = alloc_iova(&tegra->carveout.domain,
                size >> tegra->carveout.shift,
-               tegra->carveout.limit, ALLOC_IOVA_ALIGN_SIZE);
+               0, tegra->carveout.limit, ALLOC_IOVA_ALIGN_SIZE);
     if (!alloc) {
         err = -EBUSY;
         goto free_pages;
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index fbd647fc031c..d49b238b1708 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -97,7 +97,7 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)

         shift = iova_shift(&host1x->iova);
         alloc = alloc_iova(&host1x->iova, size >> shift,
-                   host1x->iova_end >> shift, ALLOC_IOVA_ALIGN_SIZE);
+                   0, host1x->iova_end >> shift, ALLOC_IOVA_ALIGN_SIZE);
         if (!alloc) {
             err = -ENOMEM;
             goto iommu_free_mem;
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index ff5325d21fe8..715585b6ec67 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -242,7 +242,7 @@ static unsigned int pin_job(struct host1x *host,
struct host1x_job *job)

             shift = iova_shift(&host->iova);
             alloc = alloc_iova(&host->iova, gather_size >> shift,
-                       host->iova_end >> shift, ALLOC_IOVA_ALIGN_SIZE);
+                       0, host->iova_end >> shift, ALLOC_IOVA_ALIGN_SIZE);
             if (!alloc) {
                 err = -ENOMEM;
                 goto put;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e78d7f8a2d61..414d31347fc2 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -810,7 +810,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
      * expected address bits are wired up between the device and the IOMMU.
      */
     if (dma_limit > DMA_BIT_MASK(32) && (size - 1) <=
DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
-        iova = alloc_iova_fast(iovad, iova_len,
+        iova = alloc_iova_fast(iovad, iova_len, 0,
                        DMA_BIT_MASK(32) >> shift, false, align);
         if (iova)
             goto done;
@@ -819,7 +819,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
         dev_notice(dev, "Using %d-bit DMA addresses\n", bits_per(dma_limit));
     }

-    iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift, true, align);
+    iova = alloc_iova_fast(iovad, iova_len, 0, dma_limit >> shift,
true, align);
 done:
     return (dma_addr_t)iova << shift;
 }
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 09356d6065ef..e599cfc66bff 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -24,6 +24,7 @@ static bool iova_rcache_insert(struct iova_domain *iovad,
                    unsigned long size);
 static unsigned long iova_rcache_get(struct iova_domain *iovad,
                      unsigned long size,
+                     unsigned long low_limit_pfn,
                      unsigned long limit_pfn);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
@@ -205,16 +206,18 @@ iova_insert_rbtree(struct rb_root *root, struct
iova *iova,
 }

 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
-        unsigned long size, unsigned long limit_pfn,
-            struct iova *new, iova_align_t align)
+        unsigned long size, unsigned low_limit_pfn, unsigned long limit_pfn,
+        struct iova *new, iova_align_t align)
 {
     struct rb_node *curr, *prev, *start_search;
     struct iova *curr_iova, *start_iova;
     unsigned long flags;
     unsigned long new_pfn, retry_pfn;
     unsigned long align_mask;
-    unsigned long high_pfn = limit_pfn, low_pfn = iovad->start_pfn;
+    unsigned long high_pfn = limit_pfn;
+    bool retried = false;

+    low_limit_pfn = max(low_limit_pfn, iovad->start_pfn);
     switch (align) {
         case ALLOC_IOVA_ALIGN_NONE: align_mask = ~0UL; break;
         case ALLOC_IOVA_ALIGN_SIZE: align_mask = (~0UL) <<
fls_long(size - 1); break;
@@ -237,14 +240,15 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
         prev = curr;
         curr = rb_prev(curr);
         curr_iova = to_iova(curr);
-    } while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
+    } while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_limit_pfn);

-    if (high_pfn < size || new_pfn < low_pfn) {
-        if (start_search != &iovad->anchor.node && low_pfn ==
iovad->start_pfn && retry_pfn < limit_pfn) {
+    if (high_pfn < size || new_pfn < low_limit_pfn) {
+        if (start_search != &iovad->anchor.node && !retried &&
retry_pfn < limit_pfn) {
             high_pfn = limit_pfn;
-            low_pfn = retry_pfn + 1;
+            low_limit_pfn = retry_pfn + 1;
             curr = start_search = iova_find_limit(iovad, limit_pfn);
             curr_iova = start_iova = to_iova(curr);
+            retried = true;
             goto retry;
         }
         goto iova32_full;
@@ -297,6 +301,7 @@ static void free_iova_mem(struct iova *iova)
  */
 struct iova *
 alloc_iova(struct iova_domain *iovad, unsigned long size,
+    unsigned long low_limit_pfn,
     unsigned long limit_pfn,
     iova_align_t align)
 {
@@ -307,7 +312,7 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
     if (!new_iova)
         return NULL;

-    ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
+    ret = __alloc_and_insert_iova_range(iovad, size, low_limit_pfn,
limit_pfn + 1,
             new_iova, align);

     if (ret) {
@@ -507,7 +512,8 @@ EXPORT_SYMBOL_GPL(free_iova);
 */
 unsigned long
 alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
-        unsigned long limit_pfn, bool flush_rcache, iova_align_t align)
+        unsigned long low_limit_pfn, unsigned long limit_pfn,
+        bool flush_rcache, iova_align_t align)
 {
     unsigned long iova_pfn;
     struct iova *new_iova;
@@ -521,12 +527,12 @@ alloc_iova_fast(struct iova_domain *iovad,
unsigned long size,
     if (size < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
         size = roundup_pow_of_two(size);

-    iova_pfn = iova_rcache_get(iovad, size, limit_pfn + 1);
+    iova_pfn = iova_rcache_get(iovad, size, low_limit_pfn, limit_pfn + 1);
     if (iova_pfn)
         return iova_pfn;

 retry:
-    new_iova = alloc_iova(iovad, size, limit_pfn, align);
+    new_iova = alloc_iova(iovad, size, low_limit_pfn, limit_pfn, align);
     if (!new_iova) {
         unsigned int cpu;

@@ -780,13 +786,14 @@ static bool iova_magazine_empty(struct iova_magazine *mag)
 }

 static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+                       unsigned long low_limit_pfn,
                        unsigned long limit_pfn)
 {
     int i;
     unsigned long pfn;

     /* Only fall back to the rbtree if we have no suitable pfns at all */
-    for (i = mag->size - 1; mag->pfns[i] > limit_pfn; i--)
+    for (i = mag->size - 1; (mag->pfns[i] > limit_pfn) ||
(mag->pfns[i] < low_limit_pfn); i--)
         if (i == 0)
             return 0;

@@ -953,6 +960,7 @@ static bool iova_rcache_insert(struct iova_domain
*iovad, unsigned long pfn,
  * it from the 'rcache'.
  */
 static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+                       unsigned long low_limit_pfn,
                        unsigned long limit_pfn)
 {
     struct iova_cpu_rcache *cpu_rcache;
@@ -979,7 +987,7 @@ static unsigned long __iova_rcache_get(struct
iova_rcache *rcache,
     }

     if (has_pfn)
-        iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+        iova_pfn = iova_magazine_pop(cpu_rcache->loaded,
low_limit_pfn, limit_pfn);

     spin_unlock_irqrestore(&cpu_rcache->lock, flags);

@@ -993,6 +1001,7 @@ static unsigned long __iova_rcache_get(struct
iova_rcache *rcache,
  */
 static unsigned long iova_rcache_get(struct iova_domain *iovad,
                      unsigned long size,
+                     unsigned long low_limit_pfn,
                      unsigned long limit_pfn)
 {
     unsigned int log_size = order_base_2(size);
@@ -1000,7 +1009,7 @@ static unsigned long iova_rcache_get(struct
iova_domain *iovad,
     if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
         return 0;

-    return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn - size);
+    return __iova_rcache_get(&iovad->rcaches[log_size],
low_limit_pfn, limit_pfn - size);
 }

 /*
diff --git a/drivers/media/pci/intel/ipu6/ipu6-dma.c
b/drivers/media/pci/intel/ipu6/ipu6-dma.c
index 4e2b98c4f348..24b677f73992 100644
--- a/drivers/media/pci/intel/ipu6/ipu6-dma.c
+++ b/drivers/media/pci/intel/ipu6/ipu6-dma.c
@@ -172,7 +172,7 @@ void *ipu6_dma_alloc(struct ipu6_bus_device *sys,
size_t size,
     count = PHYS_PFN(size);

     iova = alloc_iova(&mmu->dmap->iovad, count,
-              PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
+              0, PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         goto out_kfree;

@@ -398,7 +398,7 @@ int ipu6_dma_map_sg(struct ipu6_bus_device *sys,
struct scatterlist *sglist,
         nents, npages);

     iova = alloc_iova(&mmu->dmap->iovad, npages,
-              PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
+              0, PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         return 0;

diff --git a/drivers/media/pci/intel/ipu6/ipu6-mmu.c
b/drivers/media/pci/intel/ipu6/ipu6-mmu.c
index 4d6f9b8d68bb..013d33a0f5dc 100644
--- a/drivers/media/pci/intel/ipu6/ipu6-mmu.c
+++ b/drivers/media/pci/intel/ipu6/ipu6-mmu.c
@@ -422,7 +422,7 @@ static int allocate_trash_buffer(struct ipu6_mmu *mmu)

     /* Allocate 8MB in iova range */
     iova = alloc_iova(&mmu->dmap->iovad, n_pages,
-              PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
+              0, PHYS_PFN(mmu->dmap->mmu_info->aperture_end),
ALLOC_IOVA_ALIGN_NONE);
     if (!iova) {
         dev_err(mmu->dev, "cannot allocate iova range for trash\n");
         return -ENOMEM;
diff --git a/drivers/media/platform/nvidia/tegra-vde/iommu.c
b/drivers/media/platform/nvidia/tegra-vde/iommu.c
index ad010ad65735..ec687165e150 100644
--- a/drivers/media/platform/nvidia/tegra-vde/iommu.c
+++ b/drivers/media/platform/nvidia/tegra-vde/iommu.c
@@ -30,7 +30,7 @@ int tegra_vde_iommu_map(struct tegra_vde *vde,
     size = iova_align(&vde->iova, size);
     shift = iova_shift(&vde->iova);

-    iova = alloc_iova(&vde->iova, size >> shift, end >> shift,
ALLOC_IOVA_ALIGN_SIZE);
+    iova = alloc_iova(&vde->iova, size >> shift, 0, end >> shift,
ALLOC_IOVA_ALIGN_SIZE);
     if (!iova)
         return -ENOMEM;

diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c
b/drivers/staging/media/ipu3/ipu3-dmamap.c
index 330314a3aa94..fb42a6740f0e 100644
--- a/drivers/staging/media/ipu3/ipu3-dmamap.c
+++ b/drivers/staging/media/ipu3/ipu3-dmamap.c
@@ -105,7 +105,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu,
struct imgu_css_map *map,
     dev_dbg(dev, "%s: allocating %zu\n", __func__, size);

     iova = alloc_iova(&imgu->iova_domain, size >> shift,
-              imgu->mmu->aperture_end >> shift, ALLOC_IOVA_ALIGN_NONE);
+              0, imgu->mmu->aperture_end >> shift, ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         return NULL;

@@ -205,7 +205,7 @@ int imgu_dmamap_map_sg(struct imgu_device *imgu,
struct scatterlist *sglist,
         nents, size >> shift);

     iova = alloc_iova(&imgu->iova_domain, size >> shift,
-              imgu->mmu->aperture_end >> shift, ALLOC_IOVA_ALIGN_NONE);
+              0, imgu->mmu->aperture_end >> shift, ALLOC_IOVA_ALIGN_NONE);
     if (!iova)
         return -ENOMEM;

diff --git a/drivers/vdpa/vdpa_user/iova_domain.c
b/drivers/vdpa/vdpa_user/iova_domain.c
index 96ce209762f9..feb130648888 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.c
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -362,7 +362,7 @@ vduse_domain_alloc_iova(struct iova_domain *iovad,
     unsigned long iova_len = iova_align(iovad, size) >> shift;
     unsigned long iova_pfn;

-    iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true,
ALLOC_IOVA_ALIGN_SIZE);
+    iova_pfn = alloc_iova_fast(iovad, iova_len, 0, limit >> shift,
true, ALLOC_IOVA_ALIGN_SIZE);

     return (dma_addr_t)iova_pfn << shift;
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 0780a64e1149..d17b4901effc 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -97,8 +97,7 @@ unsigned long iova_rcache_range(void);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
-    unsigned long limit_pfn,
-    iova_align_t align);
+    unsigned long low_limit_pfn, unsigned long limit_pfn, iova_align_t align);

 void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
             unsigned long size);
@@ -109,7 +108,8 @@ ssize_t iovad_show_busy_regions(struct iova_domain
*iovad, char *buf);
 int iovad_get_lowest_free_address_range(struct iova_domain *iovad,
struct addr_range_query *query, u64 *res);

 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
-                  unsigned long limit_pfn, bool flush_rcache,
iova_align_t align);
+                unsigned long low_limit_pfn, unsigned long limit_pfn,
+                bool flush_rcache, iova_align_t align);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
     unsigned long pfn_hi);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -147,6 +147,7 @@ int iovad_get_lowest_free_address_range(struct
iova_domain *iovad, struct addr_r

 static inline struct iova *alloc_iova(struct iova_domain *iovad,
                       unsigned long size,
+                      unsigned long low_limit_pfn,
                       unsigned long limit_pfn,
                       iova_align_t align)
 {
@@ -161,6 +162,7 @@ static inline void free_iova_fast(struct iova_domain *iovad,

 static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
                         unsigned long size,
+                        unsigned long low_limit_pfn,
                         unsigned long limit_pfn,
                         bool flush_rcache, iova_align_t align)
 {


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

[PATCH 8/8] Large DMA alloc/add min_dma_addr to device struct

Posted by Constantine Gavrilov 2 months ago

This is the ninth patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@github.com:cgavrilov/linux.git repo.

Some devices (like NTB or GPU) allow mapping of the system memory to PCIe bars,
allowing to implement PCIe interconnects when devices are connected to more
than one root complex. After one root complex does the mapping, an application
on another root complex can access the memory using the PCIe bar of the device.
Since a typical system memory mapping uses offset translation (between the
device bar address and the DMA address), the device driver needs to know which
contiguous DMA address range is available to satisfy the device needs before it
can set up the mapping offset. After querying the available range, the device
driver can set up the mapping translation and use the top and low DMA address
constraints to ensure that future DMA allocation APIs will allocate DMA
addresses within the selected range.

This patch adds min_dma_addr to the device structure and uses the added field
in alloc_iova_fast() API.

commit 0bebd4d0829b941fc38c1311efa7309c033968e4 (HEAD -> master, github/master)
Author: Constantine Gavrilov <cgavrilov@infinidat.com>
Date:   Tue Jul 1 15:29:40 2025 +0300

    Add min_dma_addr to the device structure and use it in
iommu_dma_alloc_iova().

    Devices that map memory windows to PCI bars (for example NTB devices)
    may need this change. This is because such devices may use very large
    memory windows (terabytes in size), and they cannot simply cap the top
    DMA address to the size of the window, since the range [0-window size]
    may not be available. Such devices can find an available DMA region by
    calling iommu_domain_get_lowest_free_address_range(), set up the memory
    window, and request DMA address allocations in the found range by calling
    dma_set_min_dma_addr() and dma_set_mask_and_coherent().

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 414d31347fc2..077d1cd0939d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -810,7 +810,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
      * expected address bits are wired up between the device and the IOMMU.
      */
     if (dma_limit > DMA_BIT_MASK(32) && (size - 1) <=
DMA_BIT_MASK(32) && dev->iommu->pci_32bit_workaround) {
-        iova = alloc_iova_fast(iovad, iova_len, 0,
+        iova = alloc_iova_fast(iovad, iova_len, dev->min_dma_addr,
                        DMA_BIT_MASK(32) >> shift, false, align);
         if (iova)
             goto done;
@@ -819,7 +819,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct
iommu_domain *domain,
         dev_notice(dev, "Using %d-bit DMA addresses\n", bits_per(dma_limit));
     }

-    iova = alloc_iova_fast(iovad, iova_len, 0, dma_limit >> shift,
true, align);
+    iova = alloc_iova_fast(iovad, iova_len, dev->min_dma_addr,
dma_limit >> shift, true, align);
 done:
     return (dma_addr_t)iova << shift;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 0470d19da7f2..339c9187b033 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -621,6 +621,7 @@ struct device {
                          64 bit addresses for consistent
                          allocations such descriptors. */
     u64        bus_dma_limit;    /* upstream dma constraint */
+    u64        min_dma_addr;    /* force minimal value for DMA address */
     const struct bus_dma_region *dma_range_map;

     struct device_dma_parameters *dma_parms;
@@ -690,6 +691,8 @@ struct device {
 #endif
 };

+#define dma_set_min_dma_addr(__dev__, __val__) __dev__->min_dma_addr = __val__
+
 /**
  * struct device_link - Device link representation.
  * @supplier: The device on the supplier end of the link.


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------