Handle DMA map/unmap operations up to the addressable limit by comparing
against inclusive end-of-range limits, and changing iteration to
perform relative traversals across range sizes, rather than absolute
traversals across addresses.
vfio_link_dma inserts a zero-sized vfio_dma into the rb-tree, and is
only used for that purpose, so discard the size from consideration for
the insertion point.
Signed-off-by: Alex Mastro <amastro@fb.com>
---
drivers/vfio/vfio_iommu_type1.c | 77 ++++++++++++++++++++++-------------------
1 file changed, 42 insertions(+), 35 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 48b84a7af2e1..a65625dcf708 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -166,12 +166,14 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
{
struct rb_node *node = iommu->dma_list.rb_node;
+ WARN_ON(!size);
+
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
- if (start + size <= dma->iova)
+ if (start + size - 1 < dma->iova)
node = node->rb_left;
- else if (start >= dma->iova + dma->size)
+ else if (start > dma->iova + dma->size - 1)
node = node->rb_right;
else
return dma;
@@ -181,16 +183,19 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
}
static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
- dma_addr_t start, size_t size)
+ dma_addr_t start,
+ dma_addr_t end)
{
struct rb_node *res = NULL;
struct rb_node *node = iommu->dma_list.rb_node;
struct vfio_dma *dma_res = NULL;
+ WARN_ON(end < start);
+
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
- if (start < dma->iova + dma->size) {
+ if (start <= dma->iova + dma->size - 1) {
res = node;
dma_res = dma;
if (start >= dma->iova)
@@ -200,7 +205,7 @@ static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
node = node->rb_right;
}
}
- if (res && size && dma_res->iova >= start + size)
+ if (res && dma_res->iova > end)
res = NULL;
return res;
}
@@ -210,11 +215,13 @@ static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
struct vfio_dma *dma;
+ WARN_ON(new->size != 0);
+
while (*link) {
parent = *link;
dma = rb_entry(parent, struct vfio_dma, node);
- if (new->iova + new->size <= dma->iova)
+ if (new->iova <= dma->iova)
link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
@@ -1071,12 +1078,12 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
bool do_accounting)
{
- dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
LIST_HEAD(unmapped_region_list);
struct iommu_iotlb_gather iotlb_gather;
int unmapped_region_cnt = 0;
long unlocked = 0;
+ size_t pos = 0;
if (!dma->size)
return 0;
@@ -1100,13 +1107,14 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
}
iommu_iotlb_gather_init(&iotlb_gather);
- while (iova < end) {
+ while (pos < dma->size) {
size_t unmapped, len;
phys_addr_t phys, next;
+ dma_addr_t iova = dma->iova + pos;
phys = iommu_iova_to_phys(domain->domain, iova);
if (WARN_ON(!phys)) {
- iova += PAGE_SIZE;
+ pos += PAGE_SIZE;
continue;
}
@@ -1115,7 +1123,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
* may require hardware cache flushing, try to find the
* largest contiguous physical memory chunk to unmap.
*/
- for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
+ for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) {
next = iommu_iova_to_phys(domain->domain, iova + len);
if (next != phys + len)
break;
@@ -1136,7 +1144,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
break;
}
- iova += unmapped;
+ pos += unmapped;
}
dma->iommu_mapped = false;
@@ -1228,7 +1236,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
}
static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
- dma_addr_t iova, size_t size, size_t pgsize)
+ dma_addr_t iova, dma_addr_t iova_end, size_t pgsize)
{
struct vfio_dma *dma;
struct rb_node *n;
@@ -1245,8 +1253,8 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
if (dma && dma->iova != iova)
return -EINVAL;
- dma = vfio_find_dma(iommu, iova + size - 1, 0);
- if (dma && dma->iova + dma->size != iova + size)
+ dma = vfio_find_dma(iommu, iova_end, 1);
+ if (dma && dma->iova + dma->size - 1 != iova_end)
return -EINVAL;
for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
@@ -1255,7 +1263,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
if (dma->iova < iova)
continue;
- if (dma->iova > iova + size - 1)
+ if (dma->iova > iova_end)
break;
ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
@@ -1348,7 +1356,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (unmap_all) {
if (iova || size)
goto unlock;
- size = SIZE_MAX;
+ iova_end = ~(dma_addr_t)0;
} else {
if (!size || size & (pgsize - 1))
goto unlock;
@@ -1403,17 +1411,17 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma && dma->iova != iova)
goto unlock;
- dma = vfio_find_dma(iommu, iova_end, 0);
- if (dma && dma->iova + dma->size != iova + size)
+ dma = vfio_find_dma(iommu, iova_end, 1);
+ if (dma && dma->iova + dma->size - 1 != iova_end)
goto unlock;
}
ret = 0;
- n = first_n = vfio_find_dma_first_node(iommu, iova, size);
+ n = first_n = vfio_find_dma_first_node(iommu, iova, iova_end);
while (n) {
dma = rb_entry(n, struct vfio_dma, node);
- if (dma->iova >= iova + size)
+ if (dma->iova > iova_end)
break;
if (!iommu->v2 && iova > dma->iova)
@@ -1743,12 +1751,12 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
for (; n; n = rb_next(n)) {
struct vfio_dma *dma;
- dma_addr_t iova;
+ size_t pos = 0;
dma = rb_entry(n, struct vfio_dma, node);
- iova = dma->iova;
- while (iova < dma->iova + dma->size) {
+ while (pos < dma->size) {
+ dma_addr_t iova = dma->iova + pos;
phys_addr_t phys;
size_t size;
@@ -1764,14 +1772,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
phys = iommu_iova_to_phys(d->domain, iova);
if (WARN_ON(!phys)) {
- iova += PAGE_SIZE;
+ pos += PAGE_SIZE;
continue;
}
size = PAGE_SIZE;
p = phys + size;
i = iova + size;
- while (i < dma->iova + dma->size &&
+ while (pos + size < dma->size &&
p == iommu_iova_to_phys(d->domain, i)) {
size += PAGE_SIZE;
p += PAGE_SIZE;
@@ -1779,9 +1787,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
}
} else {
unsigned long pfn;
- unsigned long vaddr = dma->vaddr +
- (iova - dma->iova);
- size_t n = dma->iova + dma->size - iova;
+ unsigned long vaddr = dma->vaddr + pos;
+ size_t n = dma->size - pos;
long npage;
npage = vfio_pin_pages_remote(dma, vaddr,
@@ -1812,7 +1819,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
goto unwind;
}
- iova += size;
+ pos += size;
}
}
@@ -1829,29 +1836,29 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
unwind:
for (; n; n = rb_prev(n)) {
struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
- dma_addr_t iova;
+ size_t pos = 0;
if (dma->iommu_mapped) {
iommu_unmap(domain->domain, dma->iova, dma->size);
continue;
}
- iova = dma->iova;
- while (iova < dma->iova + dma->size) {
+ while (pos < dma->size) {
+ dma_addr_t iova = dma->iova + pos;
phys_addr_t phys, p;
size_t size;
dma_addr_t i;
phys = iommu_iova_to_phys(domain->domain, iova);
if (!phys) {
- iova += PAGE_SIZE;
+ pos += PAGE_SIZE;
continue;
}
size = PAGE_SIZE;
p = phys + size;
i = iova + size;
- while (i < dma->iova + dma->size &&
+ while (pos + size < dma->size &&
p == iommu_iova_to_phys(domain->domain, i)) {
size += PAGE_SIZE;
p += PAGE_SIZE;
@@ -2989,7 +2996,7 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
if (iommu->dirty_page_tracking)
ret = vfio_iova_dirty_bitmap(range.bitmap.data,
- iommu, iova, size,
+ iommu, iova, iova_end,
range.bitmap.pgsize);
else
ret = -EINVAL;
--
2.47.3
Hi Alex,
On 10/13/25 1:32 AM, Alex Mastro wrote:
> Handle DMA map/unmap operations up to the addressable limit by comparing
> against inclusive end-of-range limits, and changing iteration to
> perform relative traversals across range sizes, rather than absolute
> traversals across addresses.
>
> vfio_link_dma inserts a zero-sized vfio_dma into the rb-tree, and is
> only used for that purpose, so discard the size from consideration for
> the insertion point.
I made a small comment about this on the corresponding code below..
>
> Signed-off-by: Alex Mastro <amastro@fb.com>
> ---
> drivers/vfio/vfio_iommu_type1.c | 77 ++++++++++++++++++++++-------------------
> 1 file changed, 42 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 48b84a7af2e1..a65625dcf708 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -166,12 +166,14 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
> {
> struct rb_node *node = iommu->dma_list.rb_node;
>
> + WARN_ON(!size);
> +
> while (node) {
> struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
>
> - if (start + size <= dma->iova)
> + if (start + size - 1 < dma->iova)
> node = node->rb_left;
> - else if (start >= dma->iova + dma->size)
> + else if (start > dma->iova + dma->size - 1)
> node = node->rb_right;
> else
> return dma;
> @@ -181,16 +183,19 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
> }
>
> static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
> - dma_addr_t start, size_t size)
> + dma_addr_t start,
> + dma_addr_t end)
> {
> struct rb_node *res = NULL;
> struct rb_node *node = iommu->dma_list.rb_node;
> struct vfio_dma *dma_res = NULL;
>
> + WARN_ON(end < start);
> +
> while (node) {
> struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
>
> - if (start < dma->iova + dma->size) {
> + if (start <= dma->iova + dma->size - 1) {
> res = node;
> dma_res = dma;
> if (start >= dma->iova)
> @@ -200,7 +205,7 @@ static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
> node = node->rb_right;
> }
> }
> - if (res && size && dma_res->iova >= start + size)
> + if (res && dma_res->iova > end)
> res = NULL;
> return res;
> }
> @@ -210,11 +215,13 @@ static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
> struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
> struct vfio_dma *dma;
>
> + WARN_ON(new->size != 0);
> +
> while (*link) {
> parent = *link;
> dma = rb_entry(parent, struct vfio_dma, node);
>
> - if (new->iova + new->size <= dma->iova)
> + if (new->iova <= dma->iova)
It is possible I missed a previous thread where this was already
discussed, but why are we adding this new restriction that
vfio_link_dma() will _always_ be called with dma->size = 0? I know it is
the case now, but is there a reason why future code could not try to
insert a non-zero sized node?
I thought it would be more fitting to add overflow protection here too,
as it is done for other code paths in the file? I know the WARN_ON()
above will make us aware if there is ever another caller that attempts
to use size !=0, so this is more of a nit about consistency than a
concern about correctness.
Thank you,
Alejandro
> link = &(*link)->rb_left;
> else
> link = &(*link)->rb_right;
> @@ -1071,12 +1078,12 @@ static size_t unmap_unpin_slow(struct vfio_domain *domain,
> static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> bool do_accounting)
> {
> - dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> struct vfio_domain *domain, *d;
> LIST_HEAD(unmapped_region_list);
> struct iommu_iotlb_gather iotlb_gather;
> int unmapped_region_cnt = 0;
> long unlocked = 0;
> + size_t pos = 0;
>
> if (!dma->size)
> return 0;
> @@ -1100,13 +1107,14 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> }
>
> iommu_iotlb_gather_init(&iotlb_gather);
> - while (iova < end) {
> + while (pos < dma->size) {
> size_t unmapped, len;
> phys_addr_t phys, next;
> + dma_addr_t iova = dma->iova + pos;
>
> phys = iommu_iova_to_phys(domain->domain, iova);
> if (WARN_ON(!phys)) {
> - iova += PAGE_SIZE;
> + pos += PAGE_SIZE;
> continue;
> }
>
> @@ -1115,7 +1123,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> * may require hardware cache flushing, try to find the
> * largest contiguous physical memory chunk to unmap.
> */
> - for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
> + for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) {
> next = iommu_iova_to_phys(domain->domain, iova + len);
> if (next != phys + len)
> break;
> @@ -1136,7 +1144,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> break;
> }
>
> - iova += unmapped;
> + pos += unmapped;
> }
>
> dma->iommu_mapped = false;
> @@ -1228,7 +1236,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> }
>
> static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> - dma_addr_t iova, size_t size, size_t pgsize)
> + dma_addr_t iova, dma_addr_t iova_end, size_t pgsize)
> {
> struct vfio_dma *dma;
> struct rb_node *n;
> @@ -1245,8 +1253,8 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> if (dma && dma->iova != iova)
> return -EINVAL;
>
> - dma = vfio_find_dma(iommu, iova + size - 1, 0);
> - if (dma && dma->iova + dma->size != iova + size)
> + dma = vfio_find_dma(iommu, iova_end, 1);
> + if (dma && dma->iova + dma->size - 1 != iova_end)
> return -EINVAL;
>
> for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
> @@ -1255,7 +1263,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
> if (dma->iova < iova)
> continue;
>
> - if (dma->iova > iova + size - 1)
> + if (dma->iova > iova_end)
> break;
>
> ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
> @@ -1348,7 +1356,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> if (unmap_all) {
> if (iova || size)
> goto unlock;
> - size = SIZE_MAX;
> + iova_end = ~(dma_addr_t)0;
> } else {
> if (!size || size & (pgsize - 1))
> goto unlock;
> @@ -1403,17 +1411,17 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> if (dma && dma->iova != iova)
> goto unlock;
>
> - dma = vfio_find_dma(iommu, iova_end, 0);
> - if (dma && dma->iova + dma->size != iova + size)
> + dma = vfio_find_dma(iommu, iova_end, 1);
> + if (dma && dma->iova + dma->size - 1 != iova_end)
> goto unlock;
> }
>
> ret = 0;
> - n = first_n = vfio_find_dma_first_node(iommu, iova, size);
> + n = first_n = vfio_find_dma_first_node(iommu, iova, iova_end);
>
> while (n) {
> dma = rb_entry(n, struct vfio_dma, node);
> - if (dma->iova >= iova + size)
> + if (dma->iova > iova_end)
> break;
>
> if (!iommu->v2 && iova > dma->iova)
> @@ -1743,12 +1751,12 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>
> for (; n; n = rb_next(n)) {
> struct vfio_dma *dma;
> - dma_addr_t iova;
> + size_t pos = 0;
>
> dma = rb_entry(n, struct vfio_dma, node);
> - iova = dma->iova;
>
> - while (iova < dma->iova + dma->size) {
> + while (pos < dma->size) {
> + dma_addr_t iova = dma->iova + pos;
> phys_addr_t phys;
> size_t size;
>
> @@ -1764,14 +1772,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> phys = iommu_iova_to_phys(d->domain, iova);
>
> if (WARN_ON(!phys)) {
> - iova += PAGE_SIZE;
> + pos += PAGE_SIZE;
> continue;
> }
>
> size = PAGE_SIZE;
> p = phys + size;
> i = iova + size;
> - while (i < dma->iova + dma->size &&
> + while (pos + size < dma->size &&
> p == iommu_iova_to_phys(d->domain, i)) {
> size += PAGE_SIZE;
> p += PAGE_SIZE;
> @@ -1779,9 +1787,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> }
> } else {
> unsigned long pfn;
> - unsigned long vaddr = dma->vaddr +
> - (iova - dma->iova);
> - size_t n = dma->iova + dma->size - iova;
> + unsigned long vaddr = dma->vaddr + pos;
> + size_t n = dma->size - pos;
> long npage;
>
> npage = vfio_pin_pages_remote(dma, vaddr,
> @@ -1812,7 +1819,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> goto unwind;
> }
>
> - iova += size;
> + pos += size;
> }
> }
>
> @@ -1829,29 +1836,29 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> unwind:
> for (; n; n = rb_prev(n)) {
> struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
> - dma_addr_t iova;
> + size_t pos = 0;
>
> if (dma->iommu_mapped) {
> iommu_unmap(domain->domain, dma->iova, dma->size);
> continue;
> }
>
> - iova = dma->iova;
> - while (iova < dma->iova + dma->size) {
> + while (pos < dma->size) {
> + dma_addr_t iova = dma->iova + pos;
> phys_addr_t phys, p;
> size_t size;
> dma_addr_t i;
>
> phys = iommu_iova_to_phys(domain->domain, iova);
> if (!phys) {
> - iova += PAGE_SIZE;
> + pos += PAGE_SIZE;
> continue;
> }
>
> size = PAGE_SIZE;
> p = phys + size;
> i = iova + size;
> - while (i < dma->iova + dma->size &&
> + while (pos + size < dma->size &&
> p == iommu_iova_to_phys(domain->domain, i)) {
> size += PAGE_SIZE;
> p += PAGE_SIZE;
> @@ -2989,7 +2996,7 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>
> if (iommu->dirty_page_tracking)
> ret = vfio_iova_dirty_bitmap(range.bitmap.data,
> - iommu, iova, size,
> + iommu, iova, iova_end,
> range.bitmap.pgsize);
> else
> ret = -EINVAL;
>
On Tue, Oct 21, 2025 at 06:18:00PM -0400, Alejandro Jimenez wrote:
> @@ -210,11 +215,13 @@ static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
> > struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
> > struct vfio_dma *dma;
> > + WARN_ON(new->size != 0);
> > +
> > while (*link) {
> > parent = *link;
> > dma = rb_entry(parent, struct vfio_dma, node);
> > - if (new->iova + new->size <= dma->iova)
> > + if (new->iova <= dma->iova)
> It is possible I missed a previous thread where this was already discussed,
> but why are we adding this new restriction that vfio_link_dma() will
> _always_ be called with dma->size = 0? I know it is the case now, but is
> there a reason why future code could not try to insert a non-zero sized
> node?
Perhaps the WARN_ON is too coddlesome, but given that this helper is used for
exactly one purpose today, the intent is to strongly hint to a future user to
consider what they're doing by deviating from the current usage.
iommu->dma_list's invariant is that all elems should have non-overlapping iova
ranges, which is currently enforced pre-insertion in vfio_dma_do_map by the
vfio_find_dma check. After vfio_pin_map_dma returns, either the vfio_dma has
been grown to its full size, or has been removed from iommu->dma_list on error
via vfio_remove_dma.
> I thought it would be more fitting to add overflow protection here too, as
> it is done for other code paths in the file? I know the WARN_ON() above will
> make us aware if there is ever another caller that attempts to use size !=0,
> so this is more of a nit about consistency than a concern about correctness.
The other code paths which check for overflow focus on sanitizing args at
the vfio_iommu_driver_ops boundary. Since this helper is downstream from those
existing checks, and given its specificity, I'm not sure additional checks here
would be helpful.
© 2016 - 2025 Red Hat, Inc.