vfio-pci supports huge_fault for PCI MMIO BARs and will insert pud and
pmd mappings for well aligned mappings. follow_pfnmap_start() walks the
page table and therefore knows the page mask of the level where the
address is found and returns this through follow_pfnmap_args.pgmask.
Subsequent pfns from this address until the end of the mapping page are
necessarily consecutive. Use this information to retrieve a range of
pfnmap pfns in a single pass.
With optimal mappings and alignment on systems with 1GB pud and 4KB
page size, this reduces iterations for DMA mapping PCI BARs by a
factor of 256K. In real world testing, the overhead of iterating
pfns for a VM DMA mapping a 32GB PCI BAR is reduced from ~1s to
sub-millisecond overhead.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
drivers/vfio/vfio_iommu_type1.c | 24 +++++++++++++++++-------
1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 939920454da7..6f3e8d981311 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -520,7 +520,7 @@ static void vfio_batch_fini(struct vfio_batch *batch)
static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long vaddr, unsigned long *pfn,
- bool write_fault)
+ unsigned long *pgmask, bool write_fault)
{
struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
int ret;
@@ -544,10 +544,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
return ret;
}
- if (write_fault && !args.writable)
+ if (write_fault && !args.writable) {
ret = -EFAULT;
- else
+ } else {
*pfn = args.pfn;
+ *pgmask = args.pgmask;
+ }
follow_pfnmap_end(&args);
return ret;
@@ -590,15 +592,23 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
vma = vma_lookup(mm, vaddr);
if (vma && vma->vm_flags & VM_PFNMAP) {
- ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
+ unsigned long pgmask;
+
+ ret = follow_fault_pfn(vma, mm, vaddr, pfn, &pgmask,
+ prot & IOMMU_WRITE);
if (ret == -EAGAIN)
goto retry;
if (!ret) {
- if (is_invalid_reserved_pfn(*pfn))
- ret = 1;
- else
+ if (is_invalid_reserved_pfn(*pfn)) {
+ unsigned long epfn;
+
+ epfn = (((*pfn << PAGE_SHIFT) + ~pgmask + 1)
+ & pgmask) >> PAGE_SHIFT;
+ ret = min_t(int, npages, epfn - *pfn);
+ } else {
ret = -EFAULT;
+ }
}
}
done:
--
2.47.1
On Wed, Feb 05, 2025 at 04:17:21PM -0700, Alex Williamson wrote:
> + if (is_invalid_reserved_pfn(*pfn)) {
> + unsigned long epfn;
> +
> + epfn = (((*pfn << PAGE_SHIFT) + ~pgmask + 1)
> + & pgmask) >> PAGE_SHIFT;
> + ret = min_t(int, npages, epfn - *pfn);
You've really made life hard for yourself by passing around a page mask
instead of an order (ie 0/PMD_ORDER/PUD_ORDER). Why not:
epfn = round_up(*pfn + 1, 1 << order);
Although if you insist on passing around a mask, this could be:
unsigned long sz = (~pgmask >> PAGE_SHIFT) + 1;
unsigned long epfn = round_up(*pfn + 1, sz)
On Fri, 14 Feb 2025 19:46:22 +0000
Matthew Wilcox <willy@infradead.org> wrote:
> On Wed, Feb 05, 2025 at 04:17:21PM -0700, Alex Williamson wrote:
> > + if (is_invalid_reserved_pfn(*pfn)) {
> > + unsigned long epfn;
> > +
> > + epfn = (((*pfn << PAGE_SHIFT) + ~pgmask + 1)
> > + & pgmask) >> PAGE_SHIFT;
> > + ret = min_t(int, npages, epfn - *pfn);
>
> You've really made life hard for yourself by passing around a page mask
> instead of an order (ie 0/PMD_ORDER/PUD_ORDER). Why not:
>
> epfn = round_up(*pfn + 1, 1 << order);
>
> Although if you insist on passing around a mask, this could be:
>
> unsigned long sz = (~pgmask >> PAGE_SHIFT) + 1;
> unsigned long epfn = round_up(*pfn + 1, sz)
>
Hey Willy!
I was wishing I had an order, but I didn't want to mangle
follow_pfnmap_start() and follow_pfnmap_setup() too much. Currently
the latter is doing:
args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
If follow_pfnmap_start() passed an order, this would need to change to
something equivalent to:
args->pfn = pfn_base + ((args->address >> PAGE_SHIFT) & ((1UL << order) - 1));
Looks pretty ugly as well, so maybe I'm just shifting the ugliness
around, or maybe someone can spot a more elegant representation.
Thanks,
Alex
On Wed, Feb 05, 2025 at 04:17:21PM -0700, Alex Williamson wrote:
> @@ -590,15 +592,23 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
> vma = vma_lookup(mm, vaddr);
>
> if (vma && vma->vm_flags & VM_PFNMAP) {
> - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
> + unsigned long pgmask;
> +
> + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &pgmask,
> + prot & IOMMU_WRITE);
> if (ret == -EAGAIN)
> goto retry;
>
> if (!ret) {
> - if (is_invalid_reserved_pfn(*pfn))
> - ret = 1;
> - else
> + if (is_invalid_reserved_pfn(*pfn)) {
> + unsigned long epfn;
> +
> + epfn = (((*pfn << PAGE_SHIFT) + ~pgmask + 1)
> + & pgmask) >> PAGE_SHIFT;
That seems a bit indirect
epfn = ((*pfn) | (~pgmask >> PAGE_SHIFT)) + 1;
?
> + ret = min_t(int, npages, epfn - *pfn);
It is nitty but the int's here should be long, and npages should be
unsigned long..
Jason
On Fri, 14 Feb 2025 15:27:04 -0400
Jason Gunthorpe <jgg@ziepe.ca> wrote:
> On Wed, Feb 05, 2025 at 04:17:21PM -0700, Alex Williamson wrote:
> > @@ -590,15 +592,23 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
> > vma = vma_lookup(mm, vaddr);
> >
> > if (vma && vma->vm_flags & VM_PFNMAP) {
> > - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
> > + unsigned long pgmask;
> > +
> > + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &pgmask,
> > + prot & IOMMU_WRITE);
> > if (ret == -EAGAIN)
> > goto retry;
> >
> > if (!ret) {
> > - if (is_invalid_reserved_pfn(*pfn))
> > - ret = 1;
> > - else
> > + if (is_invalid_reserved_pfn(*pfn)) {
> > + unsigned long epfn;
> > +
> > + epfn = (((*pfn << PAGE_SHIFT) + ~pgmask + 1)
> > + & pgmask) >> PAGE_SHIFT;
>
> That seems a bit indirect
>
> epfn = ((*pfn) | (~pgmask >> PAGE_SHIFT)) + 1;
>
> ?
That is simpler, for sure. Thanks!
> > + ret = min_t(int, npages, epfn - *pfn);
>
> It is nitty but the int's here should be long, and npages should be
> unsigned long..
Added a new patch that uses unsigned long consistently for passed page
counts and long for returns. Now we just need a system with a 16TiB
huge page size. Thanks,
Alex
LGTM and completely eliminates guest VM PCI initialization slowdowns
on H100 and A100.
Also not seeing any obvious regressions on my side.
Reported-by: "Mitchell Augustin" <mitchell.augustin@canonical.com>
Reviewed-by: "Mitchell Augustin" <mitchell.augustin@canonical.com>
Tested-by: "Mitchell Augustin" <mitchell.augustin@canonical.com>
On Wed, Feb 5, 2025 at 5:18 PM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> vfio-pci supports huge_fault for PCI MMIO BARs and will insert pud and
> pmd mappings for well aligned mappings. follow_pfnmap_start() walks the
> page table and therefore knows the page mask of the level where the
> address is found and returns this through follow_pfnmap_args.pgmask.
> Subsequent pfns from this address until the end of the mapping page are
> necessarily consecutive. Use this information to retrieve a range of
> pfnmap pfns in a single pass.
>
> With optimal mappings and alignment on systems with 1GB pud and 4KB
> page size, this reduces iterations for DMA mapping PCI BARs by a
> factor of 256K. In real world testing, the overhead of iterating
> pfns for a VM DMA mapping a 32GB PCI BAR is reduced from ~1s to
> sub-millisecond overhead.
>
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> drivers/vfio/vfio_iommu_type1.c | 24 +++++++++++++++++-------
> 1 file changed, 17 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 939920454da7..6f3e8d981311 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -520,7 +520,7 @@ static void vfio_batch_fini(struct vfio_batch *batch)
>
> static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
> unsigned long vaddr, unsigned long *pfn,
> - bool write_fault)
> + unsigned long *pgmask, bool write_fault)
> {
> struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
> int ret;
> @@ -544,10 +544,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
> return ret;
> }
>
> - if (write_fault && !args.writable)
> + if (write_fault && !args.writable) {
> ret = -EFAULT;
> - else
> + } else {
> *pfn = args.pfn;
> + *pgmask = args.pgmask;
> + }
>
> follow_pfnmap_end(&args);
> return ret;
> @@ -590,15 +592,23 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
> vma = vma_lookup(mm, vaddr);
>
> if (vma && vma->vm_flags & VM_PFNMAP) {
> - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
> + unsigned long pgmask;
> +
> + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &pgmask,
> + prot & IOMMU_WRITE);
> if (ret == -EAGAIN)
> goto retry;
>
> if (!ret) {
> - if (is_invalid_reserved_pfn(*pfn))
> - ret = 1;
> - else
> + if (is_invalid_reserved_pfn(*pfn)) {
> + unsigned long epfn;
> +
> + epfn = (((*pfn << PAGE_SHIFT) + ~pgmask + 1)
> + & pgmask) >> PAGE_SHIFT;
> + ret = min_t(int, npages, epfn - *pfn);
> + } else {
> ret = -EFAULT;
> + }
> }
> }
> done:
> --
> 2.47.1
>
--
Mitchell Augustin
Software Engineer - Ubuntu Partner Engineering
© 2016 - 2025 Red Hat, Inc.