From: Ankit Agrawal <ankita@nvidia.com>
On Grace-based systems such as GB200, device memory is exposed as a
BAR but the actual mappable size is not power-of-2 aligned. The
previous algorithm aligned each sparse mmap area based on its
individual size using ctz64() which prevented efficient huge page
usage by the kernel.
Adjust VFIO region mapping alignment to use the next power-of-2 of
the total region size and place the sparse subregions at their
appropriate offset. This provides better opportunities to get huge
alignment allowing the kernel to use larger page sizes for the VMA.
This enables the use of PMD-level huge pages which can significantly
improve memory access performance and reduce TLB pressure for large
device memory regions.
With this change:
- Create a single aligned base mapping for the entire region
- Change Alignment to be based on pow2ceil(region->size), capped at 1GiB
- Unmap gaps between sparse regions
- Use MAP_FIXED to overlay sparse mmap areas at their offsets
Example VMA for device memory of size 0x2F00F00000 on GB200:
Before (misaligned, no hugepfnmap):
ff88ff000000-ffb7fff00000 rw-s 400000000000 00:06 727 /dev/vfio/devices/vfio1
After (aligned to 1GiB boundary, hugepfnmap enabled):
ff8ac0000000-ffb9c0f00000 rw-s 400000000000 00:06 727 /dev/vfio/devices/vfio1
Requires sparse regions to be sorted by offset (done in previous
patch) to correctly identify and handle gaps.
cc: Alex Williamson <alex@shazbot.org>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
hw/vfio/region.c | 81 ++++++++++++++++++++++++++++++++----------------
1 file changed, 54 insertions(+), 27 deletions(-)
diff --git a/hw/vfio/region.c b/hw/vfio/region.c
index 7622ae5683..49f9a42e71 100644
--- a/hw/vfio/region.c
+++ b/hw/vfio/region.c
@@ -341,8 +341,11 @@ static bool vfio_region_create_dma_buf(VFIORegion *region, Error **errp)
int vfio_region_mmap(VFIORegion *region)
{
- int i, ret, prot = 0;
+ void *map_base, *map_align;
Error *local_err = NULL;
+ int i, ret, prot = 0;
+ off_t map_offset = 0;
+ size_t align;
char *name;
int fd;
@@ -353,41 +356,56 @@ int vfio_region_mmap(VFIORegion *region)
prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
- for (i = 0; i < region->nr_mmaps; i++) {
- size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB);
- void *map_base, *map_align;
+ /*
+ * Align the mmap for more efficient mapping in the kernel. Ideally
+ * we'd know the PMD and PUD mapping sizes to use as discrete alignment
+ * intervals, but we don't. As of Linux v6.19, the largest PUD size
+ * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set
+ * on x86_64).
+ *
+ * Align by power-of-two of the size of the entire region - capped
+ * by 1G - and place the sparse subregions at their appropriate offset.
+ * This will get maximum alignment.
+ *
+ * NB. qemu_memalign() and friends actually allocate memory, whereas
+ * the region size here can exceed host memory, therefore we manually
+ * create an oversized anonymous mapping and clean it up for alignment.
+ */
- /*
- * Align the mmap for more efficient mapping in the kernel. Ideally
- * we'd know the PMD and PUD mapping sizes to use as discrete alignment
- * intervals, but we don't. As of Linux v6.12, the largest PUD size
- * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set
- * on x86_64). Align by power-of-two size, capped at 1GiB.
- *
- * NB. qemu_memalign() and friends actually allocate memory, whereas
- * the region size here can exceed host memory, therefore we manually
- * create an oversized anonymous mapping and clean it up for alignment.
- */
- map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (map_base == MAP_FAILED) {
- ret = -errno;
- goto no_mmap;
- }
+ align = MIN(pow2ceil(region->size), 1 * GiB);
- fd = vfio_device_get_region_fd(region->vbasedev, region->nr);
+ map_base = mmap(0, region->size + align, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (map_base == MAP_FAILED) {
+ ret = -errno;
+ trace_vfio_region_mmap_fault(memory_region_name(region->mem), -1,
+ region->fd_offset,
+ region->fd_offset + region->size - 1, ret);
+ return ret;
+ }
+
+ fd = vfio_device_get_region_fd(region->vbasedev, region->nr);
- map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
- munmap(map_base, map_align - map_base);
- munmap(map_align + region->mmaps[i].size,
- align - (map_align - map_base));
+ map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
+ munmap(map_base, map_align - map_base);
+ munmap(map_align + region->size,
+ align - (map_align - map_base));
- region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot,
+ for (i = 0; i < region->nr_mmaps; i++) {
+ munmap(map_align + map_offset, region->mmaps[i].offset - map_offset);
+ region->mmaps[i].mmap = mmap(map_align + region->mmaps[i].offset,
+ region->mmaps[i].size, prot,
MAP_SHARED | MAP_FIXED, fd,
region->fd_offset +
region->mmaps[i].offset);
if (region->mmaps[i].mmap == MAP_FAILED) {
ret = -errno;
+ /*
+ * Only unmap the rest of the region. Any mmaps that were successful
+ * will be unmapped in no_mmap.
+ */
+ munmap(map_align + region->mmaps[i].offset,
+ region->size - region->mmaps[i].offset);
goto no_mmap;
}
@@ -405,6 +423,15 @@ int vfio_region_mmap(VFIORegion *region)
region->mmaps[i].offset,
region->mmaps[i].offset +
region->mmaps[i].size - 1);
+
+ map_offset = region->mmaps[i].offset + region->mmaps[i].size;
+ }
+
+ /*
+ * Unmap the rest of the region not covered by sparse mmap.
+ */
+ if (map_offset < region->size) {
+ munmap(map_align + map_offset, region->size - map_offset);
}
if (!vfio_region_create_dma_buf(region, &local_err)) {
--
2.34.1
© 2016 - 2026 Red Hat, Inc.