The hugepage test cases of iommufd_dirty_tracking have the 64MB and 128MB
coverages. Both of them are smaller than the default hugepage size 512MB,
when CONFIG_PAGE_SIZE_64KB=y. However, these test cases have a variant of
using huge pages, which would mmap(MAP_HUGETLB) using these smaller sizes
than the system hugepag size. This results in the kernel aligning up the
smaller size to 512MB. If a memory was located between the upper 64/128MB
size boundary and the hugepage 512MB boundary, it would get wiped out:
https://lore.kernel.org/all/aEoUhPYIAizTLADq@nvidia.com/
Given that this aligning up behavior is well documented, we have no choice
but to allocate a hugepage aligned size to avoid this unintended wipe out.
Instead of relying on the kernel's internal force alignment, pass the same
size to posix_memalign() and map().
On the other hand, the munmap() handler in the kernel doesn't align up, so
we have to manually fix the munmap length to prevent a size mismatch.
Also, fix the FIXTURE_TEARDOWN(), as it misuses an munmap() for the bitmap
allocated via posix_memalign() and forgets to free the self->buffer.
Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP")
Cc: stable@vger.kernel.org
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
tools/testing/selftests/iommu/iommufd.c | 28 ++++++++++++++++++++-----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 1a8e85afe9aa..602f8540242b 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -7,6 +7,7 @@
#include <sys/eventfd.h>
#define __EXPORTED_HEADERS__
+#include <linux/const.h>
#include <linux/vfio.h>
#include "iommufd_utils.h"
@@ -2022,7 +2023,19 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
self->fd = open("/dev/iommu", O_RDWR);
ASSERT_NE(-1, self->fd);
- rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
+ if (variant->hugepages) {
+ /*
+ * Allocation must be aligned to the HUGEPAGE_SIZE, because the
+ * following mmap() will automatically align the length to be a
+ * multiple of the underlying huge page size. Failing to do the
+ * same at this allocation will result in a memory overwrite by
+ * the mmap().
+ */
+ size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
+ } else {
+ size = variant->buffer_size;
+ }
+ rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, size);
if (rc || !self->buffer) {
SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
variant->buffer_size, rc);
@@ -2037,8 +2050,8 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
}
assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
- vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
- mmap_flags, -1, 0);
+ vrc = mmap(self->buffer, size, PROT_READ | PROT_WRITE, mmap_flags, -1,
+ 0);
assert(vrc == self->buffer);
self->page_size = MOCK_PAGE_SIZE;
@@ -2066,8 +2079,13 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
FIXTURE_TEARDOWN(iommufd_dirty_tracking)
{
- munmap(self->buffer, variant->buffer_size);
- munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE));
+ unsigned long size = variant->buffer_size;
+
+ if (variant->hugepages)
+ size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE);
+ munmap(self->buffer, size);
+ free(self->buffer);
+ free(self->bitmap);
teardown_iommufd(self->fd, _metadata);
}
--
2.43.0
On Sun, Jun 15, 2025 at 10:02:03PM -0700, Nicolin Chen wrote: > FIXTURE_TEARDOWN(iommufd_dirty_tracking) > { > - munmap(self->buffer, variant->buffer_size); > - munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE)); > + unsigned long size = variant->buffer_size; > + > + if (variant->hugepages) > + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE); > + munmap(self->buffer, size); > + free(self->buffer); > + free(self->bitmap); > teardown_iommufd(self->fd, _metadata); munmap followed by free isn't right.. This code is using the glibc allocator to get a bunch of pages mmap'd to an aligned location then replacing the pages with MAP_SHARED and maybe HAP_HUGETLB versions. The glibc allocator is fine to unmap them via free. I think like this will do the job: diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 1a8e85afe9aa51..e0f4f1541a1f4a 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2008,6 +2008,7 @@ FIXTURE_VARIANT(iommufd_dirty_tracking) FIXTURE_SETUP(iommufd_dirty_tracking) { + size_t mmap_buffer_size; unsigned long size; int mmap_flags; void *vrc; @@ -2022,12 +2023,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking) self->fd = open("/dev/iommu", O_RDWR); ASSERT_NE(-1, self->fd); - rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size); - if (rc || !self->buffer) { - SKIP(return, "Skipping buffer_size=%lu due to errno=%d", - variant->buffer_size, rc); - } - + mmap_buffer_size = variant->buffer_size; mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED; if (variant->hugepages) { /* @@ -2035,9 +2031,25 @@ FIXTURE_SETUP(iommufd_dirty_tracking) * not available. */ mmap_flags |= MAP_HUGETLB | MAP_POPULATE; + + /* + * Allocation must be aligned to the HUGEPAGE_SIZE, because the + * following mmap() will automatically align the length to be a + * multiple of the underlying huge page size. Failing to do the + * same at this allocation will result in a memory overwrite by + * the mmap(). + */ + if (mmap_buffer_size < HUGEPAGE_SIZE) + mmap_buffer_size = HUGEPAGE_SIZE; + } + + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, mmap_buffer_size); + if (rc || !self->buffer) { + SKIP(return, "Skipping buffer_size=%lu due to errno=%d", + mmap_buffer_size, rc); } assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0); - vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE, + vrc = mmap(self->buffer, mmap_buffer_size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); assert(vrc == self->buffer); @@ -2066,8 +2078,8 @@ FIXTURE_SETUP(iommufd_dirty_tracking) FIXTURE_TEARDOWN(iommufd_dirty_tracking) { - munmap(self->buffer, variant->buffer_size); - munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE)); + free(self->buffer); + free(self->bitmap); teardown_iommufd(self->fd, _metadata); }
On Mon, Jun 16, 2025 at 01:25:01PM -0300, Jason Gunthorpe wrote: > On Sun, Jun 15, 2025 at 10:02:03PM -0700, Nicolin Chen wrote: > > FIXTURE_TEARDOWN(iommufd_dirty_tracking) > > { > > - munmap(self->buffer, variant->buffer_size); > > - munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE)); > > + unsigned long size = variant->buffer_size; > > + > > + if (variant->hugepages) > > + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE); > > + munmap(self->buffer, size); > > + free(self->buffer); > > + free(self->bitmap); > > teardown_iommufd(self->fd, _metadata); > > munmap followed by free isn't right.. You are right. I re-checked with Copilot. It says the same thing. I think the whole posix_memalign() + mmap() confuses me.. Yet, should the bitmap pair with free() since it's allocated by a posix_memalign() call? > This code is using the glibc allocator to get a bunch of pages mmap'd > to an aligned location then replacing the pages with MAP_SHARED and > maybe HAP_HUGETLB versions. And I studied some use cases from Copilot. It says that, to use the combination of posix_memalign+mmap, we should do: aligned_ptr = posix_memalign(pagesize, pagesize); unmap(aligned_ptr, pagesize); mapped = mmap(aligned_ptr, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); munmap(mapped, pagesize); // No free() after munmap(). ---breakdown--- Before `posix_memalign()`: [ heap memory unused ] After `posix_memalign()`: [ posix_memalign() memory ] ← managed by malloc/free ↑ aligned_ptr After `munmap(aligned_ptr)`: [ unmapped memory ] ← allocator no longer owns it After `mmap(aligned_ptr, ..., MAP_FIXED)`: [ anonymous mmap region ] ← fully remapped, under your control ↑ mapped ---end--- It points out that the heap bookkeeping will be silently clobbered without the munmap() in-between (like we are doing): ---breakdown--- After `posix_memalign()`: [ posix_memalign() memory ] ← malloc thinks it owns this Then `mmap(aligned_ptr, ..., MAP_FIXED)`: [ anonymous mmap region ] ← malloc still thinks it owns this (!) ↑ mapped ---end--- It also gives a simpler solution for a memory that is not huge page backed but huge page aligned (our !variant->hugepage case): ---code--- void *ptr; size_t alignment = 2 * 1024 * 1024; // or whatever HUGEPAGE_SIZE was size_t size = variant->buffer_size; // Step 1: Use posix_memalign to get an aligned pointer if (posix_memalign(&ptr, alignment, size) != 0) { perror("posix_memalign"); return -1; } // Use the memory directly self->buffer = ptr; // Access/manipulate the memory as needed... // Step 2: Clean up when done free(self->buffer); ---end--- Also, for a huge page case, there is no need of posix_memalign(): "Hugepages are not part of the standard heap, so allocator functions like posix_memalign() or malloc() don't help and can even get in the way." Instead, it suggests a cleaner version without posix_memalign(): ---code--- void *addr = mmap(NULL, variant->buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); return -1; } ---end--- Should we follow? Thanks Nicolin
On Mon, Jun 16, 2025 at 07:02:08PM -0700, Nicolin Chen wrote: > On Mon, Jun 16, 2025 at 01:25:01PM -0300, Jason Gunthorpe wrote: > > On Sun, Jun 15, 2025 at 10:02:03PM -0700, Nicolin Chen wrote: > > > FIXTURE_TEARDOWN(iommufd_dirty_tracking) > > > { > > > - munmap(self->buffer, variant->buffer_size); > > > - munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE)); > > > + unsigned long size = variant->buffer_size; > > > + > > > + if (variant->hugepages) > > > + size = __ALIGN_KERNEL(variant->buffer_size, HUGEPAGE_SIZE); > > > + munmap(self->buffer, size); > > > + free(self->buffer); > > > + free(self->bitmap); > > > teardown_iommufd(self->fd, _metadata); > > > > munmap followed by free isn't right.. > > You are right. I re-checked with Copilot. It says the same thing. > I think the whole posix_memalign() + mmap() confuses me.. > > Yet, should the bitmap pair with free() since it's allocated by a > posix_memalign() call? > > > This code is using the glibc allocator to get a bunch of pages mmap'd > > to an aligned location then replacing the pages with MAP_SHARED and > > maybe HAP_HUGETLB versions. > > And I studied some use cases from Copilot. It says that, to use > the combination of posix_memalign+mmap, we should do: > aligned_ptr = posix_memalign(pagesize, pagesize); > unmap(aligned_ptr, pagesize); > mapped = mmap(aligned_ptr, pagesize, PROT_READ | PROT_WRITE, > MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); > munmap(mapped, pagesize); > // No free() after munmap(). > ---breakdown--- > Before `posix_memalign()`: > [ heap memory unused ] > > After `posix_memalign()`: > [ posix_memalign() memory ] ← managed by malloc/free > ↑ aligned_ptr > > After `munmap(aligned_ptr)`: > [ unmapped memory ] ← allocator no longer owns it Incorrect. The allocator has no idea about the munmap and munmap doesn't disturb any of the allocator tracking structures. > After `mmap(aligned_ptr, ..., MAP_FIXED)`: > [ anonymous mmap region ] ← fully remapped, under your control > ↑ mapped > ---end--- No, this is wrong. > It points out that the heap bookkeeping will be silently clobbered > without the munmap() in-between (like we are doing): Nope, doesn't work like that. > ---breakdown--- > After `posix_memalign()`: > [ posix_memalign() memory ] ← malloc thinks it owns this > > Then `mmap(aligned_ptr, ..., MAP_FIXED)`: > [ anonymous mmap region ] ← malloc still thinks it owns this (!) > ↑ mapped > ---end--- Yes, this is correct and what we are doing here. The allocator always owns it and we are just replacing the memory with a different mmap. > It also gives a simpler solution for a memory that is not huge > page backed but huge page aligned (our !variant->hugepage case): > ---code--- > void *ptr; > size_t alignment = 2 * 1024 * 1024; // or whatever HUGEPAGE_SIZE was > size_t size = variant->buffer_size; > > // Step 1: Use posix_memalign to get an aligned pointer > if (posix_memalign(&ptr, alignment, size) != 0) { > perror("posix_memalign"); > return -1; > } Also no, the main point of this is to inject MAP_SHARED which posix_memalign cannot not do. > Also, for a huge page case, there is no need of posix_memalign(): > "Hugepages are not part of the standard heap, so allocator functions > like posix_memalign() or malloc() don't help and can even get in the > way." > Instead, it suggests a cleaner version without posix_memalign(): > ---code--- > void *addr = mmap(NULL, variant->buffer_size, PROT_READ | PROT_WRITE, > MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, > -1, 0); > if (addr == MAP_FAILED) { perror("mmap"); return -1; } > ---end--- Yes, we could do this only for MAP_HUGETLB, but it doesn't help the normal case with MAP_SHARED. So I would leave it alone, use the version I showed. Jason
On Tue, Jun 17, 2025 at 08:59:48AM -0300, Jason Gunthorpe wrote: > On Mon, Jun 16, 2025 at 07:02:08PM -0700, Nicolin Chen wrote: > > ---breakdown--- > > After `posix_memalign()`: > > [ posix_memalign() memory ] ← malloc thinks it owns this > > > > Then `mmap(aligned_ptr, ..., MAP_FIXED)`: > > [ anonymous mmap region ] ← malloc still thinks it owns this (!) > > ↑ mapped > > ---end--- > > Yes, this is correct and what we are doing here. The allocator always > owns it and we are just replacing the memory with a different mmap. Hmm, if allocator always owns it. Does that mean the munmap() [3] will release what [1] and [2] do (allocating and replacing)? [1] posix_memalign() [2] mmap() [3] munmap() > > // Step 1: Use posix_memalign to get an aligned pointer > > if (posix_memalign(&ptr, alignment, size) != 0) { > > perror("posix_memalign"); > > return -1; > > } > > Also no, the main point of this is to inject MAP_SHARED which > posix_memalign cannot not do. I see. > > Instead, it suggests a cleaner version without posix_memalign(): > > ---code--- > > void *addr = mmap(NULL, variant->buffer_size, PROT_READ | PROT_WRITE, > > MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, > > -1, 0); > > if (addr == MAP_FAILED) { perror("mmap"); return -1; } > > ---end--- > > Yes, we could do this only for MAP_HUGETLB, but it doesn't help the > normal case with MAP_SHARED. > > So I would leave it alone, use the version I showed. OK. Will respin a v2 with that. Thanks Nicolin
On Tue, Jun 17, 2025 at 02:23:41PM -0700, Nicolin Chen wrote: > On Tue, Jun 17, 2025 at 08:59:48AM -0300, Jason Gunthorpe wrote: > > On Mon, Jun 16, 2025 at 07:02:08PM -0700, Nicolin Chen wrote: > > > ---breakdown--- > > > After `posix_memalign()`: > > > [ posix_memalign() memory ] ← malloc thinks it owns this > > > > > > Then `mmap(aligned_ptr, ..., MAP_FIXED)`: > > > [ anonymous mmap region ] ← malloc still thinks it owns this (!) > > > ↑ mapped > > > ---end--- > > > > Yes, this is correct and what we are doing here. The allocator always > > owns it and we are just replacing the memory with a different mmap. > > Hmm, if allocator always owns it. Does that mean the munmap() [3] > will release what [1] and [2] do (allocating and replacing)? No, munmap doesn't destroy the allocator meta data. Jason
On Tue, Jun 17, 2025 at 08:01:36PM -0300, Jason Gunthorpe wrote: > On Tue, Jun 17, 2025 at 02:23:41PM -0700, Nicolin Chen wrote: > > On Tue, Jun 17, 2025 at 08:59:48AM -0300, Jason Gunthorpe wrote: > > > On Mon, Jun 16, 2025 at 07:02:08PM -0700, Nicolin Chen wrote: > > > > ---breakdown--- > > > > After `posix_memalign()`: > > > > [ posix_memalign() memory ] ← malloc thinks it owns this > > > > > > > > Then `mmap(aligned_ptr, ..., MAP_FIXED)`: > > > > [ anonymous mmap region ] ← malloc still thinks it owns this (!) > > > > ↑ mapped > > > > ---end--- > > > > > > Yes, this is correct and what we are doing here. The allocator always > > > owns it and we are just replacing the memory with a different mmap. > > > > Hmm, if allocator always owns it. Does that mean the munmap() [3] > > will release what [1] and [2] do (allocating and replacing)? > > No, munmap doesn't destroy the allocator meta data. Should we do something to that meta data?
On Tue, Jun 17, 2025 at 04:46:57PM -0700, Nicolin Chen wrote: > On Tue, Jun 17, 2025 at 08:01:36PM -0300, Jason Gunthorpe wrote: > > On Tue, Jun 17, 2025 at 02:23:41PM -0700, Nicolin Chen wrote: > > > On Tue, Jun 17, 2025 at 08:59:48AM -0300, Jason Gunthorpe wrote: > > > > On Mon, Jun 16, 2025 at 07:02:08PM -0700, Nicolin Chen wrote: > > > > > ---breakdown--- > > > > > After `posix_memalign()`: > > > > > [ posix_memalign() memory ] ← malloc thinks it owns this > > > > > > > > > > Then `mmap(aligned_ptr, ..., MAP_FIXED)`: > > > > > [ anonymous mmap region ] ← malloc still thinks it owns this (!) > > > > > ↑ mapped > > > > > ---end--- > > > > > > > > Yes, this is correct and what we are doing here. The allocator always > > > > owns it and we are just replacing the memory with a different mmap. > > > > > > Hmm, if allocator always owns it. Does that mean the munmap() [3] > > > will release what [1] and [2] do (allocating and replacing)? > > > > No, munmap doesn't destroy the allocator meta data. > > Should we do something to that meta data? My patch calls free on it which removes the metadata and might munmap the range (or re-use it, but that doesn't matter) Jason
© 2016 - 2025 Red Hat, Inc.