Add one new file operation, get_mapping_order(). It can be used by file
backends to report mapping order hints.
By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint,
the driver can report the possibility of mapping chunks that are larger
than PAGE_SIZE. Then, the VA allocator will try to use that as alignment
when allocating the VA ranges.
This is useful because when chunks to be mapped are larger than PAGE_SIZE,
VA alignment matters and it needs to be aligned with the size of the chunk
to be mapped.
Said that, no matter what is the alignment used for the VA allocation, the
driver can still decide which size to map the chunks. It is also not an
issue if it keeps mapping in PAGE_SIZE.
get_mapping_order() is defined to take three parameters. Besides the 1st
parameter which will be the file object pointer, the 2nd + 3rd parameters
being the pgoff + size of the mmap() request. Its retval is defined as the
order, which must be non-negative to enable the alignment. When zero is
returned, it should behave like when the hint is not provided, IOW,
alignment will still be PAGE_SIZE.
When the order is too big, ignore the hint. Normally drivers are trusted,
so it's more of an extra layer of safety measure.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
Documentation/filesystems/vfs.rst | 4 +++
include/linux/fs.h | 1 +
mm/mmap.c | 59 +++++++++++++++++++++++++++----
3 files changed, 57 insertions(+), 7 deletions(-)
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 4f13b01e42eb5..b707ddbebbf52 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -1069,6 +1069,7 @@ This describes how the VFS can manipulate an open file. As of kernel
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ int (*get_mapping_order)(struct file *, unsigned long, size_t);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
@@ -1165,6 +1166,9 @@ otherwise noted.
``get_unmapped_area``
called by the mmap(2) system call
+``get_mapping_order``
+ called by the mmap(2) system call to get mapping order hint
+
``check_flags``
called by the fcntl(2) system call for F_SETFL command
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd3b57cfadeeb..5ba373576bfe5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2287,6 +2287,7 @@ struct file_operations {
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ int (*get_mapping_order)(struct file *file, unsigned long pgoff, size_t len);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
diff --git a/mm/mmap.c b/mm/mmap.c
index 8fa397a18252e..be3dd0623f00c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -808,6 +808,33 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
}
+static inline bool file_has_mmap_order_hint(struct file *file)
+{
+ return file && file->f_op && file->f_op->get_mapping_order;
+}
+
+static inline bool
+mmap_should_align(struct file *file, unsigned long addr, unsigned long len)
+{
+ /* When THP not enabled at all, skip */
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return false;
+
+ /* Never try any alignment if the mmap() address hint is provided */
+ if (addr)
+ return false;
+
+ /* Anonymous THP could use some better alignment when len aligned */
+ if (!file)
+ return IS_ALIGNED(len, PMD_SIZE);
+
+ /*
+ * It's a file mapping, no address hint provided by caller, try any
+ * alignment if the file backend would provide a hint
+ */
+ return file_has_mmap_order_hint(file);
+}
+
unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
@@ -815,8 +842,9 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long)
= NULL;
-
unsigned long error = arch_mmap_check(addr, len, flags);
+ unsigned long align;
+
if (error)
return error;
@@ -841,13 +869,30 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (get_area) {
addr = get_area(file, addr, len, pgoff, flags);
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
- && !addr /* no hint */
- && IS_ALIGNED(len, PMD_SIZE)) {
- /* Ensures that larger anonymous mappings are THP aligned. */
+ } else if (mmap_should_align(file, addr, len)) {
+ if (file_has_mmap_order_hint(file)) {
+ int order;
+ /*
+ * Allow driver to opt-in on the order hint.
+ *
+ * Sanity check on the order returned. Treating
+ * either negative or too big order to be invalid,
+ * where alignment will be skipped.
+ */
+ order = file->f_op->get_mapping_order(file, pgoff, len);
+ if (order < 0)
+ order = 0;
+ if (check_shl_overflow(PAGE_SIZE, order, &align))
+ /* No alignment applied */
+ align = PAGE_SIZE;
+ } else {
+ /* Default alignment for anonymous THPs */
+ align = PMD_SIZE;
+ }
+
addr = thp_get_unmapped_area_vmflags(file, addr, len,
- pgoff, flags, PMD_SIZE,
- vm_flags);
+ pgoff, flags,
+ align, vm_flags);
} else {
addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
pgoff, flags, vm_flags);
--
2.50.1
On Thu, Dec 04, 2025 at 10:10:01AM -0500, Peter Xu wrote: > Add one new file operation, get_mapping_order(). It can be used by file > backends to report mapping order hints. > > By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint, > the driver can report the possibility of mapping chunks that are larger > than PAGE_SIZE. Then, the VA allocator will try to use that as alignment > when allocating the VA ranges. > > This is useful because when chunks to be mapped are larger than PAGE_SIZE, > VA alignment matters and it needs to be aligned with the size of the chunk > to be mapped. > > Said that, no matter what is the alignment used for the VA allocation, the > driver can still decide which size to map the chunks. It is also not an > issue if it keeps mapping in PAGE_SIZE. > > get_mapping_order() is defined to take three parameters. Besides the 1st > parameter which will be the file object pointer, the 2nd + 3rd parameters > being the pgoff + size of the mmap() request. Its retval is defined as the > order, which must be non-negative to enable the alignment. When zero is > returned, it should behave like when the hint is not provided, IOW, > alignment will still be PAGE_SIZE. This should explain how it works when the incoming pgoff is not aligned.. I think for dpdk we want to support mapping around the MSI hole so something like pgoff 0 -> 2M skip 4k 2m + 4k -> 64M Should setup the last VMA to align to 2M + 4k so the first PMD is fragmented to 4k pages but the remaning part is 2M sized or better. We just noticed a bug very similer to this in qemu around it's manual alignment scheme where it would de-align things around the MSI window and spoil the PMDs. I guess ideally the file could return the order assuming an aligned-to-start pgoff and the core code could use that order to compute an adjustment for the actual pgoff so we maintain: va % order = pgoff % order Jason
On Sun, Dec 07, 2025 at 12:21:32PM -0400, Jason Gunthorpe wrote: > On Thu, Dec 04, 2025 at 10:10:01AM -0500, Peter Xu wrote: > > Add one new file operation, get_mapping_order(). It can be used by file > > backends to report mapping order hints. > > > > By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint, > > the driver can report the possibility of mapping chunks that are larger > > than PAGE_SIZE. Then, the VA allocator will try to use that as alignment > > when allocating the VA ranges. > > > > This is useful because when chunks to be mapped are larger than PAGE_SIZE, > > VA alignment matters and it needs to be aligned with the size of the chunk > > to be mapped. > > > > Said that, no matter what is the alignment used for the VA allocation, the > > driver can still decide which size to map the chunks. It is also not an > > issue if it keeps mapping in PAGE_SIZE. > > > > get_mapping_order() is defined to take three parameters. Besides the 1st > > parameter which will be the file object pointer, the 2nd + 3rd parameters > > being the pgoff + size of the mmap() request. Its retval is defined as the > > order, which must be non-negative to enable the alignment. When zero is > > returned, it should behave like when the hint is not provided, IOW, > > alignment will still be PAGE_SIZE. > > This should explain how it works when the incoming pgoff is not > aligned.. Hmm, I thought the charm of this new proposal (based on suggestions of your v1 reviews) is to not need to worry on this.. Or maybe you meant I should add some doc comments in the commit message? If so I can do that. thp_get_unmapped_area_vmflags() should have taken all kinds of pgoff unalignment into account. It's just that this v2 is better than v1 when using this new API because that THP function doesn't need to be exported anymore. > > I think for dpdk we want to support mapping around the MSI hole so > something like > > pgoff 0 -> 2M > skip 4k > 2m + 4k -> 64M > > Should setup the last VMA to align to 2M + 4k so the first PMD is > fragmented to 4k pages but the remaning part is 2M sized or better. > > We just noticed a bug very similer to this in qemu around it's manual > alignment scheme where it would de-align things around the MSI window > and spoil the PMDs. Right, IIUC this series should work all fine exactly as you said. Here the driver should only care about what owns the content of (pgoff, len) range, and the proper order to map these chunks. In case of VFIO, it will know what BAR it's mapping, so as to return a proper order for that specific bar pointed by (pgoff, len). The driver doesn't need to worry on anything else like above. Let me know if I misread your question, or if this series doesn't achieve what you're asking here.. Thanks, > > I guess ideally the file could return the order assuming an aligned-to-start > pgoff and the core code could use that order to compute an adjustment > for > the actual pgoff so we maintain: > va % order = pgoff % order > > Jason > -- Peter Xu
On Wed, Dec 10, 2025 at 03:23:02PM -0500, Peter Xu wrote: > On Sun, Dec 07, 2025 at 12:21:32PM -0400, Jason Gunthorpe wrote: > > On Thu, Dec 04, 2025 at 10:10:01AM -0500, Peter Xu wrote: > > > Add one new file operation, get_mapping_order(). It can be used by file > > > backends to report mapping order hints. > > > > > > By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint, > > > the driver can report the possibility of mapping chunks that are larger > > > than PAGE_SIZE. Then, the VA allocator will try to use that as alignment > > > when allocating the VA ranges. > > > > > > This is useful because when chunks to be mapped are larger than PAGE_SIZE, > > > VA alignment matters and it needs to be aligned with the size of the chunk > > > to be mapped. > > > > > > Said that, no matter what is the alignment used for the VA allocation, the > > > driver can still decide which size to map the chunks. It is also not an > > > issue if it keeps mapping in PAGE_SIZE. > > > > > > get_mapping_order() is defined to take three parameters. Besides the 1st > > > parameter which will be the file object pointer, the 2nd + 3rd parameters > > > being the pgoff + size of the mmap() request. Its retval is defined as the > > > order, which must be non-negative to enable the alignment. When zero is > > > returned, it should behave like when the hint is not provided, IOW, > > > alignment will still be PAGE_SIZE. > > > > This should explain how it works when the incoming pgoff is not > > aligned.. > > Hmm, I thought the charm of this new proposal (based on suggestions of your > v1 reviews) is to not need to worry on this.. Or maybe you meant I should > add some doc comments in the commit message? It can't be ignored, I don't think I ever said that. I said the driver shouldn't have to worry about it, the core MM should deal with this. > > I think for dpdk we want to support mapping around the MSI hole so > > something like > > > > pgoff 0 -> 2M > > skip 4k > > 2m + 4k -> 64M > > > > Should setup the last VMA to align to 2M + 4k so the first PMD is > > fragmented to 4k pages but the remaning part is 2M sized or better. > > > > We just noticed a bug very similer to this in qemu around it's manual > > alignment scheme where it would de-align things around the MSI window > > and spoil the PMDs. > > Right, IIUC this series should work all fine exactly as you said. Are you sure? I did not see code doing this. The second mapping needs to select a VA such that VA % 2M == 4k And I don't see it doing that. Jason
On Tue, Dec 16, 2025 at 10:44:27AM -0400, Jason Gunthorpe wrote: > On Wed, Dec 10, 2025 at 03:23:02PM -0500, Peter Xu wrote: > > On Sun, Dec 07, 2025 at 12:21:32PM -0400, Jason Gunthorpe wrote: > > > On Thu, Dec 04, 2025 at 10:10:01AM -0500, Peter Xu wrote: > > > > Add one new file operation, get_mapping_order(). It can be used by file > > > > backends to report mapping order hints. > > > > > > > > By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint, > > > > the driver can report the possibility of mapping chunks that are larger > > > > than PAGE_SIZE. Then, the VA allocator will try to use that as alignment > > > > when allocating the VA ranges. > > > > > > > > This is useful because when chunks to be mapped are larger than PAGE_SIZE, > > > > VA alignment matters and it needs to be aligned with the size of the chunk > > > > to be mapped. > > > > > > > > Said that, no matter what is the alignment used for the VA allocation, the > > > > driver can still decide which size to map the chunks. It is also not an > > > > issue if it keeps mapping in PAGE_SIZE. > > > > > > > > get_mapping_order() is defined to take three parameters. Besides the 1st > > > > parameter which will be the file object pointer, the 2nd + 3rd parameters > > > > being the pgoff + size of the mmap() request. Its retval is defined as the > > > > order, which must be non-negative to enable the alignment. When zero is > > > > returned, it should behave like when the hint is not provided, IOW, > > > > alignment will still be PAGE_SIZE. > > > > > > This should explain how it works when the incoming pgoff is not > > > aligned.. > > > > Hmm, I thought the charm of this new proposal (based on suggestions of your > > v1 reviews) is to not need to worry on this.. Or maybe you meant I should > > add some doc comments in the commit message? > > It can't be ignored, I don't think I ever said that. I said the driver > shouldn't have to worry about it, the core MM should deal with this. > > > > I think for dpdk we want to support mapping around the MSI hole so > > > something like > > > > > > pgoff 0 -> 2M > > > skip 4k > > > 2m + 4k -> 64M > > > > > > Should setup the last VMA to align to 2M + 4k so the first PMD is > > > fragmented to 4k pages but the remaning part is 2M sized or better. > > > > > > We just noticed a bug very similer to this in qemu around it's manual > > > alignment scheme where it would de-align things around the MSI window > > > and spoil the PMDs. > > > > Right, IIUC this series should work all fine exactly as you said. > > Are you sure? I did not see code doing this. The second mapping needs > to select a VA such that > > VA % 2M == 4k > > And I don't see it doing that. I have an old program tested this, I ran it but I didn't mention it in the cover letter. I'm 99% sure it works like it, unless I'm seriously wrong somewhere. See: https://github.com/xzpeter/clibs/blob/master/misc/vfio-pci-nofix.c mmap BAR with memory ENABLED and read (offset=0x0, size=0x8000000) mmap()=0x7f4395a00000 - 0.000117s read(32768) - 0.085376s mmap BAR with memory ENABLED and read (offset=0x1000, size=0x7fff000) mmap()=0x7f4395a01000 - 0.000012s read(32767) - 0.088642s mmap BAR with memory ENABLED and read (offset=0x0, size=0x7fff000) mmap()=0x7f4395a00000 - 0.000015s read(32767) - 0.093850s mmap BAR with memory ENABLED and read (offset=0x1000, size=0x7ffe000) mmap()=0x7f4395a01000 - 0.000011s read(32766) - 0.093248s Also see __thp_get_unmapped_area() processed such pgoff, it allocates VA with len_pad (not len), and pad the retval at last. Please let me know if it didn't work like it, then it might be a bug. Thanks, -- Peter Xu
On Tue, Dec 16, 2025 at 10:42:39AM -0500, Peter Xu wrote: > Also see __thp_get_unmapped_area() processed such pgoff, it allocates VA > with len_pad (not len), and pad the retval at last. > > Please let me know if it didn't work like it, then it might be a bug. It should all be documented then in the kdoc for the new ops, in this kind of language that the resulting VA flows from pgoff Jason
On Tue, Dec 16, 2025 at 01:19:44PM -0400, Jason Gunthorpe wrote: > On Tue, Dec 16, 2025 at 10:42:39AM -0500, Peter Xu wrote: > > Also see __thp_get_unmapped_area() processed such pgoff, it allocates VA > > with len_pad (not len), and pad the retval at last. > > > > Please let me know if it didn't work like it, then it might be a bug. > > It should all be documented then in the kdoc for the new ops, in this > kind of language that the resulting VA flows from pgoff IMHO that's one of the major benefits of this API, so that there's no need to mention impl details like this. I thought that's also what you wanted as well.. as you're further suggesting to offload order adjustments to core mm, which I tend to agree. Here the point is, the driver should only care about the size of mapping, nothing else like how exactly the alignments will be calculated, and how that interacts with pgoff. The kernel mm manages that. It's done exactly like what anon thp does already when len is pmd aligned. Or maybe I misunderstood what you're suggesting to document? If so, please let me know; some example would be greatly helpful. Thanks, -- Peter Xu
On Tue, Dec 16, 2025 at 12:36:13PM -0500, Peter Xu wrote: > On Tue, Dec 16, 2025 at 01:19:44PM -0400, Jason Gunthorpe wrote: > > On Tue, Dec 16, 2025 at 10:42:39AM -0500, Peter Xu wrote: > > > Also see __thp_get_unmapped_area() processed such pgoff, it allocates VA > > > with len_pad (not len), and pad the retval at last. > > > > > > Please let me know if it didn't work like it, then it might be a bug. > > > > It should all be documented then in the kdoc for the new ops, in this > > kind of language that the resulting VA flows from pgoff > > IMHO that's one of the major benefits of this API, so that there's no need > to mention impl details like this. It needs to be clearly explained exactly how pgoff and the returned order are related because it impacts how the drivers need to manage their pgoff space. > Here the point is, the driver should only care about the size of mapping, > nothing else like how exactly the alignments will be calculated, and how > that interacts with pgoff. The kernel mm manages that. It's done exactly > like what anon thp does already when len is pmd aligned. The driver owns the pgoff number space, it has to care about how that relates to the PTEs. > Or maybe I misunderstood what you're suggesting to document? If so, please > let me know; some example would be greatly helpful. Just document the 'VA % order = pgoff % order' equation in the kdoc for the new op. Jason
On Tue, Dec 16, 2025 at 02:58:50PM -0400, Jason Gunthorpe wrote:
> On Tue, Dec 16, 2025 at 12:36:13PM -0500, Peter Xu wrote:
> > On Tue, Dec 16, 2025 at 01:19:44PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Dec 16, 2025 at 10:42:39AM -0500, Peter Xu wrote:
> > > > Also see __thp_get_unmapped_area() processed such pgoff, it allocates VA
> > > > with len_pad (not len), and pad the retval at last.
> > > >
> > > > Please let me know if it didn't work like it, then it might be a bug.
> > >
> > > It should all be documented then in the kdoc for the new ops, in this
> > > kind of language that the resulting VA flows from pgoff
> >
> > IMHO that's one of the major benefits of this API, so that there's no need
> > to mention impl details like this.
>
> It needs to be clearly explained exactly how pgoff and the returned
> order are related because it impacts how the drivers need to manage
> their pgoff space.
Here "pgoff" plays two roles:
(1) as a range, (pgoff, len) on top of the fd, decides which device blob
to be mapped. This is relevant to the driver, for sure..
(2) as an offset, pgoff is relevant when we want to make sure mmap()
request's VA will be aligned in a way so that we can maximize huge
mappings. This has, IMHO, nothing to do with the driver, and that's
what I want to make the new API transparent of.
I agree drivers need to know pgoff for (1) in terms of get_mapping_order(),
not (2).
>
> > Here the point is, the driver should only care about the size of mapping,
> > nothing else like how exactly the alignments will be calculated, and how
> > that interacts with pgoff. The kernel mm manages that. It's done exactly
> > like what anon thp does already when len is pmd aligned.
>
> The driver owns the pgoff number space, it has to care about how that
> relates to the PTEs.
>
> > Or maybe I misunderstood what you're suggesting to document? If so, please
> > let me know; some example would be greatly helpful.
>
> Just document the 'VA % order = pgoff % order' equation in the kdoc
> for the new op.
When it's "related to PTEs", it's talking about (2) above, so that's really
what I want to avoid mentioning.
Docuemnt anything about VA is just confusing on its own especially when
"int get_mapping_order(fd, pgoff, len)" doesn't even have anything in param
or retval that is relevant to the virtual address space..
If you think missing such info is harder for reviews, I can definitely add
a rich comment when repost explaining how __thp_get_unmapped_area() works
here.
We can also pause this a bit and wait for Matthew's review on the API,
where he showed concerns. If there's major reason this API is rejected, we
don't need to bother this part of detail either.
Thanks,
--
Peter Xu
On Tue, Dec 16, 2025 at 02:44:29PM -0500, Peter Xu wrote: > > > Or maybe I misunderstood what you're suggesting to document? If so, please > > > let me know; some example would be greatly helpful. > > > > Just document the 'VA % order = pgoff % order' equation in the kdoc > > for the new op. > > When it's "related to PTEs", it's talking about (2) above, so that's really > what I want to avoid mentioning. You can't avoid it. Drivers must ensure that pgoff % order == physical % order And that is something only drivers can do by knowing about this requirement. Jason
On Fri, Dec 19, 2025 at 10:59:57AM -0400, Jason Gunthorpe wrote: > On Tue, Dec 16, 2025 at 02:44:29PM -0500, Peter Xu wrote: > > > > Or maybe I misunderstood what you're suggesting to document? If so, please > > > > let me know; some example would be greatly helpful. > > > > > > Just document the 'VA % order = pgoff % order' equation in the kdoc > > > for the new op. > > > > When it's "related to PTEs", it's talking about (2) above, so that's really > > what I want to avoid mentioning. > > You can't avoid it. Drivers must ensure that > > pgoff % order == physical % order > > And that is something only drivers can do by knowing about this > requirement. This is a current limitation that above must be guaranteed, there's not much the driver can do, IMHO. If you could remember, that's the only reason why I used to suggest (while we were discussing this in v1) to make it *pgoff instead of pgoff, so that drivers can change *pgoff to make it relevant to HPA. I didn't take that approach as I want to make this simple until it's justified to be required. It holds true for vfio-pci, I hope it holds true forever. If not, this API will stop working, afaict. -- Peter Xu
On Fri, Dec 19, 2025 at 10:13:02AM -0500, Peter Xu wrote: > On Fri, Dec 19, 2025 at 10:59:57AM -0400, Jason Gunthorpe wrote: > > On Tue, Dec 16, 2025 at 02:44:29PM -0500, Peter Xu wrote: > > > > > Or maybe I misunderstood what you're suggesting to document? If so, please > > > > > let me know; some example would be greatly helpful. > > > > > > > > Just document the 'VA % order = pgoff % order' equation in the kdoc > > > > for the new op. > > > > > > When it's "related to PTEs", it's talking about (2) above, so that's really > > > what I want to avoid mentioning. > > > > You can't avoid it. Drivers must ensure that > > > > pgoff % order == physical % order > > > > And that is something only drivers can do by knowing about this > > requirement. > > This is a current limitation that above must be guaranteed, there's not > much the driver can do, IMHO. There is alot the driver can do! The driver decides on the pgoff values it is using, it needs to keep the above in mind when it builds its pgoff number space! > If you could remember, that's the only reason why I used to suggest (while > we were discussing this in v1) to make it *pgoff instead of pgoff, so that > drivers can change *pgoff to make it relevant to HPA. What? That's nonsense. The pgoff space is assigned by the driver and needs to remain a fixed relationship to the underlying phys the driver is mapping in. It shouldn't be changing pgoff during mmap! Jason
On Fri, Dec 19, 2025 at 11:20:30AM -0400, Jason Gunthorpe wrote: > On Fri, Dec 19, 2025 at 10:13:02AM -0500, Peter Xu wrote: > > On Fri, Dec 19, 2025 at 10:59:57AM -0400, Jason Gunthorpe wrote: > > > On Tue, Dec 16, 2025 at 02:44:29PM -0500, Peter Xu wrote: > > > > > > Or maybe I misunderstood what you're suggesting to document? If so, please > > > > > > let me know; some example would be greatly helpful. > > > > > > > > > > Just document the 'VA % order = pgoff % order' equation in the kdoc > > > > > for the new op. > > > > > > > > When it's "related to PTEs", it's talking about (2) above, so that's really > > > > what I want to avoid mentioning. > > > > > > You can't avoid it. Drivers must ensure that > > > > > > pgoff % order == physical % order > > > > > > And that is something only drivers can do by knowing about this > > > requirement. > > > > This is a current limitation that above must be guaranteed, there's not > > much the driver can do, IMHO. > > There is alot the driver can do! The driver decides on the pgoff > values it is using, it needs to keep the above in mind when it builds > its pgoff number space! Yeah, if so, it's reassuring. :) > > > If you could remember, that's the only reason why I used to suggest (while > > we were discussing this in v1) to make it *pgoff instead of pgoff, so that > > drivers can change *pgoff to make it relevant to HPA. > > What? That's nonsense. The pgoff space is assigned by the driver and > needs to remain a fixed relationship to the underlying phys the driver > is mapping in. It shouldn't be changing pgoff during mmap! I meant, return *pgoff as a hint, not changing the pgoff to be used.. Only changing the pgoff (as an integer) to be used in the VA calculations. Thanks for sharing above information to ease my mind, if drivers are all smart enough (I'll trust you more than myself on driver knowledges!) I think we're all good.. -- Peter Xu
I forgot to copy mm/fs maintainers for the 1st/2nd patches in this series,
my apologies. Whole series can be found here:
https://lore.kernel.org/r/20251204151003.171039-1-peterx@redhat.com
I'll modify the cc list when repost.
Thanks,
On Thu, Dec 04, 2025 at 10:10:01AM -0500, Peter Xu wrote:
> Add one new file operation, get_mapping_order(). It can be used by file
> backends to report mapping order hints.
>
> By default, Linux assumed we will map in PAGE_SIZE chunks. With this hint,
> the driver can report the possibility of mapping chunks that are larger
> than PAGE_SIZE. Then, the VA allocator will try to use that as alignment
> when allocating the VA ranges.
>
> This is useful because when chunks to be mapped are larger than PAGE_SIZE,
> VA alignment matters and it needs to be aligned with the size of the chunk
> to be mapped.
>
> Said that, no matter what is the alignment used for the VA allocation, the
> driver can still decide which size to map the chunks. It is also not an
> issue if it keeps mapping in PAGE_SIZE.
>
> get_mapping_order() is defined to take three parameters. Besides the 1st
> parameter which will be the file object pointer, the 2nd + 3rd parameters
> being the pgoff + size of the mmap() request. Its retval is defined as the
> order, which must be non-negative to enable the alignment. When zero is
> returned, it should behave like when the hint is not provided, IOW,
> alignment will still be PAGE_SIZE.
>
> When the order is too big, ignore the hint. Normally drivers are trusted,
> so it's more of an extra layer of safety measure.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> Documentation/filesystems/vfs.rst | 4 +++
> include/linux/fs.h | 1 +
> mm/mmap.c | 59 +++++++++++++++++++++++++++----
> 3 files changed, 57 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 4f13b01e42eb5..b707ddbebbf52 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -1069,6 +1069,7 @@ This describes how the VFS can manipulate an open file. As of kernel
> int (*fasync) (int, struct file *, int);
> int (*lock) (struct file *, int, struct file_lock *);
> unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
> + int (*get_mapping_order)(struct file *, unsigned long, size_t);
> int (*check_flags)(int);
> int (*flock) (struct file *, int, struct file_lock *);
> ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
> @@ -1165,6 +1166,9 @@ otherwise noted.
> ``get_unmapped_area``
> called by the mmap(2) system call
>
> +``get_mapping_order``
> + called by the mmap(2) system call to get mapping order hint
> +
> ``check_flags``
> called by the fcntl(2) system call for F_SETFL command
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index dd3b57cfadeeb..5ba373576bfe5 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2287,6 +2287,7 @@ struct file_operations {
> int (*fasync) (int, struct file *, int);
> int (*lock) (struct file *, int, struct file_lock *);
> unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
> + int (*get_mapping_order)(struct file *file, unsigned long pgoff, size_t len);
> int (*check_flags)(int);
> int (*flock) (struct file *, int, struct file_lock *);
> ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 8fa397a18252e..be3dd0623f00c 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -808,6 +808,33 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
> return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
> }
>
> +static inline bool file_has_mmap_order_hint(struct file *file)
> +{
> + return file && file->f_op && file->f_op->get_mapping_order;
> +}
> +
> +static inline bool
> +mmap_should_align(struct file *file, unsigned long addr, unsigned long len)
> +{
> + /* When THP not enabled at all, skip */
> + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
> + return false;
> +
> + /* Never try any alignment if the mmap() address hint is provided */
> + if (addr)
> + return false;
> +
> + /* Anonymous THP could use some better alignment when len aligned */
> + if (!file)
> + return IS_ALIGNED(len, PMD_SIZE);
> +
> + /*
> + * It's a file mapping, no address hint provided by caller, try any
> + * alignment if the file backend would provide a hint
> + */
> + return file_has_mmap_order_hint(file);
> +}
> +
> unsigned long
> __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
> unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
> @@ -815,8 +842,9 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
> unsigned long (*get_area)(struct file *, unsigned long,
> unsigned long, unsigned long, unsigned long)
> = NULL;
> -
> unsigned long error = arch_mmap_check(addr, len, flags);
> + unsigned long align;
> +
> if (error)
> return error;
>
> @@ -841,13 +869,30 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
>
> if (get_area) {
> addr = get_area(file, addr, len, pgoff, flags);
> - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
> - && !addr /* no hint */
> - && IS_ALIGNED(len, PMD_SIZE)) {
> - /* Ensures that larger anonymous mappings are THP aligned. */
> + } else if (mmap_should_align(file, addr, len)) {
> + if (file_has_mmap_order_hint(file)) {
> + int order;
> + /*
> + * Allow driver to opt-in on the order hint.
> + *
> + * Sanity check on the order returned. Treating
> + * either negative or too big order to be invalid,
> + * where alignment will be skipped.
> + */
> + order = file->f_op->get_mapping_order(file, pgoff, len);
> + if (order < 0)
> + order = 0;
> + if (check_shl_overflow(PAGE_SIZE, order, &align))
> + /* No alignment applied */
> + align = PAGE_SIZE;
> + } else {
> + /* Default alignment for anonymous THPs */
> + align = PMD_SIZE;
> + }
> +
> addr = thp_get_unmapped_area_vmflags(file, addr, len,
> - pgoff, flags, PMD_SIZE,
> - vm_flags);
> + pgoff, flags,
> + align, vm_flags);
> } else {
> addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
> pgoff, flags, vm_flags);
> --
> 2.50.1
>
--
Peter Xu
On Thu, Dec 04, 2025 at 10:19:44AM -0500, Peter Xu wrote: > > Add one new file operation, get_mapping_order(). It can be used by file > > backends to report mapping order hints. This seems like a terrible idea. I'll look at it after Plumbers.
On Mon, Dec 08, 2025 at 09:21:58AM +0000, Matthew Wilcox wrote: > On Thu, Dec 04, 2025 at 10:19:44AM -0500, Peter Xu wrote: > > > Add one new file operation, get_mapping_order(). It can be used by file > > > backends to report mapping order hints. > > This seems like a terrible idea. I'll look at it after Plumbers. Sure, no rush, please feel free to go through discussion in v1 when it comes, that's where we landed to this API based on suggestions from Jason. I'm open to other suggestions. -- Peter Xu
© 2016 - 2026 Red Hat, Inc.