[PATCH 10/16] mm/hugetlb: update hugetlbfs to use mmap_prepare, mmap_complete

Lorenzo Stoakes posted 16 patches 1 day, 14 hours ago
[PATCH 10/16] mm/hugetlb: update hugetlbfs to use mmap_prepare, mmap_complete
Posted by Lorenzo Stoakes 1 day, 13 hours ago
We can now update hugetlb to make sure of the new .mmap_prepare() hook, by
deferring the reservation of pages until the VMA is fully established and
handle this in the f_op->mmap_complete() hook.

We hold the VMA write lock throughout so we can't race with faults. rmap
can discover the VMA, but this should not cause a problem.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 fs/hugetlbfs/inode.c | 86 ++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 39 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3cfdf4091001..46d1ddc654c2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -96,39 +96,14 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 #define PGOFF_LOFFT_MAX \
 	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlb_file_mmap_complete(struct file *file, struct vm_area_struct *vma,
+				      const void *context)
 {
 	struct inode *inode = file_inode(file);
-	loff_t len, vma_len;
-	int ret;
 	struct hstate *h = hstate_file(file);
-	vm_flags_t vm_flags;
-
-	/*
-	 * vma address alignment (but not the pgoff alignment) has
-	 * already been checked by prepare_hugepage_range.  If you add
-	 * any error returns here, do so after setting VM_HUGETLB, so
-	 * is_vm_hugetlb_page tests below unmap_region go the right
-	 * way when do_mmap unwinds (may be important on powerpc
-	 * and ia64).
-	 */
-	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
-	vma->vm_ops = &hugetlb_vm_ops;
-
-	/*
-	 * page based offset in vm_pgoff could be sufficiently large to
-	 * overflow a loff_t when converted to byte offset.  This can
-	 * only happen on architectures where sizeof(loff_t) ==
-	 * sizeof(unsigned long).  So, only check in those instances.
-	 */
-	if (sizeof(unsigned long) == sizeof(loff_t)) {
-		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
-			return -EINVAL;
-	}
-
-	/* must be huge page aligned */
-	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
-		return -EINVAL;
+	vm_flags_t vm_flags = vma->vm_flags;
+	loff_t len, vma_len;
+	int ret = 0;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
@@ -139,9 +114,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	inode_lock(inode);
 	file_accessed(file);
 
-	ret = -ENOMEM;
-
-	vm_flags = vma->vm_flags;
 	/*
 	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
 	 * reserving here. Note: only for SHM hugetlbfs file, the inode
@@ -151,20 +123,55 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		vm_flags |= VM_NORESERVE;
 
 	if (hugetlb_reserve_pages(inode,
-				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma,
-				vm_flags) < 0)
+			vma->vm_pgoff >> huge_page_order(h),
+			len >> huge_page_shift(h), vma,
+			vm_flags) < 0) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
-	ret = 0;
 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
 		i_size_write(inode, len);
+
 out:
 	inode_unlock(inode);
-
 	return ret;
 }
 
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
+{
+	struct file *file = desc->file;
+	struct hstate *h = hstate_file(file);
+
+	/*
+	 * vma address alignment (but not the pgoff alignment) has
+	 * already been checked by prepare_hugepage_range.  If you add
+	 * any error returns here, do so after setting VM_HUGETLB, so
+	 * is_vm_hugetlb_page tests below unmap_region go the right
+	 * way when do_mmap unwinds (may be important on powerpc
+	 * and ia64).
+	 */
+	desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+	desc->vm_ops = &hugetlb_vm_ops;
+
+	/*
+	 * page based offset in vm_pgoff could be sufficiently large to
+	 * overflow a loff_t when converted to byte offset.  This can
+	 * only happen on architectures where sizeof(loff_t) ==
+	 * sizeof(unsigned long).  So, only check in those instances.
+	 */
+	if (sizeof(unsigned long) == sizeof(loff_t)) {
+		if (desc->pgoff & PGOFF_LOFFT_MAX)
+			return -EINVAL;
+	}
+
+	/* must be huge page aligned */
+	if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * Called under mmap_write_lock(mm).
  */
@@ -1219,7 +1226,8 @@ static void init_once(void *foo)
 
 static const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
-	.mmap			= hugetlbfs_file_mmap,
+	.mmap_prepare		= hugetlbfs_file_mmap_prepare,
+	.mmap_complete		= hugetlb_file_mmap_complete,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek			= default_llseek,
-- 
2.51.0
Re: [PATCH 10/16] mm/hugetlb: update hugetlbfs to use mmap_prepare, mmap_complete
Posted by Jason Gunthorpe 1 day, 11 hours ago
On Mon, Sep 08, 2025 at 12:10:41PM +0100, Lorenzo Stoakes wrote:
> @@ -151,20 +123,55 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
>  		vm_flags |= VM_NORESERVE;
>  
>  	if (hugetlb_reserve_pages(inode,
> -				vma->vm_pgoff >> huge_page_order(h),
> -				len >> huge_page_shift(h), vma,
> -				vm_flags) < 0)
> +			vma->vm_pgoff >> huge_page_order(h),
> +			len >> huge_page_shift(h), vma,
> +			vm_flags) < 0) {

It was split like this because vma is passed here right?

But hugetlb_reserve_pages() doesn't do much with the vma:

	hugetlb_vma_lock_alloc(vma);
[..]
	vma->vm_private_data = vma_lock;

Manipulates the private which should already exist in prepare:

Check non-share a few times:

	if (!vma || vma->vm_flags & VM_MAYSHARE) {
	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
	if (!vma || vma->vm_flags & VM_MAYSHARE) {

And does this resv_map stuff:

		set_vma_resv_map(vma, resv_map);
		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
[..]
	set_vma_private_data(vma, (unsigned long)map);

Which is also just manipulating the private data.

So it looks to me like it should be refactored so that
hugetlb_reserve_pages() returns the priv pointer to set in the VMA
instead of accepting vma as an argument. Maybe just pass in the desc
instead?

Then no need to introduce complete. I think it is probably better to
try to avoid using complete except for filling PTEs..

Jason
Re: [PATCH 10/16] mm/hugetlb: update hugetlbfs to use mmap_prepare, mmap_complete
Posted by Lorenzo Stoakes 1 day, 11 hours ago
On Mon, Sep 08, 2025 at 10:11:21AM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 08, 2025 at 12:10:41PM +0100, Lorenzo Stoakes wrote:
> > @@ -151,20 +123,55 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> >  		vm_flags |= VM_NORESERVE;
> >
> >  	if (hugetlb_reserve_pages(inode,
> > -				vma->vm_pgoff >> huge_page_order(h),
> > -				len >> huge_page_shift(h), vma,
> > -				vm_flags) < 0)
> > +			vma->vm_pgoff >> huge_page_order(h),
> > +			len >> huge_page_shift(h), vma,
> > +			vm_flags) < 0) {
>
> It was split like this because vma is passed here right?
>
> But hugetlb_reserve_pages() doesn't do much with the vma:
>
> 	hugetlb_vma_lock_alloc(vma);
> [..]
> 	vma->vm_private_data = vma_lock;
>
> Manipulates the private which should already exist in prepare:
>
> Check non-share a few times:
>
> 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
> 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
> 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
>
> And does this resv_map stuff:
>
> 		set_vma_resv_map(vma, resv_map);
> 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
> [..]
> 	set_vma_private_data(vma, (unsigned long)map);
>
> Which is also just manipulating the private data.
>
> So it looks to me like it should be refactored so that
> hugetlb_reserve_pages() returns the priv pointer to set in the VMA
> instead of accepting vma as an argument. Maybe just pass in the desc
> instead?

Well hugetlb_vma_lock_alloc() does:

	vma_lock->vma = vma;

Which we cannot do in prepare.

This is checked in hugetlb_dup_vma_private(), and obviously desc is not a stable
pointer to be used for comparing anything.

I'm also trying to do the minimal changes I can here, I'd rather not majorly
refactor things to suit this change if possible.

>
> Then no need to introduce complete. I think it is probably better to
> try to avoid using complete except for filling PTEs..

I'd rather do that yes. hugetlbfs is the exception to many rules, unfortunately.

>
> Jason

Cheers, Lorenzo
Re: [PATCH 10/16] mm/hugetlb: update hugetlbfs to use mmap_prepare, mmap_complete
Posted by Jason Gunthorpe 1 day, 11 hours ago
On Mon, Sep 08, 2025 at 02:37:44PM +0100, Lorenzo Stoakes wrote:
> On Mon, Sep 08, 2025 at 10:11:21AM -0300, Jason Gunthorpe wrote:
> > On Mon, Sep 08, 2025 at 12:10:41PM +0100, Lorenzo Stoakes wrote:
> > > @@ -151,20 +123,55 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > >  		vm_flags |= VM_NORESERVE;
> > >
> > >  	if (hugetlb_reserve_pages(inode,
> > > -				vma->vm_pgoff >> huge_page_order(h),
> > > -				len >> huge_page_shift(h), vma,
> > > -				vm_flags) < 0)
> > > +			vma->vm_pgoff >> huge_page_order(h),
> > > +			len >> huge_page_shift(h), vma,
> > > +			vm_flags) < 0) {
> >
> > It was split like this because vma is passed here right?
> >
> > But hugetlb_reserve_pages() doesn't do much with the vma:
> >
> > 	hugetlb_vma_lock_alloc(vma);
> > [..]
> > 	vma->vm_private_data = vma_lock;
> >
> > Manipulates the private which should already exist in prepare:
> >
> > Check non-share a few times:
> >
> > 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
> > 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
> > 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
> >
> > And does this resv_map stuff:
> >
> > 		set_vma_resv_map(vma, resv_map);
> > 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
> > [..]
> > 	set_vma_private_data(vma, (unsigned long)map);
> >
> > Which is also just manipulating the private data.
> >
> > So it looks to me like it should be refactored so that
> > hugetlb_reserve_pages() returns the priv pointer to set in the VMA
> > instead of accepting vma as an argument. Maybe just pass in the desc
> > instead?
> 
> Well hugetlb_vma_lock_alloc() does:
> 
> 	vma_lock->vma = vma;
> 
> Which we cannot do in prepare.

Okay, just doing that in commit would be appropriate then
 
> This is checked in hugetlb_dup_vma_private(), and obviously desc is not a stable
> pointer to be used for comparing anything.
> 
> I'm also trying to do the minimal changes I can here, I'd rather not majorly
> refactor things to suit this change if possible.

It doesn't look like a bit refactor, pass vma desc into
hugetlb_reserve_pages(), lift the vma_lock set out

Jason
Re: [PATCH 10/16] mm/hugetlb: update hugetlbfs to use mmap_prepare, mmap_complete
Posted by Lorenzo Stoakes 1 day, 10 hours ago
On Mon, Sep 08, 2025 at 10:52:40AM -0300, Jason Gunthorpe wrote:
> On Mon, Sep 08, 2025 at 02:37:44PM +0100, Lorenzo Stoakes wrote:
> > On Mon, Sep 08, 2025 at 10:11:21AM -0300, Jason Gunthorpe wrote:
> > > On Mon, Sep 08, 2025 at 12:10:41PM +0100, Lorenzo Stoakes wrote:
> > > > @@ -151,20 +123,55 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > > >  		vm_flags |= VM_NORESERVE;
> > > >
> > > >  	if (hugetlb_reserve_pages(inode,
> > > > -				vma->vm_pgoff >> huge_page_order(h),
> > > > -				len >> huge_page_shift(h), vma,
> > > > -				vm_flags) < 0)
> > > > +			vma->vm_pgoff >> huge_page_order(h),
> > > > +			len >> huge_page_shift(h), vma,
> > > > +			vm_flags) < 0) {
> > >
> > > It was split like this because vma is passed here right?
> > >
> > > But hugetlb_reserve_pages() doesn't do much with the vma:
> > >
> > > 	hugetlb_vma_lock_alloc(vma);
> > > [..]
> > > 	vma->vm_private_data = vma_lock;
> > >
> > > Manipulates the private which should already exist in prepare:
> > >
> > > Check non-share a few times:
> > >
> > > 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
> > > 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
> > > 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
> > >
> > > And does this resv_map stuff:
> > >
> > > 		set_vma_resv_map(vma, resv_map);
> > > 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
> > > [..]
> > > 	set_vma_private_data(vma, (unsigned long)map);
> > >
> > > Which is also just manipulating the private data.
> > >
> > > So it looks to me like it should be refactored so that
> > > hugetlb_reserve_pages() returns the priv pointer to set in the VMA
> > > instead of accepting vma as an argument. Maybe just pass in the desc
> > > instead?
> >
> > Well hugetlb_vma_lock_alloc() does:
> >
> > 	vma_lock->vma = vma;
> >
> > Which we cannot do in prepare.
>
> Okay, just doing that in commit would be appropriate then
>
> > This is checked in hugetlb_dup_vma_private(), and obviously desc is not a stable
> > pointer to be used for comparing anything.
> >
> > I'm also trying to do the minimal changes I can here, I'd rather not majorly
> > refactor things to suit this change if possible.
>
> It doesn't look like a bit refactor, pass vma desc into
> hugetlb_reserve_pages(), lift the vma_lock set out

OK, I'll take a look at refactoring this.

>
> Jason

Cheers, Lorenzo