[PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare

Lorenzo Stoakes posted 14 patches 4 months, 3 weeks ago
There is a newer version of this series
[PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Lorenzo Stoakes 4 months, 3 weeks ago
Since we can now perform actions after the VMA is established via
mmap_prepare, use desc->action_success_hook to set up the hugetlb lock
once the VMA is setup.

We also make changes throughout hugetlbfs to make this possible.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
---
 fs/hugetlbfs/inode.c           | 36 ++++++++++------
 include/linux/hugetlb.h        |  9 +++-
 include/linux/hugetlb_inline.h | 15 ++++---
 mm/hugetlb.c                   | 77 ++++++++++++++++++++--------------
 4 files changed, 85 insertions(+), 52 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f42548ee9083..9e0625167517 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 #define PGOFF_LOFFT_MAX \
 	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
 {
+	/* Unfortunate we have to reassign vma->vm_private_data. */
+	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
+}
+
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
+{
+	struct file *file = desc->file;
 	struct inode *inode = file_inode(file);
 	loff_t len, vma_len;
 	int ret;
@@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * way when do_mmap unwinds (may be important on powerpc
 	 * and ia64).
 	 */
-	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
-	vma->vm_ops = &hugetlb_vm_ops;
+	desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+	desc->vm_ops = &hugetlb_vm_ops;
 
 	/*
 	 * page based offset in vm_pgoff could be sufficiently large to
@@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * sizeof(unsigned long).  So, only check in those instances.
 	 */
 	if (sizeof(unsigned long) == sizeof(loff_t)) {
-		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+		if (desc->pgoff & PGOFF_LOFFT_MAX)
 			return -EINVAL;
 	}
 
 	/* must be huge page aligned */
-	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+	if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
-	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
-	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	vma_len = (loff_t)vma_desc_size(desc);
+	len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
 	/* check for overflow */
 	if (len < vma_len)
 		return -EINVAL;
@@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	ret = -ENOMEM;
 
-	vm_flags = vma->vm_flags;
+	vm_flags = desc->vm_flags;
 	/*
 	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
 	 * reserving here. Note: only for SHM hugetlbfs file, the inode
@@ -151,17 +158,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 		vm_flags |= VM_NORESERVE;
 
 	if (hugetlb_reserve_pages(inode,
-				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma,
-				vm_flags) < 0)
+			desc->pgoff >> huge_page_order(h),
+			len >> huge_page_shift(h), desc,
+			vm_flags) < 0)
 		goto out;
 
 	ret = 0;
-	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+	if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
 		i_size_write(inode, len);
 out:
 	inode_unlock(inode);
 
+	/* Allocate the VMA lock after we set it up. */
+	if (!ret)
+		desc->action.success_hook = hugetlb_file_mmap_prepare_success;
 	return ret;
 }
 
@@ -1221,7 +1231,7 @@ static void init_once(void *foo)
 
 static const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
-	.mmap			= hugetlbfs_file_mmap,
+	.mmap_prepare		= hugetlbfs_file_mmap_prepare,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek			= default_llseek,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..2387513d6ae5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     struct folio **foliop);
 #endif /* CONFIG_USERFAULTFD */
 long hugetlb_reserve_pages(struct inode *inode, long from, long to,
-						struct vm_area_struct *vma,
-						vm_flags_t vm_flags);
+			   struct vm_area_desc *desc, vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void fixup_hugetlb_reservations(struct vm_area_struct *vma);
 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
 
 static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
 
+static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 
 #ifndef pgd_write
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..a27aa0162918 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -2,22 +2,27 @@
 #ifndef _LINUX_HUGETLB_INLINE_H
 #define _LINUX_HUGETLB_INLINE_H
 
-#ifdef CONFIG_HUGETLB_PAGE
-
 #include <linux/mm.h>
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+#ifdef CONFIG_HUGETLB_PAGE
+
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
-	return !!(vma->vm_flags & VM_HUGETLB);
+	return !!(vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
 	return false;
 }
 
 #endif
 
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+	return is_vm_hugetlb_flags(vma->vm_flags);
+}
+
 #endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1806685ea326..af28f7fbabb8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, bool take_locks);
@@ -427,17 +426,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 	}
 }
 
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 {
 	struct hugetlb_vma_lock *vma_lock;
 
 	/* Only establish in (flags) sharable vmas */
 	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
-		return;
+		return 0;
 
 	/* Should never get here with non-NULL vm_private_data */
 	if (vma->vm_private_data)
-		return;
+		return -EINVAL;
 
 	vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
 	if (!vma_lock) {
@@ -452,13 +455,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 		 * allocation failure.
 		 */
 		pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
-		return;
+		return -EINVAL;
 	}
 
 	kref_init(&vma_lock->refs);
 	init_rwsem(&vma_lock->rw_sema);
 	vma_lock->vma = vma;
 	vma->vm_private_data = vma_lock;
+
+	return 0;
 }
 
 /* Helper that removes a struct file_region from the resv_map cache and returns
@@ -1190,20 +1195,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 	}
 }
 
-static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 {
-	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+	VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
+	VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
 
-	set_vma_private_data(vma, (unsigned long)map);
+	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 }
 
-static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
 {
-	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+	VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
 
-	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+	desc->private_data = map;
+}
+
+static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
+{
+	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+	VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+
+	desc->private_data = (void *)((unsigned long)desc->private_data | flags);
 }
 
 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
@@ -1213,6 +1226,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 	return (get_vma_private_data(vma) & flag) != 0;
 }
 
+static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
+{
+	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+
+	return ((unsigned long)desc->private_data) & flag;
+}
+
 bool __vma_private_lock(struct vm_area_struct *vma)
 {
 	return !(vma->vm_flags & VM_MAYSHARE) &&
@@ -7250,9 +7270,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
  */
 
 long hugetlb_reserve_pages(struct inode *inode,
-					long from, long to,
-					struct vm_area_struct *vma,
-					vm_flags_t vm_flags)
+		long from, long to,
+		struct vm_area_desc *desc,
+		vm_flags_t vm_flags)
 {
 	long chg = -1, add = -1, spool_resv, gbl_resv;
 	struct hstate *h = hstate_inode(inode);
@@ -7267,12 +7287,6 @@ long hugetlb_reserve_pages(struct inode *inode,
 		return -EINVAL;
 	}
 
-	/*
-	 * vma specific semaphore used for pmd sharing and fault/truncation
-	 * synchronization
-	 */
-	hugetlb_vma_lock_alloc(vma);
-
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
 	 * attempt will be made for VM_NORESERVE to allocate a page
@@ -7285,9 +7299,9 @@ long hugetlb_reserve_pages(struct inode *inode,
 	 * Shared mappings base their reservation on the number of pages that
 	 * are already allocated on behalf of the file. Private mappings need
 	 * to reserve the full area even if read-only as mprotect() may be
-	 * called to make the mapping read-write. Assume !vma is a shm mapping
+	 * called to make the mapping read-write. Assume !desc is a shm mapping
 	 */
-	if (!vma || vma->vm_flags & VM_MAYSHARE) {
+	if (!desc || desc->vm_flags & VM_MAYSHARE) {
 		/*
 		 * resv_map can not be NULL as hugetlb_reserve_pages is only
 		 * called for inodes for which resv_maps were created (see
@@ -7304,8 +7318,8 @@ long hugetlb_reserve_pages(struct inode *inode,
 
 		chg = to - from;
 
-		set_vma_resv_map(vma, resv_map);
-		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+		set_vma_desc_resv_map(desc, resv_map);
+		set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
 	}
 
 	if (chg < 0)
@@ -7315,7 +7329,7 @@ long hugetlb_reserve_pages(struct inode *inode,
 				chg * pages_per_huge_page(h), &h_cg) < 0)
 		goto out_err;
 
-	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+	if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
 		/* For private mappings, the hugetlb_cgroup uncharge info hangs
 		 * of the resv_map.
 		 */
@@ -7349,7 +7363,7 @@ long hugetlb_reserve_pages(struct inode *inode,
 	 * consumed reservations are stored in the map. Hence, nothing
 	 * else has to be done for private mappings here
 	 */
-	if (!vma || vma->vm_flags & VM_MAYSHARE) {
+	if (!desc || desc->vm_flags & VM_MAYSHARE) {
 		add = region_add(resv_map, from, to, regions_needed, h, h_cg);
 
 		if (unlikely(add < 0)) {
@@ -7403,16 +7417,15 @@ long hugetlb_reserve_pages(struct inode *inode,
 	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
 					    chg * pages_per_huge_page(h), h_cg);
 out_err:
-	hugetlb_vma_lock_free(vma);
-	if (!vma || vma->vm_flags & VM_MAYSHARE)
+	if (!desc || desc->vm_flags & VM_MAYSHARE)
 		/* Only call region_abort if the region_chg succeeded but the
 		 * region_add failed or didn't run.
 		 */
 		if (chg >= 0 && add < 0)
 			region_abort(resv_map, from, to, regions_needed);
-	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+	if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
 		kref_put(&resv_map->refs, resv_map_release);
-		set_vma_resv_map(vma, NULL);
+		set_vma_desc_resv_map(desc, NULL);
 	}
 	return chg < 0 ? chg : add < 0 ? add : -EINVAL;
 }
-- 
2.51.0
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Sumanth Korikkar 4 months, 2 weeks ago
On Wed, Sep 17, 2025 at 08:11:13PM +0100, Lorenzo Stoakes wrote:
> Since we can now perform actions after the VMA is established via
> mmap_prepare, use desc->action_success_hook to set up the hugetlb lock
> once the VMA is setup.
> 
> We also make changes throughout hugetlbfs to make this possible.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  fs/hugetlbfs/inode.c           | 36 ++++++++++------
>  include/linux/hugetlb.h        |  9 +++-
>  include/linux/hugetlb_inline.h | 15 ++++---
>  mm/hugetlb.c                   | 77 ++++++++++++++++++++--------------
>  4 files changed, 85 insertions(+), 52 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index f42548ee9083..9e0625167517 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
>  #define PGOFF_LOFFT_MAX \
>  	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
>  
> -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
>  {
> +	/* Unfortunate we have to reassign vma->vm_private_data. */
> +	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
> +}

Hi Lorenzo,

The following tests causes the kernel to enter a blocked state,
suggesting an issue related to locking order. I was able to reproduce
this behavior in certain test runs.

Test case:
git clone https://github.com/libhugetlbfs/libhugetlbfs.git
cd libhugetlbfs ; ./configure
make -j32
cd tests
echo 100 > /proc/sys/vm/nr_hugepages
mkdir -p /test-hugepages && mount -t hugetlbfs nodev /test-hugepages
./run_tests.py <in a loop>
...
shm-fork 10 100 (1024K: 64):    PASS
set shmmax limit to 104857600
shm-getraw 100 /dev/full (1024K: 32):
shm-getraw 100 /dev/full (1024K: 64):   PASS
fallocate_stress.sh (1024K: 64):  <blocked>

Blocked task state below:

task:fallocate_stres state:D stack:0     pid:5106  tgid:5106  ppid:5103
task_flags:0x400000 flags:0x00000001
Call Trace:
 [<00000255adc646f0>] __schedule+0x370/0x7f0
 [<00000255adc64bb0>] schedule+0x40/0xc0
 [<00000255adc64d32>] schedule_preempt_disabled+0x22/0x30
 [<00000255adc68492>] rwsem_down_write_slowpath+0x232/0x610
 [<00000255adc68922>] down_write_killable+0x52/0x80
 [<00000255ad12c980>] vm_mmap_pgoff+0xc0/0x1f0
 [<00000255ad164bbe>] ksys_mmap_pgoff+0x17e/0x220
 [<00000255ad164d3c>] __s390x_sys_old_mmap+0x7c/0xa0
 [<00000255adc60e4e>] __do_syscall+0x12e/0x350
 [<00000255adc6cfee>] system_call+0x6e/0x90
task:fallocate_stres state:D stack:0     pid:5109  tgid:5106  ppid:5103
task_flags:0x400040 flags:0x00000001
Call Trace:
 [<00000255adc646f0>] __schedule+0x370/0x7f0
 [<00000255adc64bb0>] schedule+0x40/0xc0
 [<00000255adc64d32>] schedule_preempt_disabled+0x22/0x30
 [<00000255adc68492>] rwsem_down_write_slowpath+0x232/0x610
 [<00000255adc688be>] down_write+0x4e/0x60
 [<00000255ad1c11ec>] __hugetlb_zap_begin+0x3c/0x70
 [<00000255ad158b9c>] unmap_vmas+0x10c/0x1a0
 [<00000255ad180844>] vms_complete_munmap_vmas+0x134/0x2e0
 [<00000255ad1811be>] do_vmi_align_munmap+0x13e/0x170
 [<00000255ad1812ae>] do_vmi_munmap+0xbe/0x140
 [<00000255ad183f86>] __vm_munmap+0xe6/0x190
 [<00000255ad166832>] __s390x_sys_munmap+0x32/0x40
 [<00000255adc60e4e>] __do_syscall+0x12e/0x350
 [<00000255adc6cfee>] system_call+0x6e/0x90


Thanks,
Sumanth
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Lorenzo Stoakes 3 months, 3 weeks ago
On Tue, Sep 23, 2025 at 01:52:09PM +0200, Sumanth Korikkar wrote:
> Hi Lorenzo,
>
> The following tests causes the kernel to enter a blocked state,
> suggesting an issue related to locking order. I was able to reproduce
> this behavior in certain test runs.
>
> Test case:
> git clone https://github.com/libhugetlbfs/libhugetlbfs.git
> cd libhugetlbfs ; ./configure
> make -j32
> cd tests
> echo 100 > /proc/sys/vm/nr_hugepages
> mkdir -p /test-hugepages && mount -t hugetlbfs nodev /test-hugepages
> ./run_tests.py <in a loop>
> ...
> shm-fork 10 100 (1024K: 64):    PASS
> set shmmax limit to 104857600
> shm-getraw 100 /dev/full (1024K: 32):
> shm-getraw 100 /dev/full (1024K: 64):   PASS
> fallocate_stress.sh (1024K: 64):  <blocked>
>
> Blocked task state below:
>
> task:fallocate_stres state:D stack:0     pid:5106  tgid:5106  ppid:5103
> task_flags:0x400000 flags:0x00000001
> Call Trace:
>  [<00000255adc646f0>] __schedule+0x370/0x7f0
>  [<00000255adc64bb0>] schedule+0x40/0xc0
>  [<00000255adc64d32>] schedule_preempt_disabled+0x22/0x30
>  [<00000255adc68492>] rwsem_down_write_slowpath+0x232/0x610
>  [<00000255adc68922>] down_write_killable+0x52/0x80
>  [<00000255ad12c980>] vm_mmap_pgoff+0xc0/0x1f0
>  [<00000255ad164bbe>] ksys_mmap_pgoff+0x17e/0x220
>  [<00000255ad164d3c>] __s390x_sys_old_mmap+0x7c/0xa0
>  [<00000255adc60e4e>] __do_syscall+0x12e/0x350
>  [<00000255adc6cfee>] system_call+0x6e/0x90
> task:fallocate_stres state:D stack:0     pid:5109  tgid:5106  ppid:5103
> task_flags:0x400040 flags:0x00000001
> Call Trace:
>  [<00000255adc646f0>] __schedule+0x370/0x7f0
>  [<00000255adc64bb0>] schedule+0x40/0xc0
>  [<00000255adc64d32>] schedule_preempt_disabled+0x22/0x30
>  [<00000255adc68492>] rwsem_down_write_slowpath+0x232/0x610
>  [<00000255adc688be>] down_write+0x4e/0x60
>  [<00000255ad1c11ec>] __hugetlb_zap_begin+0x3c/0x70
>  [<00000255ad158b9c>] unmap_vmas+0x10c/0x1a0
>  [<00000255ad180844>] vms_complete_munmap_vmas+0x134/0x2e0
>  [<00000255ad1811be>] do_vmi_align_munmap+0x13e/0x170
>  [<00000255ad1812ae>] do_vmi_munmap+0xbe/0x140
>  [<00000255ad183f86>] __vm_munmap+0xe6/0x190
>  [<00000255ad166832>] __s390x_sys_munmap+0x32/0x40
>  [<00000255adc60e4e>] __do_syscall+0x12e/0x350
>  [<00000255adc6cfee>] system_call+0x6e/0x90
>
>
> Thanks,
> Sumanth

(been on holiday for a couple weeks and last week was a catch-up! :)

So having looked into this, the issue is that hugetlbfs exposes a per-VMA
hugetlbfs lock which can be taken via the rmap.

So, while faults are disallowed until the VMA is fully setup, the rmap is not,
and therefore there's a race between setting up the hugetlbfs lock and the rmap
trying to take/release it.

It's a real edge case as it's kind of unusual to have this requirement during
initial custom mmap, but to account for this and for any other users which might
require it, I have resolved this by introducing the ability to hold on to the
rmap lock until the VMA is fully set up.

The window is very very small, but obviously it's one we have to account for :)

This is the most correct solution I think, as it prevents any confusion as to
the state of the lock, rmap users simply cannot access the VMA until it is
established.

I am putting the finishing touches to a respin with this fix included, will cc
you on it.

Cheers, Lorenzo
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Andrew Morton 4 months, 2 weeks ago
On Tue, 23 Sep 2025 13:52:09 +0200 Sumanth Korikkar <sumanthk@linux.ibm.com> wrote:

> > --- a/fs/hugetlbfs/inode.c
> > +++ b/fs/hugetlbfs/inode.c
> > @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
> >  #define PGOFF_LOFFT_MAX \
> >  	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
> >  
> > -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
> >  {
> > +	/* Unfortunate we have to reassign vma->vm_private_data. */
> > +	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
> > +}
> 
> Hi Lorenzo,
> 
> The following tests causes the kernel to enter a blocked state,
> suggesting an issue related to locking order. I was able to reproduce
> this behavior in certain test runs.

Thanks.  I pulled this series out of mm.git's mm-stable branch, put it
back into mm-unstable.
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Sumanth Korikkar 3 months, 3 weeks ago
On Tue, Sep 23, 2025 at 02:17:04PM -0700, Andrew Morton wrote:
> On Tue, 23 Sep 2025 13:52:09 +0200 Sumanth Korikkar <sumanthk@linux.ibm.com> wrote:
> 
> > > --- a/fs/hugetlbfs/inode.c
> > > +++ b/fs/hugetlbfs/inode.c
> > > @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
> > >  #define PGOFF_LOFFT_MAX \
> > >  	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
> > >  
> > > -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > > +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
> > >  {
> > > +	/* Unfortunate we have to reassign vma->vm_private_data. */
> > > +	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
> > > +}
> > 
> > Hi Lorenzo,
> > 
> > The following tests causes the kernel to enter a blocked state,
> > suggesting an issue related to locking order. I was able to reproduce
> > this behavior in certain test runs.
> 
> Thanks.  I pulled this series out of mm.git's mm-stable branch, put it
> back into mm-unstable.

Hi all,

The issue is reproducible again in linux-next with the following commit:
5fdb155933fa ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare")

Thanks,
Sumanth
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Lorenzo Stoakes 3 months, 3 weeks ago
On Fri, Oct 17, 2025 at 02:27:53PM +0200, Sumanth Korikkar wrote:
> On Tue, Sep 23, 2025 at 02:17:04PM -0700, Andrew Morton wrote:
> > On Tue, 23 Sep 2025 13:52:09 +0200 Sumanth Korikkar <sumanthk@linux.ibm.com> wrote:
> >
> > > > --- a/fs/hugetlbfs/inode.c
> > > > +++ b/fs/hugetlbfs/inode.c
> > > > @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
> > > >  #define PGOFF_LOFFT_MAX \
> > > >  	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
> > > >
> > > > -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > > > +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
> > > >  {
> > > > +	/* Unfortunate we have to reassign vma->vm_private_data. */
> > > > +	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
> > > > +}
> > >
> > > Hi Lorenzo,
> > >
> > > The following tests causes the kernel to enter a blocked state,
> > > suggesting an issue related to locking order. I was able to reproduce
> > > this behavior in certain test runs.
> >
> > Thanks.  I pulled this series out of mm.git's mm-stable branch, put it
> > back into mm-unstable.
>
> Hi all,
>
> The issue is reproducible again in linux-next with the following commit:
> 5fdb155933fa ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare")

Andrew - I see this series in mm-unstable, not sure what it's doing there
as I need to rework this (when I get a chance, back from a 2 week vacation
and this week has been - difficult :)

Can we please drop this until I have a chance to respin?

Thanks, Lorenzo
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Andrew Morton 3 months, 3 weeks ago
On Fri, 17 Oct 2025 13:46:20 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:

> > The issue is reproducible again in linux-next with the following commit:
> > 5fdb155933fa ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare")
> 
> Andrew - I see this series in mm-unstable, not sure what it's doing there
> as I need to rework this (when I get a chance, back from a 2 week vacation
> and this week has been - difficult :)
> 
> Can we please drop this until I have a chance to respin?

No probs, gone.
Re: [PATCH v4 11/14] mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Posted by Lorenzo Stoakes 4 months, 2 weeks ago
On Tue, Sep 23, 2025 at 02:17:04PM -0700, Andrew Morton wrote:
> On Tue, 23 Sep 2025 13:52:09 +0200 Sumanth Korikkar <sumanthk@linux.ibm.com> wrote:
>
> > > --- a/fs/hugetlbfs/inode.c
> > > +++ b/fs/hugetlbfs/inode.c
> > > @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
> > >  #define PGOFF_LOFFT_MAX \
> > >  	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
> > >
> > > -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > > +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
> > >  {
> > > +	/* Unfortunate we have to reassign vma->vm_private_data. */
> > > +	return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
> > > +}
> >
> > Hi Lorenzo,
> >
> > The following tests causes the kernel to enter a blocked state,
> > suggesting an issue related to locking order. I was able to reproduce
> > this behavior in certain test runs.
>
> Thanks.  I pulled this series out of mm.git's mm-stable branch, put it
> back into mm-unstable.

I'm at a conference right now and after that I'm on leave for a couple weeks,
returning on first week of 6.18-rc1, so I think best to delay this series for a
cycle so I can properly dig in here and determine best way forward then :)

Cheers, Lorenzo