mm: improve map count checks

[PATCH 3/3] mm/mremap: check map count under mmap write lock and abstract

Posted by Lorenzo Stoakes (Oracle) 3 weeks, 6 days ago

We are checking the mmap count in check_mremap_params(), prior to obtaining
an mmap write lock, which means that accesses to current->mm->map_count
might race with this field being updated.

Resolve this by only checking this field after the mmap write lock is held.

Additionally, abstract this check into a helper function with extensive
ASCII documentation of what's going on.

Reported-by: Jianzhou Zhao <luckd0g@163.com>
Closes: https://lore.kernel.org/all/1a7d4c26.6b46.19cdbe7eaf0.Coremail.luckd0g@163.com/
Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 mm/mremap.c | 88 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 75 insertions(+), 13 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index ba6c690f6c1b..ee46bbb031e6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1028,6 +1028,75 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
 		mm->locked_vm += pages;
 }
 
+static bool __check_map_count_against_split(struct mm_struct *mm,
+					    bool before_unmaps)
+{
+	const int sys_map_count = get_sysctl_max_map_count();
+	int map_count = mm->map_count;
+
+	mmap_assert_write_locked(mm);
+
+	/*
+	 * At the point of shrinking the VMA, if new_len < old_len, we unmap
+	 * thusly in the worst case:
+	 *
+	 *              old_addr+old_len                    old_addr+old_len
+	 * |---------------.----.---------|    |---------------|    |---------|
+	 * |               .    .         | -> |      +1       | -1 |   +1    |
+	 * |---------------.----.---------|    |---------------|    |---------|
+	 *        old_addr+new_len                     old_addr+new_len
+	 *
+	 * At the point of removing the portion of an existing VMA to make space
+	 * for the moved VMA if MREMAP_FIXED, we unmap thusly in the worst case:
+	 *
+	 *   new_addr   new_addr+new_len         new_addr   new_addr+new_len
+	 * |----.---------------.---------|    |----|               |---------|
+	 * |    .               .         | -> | +1 |      -1       |   +1    |
+	 * |----.---------------.---------|    |----|               |---------|
+	 *
+	 * Therefore, before we consider the move anything, we have to account
+	 * for 2 additional VMAs possibly being created upon these unmappings.
+	 */
+	if (before_unmaps)
+		map_count += 2;
+
+	/*
+	 * At the point of MOVING the VMA:
+	 *
+	 * We start by copying a VMA, which creates an additional VMA if no
+	 * merge occurs, then if not MREMAP_DONTUNMAP, we unmap the source VMA.
+	 * In the worst case we might then observe:
+	 *
+	 *   new_addr   new_addr+new_len         new_addr   new_addr+new_len
+	 * |----|               |---------|    |----|---------------|---------|
+	 * |    |               |         | -> |    |      +1       |         |
+	 * |----|               |---------|    |----|---------------|---------|
+	 *
+	 *   old_addr   old_addr+old_len         old_addr   old_addr+old_len
+	 * |----.---------------.---------|    |----|               |---------|
+	 * |    .               .         | -> | +1 |      -1       |   +1    |
+	 * |----.---------------.---------|    |----|               |---------|
+	 *
+	 * Therefore we must check to ensure we have headroom of 2 additional
+	 * VMAs.
+	 */
+	return map_count + 2 <= sys_map_count;
+}
+
+/* Do we violate the map count limit if we split VMAs when moving the VMA? */
+static bool check_map_count_against_split(void)
+{
+	return __check_map_count_against_split(current->mm,
+					       /*before_unmaps=*/false);
+}
+
+/* Do we violate the map count limit if we split VMAs prior to early unmaps? */
+static bool check_map_count_against_split_early(void)
+{
+	return __check_map_count_against_split(current->mm,
+					       /*before_unmaps=*/true);
+}
+
 /*
  * Perform checks before attempting to write a VMA prior to it being
  * moved.
@@ -1045,7 +1114,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
 	 * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the
 	 * source, which may split, causing a net increase of 2 mappings.
 	 */
-	if (current->mm->map_count + 2 > get_sysctl_max_map_count())
+	if (!check_map_count_against_split())
 		return -ENOMEM;
 
 	if (vma->vm_ops && vma->vm_ops->may_split) {
@@ -1804,18 +1873,6 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
 	if (vrm_overlaps(vrm))
 		return -EINVAL;
 
-	/*
-	 * We may unmap twice before invoking move_vma(), that is if new_len <
-	 * old_len (shrinking), and in the MREMAP_FIXED case, unmapping part of
-	 * a VMA located at the destination.
-	 *
-	 * In the worst case, both unmappings will cause splits, resulting in a
-	 * net increased map count of 2. In move_vma() we check for headroom of
-	 * 2 additional mappings, so check early to avoid bailing out then.
-	 */
-	if (current->mm->map_count + 4 > get_sysctl_max_map_count())
-		return -ENOMEM;
-
 	return 0;
 }
 
@@ -1925,6 +1982,11 @@ static unsigned long do_mremap(struct vma_remap_struct *vrm)
 		return -EINTR;
 	vrm->mmap_locked = true;
 
+	if (!check_map_count_against_split_early()) {
+		mmap_write_unlock(mm);
+		return -ENOMEM;
+	}
+
 	if (vrm_move_only(vrm)) {
 		res = remap_move(vrm);
 	} else {
-- 
2.53.0

Re: [PATCH 3/3] mm/mremap: check map count under mmap write lock and abstract

Posted by Pedro Falcato 1 week, 4 days ago

On Wed, Mar 11, 2026 at 05:24:38PM +0000, Lorenzo Stoakes (Oracle) wrote:
> We are checking the mmap count in check_mremap_params(), prior to obtaining
> an mmap write lock, which means that accesses to current->mm->map_count
> might race with this field being updated.
> 
> Resolve this by only checking this field after the mmap write lock is held.
> 
> Additionally, abstract this check into a helper function with extensive
> ASCII documentation of what's going on.
> 
> Reported-by: Jianzhou Zhao <luckd0g@163.com>
> Closes: https://lore.kernel.org/all/1a7d4c26.6b46.19cdbe7eaf0.Coremail.luckd0g@163.com/
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>

Reviewed-by: Pedro Falcato <pfalcato@suse.de>

Shouldn't this have a Fixes: and go to stable?

> ---
>  mm/mremap.c | 88 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 75 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index ba6c690f6c1b..ee46bbb031e6 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -1028,6 +1028,75 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
>  		mm->locked_vm += pages;
>  }
>  
> +static bool __check_map_count_against_split(struct mm_struct *mm,
> +					    bool before_unmaps)
> +{
> +	const int sys_map_count = get_sysctl_max_map_count();
> +	int map_count = mm->map_count;
> +
> +	mmap_assert_write_locked(mm);
> +
> +	/*
> +	 * At the point of shrinking the VMA, if new_len < old_len, we unmap
> +	 * thusly in the worst case:
> +	 *
> +	 *              old_addr+old_len                    old_addr+old_len
> +	 * |---------------.----.---------|    |---------------|    |---------|
> +	 * |               .    .         | -> |      +1       | -1 |   +1    |
> +	 * |---------------.----.---------|    |---------------|    |---------|
> +	 *        old_addr+new_len                     old_addr+new_len
> +	 *
> +	 * At the point of removing the portion of an existing VMA to make space
> +	 * for the moved VMA if MREMAP_FIXED, we unmap thusly in the worst case:
> +	 *
> +	 *   new_addr   new_addr+new_len         new_addr   new_addr+new_len
> +	 * |----.---------------.---------|    |----|               |---------|
> +	 * |    .               .         | -> | +1 |      -1       |   +1    |
> +	 * |----.---------------.---------|    |----|               |---------|
> +	 *
> +	 * Therefore, before we consider the move anything, we have to account
> +	 * for 2 additional VMAs possibly being created upon these unmappings.
> +	 */
> +	if (before_unmaps)
> +		map_count += 2;

oooh, shiny shiny diagrams.

-- 
Pedro

Re: [PATCH 3/3] mm/mremap: check map count under mmap write lock and abstract

Posted by Lorenzo Stoakes (Oracle) 1 week, 4 days ago

On Fri, Mar 27, 2026 at 09:22:31AM +0000, Pedro Falcato wrote:
> On Wed, Mar 11, 2026 at 05:24:38PM +0000, Lorenzo Stoakes (Oracle) wrote:
> > We are checking the mmap count in check_mremap_params(), prior to obtaining
> > an mmap write lock, which means that accesses to current->mm->map_count
> > might race with this field being updated.
> >
> > Resolve this by only checking this field after the mmap write lock is held.
> >
> > Additionally, abstract this check into a helper function with extensive
> > ASCII documentation of what's going on.
> >
> > Reported-by: Jianzhou Zhao <luckd0g@163.com>
> > Closes: https://lore.kernel.org/all/1a7d4c26.6b46.19cdbe7eaf0.Coremail.luckd0g@163.com/
> > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
>
> Reviewed-by: Pedro Falcato <pfalcato@suse.de>

Thanks!

>
> Shouldn't this have a Fixes: and go to stable?

Nah this is really hard to hit and doing the check is just making it so mremap
can bail out early, so it's not worth it.

>
> > ---
> >  mm/mremap.c | 88 +++++++++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 75 insertions(+), 13 deletions(-)
> >
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index ba6c690f6c1b..ee46bbb031e6 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -1028,6 +1028,75 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
> >  		mm->locked_vm += pages;
> >  }
> >
> > +static bool __check_map_count_against_split(struct mm_struct *mm,
> > +					    bool before_unmaps)
> > +{
> > +	const int sys_map_count = get_sysctl_max_map_count();
> > +	int map_count = mm->map_count;
> > +
> > +	mmap_assert_write_locked(mm);
> > +
> > +	/*
> > +	 * At the point of shrinking the VMA, if new_len < old_len, we unmap
> > +	 * thusly in the worst case:
> > +	 *
> > +	 *              old_addr+old_len                    old_addr+old_len
> > +	 * |---------------.----.---------|    |---------------|    |---------|
> > +	 * |               .    .         | -> |      +1       | -1 |   +1    |
> > +	 * |---------------.----.---------|    |---------------|    |---------|
> > +	 *        old_addr+new_len                     old_addr+new_len
> > +	 *
> > +	 * At the point of removing the portion of an existing VMA to make space
> > +	 * for the moved VMA if MREMAP_FIXED, we unmap thusly in the worst case:
> > +	 *
> > +	 *   new_addr   new_addr+new_len         new_addr   new_addr+new_len
> > +	 * |----.---------------.---------|    |----|               |---------|
> > +	 * |    .               .         | -> | +1 |      -1       |   +1    |
> > +	 * |----.---------------.---------|    |----|               |---------|
> > +	 *
> > +	 * Therefore, before we consider the move anything, we have to account
> > +	 * for 2 additional VMAs possibly being created upon these unmappings.
> > +	 */
> > +	if (before_unmaps)
> > +		map_count += 2;
>
> oooh, shiny shiny diagrams.

:)

>
> --
> Pedro

Cheers, Lorenzo

[PATCH 1/3] mm/mremap: correct invalid map count check
[PATCH 2/3] mm: abstract reading sysctl_max_map_count, and READ_ONCE()
[PATCH 3/3] mm/mremap: check map count under mmap write lock and abstract