include/linux/mm.h | 11 ++++++++++- mm/mmap.c | 15 ++++++++++++++- mm/mremap.c | 7 ++++--- mm/nommu.c | 2 +- mm/util.c | 1 - mm/vma.c | 6 +++--- 6 files changed, 32 insertions(+), 10 deletions(-)
The check against the max map count (sysctl_max_map_count) was
open-coded in several places. This led to inconsistent enforcement
and subtle bugs where the limit could be exceeded.
For example, some paths would check map_count > sysctl_max_map_count
before allocating a new VMA and incrementing the count, allowing the
process to reach sysctl_max_map_count + 1:
int do_brk_flags(...)
{
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* We can get here with mm->map_count == sysctl_max_map_count */
vma = vm_area_alloc(mm);
...
mm->map_count++ /* We've now exceeded the threshold. */
}
To fix this and unify the logic, introduce a new function,
exceeds_max_map_count(), to consolidate the check. All open-coded
checks are replaced with calls to this new function, ensuring the
limit is applied uniformly and correctly.
To improve encapsulation, sysctl_max_map_count is now static to
mm/mmap.c. The new helper also adds a rate-limited warning to make
debugging applications that exhaust their VMA limit easier.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
---
include/linux/mm.h | 11 ++++++++++-
mm/mmap.c | 15 ++++++++++++++-
mm/mremap.c | 7 ++++---
mm/nommu.c | 2 +-
mm/util.c | 1 -
mm/vma.c | 6 +++---
6 files changed, 32 insertions(+), 10 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ae97a0b8ec7..d4e64e6a9814 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page)
#define MAPCOUNT_ELF_CORE_MARGIN (5)
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
-extern int sysctl_max_map_count;
+/**
+ * exceeds_max_map_count - check if a VMA operation would exceed max_map_count
+ * @mm: The memory descriptor for the process.
+ * @new_vmas: The number of new VMAs the operation will create.
+ *
+ * Returns true if the operation would cause the number of VMAs to exceed
+ * the sysctl_max_map_count limit, false otherwise. A rate-limited warning
+ * is logged if the limit is exceeded.
+ */
+extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas);
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7306253cc3b5..693a0105e6a5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
return -EOVERFLOW;
/* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count)
+ if (exceeds_max_map_count(mm, 0))
return -ENOMEM;
/*
@@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping(
int sysctl_legacy_va_layout;
#endif
+static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+
+bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas)
+{
+ if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) {
+ pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n",
+ current->comm, current->pid,
+ sysctl_max_map_count);
+ return true;
+ }
+ return false;
+}
+
static const struct ctl_table mmap_table[] = {
{
.procname = "max_map_count",
diff --git a/mm/mremap.c b/mm/mremap.c
index e618a706aff5..793fad58302c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
- if (current->mm->map_count >= sysctl_max_map_count - 3)
+ if (exceeds_max_map_count(current->mm, 4))
return -ENOMEM;
if (vma->vm_ops && vma->vm_ops->may_split) {
@@ -1811,9 +1811,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
* split in 3 before unmapping it.
* That means 2 more maps (1 for each) to the ones we already hold.
* Check whether current map count plus 2 still leads us to 4 maps below
- * the threshold, otherwise return -ENOMEM here to be more safe.
+ * the threshold. In other words, is the current map count + 6 at or
+ * below the threshold? Otherwise return -ENOMEM here to be more safe.
*/
- if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
+ if (exceeds_max_map_count(current->mm, 6))
return -ENOMEM;
return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b819fafd57b..0533e1e3b266 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1316,7 +1316,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
return -ENOMEM;
mm = vma->vm_mm;
- if (mm->map_count >= sysctl_max_map_count)
+ if (exceeds_max_map_count(mm, 1))
return -ENOMEM;
region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
diff --git a/mm/util.c b/mm/util.c
index f814e6a59ab1..b6e83922cafe 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -751,7 +751,6 @@ EXPORT_SYMBOL(folio_mc_copy);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
static int sysctl_overcommit_ratio __read_mostly = 50;
static unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
diff --git a/mm/vma.c b/mm/vma.c
index 3b12c7579831..f804c8ac8fbb 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -592,7 +592,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
- if (vma->vm_mm->map_count >= sysctl_max_map_count)
+ if (exceeds_max_map_count(vma->vm_mm, 1))
return -ENOMEM;
return __split_vma(vmi, vma, addr, new_below);
@@ -1345,7 +1345,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
* its limit temporarily, to help free resources as expected.
*/
if (vms->end < vms->vma->vm_end &&
- vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
+ exceeds_max_map_count(vms->vma->vm_mm, 1)) {
error = -ENOMEM;
goto map_count_exceeded;
}
@@ -2772,7 +2772,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
return -ENOMEM;
- if (mm->map_count > sysctl_max_map_count)
+ if (exceeds_max_map_count(mm, 1))
return -ENOMEM;
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
--
2.51.0.338.gd7d06c2dae-goog
On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > The check against the max map count (sysctl_max_map_count) was > open-coded in several places. This led to inconsistent enforcement > and subtle bugs where the limit could be exceeded. > > For example, some paths would check map_count > sysctl_max_map_count > before allocating a new VMA and incrementing the count, allowing the > process to reach sysctl_max_map_count + 1: > > int do_brk_flags(...) > { > if (mm->map_count > sysctl_max_map_count) > return -ENOMEM; > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > vma = vm_area_alloc(mm); > ... > mm->map_count++ /* We've now exceeded the threshold. */ > } > > To fix this and unify the logic, introduce a new function, > exceeds_max_map_count(), to consolidate the check. All open-coded > checks are replaced with calls to this new function, ensuring the > limit is applied uniformly and correctly. > > To improve encapsulation, sysctl_max_map_count is now static to > mm/mmap.c. The new helper also adds a rate-limited warning to make > debugging applications that exhaust their VMA limit easier. > Yeah this is nice thanks! > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Minchan Kim <minchan@kernel.org> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > --- > include/linux/mm.h | 11 ++++++++++- > mm/mmap.c | 15 ++++++++++++++- > mm/mremap.c | 7 ++++--- > mm/nommu.c | 2 +- > mm/util.c | 1 - > mm/vma.c | 6 +++--- > 6 files changed, 32 insertions(+), 10 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1ae97a0b8ec7..d4e64e6a9814 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > #define MAPCOUNT_ELF_CORE_MARGIN (5) > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > -extern int sysctl_max_map_count; > +/** > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > + * @mm: The memory descriptor for the process. > + * @new_vmas: The number of new VMAs the operation will create. > + * > + * Returns true if the operation would cause the number of VMAs to exceed > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > + * is logged if the limit is exceeded. > + */ > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); No new externs please (as Pedro also pointed out! :) > > extern unsigned long sysctl_user_reserve_kbytes; > extern unsigned long sysctl_admin_reserve_kbytes; > diff --git a/mm/mmap.c b/mm/mmap.c > index 7306253cc3b5..693a0105e6a5 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > return -EOVERFLOW; > > /* Too many mappings? */ > - if (mm->map_count > sysctl_max_map_count) > + if (exceeds_max_map_count(mm, 0)) Yeah this last 'mystery meat' parameter isn't amazing though to be honest, it's kinda hard to read/understand. E.g.: int map_count_capacity(const struct *mm) { const int map_count = mm->map_count; const int max_count = sysctl_max_map_count; return max_count > map_count ? max_count - map_count : 0; } Then this would become; if (!map_count_capacity(mm)) { // blah. } > return -ENOMEM; > > /* > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > int sysctl_legacy_va_layout; > #endif > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > + > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > +{ > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > + current->comm, current->pid, > + sysctl_max_map_count); Yeah not a fan of this, we aren't warning elsewhere, it's actually perfectly normal for somebody to exceed this, this isn't at the level of a warning. > + return true; > + } > + return false; > +} > + > static const struct ctl_table mmap_table[] = { > { > .procname = "max_map_count", > diff --git a/mm/mremap.c b/mm/mremap.c > index e618a706aff5..793fad58302c 100644 > --- a/mm/mremap.c > +++ b/mm/mremap.c > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > * We'd prefer to avoid failure later on in do_munmap: > * which may split one vma into three before unmapping. > */ > - if (current->mm->map_count >= sysctl_max_map_count - 3) > + if (exceeds_max_map_count(current->mm, 4)) > return -ENOMEM; In my version this would be: if (map_count_capacity(current->mm) < 4) return -ENOMEM; And etc. etc. I won't do them all :) I do think this is clearer... > > if (vma->vm_ops && vma->vm_ops->may_split) { > @@ -1811,9 +1811,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) > * split in 3 before unmapping it. > * That means 2 more maps (1 for each) to the ones we already hold. > * Check whether current map count plus 2 still leads us to 4 maps below > - * the threshold, otherwise return -ENOMEM here to be more safe. > + * the threshold. In other words, is the current map count + 6 at or > + * below the threshold? Otherwise return -ENOMEM here to be more safe. > */ > - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) > + if (exceeds_max_map_count(current->mm, 6)) > return -ENOMEM; > > return 0; > diff --git a/mm/nommu.c b/mm/nommu.c > index 8b819fafd57b..0533e1e3b266 100644 > --- a/mm/nommu.c > +++ b/mm/nommu.c > @@ -1316,7 +1316,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > return -ENOMEM; > > mm = vma->vm_mm; > - if (mm->map_count >= sysctl_max_map_count) > + if (exceeds_max_map_count(mm, 1)) > return -ENOMEM; > > region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); > diff --git a/mm/util.c b/mm/util.c > index f814e6a59ab1..b6e83922cafe 100644 > --- a/mm/util.c > +++ b/mm/util.c > @@ -751,7 +751,6 @@ EXPORT_SYMBOL(folio_mc_copy); > int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; > static int sysctl_overcommit_ratio __read_mostly = 50; > static unsigned long sysctl_overcommit_kbytes __read_mostly; > -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ > unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ > > diff --git a/mm/vma.c b/mm/vma.c > index 3b12c7579831..f804c8ac8fbb 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -592,7 +592,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > unsigned long addr, int new_below) > { > - if (vma->vm_mm->map_count >= sysctl_max_map_count) > + if (exceeds_max_map_count(vma->vm_mm, 1)) > return -ENOMEM; > > return __split_vma(vmi, vma, addr, new_below); > @@ -1345,7 +1345,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, > * its limit temporarily, to help free resources as expected. > */ > if (vms->end < vms->vma->vm_end && > - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { > + exceeds_max_map_count(vms->vma->vm_mm, 1)) { > error = -ENOMEM; > goto map_count_exceeded; > } > @@ -2772,7 +2772,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) > return -ENOMEM; > > - if (mm->map_count > sysctl_max_map_count) > + if (exceeds_max_map_count(mm, 1)) > return -ENOMEM; > > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) > -- > 2.51.0.338.gd7d06c2dae-goog >
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250904 12:02]: > On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > > The check against the max map count (sysctl_max_map_count) was > > open-coded in several places. This led to inconsistent enforcement > > and subtle bugs where the limit could be exceeded. > > > > For example, some paths would check map_count > sysctl_max_map_count > > before allocating a new VMA and incrementing the count, allowing the > > process to reach sysctl_max_map_count + 1: > > > > int do_brk_flags(...) > > { > > if (mm->map_count > sysctl_max_map_count) > > return -ENOMEM; > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > vma = vm_area_alloc(mm); > > ... > > mm->map_count++ /* We've now exceeded the threshold. */ > > } > > > > To fix this and unify the logic, introduce a new function, > > exceeds_max_map_count(), to consolidate the check. All open-coded > > checks are replaced with calls to this new function, ensuring the > > limit is applied uniformly and correctly. > > > > To improve encapsulation, sysctl_max_map_count is now static to > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > debugging applications that exhaust their VMA limit easier. > > > > Yeah this is nice thanks! > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Minchan Kim <minchan@kernel.org> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > --- > > include/linux/mm.h | 11 ++++++++++- > > mm/mmap.c | 15 ++++++++++++++- > > mm/mremap.c | 7 ++++--- > > mm/nommu.c | 2 +- > > mm/util.c | 1 - > > mm/vma.c | 6 +++--- > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > -extern int sysctl_max_map_count; > > +/** > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > + * @mm: The memory descriptor for the process. > > + * @new_vmas: The number of new VMAs the operation will create. > > + * > > + * Returns true if the operation would cause the number of VMAs to exceed > > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > > + * is logged if the limit is exceeded. > > + */ > > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); > > No new externs please (as Pedro also pointed out! :) > > > > > extern unsigned long sysctl_user_reserve_kbytes; > > extern unsigned long sysctl_admin_reserve_kbytes; > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 7306253cc3b5..693a0105e6a5 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > > return -EOVERFLOW; > > > > /* Too many mappings? */ > > - if (mm->map_count > sysctl_max_map_count) > > + if (exceeds_max_map_count(mm, 0)) > > Yeah this last 'mystery meat' parameter isn't amazing though to be honest, it's > kinda hard to read/understand. > > E.g.: > > int map_count_capacity(const struct *mm) > { > const int map_count = mm->map_count; > const int max_count = sysctl_max_map_count; > > return max_count > map_count ? max_count - map_count : 0; > } > > Then this would become; > > if (!map_count_capacity(mm)) { > // blah. > } > > > > return -ENOMEM; > > > > /* > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > int sysctl_legacy_va_layout; > > #endif > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > + > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > +{ > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > + current->comm, current->pid, > > + sysctl_max_map_count); > > Yeah not a fan of this, we aren't warning elsewhere, it's actually perfectly > normal for somebody to exceed this, this isn't at the level of a warning. > > > + return true; > > + } > > + return false; > > +} > > + > > static const struct ctl_table mmap_table[] = { > > { > > .procname = "max_map_count", > > diff --git a/mm/mremap.c b/mm/mremap.c > > index e618a706aff5..793fad58302c 100644 > > --- a/mm/mremap.c > > +++ b/mm/mremap.c > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > * We'd prefer to avoid failure later on in do_munmap: > > * which may split one vma into three before unmapping. > > */ > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > + if (exceeds_max_map_count(current->mm, 4)) > > return -ENOMEM; > > In my version this would be: > > if (map_count_capacity(current->mm) < 4) > return -ENOMEM; > Someone could write map_count_capacity(current->mm) <= 4 and reintroduce what this is trying to solve. And with the way it is written in this patch, someone could pass in the wrong number. I'm not sure this is worth doing. There are places we allow the count to go higher. Certainly fix the brk < to be <= and any other calculations, but the rest seem okay as-is to me. The only real way to be sure we don't cause a bug in the future is to have better testing. Thanks, Liam
On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > index e618a706aff5..793fad58302c 100644 > > > --- a/mm/mremap.c > > > +++ b/mm/mremap.c > > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > > * We'd prefer to avoid failure later on in do_munmap: > > > * which may split one vma into three before unmapping. > > > */ > > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > > + if (exceeds_max_map_count(current->mm, 4)) > > > return -ENOMEM; > > > > In my version this would be: > > > > if (map_count_capacity(current->mm) < 4) > > return -ENOMEM; > > > > Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > what this is trying to solve. And with the way it is written in this > patch, someone could pass in the wrong number. Right, but I think 'capacity' is pretty clear here, if the caller does something silly then that's on them... > > I'm not sure this is worth doing. There are places we allow the count > to go higher. ...But yeah, it's kinda borderline as to how useful this is. I _do_ however like the 'put map count in one place statically' rather than having a global, so a minimal version of this could be to just have a helper function that gets the sysctl_max_map_count, e.g.: if (current->mm->mmap_count >= max_map_count() - 3) etc. etc. > > Certainly fix the brk < to be <= and any other calculations, but the > rest seem okay as-is to me. The only real way to be sure we don't cause > a bug in the future is to have better testing. Speaking of testing - Kalesh - do make sure to test the VMA tests to make sure this doesn't break those - they live in tools/testing/vma and you just have to do make && ./vma Cheers! > > Thanks, > Liam
On Thu, Sep 4, 2025 at 10:33 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > > index e618a706aff5..793fad58302c 100644 > > > > --- a/mm/mremap.c > > > > +++ b/mm/mremap.c > > > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > > > * We'd prefer to avoid failure later on in do_munmap: > > > > * which may split one vma into three before unmapping. > > > > */ > > > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > > > + if (exceeds_max_map_count(current->mm, 4)) > > > > return -ENOMEM; > > > > > > In my version this would be: > > > > > > if (map_count_capacity(current->mm) < 4) > > > return -ENOMEM; > > > > > > > Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > > what this is trying to solve. And with the way it is written in this > > patch, someone could pass in the wrong number. Hi Liam, I still think there is value to this as it's lot less likely to get the common case incorrectly: if (!map_count_capacity(mm)) return -ENOMEM; It also facilitate us adding the asserts as Pedro suggested (excluding the munmap() case. > > Right, but I think 'capacity' is pretty clear here, if the caller does something > silly then that's on them... > > > > > I'm not sure this is worth doing. There are places we allow the count > > to go higher. > > ...But yeah, it's kinda borderline as to how useful this is. > > I _do_ however like the 'put map count in one place statically' rather than > having a global, so a minimal version of this could be to just have a helper > function that gets the sysctl_max_map_count, e.g.: > > if (current->mm->mmap_count >= max_map_count() - 3) > > etc. etc. > > > > > Certainly fix the brk < to be <= and any other calculations, but the > > rest seem okay as-is to me. The only real way to be sure we don't cause > > a bug in the future is to have better testing. > > Speaking of testing - Kalesh - do make sure to test the VMA tests to make sure > this doesn't break those - they live in tools/testing/vma and you just have to > do make && ./vma Thanks Lorenzo, will do. -- Kalesh > > Cheers! > > > > > Thanks, > > Liam
* Kalesh Singh <kaleshsingh@google.com> [250904 13:44]: > On Thu, Sep 4, 2025 at 10:33 AM Lorenzo Stoakes > <lorenzo.stoakes@oracle.com> wrote: > > > > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > > > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > > > index e618a706aff5..793fad58302c 100644 > > > > > --- a/mm/mremap.c > > > > > +++ b/mm/mremap.c > > > > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > > > > * We'd prefer to avoid failure later on in do_munmap: > > > > > * which may split one vma into three before unmapping. > > > > > */ > > > > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > > > > + if (exceeds_max_map_count(current->mm, 4)) > > > > > return -ENOMEM; > > > > > > > > In my version this would be: > > > > > > > > if (map_count_capacity(current->mm) < 4) > > > > return -ENOMEM; > > > > > > > > > > Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > > > what this is trying to solve. And with the way it is written in this > > > patch, someone could pass in the wrong number. > > Hi Liam, > > I still think there is value to this as it's lot less likely to get > the common case incorrectly: > > if (!map_count_capacity(mm)) > return -ENOMEM; > > It also facilitate us adding the asserts as Pedro suggested (excluding > the munmap() case. And munmap() callers case, I guess. There is still the possibility of us failing to unmap after a split and having shot ourselves, so just don't assert that in the case of the failure and recovery path (ie exit_mmap()). > > > > > Right, but I think 'capacity' is pretty clear here, if the caller does something > > silly then that's on them... Turns out it's on us, not them :) > > > > > > > > I'm not sure this is worth doing. There are places we allow the count > > > to go higher. > > > > ...But yeah, it's kinda borderline as to how useful this is. > > > > I _do_ however like the 'put map count in one place statically' rather than > > having a global, so a minimal version of this could be to just have a helper > > function that gets the sysctl_max_map_count, e.g.: > > > > if (current->mm->mmap_count >= max_map_count() - 3) > > > > etc. etc. > > > > > > > > Certainly fix the brk < to be <= and any other calculations, but the > > > rest seem okay as-is to me. The only real way to be sure we don't cause > > > a bug in the future is to have better testing. > > > > Speaking of testing - Kalesh - do make sure to test the VMA tests to make sure > > this doesn't break those - they live in tools/testing/vma and you just have to > > do make && ./vma Bonus points if you can add some tests for it! Thanks, Liam
On 04.09.25 19:33, Lorenzo Stoakes wrote: > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: >>>> diff --git a/mm/mremap.c b/mm/mremap.c >>>> index e618a706aff5..793fad58302c 100644 >>>> --- a/mm/mremap.c >>>> +++ b/mm/mremap.c >>>> @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) >>>> * We'd prefer to avoid failure later on in do_munmap: >>>> * which may split one vma into three before unmapping. >>>> */ >>>> - if (current->mm->map_count >= sysctl_max_map_count - 3) >>>> + if (exceeds_max_map_count(current->mm, 4)) >>>> return -ENOMEM; >>> >>> In my version this would be: >>> >>> if (map_count_capacity(current->mm) < 4) >>> return -ENOMEM; >>> >> >> Someone could write map_count_capacity(current->mm) <= 4 and reintroduce >> what this is trying to solve. And with the way it is written in this >> patch, someone could pass in the wrong number. > > Right, but I think 'capacity' is pretty clear here, if the caller does something > silly then that's on them... > >> >> I'm not sure this is worth doing. There are places we allow the count >> to go higher. > > ...But yeah, it's kinda borderline as to how useful this is. > > I _do_ however like the 'put map count in one place statically' rather than > having a global, so a minimal version of this could be to just have a helper > function that gets the sysctl_max_map_count, e.g.: > > if (current->mm->mmap_count >= max_map_count() - 3) I enjoy seeing sysctl_max_map_count hidden. But map_count_capacity() is even more readable, so I like it. I don't complete like the "capacity" term, but I cannot think of something better right now. Maybe something around "free" or "remaining", not sure. I also don't completely like "map_count" (I know, I know, we call it like that in structures), because it reminds me of the mapcount ... talking somehow about "vmas" would be quite clear. Anyhow, just as an inspiration my 2 cents ... -- Cheers David / dhildenb
On Thu, Sep 4, 2025 at 10:42 AM David Hildenbrand <david@redhat.com> wrote: > > On 04.09.25 19:33, Lorenzo Stoakes wrote: > > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > >>>> diff --git a/mm/mremap.c b/mm/mremap.c > >>>> index e618a706aff5..793fad58302c 100644 > >>>> --- a/mm/mremap.c > >>>> +++ b/mm/mremap.c > >>>> @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > >>>> * We'd prefer to avoid failure later on in do_munmap: > >>>> * which may split one vma into three before unmapping. > >>>> */ > >>>> - if (current->mm->map_count >= sysctl_max_map_count - 3) > >>>> + if (exceeds_max_map_count(current->mm, 4)) > >>>> return -ENOMEM; > >>> > >>> In my version this would be: > >>> > >>> if (map_count_capacity(current->mm) < 4) > >>> return -ENOMEM; > >>> > >> > >> Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > >> what this is trying to solve. And with the way it is written in this > >> patch, someone could pass in the wrong number. > > > > Right, but I think 'capacity' is pretty clear here, if the caller does something > > silly then that's on them... > > > >> > >> I'm not sure this is worth doing. There are places we allow the count > >> to go higher. > > > > ...But yeah, it's kinda borderline as to how useful this is. > > > > I _do_ however like the 'put map count in one place statically' rather than > > having a global, so a minimal version of this could be to just have a helper > > function that gets the sysctl_max_map_count, e.g.: > > > > if (current->mm->mmap_count >= max_map_count() - 3) > > I enjoy seeing sysctl_max_map_count hidden. But map_count_capacity() is > even more readable, so I like it. > > I don't complete like the "capacity" term, but I cannot think of > something better right now. Maybe something around "free" or > "remaining", not sure. > > I also don't completely like "map_count" (I know, I know, we call it > like that in structures), because it reminds me of the mapcount ... > talking somehow about "vmas" would be quite clear. Thanks David, my original implementation started with vma_limit() :). Maybe something like vma_count_remaining() ? -- Kalesh > > Anyhow, just as an inspiration my 2 cents ... > > -- > Cheers > > David / dhildenb >
* Kalesh Singh <kaleshsingh@google.com> [250904 13:51]: > On Thu, Sep 4, 2025 at 10:42 AM David Hildenbrand <david@redhat.com> wrote: > > > > On 04.09.25 19:33, Lorenzo Stoakes wrote: > > > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > > >>>> diff --git a/mm/mremap.c b/mm/mremap.c > > >>>> index e618a706aff5..793fad58302c 100644 > > >>>> --- a/mm/mremap.c > > >>>> +++ b/mm/mremap.c > > >>>> @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > >>>> * We'd prefer to avoid failure later on in do_munmap: > > >>>> * which may split one vma into three before unmapping. > > >>>> */ > > >>>> - if (current->mm->map_count >= sysctl_max_map_count - 3) > > >>>> + if (exceeds_max_map_count(current->mm, 4)) > > >>>> return -ENOMEM; > > >>> > > >>> In my version this would be: > > >>> > > >>> if (map_count_capacity(current->mm) < 4) > > >>> return -ENOMEM; > > >>> > > >> > > >> Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > > >> what this is trying to solve. And with the way it is written in this > > >> patch, someone could pass in the wrong number. > > > > > > Right, but I think 'capacity' is pretty clear here, if the caller does something > > > silly then that's on them... > > > > > >> > > >> I'm not sure this is worth doing. There are places we allow the count > > >> to go higher. > > > > > > ...But yeah, it's kinda borderline as to how useful this is. > > > > > > I _do_ however like the 'put map count in one place statically' rather than > > > having a global, so a minimal version of this could be to just have a helper > > > function that gets the sysctl_max_map_count, e.g.: > > > > > > if (current->mm->mmap_count >= max_map_count() - 3) > > > > I enjoy seeing sysctl_max_map_count hidden. But map_count_capacity() is > > even more readable, so I like it. > > > > I don't complete like the "capacity" term, but I cannot think of > > something better right now. Maybe something around "free" or > > "remaining", not sure. > > > > I also don't completely like "map_count" (I know, I know, we call it > > like that in structures), because it reminds me of the mapcount ... > > talking somehow about "vmas" would be quite clear. > > Thanks David, my original implementation started with vma_limit() :). > Maybe something like vma_count_remaining() ? Yes, reducing this confusion would very much be helpful. In fact, if you put it in its own function we could change the actual name with lower impact. map_count vs mapcount is annoying. vma_headroom() ? additional_vma_space() ? Maybe David would like: remedy_vma_space() which would be !poison_vma_space().. that's pretty clear.. :) Cheers, Liam
On 04.09.25 20:49, Liam R. Howlett wrote: > * Kalesh Singh <kaleshsingh@google.com> [250904 13:51]: >> On Thu, Sep 4, 2025 at 10:42 AM David Hildenbrand <david@redhat.com> wrote: >>> >>> On 04.09.25 19:33, Lorenzo Stoakes wrote: >>>> On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: >>>>>>> diff --git a/mm/mremap.c b/mm/mremap.c >>>>>>> index e618a706aff5..793fad58302c 100644 >>>>>>> --- a/mm/mremap.c >>>>>>> +++ b/mm/mremap.c >>>>>>> @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) >>>>>>> * We'd prefer to avoid failure later on in do_munmap: >>>>>>> * which may split one vma into three before unmapping. >>>>>>> */ >>>>>>> - if (current->mm->map_count >= sysctl_max_map_count - 3) >>>>>>> + if (exceeds_max_map_count(current->mm, 4)) >>>>>>> return -ENOMEM; >>>>>> >>>>>> In my version this would be: >>>>>> >>>>>> if (map_count_capacity(current->mm) < 4) >>>>>> return -ENOMEM; >>>>>> >>>>> >>>>> Someone could write map_count_capacity(current->mm) <= 4 and reintroduce >>>>> what this is trying to solve. And with the way it is written in this >>>>> patch, someone could pass in the wrong number. >>>> >>>> Right, but I think 'capacity' is pretty clear here, if the caller does something >>>> silly then that's on them... >>>> >>>>> >>>>> I'm not sure this is worth doing. There are places we allow the count >>>>> to go higher. >>>> >>>> ...But yeah, it's kinda borderline as to how useful this is. >>>> >>>> I _do_ however like the 'put map count in one place statically' rather than >>>> having a global, so a minimal version of this could be to just have a helper >>>> function that gets the sysctl_max_map_count, e.g.: >>>> >>>> if (current->mm->mmap_count >= max_map_count() - 3) >>> >>> I enjoy seeing sysctl_max_map_count hidden. But map_count_capacity() is >>> even more readable, so I like it. >>> >>> I don't complete like the "capacity" term, but I cannot think of >>> something better right now. Maybe something around "free" or >>> "remaining", not sure. >>> >>> I also don't completely like "map_count" (I know, I know, we call it >>> like that in structures), because it reminds me of the mapcount ... >>> talking somehow about "vmas" would be quite clear. >> >> Thanks David, my original implementation started with vma_limit() :). >> Maybe something like vma_count_remaining() ? > > Yes, reducing this confusion would very much be helpful. In fact, if > you put it in its own function we could change the actual name with > lower impact. map_count vs mapcount is annoying. > > vma_headroom() ? > additional_vma_space() ? VMA space might be interpreted as VA space. I think basing it on "vma_count" would be good. vma_count_capacity() vma_count_headroom() vma_count_remaining() vma_count_avail() vma_count_left() > > Maybe David would like: > remedy_vma_space() which would be !poison_vma_space().. that's pretty > clear.. :) :D Careful, sensitive topic; I know that Lorenzo is still crying about this late at night when nobody watches, screaming "DAVID, WHYYYYYYY" ;) -- Cheers David / dhildenb
* David Hildenbrand <david@redhat.com> [250904 15:02]: > On 04.09.25 20:49, Liam R. Howlett wrote: > > * Kalesh Singh <kaleshsingh@google.com> [250904 13:51]: > > > On Thu, Sep 4, 2025 at 10:42 AM David Hildenbrand <david@redhat.com> wrote: > > > > > > > > On 04.09.25 19:33, Lorenzo Stoakes wrote: > > > > > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > > > > > > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > > > > > > index e618a706aff5..793fad58302c 100644 > > > > > > > > --- a/mm/mremap.c > > > > > > > > +++ b/mm/mremap.c > > > > > > > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > > > > > > > * We'd prefer to avoid failure later on in do_munmap: > > > > > > > > * which may split one vma into three before unmapping. > > > > > > > > */ > > > > > > > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > > > > > > > + if (exceeds_max_map_count(current->mm, 4)) > > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > In my version this would be: > > > > > > > > > > > > > > if (map_count_capacity(current->mm) < 4) > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > > > > > > Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > > > > > > what this is trying to solve. And with the way it is written in this > > > > > > patch, someone could pass in the wrong number. > > > > > > > > > > Right, but I think 'capacity' is pretty clear here, if the caller does something > > > > > silly then that's on them... > > > > > > > > > > > > > > > > > I'm not sure this is worth doing. There are places we allow the count > > > > > > to go higher. > > > > > > > > > > ...But yeah, it's kinda borderline as to how useful this is. > > > > > > > > > > I _do_ however like the 'put map count in one place statically' rather than > > > > > having a global, so a minimal version of this could be to just have a helper > > > > > function that gets the sysctl_max_map_count, e.g.: > > > > > > > > > > if (current->mm->mmap_count >= max_map_count() - 3) > > > > > > > > I enjoy seeing sysctl_max_map_count hidden. But map_count_capacity() is > > > > even more readable, so I like it. > > > > > > > > I don't complete like the "capacity" term, but I cannot think of > > > > something better right now. Maybe something around "free" or > > > > "remaining", not sure. > > > > > > > > I also don't completely like "map_count" (I know, I know, we call it > > > > like that in structures), because it reminds me of the mapcount ... > > > > talking somehow about "vmas" would be quite clear. > > > > > > Thanks David, my original implementation started with vma_limit() :). > > > Maybe something like vma_count_remaining() ? > > > > Yes, reducing this confusion would very much be helpful. In fact, if > > you put it in its own function we could change the actual name with > > lower impact. map_count vs mapcount is annoying. > > > > vma_headroom() ? > > additional_vma_space() ? > > VMA space might be interpreted as VA space. Fair enough. > > I think basing it on "vma_count" would be good. > > vma_count_capacity() > > vma_count_headroom() > > vma_count_remaining() headroom or remaining have my vote as the clearest. > > vma_count_avail() > > vma_count_left() >
On Thu, Sep 04, 2025 at 03:11:46PM -0400, Liam R. Howlett wrote: > * David Hildenbrand <david@redhat.com> [250904 15:02]: > > On 04.09.25 20:49, Liam R. Howlett wrote: > > > * Kalesh Singh <kaleshsingh@google.com> [250904 13:51]: > > > > On Thu, Sep 4, 2025 at 10:42 AM David Hildenbrand <david@redhat.com> wrote: > > > > > > > > > > On 04.09.25 19:33, Lorenzo Stoakes wrote: > > > > > > On Thu, Sep 04, 2025 at 01:22:51PM -0400, Liam R. Howlett wrote: > > > > > > > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > > > > > > > index e618a706aff5..793fad58302c 100644 > > > > > > > > > --- a/mm/mremap.c > > > > > > > > > +++ b/mm/mremap.c > > > > > > > > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > > > > > > > > * We'd prefer to avoid failure later on in do_munmap: > > > > > > > > > * which may split one vma into three before unmapping. > > > > > > > > > */ > > > > > > > > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > > > > > > > > + if (exceeds_max_map_count(current->mm, 4)) > > > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > > > In my version this would be: > > > > > > > > > > > > > > > > if (map_count_capacity(current->mm) < 4) > > > > > > > > return -ENOMEM; > > > > > > > > > > > > > > > > > > > > > > Someone could write map_count_capacity(current->mm) <= 4 and reintroduce > > > > > > > what this is trying to solve. And with the way it is written in this > > > > > > > patch, someone could pass in the wrong number. > > > > > > > > > > > > Right, but I think 'capacity' is pretty clear here, if the caller does something > > > > > > silly then that's on them... > > > > > > > > > > > > > > > > > > > > I'm not sure this is worth doing. There are places we allow the count > > > > > > > to go higher. > > > > > > > > > > > > ...But yeah, it's kinda borderline as to how useful this is. > > > > > > > > > > > > I _do_ however like the 'put map count in one place statically' rather than > > > > > > having a global, so a minimal version of this could be to just have a helper > > > > > > function that gets the sysctl_max_map_count, e.g.: > > > > > > > > > > > > if (current->mm->mmap_count >= max_map_count() - 3) > > > > > > > > > > I enjoy seeing sysctl_max_map_count hidden. But map_count_capacity() is > > > > > even more readable, so I like it. > > > > > > > > > > I don't complete like the "capacity" term, but I cannot think of > > > > > something better right now. Maybe something around "free" or > > > > > "remaining", not sure. > > > > > > > > > > I also don't completely like "map_count" (I know, I know, we call it > > > > > like that in structures), because it reminds me of the mapcount ... > > > > > talking somehow about "vmas" would be quite clear. > > > > > > > > Thanks David, my original implementation started with vma_limit() :). > > > > Maybe something like vma_count_remaining() ? > > > > > > Yes, reducing this confusion would very much be helpful. In fact, if > > > you put it in its own function we could change the actual name with > > > lower impact. map_count vs mapcount is annoying. > > > > > > vma_headroom() ? > > > additional_vma_space() ? > > > > VMA space might be interpreted as VA space. > > Fair enough. > > > > > I think basing it on "vma_count" would be good. > > > > vma_count_capacity() > > > > vma_count_headroom() > > > > vma_count_remaining() > > headroom or remaining have my vote as the clearest. I like 'remaining' more > > vma_count_avail() > > > > vma_count_left() > > -- Sincerely yours, Mike.
On Thu, Sep 4, 2025 at 9:03 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > > The check against the max map count (sysctl_max_map_count) was > > open-coded in several places. This led to inconsistent enforcement > > and subtle bugs where the limit could be exceeded. > > > > For example, some paths would check map_count > sysctl_max_map_count > > before allocating a new VMA and incrementing the count, allowing the > > process to reach sysctl_max_map_count + 1: > > > > int do_brk_flags(...) > > { > > if (mm->map_count > sysctl_max_map_count) > > return -ENOMEM; > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > vma = vm_area_alloc(mm); > > ... > > mm->map_count++ /* We've now exceeded the threshold. */ > > } > > > > To fix this and unify the logic, introduce a new function, > > exceeds_max_map_count(), to consolidate the check. All open-coded > > checks are replaced with calls to this new function, ensuring the > > limit is applied uniformly and correctly. > > > > To improve encapsulation, sysctl_max_map_count is now static to > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > debugging applications that exhaust their VMA limit easier. > > > > Yeah this is nice thanks! > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Minchan Kim <minchan@kernel.org> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > --- > > include/linux/mm.h | 11 ++++++++++- > > mm/mmap.c | 15 ++++++++++++++- > > mm/mremap.c | 7 ++++--- > > mm/nommu.c | 2 +- > > mm/util.c | 1 - > > mm/vma.c | 6 +++--- > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > -extern int sysctl_max_map_count; > > +/** > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > + * @mm: The memory descriptor for the process. > > + * @new_vmas: The number of new VMAs the operation will create. > > + * > > + * Returns true if the operation would cause the number of VMAs to exceed > > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > > + * is logged if the limit is exceeded. > > + */ > > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); > > No new externs please (as Pedro also pointed out! :) > > > > > extern unsigned long sysctl_user_reserve_kbytes; > > extern unsigned long sysctl_admin_reserve_kbytes; > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 7306253cc3b5..693a0105e6a5 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > > return -EOVERFLOW; > > > > /* Too many mappings? */ > > - if (mm->map_count > sysctl_max_map_count) > > + if (exceeds_max_map_count(mm, 0)) > > Yeah this last 'mystery meat' parameter isn't amazing though to be honest, it's > kinda hard to read/understand. > > E.g.: > > int map_count_capacity(const struct *mm) > { > const int map_count = mm->map_count; > const int max_count = sysctl_max_map_count; > > return max_count > map_count ? max_count - map_count : 0; > } > > Then this would become; > > if (!map_count_capacity(mm)) { > // blah. > } > Hi Lorenzo, I agree your suggestion is a lot clearer and readable. Thanks for the suggestion :) Let me update that in the next revision. --Kalesh > > > return -ENOMEM; > > > > /* > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > int sysctl_legacy_va_layout; > > #endif > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > + > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > +{ > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > + current->comm, current->pid, > > + sysctl_max_map_count); > > Yeah not a fan of this, we aren't warning elsewhere, it's actually perfectly > normal for somebody to exceed this, this isn't at the level of a warning. > > > + return true; > > + } > > + return false; > > +} > > + > > static const struct ctl_table mmap_table[] = { > > { > > .procname = "max_map_count", > > diff --git a/mm/mremap.c b/mm/mremap.c > > index e618a706aff5..793fad58302c 100644 > > --- a/mm/mremap.c > > +++ b/mm/mremap.c > > @@ -1040,7 +1040,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) > > * We'd prefer to avoid failure later on in do_munmap: > > * which may split one vma into three before unmapping. > > */ > > - if (current->mm->map_count >= sysctl_max_map_count - 3) > > + if (exceeds_max_map_count(current->mm, 4)) > > return -ENOMEM; > > In my version this would be: > > if (map_count_capacity(current->mm) < 4) > return -ENOMEM; > > And etc. etc. I won't do them all :) > > I do think this is clearer... > > > > > if (vma->vm_ops && vma->vm_ops->may_split) { > > @@ -1811,9 +1811,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) > > * split in 3 before unmapping it. > > * That means 2 more maps (1 for each) to the ones we already hold. > > * Check whether current map count plus 2 still leads us to 4 maps below > > - * the threshold, otherwise return -ENOMEM here to be more safe. > > + * the threshold. In other words, is the current map count + 6 at or > > + * below the threshold? Otherwise return -ENOMEM here to be more safe. > > */ > > - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) > > + if (exceeds_max_map_count(current->mm, 6)) > > return -ENOMEM; > > > > return 0; > > diff --git a/mm/nommu.c b/mm/nommu.c > > index 8b819fafd57b..0533e1e3b266 100644 > > --- a/mm/nommu.c > > +++ b/mm/nommu.c > > @@ -1316,7 +1316,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > return -ENOMEM; > > > > mm = vma->vm_mm; > > - if (mm->map_count >= sysctl_max_map_count) > > + if (exceeds_max_map_count(mm, 1)) > > return -ENOMEM; > > > > region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); > > diff --git a/mm/util.c b/mm/util.c > > index f814e6a59ab1..b6e83922cafe 100644 > > --- a/mm/util.c > > +++ b/mm/util.c > > @@ -751,7 +751,6 @@ EXPORT_SYMBOL(folio_mc_copy); > > int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; > > static int sysctl_overcommit_ratio __read_mostly = 50; > > static unsigned long sysctl_overcommit_kbytes __read_mostly; > > -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ > > unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ > > > > diff --git a/mm/vma.c b/mm/vma.c > > index 3b12c7579831..f804c8ac8fbb 100644 > > --- a/mm/vma.c > > +++ b/mm/vma.c > > @@ -592,7 +592,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, > > unsigned long addr, int new_below) > > { > > - if (vma->vm_mm->map_count >= sysctl_max_map_count) > > + if (exceeds_max_map_count(vma->vm_mm, 1)) > > return -ENOMEM; > > > > return __split_vma(vmi, vma, addr, new_below); > > @@ -1345,7 +1345,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, > > * its limit temporarily, to help free resources as expected. > > */ > > if (vms->end < vms->vma->vm_end && > > - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { > > + exceeds_max_map_count(vms->vma->vm_mm, 1)) { > > error = -ENOMEM; > > goto map_count_exceeded; > > } > > @@ -2772,7 +2772,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) > > return -ENOMEM; > > > > - if (mm->map_count > sysctl_max_map_count) > > + if (exceeds_max_map_count(mm, 1)) > > return -ENOMEM; > > > > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) > > -- > > 2.51.0.338.gd7d06c2dae-goog > >
On 04.09.25 01:24, Kalesh Singh wrote: > The check against the max map count (sysctl_max_map_count) was > open-coded in several places. This led to inconsistent enforcement > and subtle bugs where the limit could be exceeded. > > For example, some paths would check map_count > sysctl_max_map_count > before allocating a new VMA and incrementing the count, allowing the > process to reach sysctl_max_map_count + 1: > > int do_brk_flags(...) > { > if (mm->map_count > sysctl_max_map_count) > return -ENOMEM; > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > vma = vm_area_alloc(mm); > ... > mm->map_count++ /* We've now exceeded the threshold. */ > } > > To fix this and unify the logic, introduce a new function, > exceeds_max_map_count(), to consolidate the check. All open-coded > checks are replaced with calls to this new function, ensuring the > limit is applied uniformly and correctly. > > To improve encapsulation, sysctl_max_map_count is now static to > mm/mmap.c. The new helper also adds a rate-limited warning to make > debugging applications that exhaust their VMA limit easier. > > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Minchan Kim <minchan@kernel.org> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > --- > include/linux/mm.h | 11 ++++++++++- > mm/mmap.c | 15 ++++++++++++++- > mm/mremap.c | 7 ++++--- > mm/nommu.c | 2 +- > mm/util.c | 1 - > mm/vma.c | 6 +++--- > 6 files changed, 32 insertions(+), 10 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1ae97a0b8ec7..d4e64e6a9814 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > #define MAPCOUNT_ELF_CORE_MARGIN (5) > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > -extern int sysctl_max_map_count; > +/** > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > + * @mm: The memory descriptor for the process. > + * @new_vmas: The number of new VMAs the operation will create. It's not always a "will" right? At least I remember that this was the worst case scenario in some ("may split"). "The number of new VMAs the operation may create in the worst case. -- Cheers David / dhildenb
On Thu, Sep 4, 2025 at 3:14 AM David Hildenbrand <david@redhat.com> wrote: > > On 04.09.25 01:24, Kalesh Singh wrote: > > The check against the max map count (sysctl_max_map_count) was > > open-coded in several places. This led to inconsistent enforcement > > and subtle bugs where the limit could be exceeded. > > > > For example, some paths would check map_count > sysctl_max_map_count > > before allocating a new VMA and incrementing the count, allowing the > > process to reach sysctl_max_map_count + 1: > > > > int do_brk_flags(...) > > { > > if (mm->map_count > sysctl_max_map_count) > > return -ENOMEM; > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > vma = vm_area_alloc(mm); > > ... > > mm->map_count++ /* We've now exceeded the threshold. */ > > } > > > > To fix this and unify the logic, introduce a new function, > > exceeds_max_map_count(), to consolidate the check. All open-coded > > checks are replaced with calls to this new function, ensuring the > > limit is applied uniformly and correctly. > > > > To improve encapsulation, sysctl_max_map_count is now static to > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > debugging applications that exhaust their VMA limit easier. > > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Minchan Kim <minchan@kernel.org> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > --- > > include/linux/mm.h | 11 ++++++++++- > > mm/mmap.c | 15 ++++++++++++++- > > mm/mremap.c | 7 ++++--- > > mm/nommu.c | 2 +- > > mm/util.c | 1 - > > mm/vma.c | 6 +++--- > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > -extern int sysctl_max_map_count; > > +/** > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > + * @mm: The memory descriptor for the process. > > + * @new_vmas: The number of new VMAs the operation will create. > > It's not always a "will" right? At least I remember that this was the > worst case scenario in some ("may split"). > > "The number of new VMAs the operation may create in the worst case. > Hi Daivd, You are correct. Cases like mremap account for the worst case (3 way split on both src and dest). I'll update the description. Thanks, Kalesh > > -- > Cheers > > David / dhildenb >
On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > The check against the max map count (sysctl_max_map_count) was > open-coded in several places. This led to inconsistent enforcement > and subtle bugs where the limit could be exceeded. > > For example, some paths would check map_count > sysctl_max_map_count > before allocating a new VMA and incrementing the count, allowing the > process to reach sysctl_max_map_count + 1: > > int do_brk_flags(...) > { > if (mm->map_count > sysctl_max_map_count) > return -ENOMEM; > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > vma = vm_area_alloc(mm); > ... > mm->map_count++ /* We've now exceeded the threshold. */ > } > > To fix this and unify the logic, introduce a new function, > exceeds_max_map_count(), to consolidate the check. All open-coded > checks are replaced with calls to this new function, ensuring the > limit is applied uniformly and correctly. > > To improve encapsulation, sysctl_max_map_count is now static to > mm/mmap.c. The new helper also adds a rate-limited warning to make > debugging applications that exhaust their VMA limit easier. > > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Minchan Kim <minchan@kernel.org> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > --- > include/linux/mm.h | 11 ++++++++++- > mm/mmap.c | 15 ++++++++++++++- > mm/mremap.c | 7 ++++--- > mm/nommu.c | 2 +- > mm/util.c | 1 - > mm/vma.c | 6 +++--- > 6 files changed, 32 insertions(+), 10 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1ae97a0b8ec7..d4e64e6a9814 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > #define MAPCOUNT_ELF_CORE_MARGIN (5) > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > -extern int sysctl_max_map_count; > +/** > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > + * @mm: The memory descriptor for the process. > + * @new_vmas: The number of new VMAs the operation will create. > + * > + * Returns true if the operation would cause the number of VMAs to exceed > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > + * is logged if the limit is exceeded. kernel-doc will be unhappy because of missing Return: description. Please s/Returns/Return:/ > + */ > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); No need for 'extern' here. And the declaration should go to mm/internal.h > extern unsigned long sysctl_user_reserve_kbytes; > extern unsigned long sysctl_admin_reserve_kbytes; -- Sincerely yours, Mike.
On Thu, Sep 4, 2025 at 12:29 AM Mike Rapoport <rppt@kernel.org> wrote: > > On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > > The check against the max map count (sysctl_max_map_count) was > > open-coded in several places. This led to inconsistent enforcement > > and subtle bugs where the limit could be exceeded. > > > > For example, some paths would check map_count > sysctl_max_map_count > > before allocating a new VMA and incrementing the count, allowing the > > process to reach sysctl_max_map_count + 1: > > > > int do_brk_flags(...) > > { > > if (mm->map_count > sysctl_max_map_count) > > return -ENOMEM; > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > vma = vm_area_alloc(mm); > > ... > > mm->map_count++ /* We've now exceeded the threshold. */ > > } > > > > To fix this and unify the logic, introduce a new function, > > exceeds_max_map_count(), to consolidate the check. All open-coded > > checks are replaced with calls to this new function, ensuring the > > limit is applied uniformly and correctly. > > > > To improve encapsulation, sysctl_max_map_count is now static to > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > debugging applications that exhaust their VMA limit easier. > > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Minchan Kim <minchan@kernel.org> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > --- > > include/linux/mm.h | 11 ++++++++++- > > mm/mmap.c | 15 ++++++++++++++- > > mm/mremap.c | 7 ++++--- > > mm/nommu.c | 2 +- > > mm/util.c | 1 - > > mm/vma.c | 6 +++--- > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > -extern int sysctl_max_map_count; > > +/** > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > + * @mm: The memory descriptor for the process. > > + * @new_vmas: The number of new VMAs the operation will create. > > + * > > + * Returns true if the operation would cause the number of VMAs to exceed > > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > > + * is logged if the limit is exceeded. > > kernel-doc will be unhappy because of missing Return: description. Please > s/Returns/Return:/ Hi Mike, thanks for catching this. I'll fix it in the next version. -- Kalesh > > > + */ > > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); > > No need for 'extern' here. And the declaration should go to mm/internal.h > > > extern unsigned long sysctl_user_reserve_kbytes; > > extern unsigned long sysctl_admin_reserve_kbytes; > > -- > Sincerely yours, > Mike.
On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > The check against the max map count (sysctl_max_map_count) was > open-coded in several places. This led to inconsistent enforcement > and subtle bugs where the limit could be exceeded. > > For example, some paths would check map_count > sysctl_max_map_count > before allocating a new VMA and incrementing the count, allowing the > process to reach sysctl_max_map_count + 1: > > int do_brk_flags(...) > { > if (mm->map_count > sysctl_max_map_count) > return -ENOMEM; > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > vma = vm_area_alloc(mm); > ... > mm->map_count++ /* We've now exceeded the threshold. */ > } I think this should be fixed separately, and sent for stable. > > To fix this and unify the logic, introduce a new function, > exceeds_max_map_count(), to consolidate the check. All open-coded > checks are replaced with calls to this new function, ensuring the > limit is applied uniformly and correctly. Thanks! In general I like the idea. > > To improve encapsulation, sysctl_max_map_count is now static to > mm/mmap.c. The new helper also adds a rate-limited warning to make > debugging applications that exhaust their VMA limit easier. > > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Minchan Kim <minchan@kernel.org> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > --- > include/linux/mm.h | 11 ++++++++++- > mm/mmap.c | 15 ++++++++++++++- > mm/mremap.c | 7 ++++--- > mm/nommu.c | 2 +- > mm/util.c | 1 - > mm/vma.c | 6 +++--- > 6 files changed, 32 insertions(+), 10 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1ae97a0b8ec7..d4e64e6a9814 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > #define MAPCOUNT_ELF_CORE_MARGIN (5) > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > -extern int sysctl_max_map_count; > +/** > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > + * @mm: The memory descriptor for the process. > + * @new_vmas: The number of new VMAs the operation will create. > + * > + * Returns true if the operation would cause the number of VMAs to exceed > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > + * is logged if the limit is exceeded. > + */ > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); No new "extern" in func declarations please. > > extern unsigned long sysctl_user_reserve_kbytes; > extern unsigned long sysctl_admin_reserve_kbytes; > diff --git a/mm/mmap.c b/mm/mmap.c > index 7306253cc3b5..693a0105e6a5 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > return -EOVERFLOW; > > /* Too many mappings? */ > - if (mm->map_count > sysctl_max_map_count) > + if (exceeds_max_map_count(mm, 0)) > return -ENOMEM; If the brk example is incorrect, isn't this also wrong? /me is confused > > /* > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > int sysctl_legacy_va_layout; > #endif > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > + > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > +{ > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > + current->comm, current->pid, > + sysctl_max_map_count); I'm not entirely sold on the map count warn, even if it's rate limited. It sounds like something you can hit in nasty edge cases and nevertheless flood your dmesg (more frustrating if you can't fix the damn program). In any case, if we are to make the checks more strict, we should also add asserts around the place. Though there's a little case in munmap() we can indeed go over temporarily, on purpose. -- Pedro
On Thu, Sep 04, 2025 at 12:46:34AM +0100, Pedro Falcato wrote: > On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > > The check against the max map count (sysctl_max_map_count) was > > open-coded in several places. This led to inconsistent enforcement > > and subtle bugs where the limit could be exceeded. > > > > For example, some paths would check map_count > sysctl_max_map_count > > before allocating a new VMA and incrementing the count, allowing the > > process to reach sysctl_max_map_count + 1: > > > > int do_brk_flags(...) > > { > > if (mm->map_count > sysctl_max_map_count) > > return -ENOMEM; > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > vma = vm_area_alloc(mm); > > ... > > mm->map_count++ /* We've now exceeded the threshold. */ > > } > > I think this should be fixed separately, and sent for stable. > > > > > To fix this and unify the logic, introduce a new function, > > exceeds_max_map_count(), to consolidate the check. All open-coded > > checks are replaced with calls to this new function, ensuring the > > limit is applied uniformly and correctly. > > Thanks! In general I like the idea. > > > > > To improve encapsulation, sysctl_max_map_count is now static to > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > debugging applications that exhaust their VMA limit easier. > > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Minchan Kim <minchan@kernel.org> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > --- > > include/linux/mm.h | 11 ++++++++++- > > mm/mmap.c | 15 ++++++++++++++- > > mm/mremap.c | 7 ++++--- > > mm/nommu.c | 2 +- > > mm/util.c | 1 - > > mm/vma.c | 6 +++--- > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > -extern int sysctl_max_map_count; > > +/** > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > + * @mm: The memory descriptor for the process. > > + * @new_vmas: The number of new VMAs the operation will create. > > + * > > + * Returns true if the operation would cause the number of VMAs to exceed > > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > > + * is logged if the limit is exceeded. > > + */ > > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); > > No new "extern" in func declarations please. > > > > > extern unsigned long sysctl_user_reserve_kbytes; > > extern unsigned long sysctl_admin_reserve_kbytes; > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 7306253cc3b5..693a0105e6a5 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > > return -EOVERFLOW; > > > > /* Too many mappings? */ > > - if (mm->map_count > sysctl_max_map_count) > > + if (exceeds_max_map_count(mm, 0)) > > return -ENOMEM; > > If the brk example is incorrect, isn't this also wrong? /me is confused > > > > /* > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > int sysctl_legacy_va_layout; > > #endif > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > + > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > +{ > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > + current->comm, current->pid, > > + sysctl_max_map_count); > > I'm not entirely sold on the map count warn, even if it's rate limited. It > sounds like something you can hit in nasty edge cases and nevertheless flood > your dmesg (more frustrating if you can't fix the damn program). How about dynamic_debug? a1394bddf9b6, mm: page_alloc: dump migrate-failed pages
On Fri, Sep 5, 2025 at 12:44 PM Minchan Kim <minchan@kernel.org> wrote: > > On Thu, Sep 04, 2025 at 12:46:34AM +0100, Pedro Falcato wrote: > > On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > > > The check against the max map count (sysctl_max_map_count) was > > > open-coded in several places. This led to inconsistent enforcement > > > and subtle bugs where the limit could be exceeded. > > > > > > For example, some paths would check map_count > sysctl_max_map_count > > > before allocating a new VMA and incrementing the count, allowing the > > > process to reach sysctl_max_map_count + 1: > > > > > > int do_brk_flags(...) > > > { > > > if (mm->map_count > sysctl_max_map_count) > > > return -ENOMEM; > > > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > > > vma = vm_area_alloc(mm); > > > ... > > > mm->map_count++ /* We've now exceeded the threshold. */ > > > } > > > > I think this should be fixed separately, and sent for stable. > > > > > > > > To fix this and unify the logic, introduce a new function, > > > exceeds_max_map_count(), to consolidate the check. All open-coded > > > checks are replaced with calls to this new function, ensuring the > > > limit is applied uniformly and correctly. > > > > Thanks! In general I like the idea. > > > > > > > > To improve encapsulation, sysctl_max_map_count is now static to > > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > > debugging applications that exhaust their VMA limit easier. > > > > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > > Cc: Minchan Kim <minchan@kernel.org> > > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > > --- > > > include/linux/mm.h | 11 ++++++++++- > > > mm/mmap.c | 15 ++++++++++++++- > > > mm/mremap.c | 7 ++++--- > > > mm/nommu.c | 2 +- > > > mm/util.c | 1 - > > > mm/vma.c | 6 +++--- > > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > > --- a/include/linux/mm.h > > > +++ b/include/linux/mm.h > > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > > > -extern int sysctl_max_map_count; > > > +/** > > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > > + * @mm: The memory descriptor for the process. > > > + * @new_vmas: The number of new VMAs the operation will create. > > > + * > > > + * Returns true if the operation would cause the number of VMAs to exceed > > > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > > > + * is logged if the limit is exceeded. > > > + */ > > > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); > > > > No new "extern" in func declarations please. > > > > > > > > extern unsigned long sysctl_user_reserve_kbytes; > > > extern unsigned long sysctl_admin_reserve_kbytes; > > > diff --git a/mm/mmap.c b/mm/mmap.c > > > index 7306253cc3b5..693a0105e6a5 100644 > > > --- a/mm/mmap.c > > > +++ b/mm/mmap.c > > > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > > > return -EOVERFLOW; > > > > > > /* Too many mappings? */ > > > - if (mm->map_count > sysctl_max_map_count) > > > + if (exceeds_max_map_count(mm, 0)) > > > return -ENOMEM; > > > > If the brk example is incorrect, isn't this also wrong? /me is confused > > > > > > /* > > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > > int sysctl_legacy_va_layout; > > > #endif > > > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > > + > > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > > +{ > > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > > + current->comm, current->pid, > > > + sysctl_max_map_count); > > > > I'm not entirely sold on the map count warn, even if it's rate limited. It > > sounds like something you can hit in nasty edge cases and nevertheless flood > > your dmesg (more frustrating if you can't fix the damn program). > > How about dynamic_debug? > > a1394bddf9b6, mm: page_alloc: dump migrate-failed pages Hi Minchan, Thanks for the suggestion to use dynamic_debug. As you may have seen in the discussion, it has moved to a capacity based helper (vma_count_remaining()) based on feedback for better readability at the call sites. Unfortunately, a side effect of that design is that we've lost the single, centralized failure point where a dynamic_debug message could be placed. I'm going to stick with that due to the readability benefits. However, you've raised a good point about observability. For this I am planning to add force increment/decrement via vma_count_* helpers and perhaps we can add trace events in the helpers to get similar observability. Thanks, Kalesh
On Wed, Sep 3, 2025 at 4:46 PM Pedro Falcato <pfalcato@suse.de> wrote: > > On Wed, Sep 03, 2025 at 04:24:35PM -0700, Kalesh Singh wrote: > > The check against the max map count (sysctl_max_map_count) was > > open-coded in several places. This led to inconsistent enforcement > > and subtle bugs where the limit could be exceeded. > > > > For example, some paths would check map_count > sysctl_max_map_count > > before allocating a new VMA and incrementing the count, allowing the > > process to reach sysctl_max_map_count + 1: > > > > int do_brk_flags(...) > > { > > if (mm->map_count > sysctl_max_map_count) > > return -ENOMEM; > > > > /* We can get here with mm->map_count == sysctl_max_map_count */ > > > > vma = vm_area_alloc(mm); > > ... > > mm->map_count++ /* We've now exceeded the threshold. */ > > } > > I think this should be fixed separately, and sent for stable. Hi Pedro, thanks for the review. I can split this out separate. > > > > > To fix this and unify the logic, introduce a new function, > > exceeds_max_map_count(), to consolidate the check. All open-coded > > checks are replaced with calls to this new function, ensuring the > > limit is applied uniformly and correctly. > > Thanks! In general I like the idea. > > > > > To improve encapsulation, sysctl_max_map_count is now static to > > mm/mmap.c. The new helper also adds a rate-limited warning to make > > debugging applications that exhaust their VMA limit easier. > > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Minchan Kim <minchan@kernel.org> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com> > > --- > > include/linux/mm.h | 11 ++++++++++- > > mm/mmap.c | 15 ++++++++++++++- > > mm/mremap.c | 7 ++++--- > > mm/nommu.c | 2 +- > > mm/util.c | 1 - > > mm/vma.c | 6 +++--- > > 6 files changed, 32 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 1ae97a0b8ec7..d4e64e6a9814 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -192,7 +192,16 @@ static inline void __mm_zero_struct_page(struct page *page) > > #define MAPCOUNT_ELF_CORE_MARGIN (5) > > #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) > > > > -extern int sysctl_max_map_count; > > +/** > > + * exceeds_max_map_count - check if a VMA operation would exceed max_map_count > > + * @mm: The memory descriptor for the process. > > + * @new_vmas: The number of new VMAs the operation will create. > > + * > > + * Returns true if the operation would cause the number of VMAs to exceed > > + * the sysctl_max_map_count limit, false otherwise. A rate-limited warning > > + * is logged if the limit is exceeded. > > + */ > > +extern bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas); > > No new "extern" in func declarations please. Ack > > > > > extern unsigned long sysctl_user_reserve_kbytes; > > extern unsigned long sysctl_admin_reserve_kbytes; > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 7306253cc3b5..693a0105e6a5 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -374,7 +374,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, > > return -EOVERFLOW; > > > > /* Too many mappings? */ > > - if (mm->map_count > sysctl_max_map_count) > > + if (exceeds_max_map_count(mm, 0)) > > return -ENOMEM; > > If the brk example is incorrect, isn't this also wrong? /me is confused Ahh you are right, this will also go over by 1 once we return from mmap_region(). I'll batch this with the do_brk_flags() fix. > > > > /* > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > int sysctl_legacy_va_layout; > > #endif > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > + > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > +{ > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > + current->comm, current->pid, > > + sysctl_max_map_count); > > I'm not entirely sold on the map count warn, even if it's rate limited. It > sounds like something you can hit in nasty edge cases and nevertheless flood > your dmesg (more frustrating if you can't fix the damn program). I don't feel strongly about this, I can drop it in the next revision. > > In any case, if we are to make the checks more strict, we should also add > asserts around the place. Though there's a little case in munmap() we can indeed > go over temporarily, on purpose. To confirm, do you mean we should WARN_ON() checks where map count is being incremented? > Though there's a little case in munmap() we can indeed > go over temporarily, on purpose. For the 3 way split we need 1 additional VMA after munmap completed as one of the 3 gets unmapped. The check is done in the caller beforehand as __split_vma() explicitly doesn't check map_count. Though if we add asserts we'll need a variant of vma_complete() or the like that doesn't enforce the threshold. Thanks, Kalesh > > -- > Pedro
On Wed, Sep 03, 2025 at 08:01:50PM -0700, Kalesh Singh wrote: > On Wed, Sep 3, 2025 at 4:46 PM Pedro Falcato <pfalcato@suse.de> wrote: > > <snip> > > > > > > /* Too many mappings? */ > > > - if (mm->map_count > sysctl_max_map_count) > > > + if (exceeds_max_map_count(mm, 0)) > > > return -ENOMEM; > > > > If the brk example is incorrect, isn't this also wrong? /me is confused > > Ahh you are right, this will also go over by 1 once we return from > mmap_region(). I'll batch this with the do_brk_flags() fix. > > > > > > > /* > > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > > int sysctl_legacy_va_layout; > > > #endif > > > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > > + > > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > > +{ > > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > > + current->comm, current->pid, > > > + sysctl_max_map_count); > > > > I'm not entirely sold on the map count warn, even if it's rate limited. It > > sounds like something you can hit in nasty edge cases and nevertheless flood > > your dmesg (more frustrating if you can't fix the damn program). > > I don't feel strongly about this, I can drop it in the next revision. FWIW, I don't feel strongly about it either, and I would not mind if there's a way to shut it up (cmdline, or even sysctl knob?). Let's see if anyone has a stronger opinion. > > > > > In any case, if we are to make the checks more strict, we should also add > > asserts around the place. Though there's a little case in munmap() we can indeed > > go over temporarily, on purpose. > > To confirm, do you mean we should WARN_ON() checks where map count is > being incremented? Yes, _possibly_ gated off by CONFIG_DEBUG_VM. > > > Though there's a little case in munmap() we can indeed > > go over temporarily, on purpose. > > For the 3 way split we need 1 additional VMA after munmap completed as > one of the 3 gets unmapped. The check is done in the caller beforehand > as __split_vma() explicitly doesn't check map_count. Though if we add > asserts we'll need a variant of vma_complete() or the like that > doesn't enforce the threshold. Right, it might get a little hairy, which is partly why I'm not super into the idea. But definitely worth considering as a way to help prevent these sorts of problems in the future. -- Pedro
On Thu, Sep 4, 2025 at 8:25 AM Pedro Falcato <pfalcato@suse.de> wrote: > > On Wed, Sep 03, 2025 at 08:01:50PM -0700, Kalesh Singh wrote: > > On Wed, Sep 3, 2025 at 4:46 PM Pedro Falcato <pfalcato@suse.de> wrote: > > > > <snip> > > > > > > > > /* Too many mappings? */ > > > > - if (mm->map_count > sysctl_max_map_count) > > > > + if (exceeds_max_map_count(mm, 0)) > > > > return -ENOMEM; > > > > > > If the brk example is incorrect, isn't this also wrong? /me is confused > > > > Ahh you are right, this will also go over by 1 once we return from > > mmap_region(). I'll batch this with the do_brk_flags() fix. > > > > > > > > > > /* > > > > @@ -1504,6 +1504,19 @@ struct vm_area_struct *_install_special_mapping( > > > > int sysctl_legacy_va_layout; > > > > #endif > > > > > > > > +static int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; > > > > + > > > > +bool exceeds_max_map_count(struct mm_struct *mm, unsigned int new_vmas) > > > > +{ > > > > + if (unlikely(mm->map_count + new_vmas > sysctl_max_map_count)) { > > > > + pr_warn_ratelimited("%s (%d): Map count limit %u exceeded\n", > > > > + current->comm, current->pid, > > > > + sysctl_max_map_count); > > > > > > I'm not entirely sold on the map count warn, even if it's rate limited. It > > > sounds like something you can hit in nasty edge cases and nevertheless flood > > > your dmesg (more frustrating if you can't fix the damn program). > > > > I don't feel strongly about this, I can drop it in the next revision. > > FWIW, I don't feel strongly about it either, and I would not mind if there's a > way to shut it up (cmdline, or even sysctl knob?). Let's see if anyone has a > stronger opinion. > > > > > > > > > In any case, if we are to make the checks more strict, we should also add > > > asserts around the place. Though there's a little case in munmap() we can indeed > > > go over temporarily, on purpose. > > > > To confirm, do you mean we should WARN_ON() checks where map count is > > being incremented? > > Yes, _possibly_ gated off by CONFIG_DEBUG_VM. > > > > > > Though there's a little case in munmap() we can indeed > > > go over temporarily, on purpose. > > > > For the 3 way split we need 1 additional VMA after munmap completed as > > one of the 3 gets unmapped. The check is done in the caller beforehand > > as __split_vma() explicitly doesn't check map_count. Though if we add > > asserts we'll need a variant of vma_complete() or the like that > > doesn't enforce the threshold. > > Right, it might get a little hairy, which is partly why I'm not super into > the idea. But definitely worth considering as a way to help prevent these > sorts of problems in the future. Let me take a stab at this to see how it turns out. Thanks, Kalesh > > -- > Pedro
© 2016 - 2025 Red Hat, Inc.