vma count: fixes, test and improvements

[PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 2 weeks, 2 days ago

Needed observability on in field devices can be collected with minimal
overhead and can be toggled on and off. Event driven telemetry can be
done with tracepoint BPF programs.

The process comm is provided for aggregation across devices and tgid is
to enable per-process aggregation per device.

This allows for observing the distribution of such problems in the
field, to deduce if there are legitimate bugs or if a bump to the limit is
warranted.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pedro Falcato <pfalcato@suse.de>
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
---

Chnages in v2:
  - Add needed observability for operations failing due to the vma count limit,
      per Minchan
    (Since there isn't a common point for debug logging due checks being
    external to the capacity based vma_count_remaining() helper. I used a
    trace event for low overhead and to facilitate event driven telemetry
    for in field devices)

 include/trace/events/vma.h | 32 ++++++++++++++++++++++++++++++++
 mm/mmap.c                  |  5 ++++-
 mm/mremap.c                | 10 ++++++++--
 mm/vma.c                   | 11 +++++++++--
 4 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 include/trace/events/vma.h

diff --git a/include/trace/events/vma.h b/include/trace/events/vma.h
new file mode 100644
index 000000000000..2fed63b0d0a6
--- /dev/null
+++ b/include/trace/events/vma.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vma
+
+#if !defined(_TRACE_VMA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VMA_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(max_vma_count_exceeded,
+
+	TP_PROTO(struct task_struct *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__string(comm,	task->comm)
+		__field(pid_t,	tgid)
+	),
+
+	TP_fast_assign(
+		__assign_str(comm);
+		__entry->tgid = task->tgid;
+	),
+
+	TP_printk("comm=%s tgid=%d", __get_str(comm), __entry->tgid)
+);
+
+#endif /*  _TRACE_VMA_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 30ddd550197e..0bb311bf48f3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -56,6 +56,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mmap.h>
+#include <trace/events/vma.h>
 
 #include "internal.h"
 
@@ -374,8 +375,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 		return -EOVERFLOW;
 
 	/* Too many mappings? */
-	if (!vma_count_remaining(mm))
+	if (!vma_count_remaining(mm)) {
+		trace_max_vma_count_exceeded(current);
 		return -ENOMEM;
+	}
 
 	/*
 	 * addr is returned from get_unmapped_area,
diff --git a/mm/mremap.c b/mm/mremap.c
index 14d35d87e89b..f42ac05f0069 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -30,6 +30,8 @@
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 
+#include <trace/events/vma.h>
+
 #include "internal.h"
 
 /* Classify the kind of remap operation being performed. */
@@ -1040,8 +1042,10 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
 	 * We'd prefer to avoid failure later on in do_munmap:
 	 * which may split one vma into three before unmapping.
 	 */
-	if (vma_count_remaining(current->mm) < 4)
+	if (vma_count_remaining(current->mm) < 4) {
+		trace_max_vma_count_exceeded(current);
 		return -ENOMEM;
+	}
 
 	if (vma->vm_ops && vma->vm_ops->may_split) {
 		if (vma->vm_start != old_addr)
@@ -1817,8 +1821,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
 	 * the threshold. In other words, is the current map count + 6 at or
 	 * below the threshold? Otherwise return -ENOMEM here to be more safe.
 	 */
-	if (vma_count_remaining(current->mm) < 6)
+	if (vma_count_remaining(current->mm) < 6) {
+		trace_max_vma_count_exceeded(current);
 		return -ENOMEM;
+	}
 
 	return 0;
 }
diff --git a/mm/vma.c b/mm/vma.c
index 0e4fcaebe209..692c33c3e84d 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -7,6 +7,8 @@
 #include "vma_internal.h"
 #include "vma.h"
 
+#include <trace/events/vma.h>
+
 struct mmap_state {
 	struct mm_struct *mm;
 	struct vma_iterator *vmi;
@@ -621,8 +623,10 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		     unsigned long addr, int new_below)
 {
-	if (!vma_count_remaining(vma->vm_mm))
+	if (!vma_count_remaining(vma->vm_mm)) {
+		trace_max_vma_count_exceeded(current);
 		return -ENOMEM;
+	}
 
 	return __split_vma(vmi, vma, addr, new_below);
 }
@@ -1375,6 +1379,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
 		 */
 		if (vms->end < vms->vma->vm_end &&
 		    !vma_count_remaining(vms->vma->vm_mm)) {
+			trace_max_vma_count_exceeded(current);
 			error = -ENOMEM;
 			goto vma_count_exceeded;
 		}
@@ -2801,8 +2806,10 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
-	if (!vma_count_remaining(mm))
+	if (!vma_count_remaining(mm)) {
+		trace_max_vma_count_exceeded(current);
 		return -ENOMEM;
+	}
 
 	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
-- 
2.51.0.384.g4c02a37b29-goog

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Lorenzo Stoakes 2 weeks ago

On Mon, Sep 15, 2025 at 09:36:38AM -0700, Kalesh Singh wrote:
> Needed observability on in field devices can be collected with minimal
> overhead and can be toggled on and off. Event driven telemetry can be
> done with tracepoint BPF programs.
>
> The process comm is provided for aggregation across devices and tgid is
> to enable per-process aggregation per device.
>
> This allows for observing the distribution of such problems in the
> field, to deduce if there are legitimate bugs or if a bump to the limit is
> warranted.

It's not really a bug though is it? It's somebody running out of resources.

I'm not sure how useful this is really. But I'm open to being convinced!

I also wonder if this is better as a statistic? You'd figure out it was a
problem that way too right?

>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Mike Rapoport <rppt@kernel.org>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Pedro Falcato <pfalcato@suse.de>
> Signed-off-by: Kalesh Singh <kaleshsingh@google.com>

This breaks the VMA tests, please make sure to always check them:

cc -I../shared -I. -I../../include -I../../arch/x86/include -I../../../lib -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined    -c -o vma.o vma.c
In file included from vma.c:33:
../../../mm/vma.c:10:10: fatal error: trace/events/vma.h: No such file or directory
   10 | #include <trace/events/vma.h>
      |          ^~~~~~~~~~~~~~~~~~~~
compilation terminated.
make: *** [<builtin>: vma.o] Error 1

See below though, you've included this in the wrong place (I don't blame you,
perhaps we've not made this _very_ clear :)

> ---
>
> Chnages in v2:
>   - Add needed observability for operations failing due to the vma count limit,
>       per Minchan
>     (Since there isn't a common point for debug logging due checks being
>     external to the capacity based vma_count_remaining() helper. I used a
>     trace event for low overhead and to facilitate event driven telemetry
>     for in field devices)
>
>  include/trace/events/vma.h | 32 ++++++++++++++++++++++++++++++++

Do we really need a new file? We already have VMA-related tracepoints no?

Also if you add a new file you _have_ to update MAINTAINERS.

>  mm/mmap.c                  |  5 ++++-
>  mm/mremap.c                | 10 ++++++++--
>  mm/vma.c                   | 11 +++++++++--
>  4 files changed, 53 insertions(+), 5 deletions(-)
>  create mode 100644 include/trace/events/vma.h
>
> diff --git a/include/trace/events/vma.h b/include/trace/events/vma.h
> new file mode 100644
> index 000000000000..2fed63b0d0a6
> --- /dev/null
> +++ b/include/trace/events/vma.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM vma
> +
> +#if !defined(_TRACE_VMA_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_VMA_H
> +
> +#include <linux/tracepoint.h>
> +
> +TRACE_EVENT(max_vma_count_exceeded,
> +
> +	TP_PROTO(struct task_struct *task),
> +
> +	TP_ARGS(task),
> +
> +	TP_STRUCT__entry(
> +		__string(comm,	task->comm)
> +		__field(pid_t,	tgid)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(comm);
> +		__entry->tgid = task->tgid;
> +	),
> +
> +	TP_printk("comm=%s tgid=%d", __get_str(comm), __entry->tgid)
> +);
> +
> +#endif /*  _TRACE_VMA_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 30ddd550197e..0bb311bf48f3 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -56,6 +56,7 @@
>
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/mmap.h>
> +#include <trace/events/vma.h>
>
>  #include "internal.h"
>
> @@ -374,8 +375,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>  		return -EOVERFLOW;
>
>  	/* Too many mappings? */
> -	if (!vma_count_remaining(mm))
> +	if (!vma_count_remaining(mm)) {
> +		trace_max_vma_count_exceeded(current);
>  		return -ENOMEM;
> +	}
>
>  	/*
>  	 * addr is returned from get_unmapped_area,
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 14d35d87e89b..f42ac05f0069 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -30,6 +30,8 @@
>  #include <asm/tlb.h>
>  #include <asm/pgalloc.h>
>
> +#include <trace/events/vma.h>
> +
>  #include "internal.h"
>
>  /* Classify the kind of remap operation being performed. */
> @@ -1040,8 +1042,10 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
>  	 * We'd prefer to avoid failure later on in do_munmap:
>  	 * which may split one vma into three before unmapping.
>  	 */
> -	if (vma_count_remaining(current->mm) < 4)
> +	if (vma_count_remaining(current->mm) < 4) {
> +		trace_max_vma_count_exceeded(current);

But we didn't exceed it, we're guessing we might?

>  		return -ENOMEM;
> +	}
>
>  	if (vma->vm_ops && vma->vm_ops->may_split) {
>  		if (vma->vm_start != old_addr)
> @@ -1817,8 +1821,10 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
>  	 * the threshold. In other words, is the current map count + 6 at or
>  	 * below the threshold? Otherwise return -ENOMEM here to be more safe.
>  	 */
> -	if (vma_count_remaining(current->mm) < 6)
> +	if (vma_count_remaining(current->mm) < 6) {
> +		trace_max_vma_count_exceeded(current);

Similar point here.

>  		return -ENOMEM;
> +	}
>
>  	return 0;
>  }
> diff --git a/mm/vma.c b/mm/vma.c
> index 0e4fcaebe209..692c33c3e84d 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -7,6 +7,8 @@
>  #include "vma_internal.h"
>  #include "vma.h"
>
> +#include <trace/events/vma.h>

Nope you don't do this :)

vma.c is designed to be userland testable and _intentionally_ puts all its
includes in vma_internal.h.

So you'd need to add this include over there, and then update the vma tests
vma_internal.h to stub out the trace function.

> +
>  struct mmap_state {
>  	struct mm_struct *mm;
>  	struct vma_iterator *vmi;
> @@ -621,8 +623,10 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		     unsigned long addr, int new_below)
>  {
> -	if (!vma_count_remaining(vma->vm_mm))
> +	if (!vma_count_remaining(vma->vm_mm)) {
> +		trace_max_vma_count_exceeded(current);
>  		return -ENOMEM;
> +	}
>
>  	return __split_vma(vmi, vma, addr, new_below);
>  }
> @@ -1375,6 +1379,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
>  		 */
>  		if (vms->end < vms->vma->vm_end &&
>  		    !vma_count_remaining(vms->vma->vm_mm)) {
> +			trace_max_vma_count_exceeded(current);
>  			error = -ENOMEM;
>  			goto vma_count_exceeded;
>  		}
> @@ -2801,8 +2806,10 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
>  		return -ENOMEM;
>
> -	if (!vma_count_remaining(mm))
> +	if (!vma_count_remaining(mm)) {
> +		trace_max_vma_count_exceeded(current);
>  		return -ENOMEM;
> +	}
>
>  	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
>  		return -ENOMEM;
> --
> 2.51.0.384.g4c02a37b29-goog
>

Cheers, Lorenzo

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Lorenzo Stoakes 2 weeks ago

On Thu, Sep 18, 2025 at 02:42:16PM +0100, Lorenzo Stoakes wrote:
> On Mon, Sep 15, 2025 at 09:36:38AM -0700, Kalesh Singh wrote:
> > Needed observability on in field devices can be collected with minimal
> > overhead and can be toggled on and off. Event driven telemetry can be
> > done with tracepoint BPF programs.
> >
> > The process comm is provided for aggregation across devices and tgid is
> > to enable per-process aggregation per device.
> >
> > This allows for observing the distribution of such problems in the
> > field, to deduce if there are legitimate bugs or if a bump to the limit is
> > warranted.
>
> It's not really a bug though is it? It's somebody running out of resources.
>
> I'm not sure how useful this is really. But I'm open to being convinced!
>
> I also wonder if this is better as a statistic? You'd figure out it was a
> problem that way too right?
>
> >
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Cc: Mike Rapoport <rppt@kernel.org>
> > Cc: Minchan Kim <minchan@kernel.org>
> > Cc: Pedro Falcato <pfalcato@suse.de>
> > Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
>
> This breaks the VMA tests, please make sure to always check them:
>
> cc -I../shared -I. -I../../include -I../../arch/x86/include -I../../../lib -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined    -c -o vma.o vma.c
> In file included from vma.c:33:
> ../../../mm/vma.c:10:10: fatal error: trace/events/vma.h: No such file or directory
>    10 | #include <trace/events/vma.h>
>       |          ^~~~~~~~~~~~~~~~~~~~
> compilation terminated.
> make: *** [<builtin>: vma.o] Error 1

Trivial build fix:

----8<----
From fe4c30abbd302ccc628ec92381ac10cea31c6d85 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 18 Sep 2025 14:47:10 +0100
Subject: [PATCH] fix

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/vma.c                         | 2 --
 mm/vma_internal.h                | 2 ++
 tools/testing/vma/vma_internal.h | 4 ++++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/vma.c b/mm/vma.c
index 26046b28cdda..a11d29a2ddc0 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -7,8 +7,6 @@
 #include "vma_internal.h"
 #include "vma.h"

-#include <trace/events/vma.h>
-
 struct mmap_state {
 	struct mm_struct *mm;
 	struct vma_iterator *vmi;
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index 2f05735ff190..2f5ba679f43d 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -47,6 +47,8 @@
 #include <linux/uprobes.h>
 #include <linux/userfaultfd_k.h>

+#include <trace/events/vma.h>
+
 #include <asm/current.h>
 #include <asm/tlb.h>

diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 07f4108c5e4c..c08c91861b9a 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1661,4 +1661,8 @@ static inline void vma_count_dec(struct mm_struct *mm)
 	vma_count_sub(mm, 1);
 }

+static void trace_max_vma_count_exceeded(struct task_struct *task)
+{
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
--
2.51.0

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 1 week, 6 days ago

On Thu, Sep 18, 2025 at 6:52 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
>
> On Thu, Sep 18, 2025 at 02:42:16PM +0100, Lorenzo Stoakes wrote:
> > On Mon, Sep 15, 2025 at 09:36:38AM -0700, Kalesh Singh wrote:
> > > Needed observability on in field devices can be collected with minimal
> > > overhead and can be toggled on and off. Event driven telemetry can be
> > > done with tracepoint BPF programs.
> > >
> > > The process comm is provided for aggregation across devices and tgid is
> > > to enable per-process aggregation per device.
> > >
> > > This allows for observing the distribution of such problems in the
> > > field, to deduce if there are legitimate bugs or if a bump to the limit is
> > > warranted.
> >
> > It's not really a bug though is it? It's somebody running out of resources.
> >
> > I'm not sure how useful this is really. But I'm open to being convinced!
> >
> > I also wonder if this is better as a statistic? You'd figure out it was a
> > problem that way too right?
> >
> > >
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: David Hildenbrand <david@redhat.com>
> > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Cc: Mike Rapoport <rppt@kernel.org>
> > > Cc: Minchan Kim <minchan@kernel.org>
> > > Cc: Pedro Falcato <pfalcato@suse.de>
> > > Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
> >
> > This breaks the VMA tests, please make sure to always check them:
> >
> > cc -I../shared -I. -I../../include -I../../arch/x86/include -I../../../lib -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined    -c -o vma.o vma.c
> > In file included from vma.c:33:
> > ../../../mm/vma.c:10:10: fatal error: trace/events/vma.h: No such file or directory
> >    10 | #include <trace/events/vma.h>
> >       |          ^~~~~~~~~~~~~~~~~~~~
> > compilation terminated.
> > make: *** [<builtin>: vma.o] Error 1
>
> Trivial build fix:
>
> ----8<----
> From fe4c30abbd302ccc628ec92381ac10cea31c6d85 Mon Sep 17 00:00:00 2001
> From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Date: Thu, 18 Sep 2025 14:47:10 +0100
> Subject: [PATCH] fix
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/vma.c                         | 2 --
>  mm/vma_internal.h                | 2 ++
>  tools/testing/vma/vma_internal.h | 4 ++++
>  3 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vma.c b/mm/vma.c
> index 26046b28cdda..a11d29a2ddc0 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -7,8 +7,6 @@
>  #include "vma_internal.h"
>  #include "vma.h"
>
> -#include <trace/events/vma.h>
> -
>  struct mmap_state {
>         struct mm_struct *mm;
>         struct vma_iterator *vmi;
> diff --git a/mm/vma_internal.h b/mm/vma_internal.h
> index 2f05735ff190..2f5ba679f43d 100644
> --- a/mm/vma_internal.h
> +++ b/mm/vma_internal.h
> @@ -47,6 +47,8 @@
>  #include <linux/uprobes.h>
>  #include <linux/userfaultfd_k.h>
>
> +#include <trace/events/vma.h>
> +
>  #include <asm/current.h>
>  #include <asm/tlb.h>
>
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 07f4108c5e4c..c08c91861b9a 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -1661,4 +1661,8 @@ static inline void vma_count_dec(struct mm_struct *mm)
>         vma_count_sub(mm, 1);
>  }
>
> +static void trace_max_vma_count_exceeded(struct task_struct *task)
> +{
> +}
> +
>  #endif /* __MM_VMA_INTERNAL_H */

I made a point to build and run your tests, seems I forgot to actually
test it with this last patch.

Thanks for the fix.

--Kalesh

> --
> 2.51.0

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Steven Rostedt 2 weeks, 2 days ago

On Mon, 15 Sep 2025 09:36:38 -0700
Kalesh Singh <kaleshsingh@google.com> wrote:

> Needed observability on in field devices can be collected with minimal
> overhead and can be toggled on and off. Event driven telemetry can be
> done with tracepoint BPF programs.
> 
> The process comm is provided for aggregation across devices and tgid is
> to enable per-process aggregation per device.

What do you mean about comm being used to aggregation across devices?
What's special about this trace event that will make it used across devices?

Note, if BPF is being used, can't the BPF program just add the current
comm? Why waste space in the ring buffer for it?

> +
> +TRACE_EVENT(max_vma_count_exceeded,
> +
> +	TP_PROTO(struct task_struct *task),

Why pass in the task if it's always going to be current?

> +
> +	TP_ARGS(task),
> +
> +	TP_STRUCT__entry(
> +		__string(comm,	task->comm)

This could be:

		__string(comm, current)

But I still want to know what makes this trace event special over other
trace events to store this, and can't it be retrieved another way,
especially if BPF is being used to hook to it?

-- Steve

> +		__field(pid_t,	tgid)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(comm);
> +		__entry->tgid = task->tgid;
> +	),
> +
> +	TP_printk("comm=%s tgid=%d", __get_str(comm), __entry->tgid)
> +);
> +

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 2 weeks, 2 days ago

On Mon, Sep 15, 2025 at 4:41 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 15 Sep 2025 09:36:38 -0700
> Kalesh Singh <kaleshsingh@google.com> wrote:
>
> > Needed observability on in field devices can be collected with minimal
> > overhead and can be toggled on and off. Event driven telemetry can be
> > done with tracepoint BPF programs.
> >
> > The process comm is provided for aggregation across devices and tgid is
> > to enable per-process aggregation per device.
>
> What do you mean about comm being used to aggregation across devices?
> What's special about this trace event that will make it used across devices?
>
> Note, if BPF is being used, can't the BPF program just add the current
> comm? Why waste space in the ring buffer for it?
>
>
>
> > +
> > +TRACE_EVENT(max_vma_count_exceeded,
> > +
> > +     TP_PROTO(struct task_struct *task),
>
> Why pass in the task if it's always going to be current?
>
> > +
> > +     TP_ARGS(task),
> > +
> > +     TP_STRUCT__entry(
> > +             __string(comm,  task->comm)
>
> This could be:
>
>                 __string(comm, current)
>
> But I still want to know what makes this trace event special over other
> trace events to store this, and can't it be retrieved another way,
> especially if BPF is being used to hook to it?

Hi Steve,

Thanks for the comments and suggestion you are right we can use bpf to
get the comm. There is nothing special about this trace event.  I will
drop comm in the next revision.

The reason I did the task_struct parameter (current): I believe there
is a limitation that we must  specify at least 1 parameter to the
TRACE_EVENT()  PROTO and ARGS macros.

Is there some way to use this without needing a parameter?

I hit the build failure below, with no parameters:

In file included from mm/vma.c:10:
./include/trace/events/vma.h:10:1: error: expected parameter declarator
   10 | TRACE_EVENT(max_vma_count_exceeded,
      | ^
...

Below is the code for reference:

/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vma

#if !defined(_TRACE_VMA_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_VMA_H

#include <linux/tracepoint.h>

TRACE_EVENT(max_vma_count_exceeded,

TP_PROTO(),

TP_ARGS(),

TP_STRUCT__entry(
__field(pid_t, tgid)
),

TP_fast_assign(
__entry->tgid = current->tgid;
),

TP_printk("tgid=%d", __entry->tgid)
);

#endif /*  _TRACE_VMA_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

Thanks,
Kalesh

>
> -- Steve
>
>
> > +             __field(pid_t,  tgid)
> > +     ),
> > +
> > +     TP_fast_assign(
> > +             __assign_str(comm);
> > +             __entry->tgid = task->tgid;
> > +     ),
> > +
> > +     TP_printk("comm=%s tgid=%d", __get_str(comm), __entry->tgid)
> > +);
> > +

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Steven Rostedt 2 weeks, 1 day ago

On Mon, 15 Sep 2025 18:19:53 -0700
Kalesh Singh <kaleshsingh@google.com> wrote:

> Hi Steve,
> 
> Thanks for the comments and suggestion you are right we can use bpf to
> get the comm. There is nothing special about this trace event.  I will
> drop comm in the next revision.
> 
> The reason I did the task_struct parameter (current): I believe there
> is a limitation that we must  specify at least 1 parameter to the
> TRACE_EVENT()  PROTO and ARGS macros.

OK, then this is another issue. We don't want tracepoint "markers".
Each tracepoint can take up to 5K in memory due to the code it
generates and the meta data to control it.

For something like that, we highly recommend dynamic probes (fprobes,
kprobes, etc).

The only purpose of a static tracepoint is to get data within a
function that is too difficult to get via a probe. It should never be
used as a trigger where its purpose is "we hit this path".

-- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Pedro Falcato 2 weeks ago

On Tue, Sep 16, 2025 at 11:52:20AM -0400, Steven Rostedt wrote:
> On Mon, 15 Sep 2025 18:19:53 -0700
> Kalesh Singh <kaleshsingh@google.com> wrote:
> 
> 
> > Hi Steve,
> > 
> > Thanks for the comments and suggestion you are right we can use bpf to
> > get the comm. There is nothing special about this trace event.  I will
> > drop comm in the next revision.
> > 
> > The reason I did the task_struct parameter (current): I believe there
> > is a limitation that we must  specify at least 1 parameter to the
> > TRACE_EVENT()  PROTO and ARGS macros.
> 
> OK, then this is another issue. We don't want tracepoint "markers".
> Each tracepoint can take up to 5K in memory due to the code it
> generates and the meta data to control it.
> 
> For something like that, we highly recommend dynamic probes (fprobes,
> kprobes, etc).
> 
> The only purpose of a static tracepoint is to get data within a
> function that is too difficult to get via a probe. It should never be
> used as a trigger where its purpose is "we hit this path".
>

Isn't the usual problem with that approach, that of static functions/static
inline functions? I was tracing through a problem a few months ago, and man
I really did think "wouldn't it be nice to have a tracepoint instead of fishing
around for kprobe spots".

Not that I particularly think a tracepoint is super worth it in this case, but,
y'know.

-- 
Pedro

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Steven Rostedt 1 week, 6 days ago

On Thu, 18 Sep 2025 12:38:56 +0100
Pedro Falcato <pfalcato@suse.de> wrote:

> Isn't the usual problem with that approach, that of static functions/static
> inline functions? I was tracing through a problem a few months ago, and man
> I really did think "wouldn't it be nice to have a tracepoint instead of fishing
> around for kprobe spots".
> 
> Not that I particularly think a tracepoint is super worth it in this case, but,
> y'know.

Yes, it would be useful. The issue is that tracepoints are not free. They
do increase the I$ hit and take up memory.

If you're going to inject a tracepoint somewhere, at least extract some
useful information from that spot. If you can't think of anything to track,
then it's not worth a tracepoint.

If one really wants a way to track something, they could add a static
branch that would call a function when enabled that could be traced. They
would also need a way to enable that static branch.

-- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 2 weeks, 1 day ago

On Tue, Sep 16, 2025 at 8:52 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 15 Sep 2025 18:19:53 -0700
> Kalesh Singh <kaleshsingh@google.com> wrote:
>
>
> > Hi Steve,
> >
> > Thanks for the comments and suggestion you are right we can use bpf to
> > get the comm. There is nothing special about this trace event.  I will
> > drop comm in the next revision.
> >
> > The reason I did the task_struct parameter (current): I believe there
> > is a limitation that we must  specify at least 1 parameter to the
> > TRACE_EVENT()  PROTO and ARGS macros.
>
> OK, then this is another issue. We don't want tracepoint "markers".
> Each tracepoint can take up to 5K in memory due to the code it
> generates and the meta data to control it.
>
> For something like that, we highly recommend dynamic probes (fprobes,
> kprobes, etc).
>
> The only purpose of a static tracepoint is to get data within a
> function that is too difficult to get via a probe. It should never be
> used as a trigger where its purpose is "we hit this path".

Hi Steve,

I completely agree with the principle that static tracepoints
shouldn't be used as markers if a dynamic probe will suffice. The
intent here is to avoid introducing overhead in the common case to
avoid regressing mmap, munmap, and other syscall latencies; while
still providing observability for the max vma_count exceeded failure
condition.

The original centralized check (before previous review rounds) was
indeed in a dedicated function, exceeds_max_map_count(), where a
kprobe/fprobe could have been easily attached without impacting the
common path. This was changed due to previous review feedback to the
capacity based vma_count_remaining() which necessitated the check to
be done externally by the callers:

https://lore.kernel.org/r/20250903232437.1454293-1-kaleshsingh@google.com/

Would you be ok with something like:

trace_max_vma_count_exceeded(mm);

TP_STRUCT__entry(
__field(unsigned int, mm_id)
__field(unsigned int vma_count)
)

mm_id would be the hash of the mm_struct ptr similar to rss_stat and
the vma_count is the current vma count (some syscalls have different
requirements on the capacity remaining: mremap requires 6 available
slots, other syscalls require 1).

Thanks,
Kalesh

>
> -- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Steven Rostedt 2 weeks, 1 day ago

On Tue, 16 Sep 2025 10:36:57 -0700
Kalesh Singh <kaleshsingh@google.com> wrote:

> I completely agree with the principle that static tracepoints
> shouldn't be used as markers if a dynamic probe will suffice. The
> intent here is to avoid introducing overhead in the common case to
> avoid regressing mmap, munmap, and other syscall latencies; while
> still providing observability for the max vma_count exceeded failure
> condition.
> 
> The original centralized check (before previous review rounds) was
> indeed in a dedicated function, exceeds_max_map_count(), where a
> kprobe/fprobe could have been easily attached without impacting the
> common path. This was changed due to previous review feedback to the
> capacity based vma_count_remaining() which necessitated the check to
> be done externally by the callers:
> 
> https://lore.kernel.org/r/20250903232437.1454293-1-kaleshsingh@google.com/
> 
> Would you be ok with something like:
> 
> trace_max_vma_count_exceeded(mm);
> 
> TP_STRUCT__entry(
> __field(unsigned int, mm_id)
> __field(unsigned int vma_count)
> )
> 
> mm_id would be the hash of the mm_struct ptr similar to rss_stat and
> the vma_count is the current vma count (some syscalls have different
> requirements on the capacity remaining: mremap requires 6 available
> slots, other syscalls require 1).
> 

BTW, why the hash of the mm pointer and not the pointer itself? We save
pointers in lots of places, and if it is the pointer, you could use an
eprobe to attache to the trace event to dereference its fields.

-- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 2 weeks, 1 day ago

On Tue, Sep 16, 2025 at 10:47 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 16 Sep 2025 10:36:57 -0700
> Kalesh Singh <kaleshsingh@google.com> wrote:
>
> > I completely agree with the principle that static tracepoints
> > shouldn't be used as markers if a dynamic probe will suffice. The
> > intent here is to avoid introducing overhead in the common case to
> > avoid regressing mmap, munmap, and other syscall latencies; while
> > still providing observability for the max vma_count exceeded failure
> > condition.
> >
> > The original centralized check (before previous review rounds) was
> > indeed in a dedicated function, exceeds_max_map_count(), where a
> > kprobe/fprobe could have been easily attached without impacting the
> > common path. This was changed due to previous review feedback to the
> > capacity based vma_count_remaining() which necessitated the check to
> > be done externally by the callers:
> >
> > https://lore.kernel.org/r/20250903232437.1454293-1-kaleshsingh@google.com/
> >
> > Would you be ok with something like:
> >
> > trace_max_vma_count_exceeded(mm);
> >
> > TP_STRUCT__entry(
> > __field(unsigned int, mm_id)
> > __field(unsigned int vma_count)
> > )
> >
> > mm_id would be the hash of the mm_struct ptr similar to rss_stat and
> > the vma_count is the current vma count (some syscalls have different
> > requirements on the capacity remaining: mremap requires 6 available
> > slots, other syscalls require 1).
> >
>
> BTW, why the hash of the mm pointer and not the pointer itself? We save
> pointers in lots of places, and if it is the pointer, you could use an
> eprobe to attache to the trace event to dereference its fields.

In Android we try to avoid exposing raw kernel pointers to userspace
for security reasons: raising /proc/sys/kernel/kptr_restrict to 2
immediately after symbols are resolved for necessary telemetry tooling
during early boot. I believe this is also why rss_stat uses the hash
and not the raw pointer.

Thanks,
Kalesh

>
> -- Steve
>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Steven Rostedt 2 weeks, 1 day ago

On Tue, 16 Sep 2025 10:57:43 -0700
Kalesh Singh <kaleshsingh@google.com> wrote:

> > BTW, why the hash of the mm pointer and not the pointer itself? We save
> > pointers in lots of places, and if it is the pointer, you could use an
> > eprobe to attache to the trace event to dereference its fields.  
> 
> In Android we try to avoid exposing raw kernel pointers to userspace
> for security reasons: raising /proc/sys/kernel/kptr_restrict to 2
> immediately after symbols are resolved for necessary telemetry tooling
> during early boot. I believe this is also why rss_stat uses the hash
> and not the raw pointer.

When it comes to tracing, you already lost. If it goes into the ring buffer
it's a raw pointer. BPF doesn't use the output of the trace event, so you
are exposing nothing from that. It uses the proto directly.

Heck, if you enable function tracing, you are exposing every function
address it traces via the raw data output.

-- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 2 weeks, 1 day ago

On Tue, Sep 16, 2025 at 11:01 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 16 Sep 2025 10:57:43 -0700
> Kalesh Singh <kaleshsingh@google.com> wrote:
>
> > > BTW, why the hash of the mm pointer and not the pointer itself? We save
> > > pointers in lots of places, and if it is the pointer, you could use an
> > > eprobe to attache to the trace event to dereference its fields.
> >
> > In Android we try to avoid exposing raw kernel pointers to userspace
> > for security reasons: raising /proc/sys/kernel/kptr_restrict to 2
> > immediately after symbols are resolved for necessary telemetry tooling
> > during early boot. I believe this is also why rss_stat uses the hash
> > and not the raw pointer.
>
> When it comes to tracing, you already lost. If it goes into the ring buffer
> it's a raw pointer. BPF doesn't use the output of the trace event, so you
> are exposing nothing from that. It uses the proto directly.

My understanding is that the BPF tracepoint type uses the trace event
fields from TP_STRUCT__entry(); whereas the raw tracepoint type has
access to the proto arguments. Please CMIW: Isn't what we'd be adding
to the trace buffer is the hashed mm_id value?

>
> Heck, if you enable function tracing, you are exposing every function
> address it traces via the raw data output.

Right, security doesn't allow compiling CONFIG_FUNCTION_TRACER  in
Android production kernels.

Thanks,
Kalesh

>
> -- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Steven Rostedt 2 weeks, 1 day ago

On Tue, 16 Sep 2025 11:23:03 -0700
Kalesh Singh <kaleshsingh@google.com> wrote:

> > When it comes to tracing, you already lost. If it goes into the ring buffer
> > it's a raw pointer. BPF doesn't use the output of the trace event, so you
> > are exposing nothing from that. It uses the proto directly.  
> 
> My understanding is that the BPF tracepoint type uses the trace event
> fields from TP_STRUCT__entry(); whereas the raw tracepoint type has
> access to the proto arguments. Please CMIW: Isn't what we'd be adding
> to the trace buffer is the hashed mm_id value?

Ah, right. Can't the BPF infrastructure protect against it?

> 
> >
> > Heck, if you enable function tracing, you are exposing every function
> > address it traces via the raw data output.  
> 
> Right, security doesn't allow compiling CONFIG_FUNCTION_TRACER  in
> Android production kernels.

Does it block all the other trace events that share pointers?

Like nmi handler tracepoints, x86_fpu tracepoints, page_fault kernel
tracepoint, tasklet tracepoints, cpu hot plug tracepoints, timer
tracepoints, work queue tracepoints, ipi tracepoints, scheduling
tracepoints, locking tracepoints, rcu tracepoints, dma tracepoints,
module tracepoints, xdp tracepoints, filemap tracepoints, page map
tracepoints, vmscan tracepoints, percpu tracepoints, kmem_cache
tracepoints, mmap lock tracepoints, file lock tracepoints, and many
more! (I got tired of looking them up).

Again, are you really protecting anything if this one trace point
hashes the pointer? Most other tracepoints expose this. If BPF can
access a tracepoint entry struct, it can access the raw address and
break KASLR.

-- Steve

Re: [PATCH v2 7/7] mm/tracing: introduce max_vma_count_exceeded trace event

Posted by Kalesh Singh 2 weeks, 1 day ago

On Tue, Sep 16, 2025 at 11:51 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 16 Sep 2025 11:23:03 -0700
> Kalesh Singh <kaleshsingh@google.com> wrote:
>
> > > When it comes to tracing, you already lost. If it goes into the ring buffer
> > > it's a raw pointer. BPF doesn't use the output of the trace event, so you
> > > are exposing nothing from that. It uses the proto directly.
> >
> > My understanding is that the BPF tracepoint type uses the trace event
> > fields from TP_STRUCT__entry(); whereas the raw tracepoint type has
> > access to the proto arguments. Please CMIW: Isn't what we'd be adding
> > to the trace buffer is the hashed mm_id value?
>
> Ah, right. Can't the BPF infrastructure protect against it?
>
> >
> > >
> > > Heck, if you enable function tracing, you are exposing every function
> > > address it traces via the raw data output.
> >
> > Right, security doesn't allow compiling CONFIG_FUNCTION_TRACER  in
> > Android production kernels.
>
> Does it block all the other trace events that share pointers?
>
> Like nmi handler tracepoints, x86_fpu tracepoints, page_fault kernel
> tracepoint, tasklet tracepoints, cpu hot plug tracepoints, timer
> tracepoints, work queue tracepoints, ipi tracepoints, scheduling
> tracepoints, locking tracepoints, rcu tracepoints, dma tracepoints,
> module tracepoints, xdp tracepoints, filemap tracepoints, page map
> tracepoints, vmscan tracepoints, percpu tracepoints, kmem_cache
> tracepoints, mmap lock tracepoints, file lock tracepoints, and many
> more! (I got tired of looking them up).

Hi Steve,

I see your point :)  I'll use the raw pointer and handle not exposing
it from the BPF side.

Thanks for discussing.

--Kalesh

>
> Again, are you really protecting anything if this one trace point
> hashes the pointer? Most other tracepoints expose this. If BPF can
> access a tracepoint entry struct, it can access the raw address and
> break KASLR.

Thanks,
Kalesh
>
> -- Steve