[v2] percpu: improve percpu_alloc_percpu event trace

[PATCH v2] percpu: improve percpu_alloc_percpu event trace

Posted by Vasily Averin 4 years ago

Added call_site, bytes_alloc and gfp_flags fields to the output
of the percpu_alloc_percpu ftrace event:

mkdir-4393  [001]   169.334788: percpu_alloc_percpu:
 call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8
  base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448
   gfp_flags=GFP_KERNEL_ACCOUNT

This is required to track memcg-accounted percpu allocations.

Signed-off-by: Vasily Averin <vvs@openvz.org>
---
v2: added call_site, improved patch description
---
 include/trace/events/percpu.h | 23 +++++++++++++++++------
 mm/percpu-internal.h          |  8 ++++----
 mm/percpu.c                   |  5 +++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
index df112a64f6c9..e989cefc0def 100644
--- a/include/trace/events/percpu.h
+++ b/include/trace/events/percpu.h
@@ -6,15 +6,20 @@
 #define _TRACE_PERCPU_H
 
 #include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
 
 TRACE_EVENT(percpu_alloc_percpu,
 
-	TP_PROTO(bool reserved, bool is_atomic, size_t size,
-		 size_t align, void *base_addr, int off, void __percpu *ptr),
+	TP_PROTO(unsigned long call_site,
+		 bool reserved, bool is_atomic, size_t size,
+		 size_t align, void *base_addr, int off,
+		 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
 
-	TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),
+	TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
+		ptr, bytes_alloc, gfp_flags),
 
 	TP_STRUCT__entry(
+		__field(	unsigned long,		call_site	)
 		__field(	bool,			reserved	)
 		__field(	bool,			is_atomic	)
 		__field(	size_t,			size		)
@@ -22,9 +27,11 @@ TRACE_EVENT(percpu_alloc_percpu,
 		__field(	void *,			base_addr	)
 		__field(	int,			off		)
 		__field(	void __percpu *,	ptr		)
+		__field(	size_t,			bytes_alloc	)
+		__field(	gfp_t,			gfp_flags	)
 	),
-
 	TP_fast_assign(
+		__entry->call_site	= call_site;
 		__entry->reserved	= reserved;
 		__entry->is_atomic	= is_atomic;
 		__entry->size		= size;
@@ -32,12 +39,16 @@ TRACE_EVENT(percpu_alloc_percpu,
 		__entry->base_addr	= base_addr;
 		__entry->off		= off;
 		__entry->ptr		= ptr;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
+	TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
+		  (void *)__entry->call_site,
 		  __entry->reserved, __entry->is_atomic,
 		  __entry->size, __entry->align,
-		  __entry->base_addr, __entry->off, __entry->ptr)
+		  __entry->base_addr, __entry->off, __entry->ptr,
+		  __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
 );
 
 TRACE_EVENT(percpu_free_percpu,
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 411d1593ef23..70b1ea23f4d2 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -113,7 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
 	return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
 }
 
-#ifdef CONFIG_MEMCG_KMEM
 /**
  * pcpu_obj_full_size - helper to calculate size of each accounted object
  * @size: size of area to allocate in bytes
@@ -123,13 +122,14 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
  */
 static inline size_t pcpu_obj_full_size(size_t size)
 {
-	size_t extra_size;
+	size_t extra_size = 0;
 
-	extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#ifdef CONFIG_MEMCG_KMEM
+	extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#endif
 
 	return size * num_possible_cpus() + extra_size;
 }
-#endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_PERCPU_STATS
 
diff --git a/mm/percpu.c b/mm/percpu.c
index ea28db283044..3633eeefaa0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1884,8 +1884,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
 	kmemleak_alloc_percpu(ptr, size, gfp);
 
-	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
-			chunk->base_addr, off, ptr);
+	trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
+				  chunk->base_addr, off, ptr,
+				  pcpu_obj_full_size(size), gfp);
 
 	pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
 
-- 
2.31.1

Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

Posted by Roman Gushchin 3 years, 12 months ago

On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
> Added call_site, bytes_alloc and gfp_flags fields to the output
> of the percpu_alloc_percpu ftrace event:
> 
> mkdir-4393  [001]   169.334788: percpu_alloc_percpu:
>  call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8
>   base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448
>    gfp_flags=GFP_KERNEL_ACCOUNT
> 
> This is required to track memcg-accounted percpu allocations.
> 
> Signed-off-by: Vasily Averin <vvs@openvz.org>

Acked-by: Roman Gushchin <roman.gushchin@linux.dev>

LGTM, thanks Vasily!

One minor thing below.

> ---
> v2: added call_site, improved patch description
> ---
>  include/trace/events/percpu.h | 23 +++++++++++++++++------
>  mm/percpu-internal.h          |  8 ++++----
>  mm/percpu.c                   |  5 +++--
>  3 files changed, 24 insertions(+), 12 deletions(-)
> 
> diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
> index df112a64f6c9..e989cefc0def 100644
> --- a/include/trace/events/percpu.h
> +++ b/include/trace/events/percpu.h
> @@ -6,15 +6,20 @@
>  #define _TRACE_PERCPU_H
>  
>  #include <linux/tracepoint.h>
> +#include <trace/events/mmflags.h>
>  
>  TRACE_EVENT(percpu_alloc_percpu,
>  
> -	TP_PROTO(bool reserved, bool is_atomic, size_t size,
> -		 size_t align, void *base_addr, int off, void __percpu *ptr),
> +	TP_PROTO(unsigned long call_site,
> +		 bool reserved, bool is_atomic, size_t size,
> +		 size_t align, void *base_addr, int off,
> +		 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

Don't we want to preserve the order and add the call_site at the end?
Trace events are not ABI, but if we don't have a strong reason to break it,
I'd preserve the old order.

Thanks!

Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

Posted by Steven Rostedt 3 years, 11 months ago

On Tue, 10 May 2022 19:33:17 -0700
Roman Gushchin <roman.gushchin@linux.dev> wrote:

>  --- a/include/trace/events/percpu.h
> > +++ b/include/trace/events/percpu.h
> > @@ -6,15 +6,20 @@
> >  #define _TRACE_PERCPU_H
> >  
> >  #include <linux/tracepoint.h>
> > +#include <trace/events/mmflags.h>
> >  
> >  TRACE_EVENT(percpu_alloc_percpu,
> >  
> > -	TP_PROTO(bool reserved, bool is_atomic, size_t size,
> > -		 size_t align, void *base_addr, int off, void __percpu *ptr),
> > +	TP_PROTO(unsigned long call_site,
> > +		 bool reserved, bool is_atomic, size_t size,
> > +		 size_t align, void *base_addr, int off,
> > +		 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),  
> 
> Don't we want to preserve the order and add the call_site at the end?
> Trace events are not ABI, but if we don't have a strong reason to break it,
> I'd preserve the old order.

Ideally everyone should be using libtraceevent which will parse the format
file for the needed entries.

Nothing (important) should be parsing the raw ascii from the trace files.
It's slow and unreliable. The raw format (trace_pipe_raw) files, along with
libtraceevent will handle fining the fields you are looking for, even if
the fields move around (internally or externally).

Then there's trace-cruncher (a python script that uses libtracefs and
libtraceevent) that will work too.

  https://github.com/vmware/trace-cruncher

-- Steve

Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

Posted by Vasily Averin 3 years, 12 months ago

On 5/11/22 05:33, Roman Gushchin wrote:
> On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
>>  TRACE_EVENT(percpu_alloc_percpu,
>>  
>> -	TP_PROTO(bool reserved, bool is_atomic, size_t size,
>> -		 size_t align, void *base_addr, int off, void __percpu *ptr),
>> +	TP_PROTO(unsigned long call_site,
>> +		 bool reserved, bool is_atomic, size_t size,
>> +		 size_t align, void *base_addr, int off,
>> +		 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
> 
> Don't we want to preserve the order and add the call_site at the end?
> Trace events are not ABI, but if we don't have a strong reason to break it,
> I'd preserve the old order.

I checked recent trace patches and found that order changes is acceptable.

commit 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d
Author: David Howells <dhowells@redhat.com>
Date:   Fri Jan 14 11:44:54 2022 +0000

    cachefiles: Make some tracepoint adjustments

-           TP_printk("o=%08x i=%lx e=%d",
-                     __entry->obj, __entry->ino, __entry->error)
+           TP_printk("o=%08x dB=%lx B=%lx e=%d",
+                     __entry->obj, __entry->dino, __entry->ino, __entry->error)

On the other hand I'm agree to keep old order by default.
that's why I added bytes_alloc and gfp_flags to end of output.
However I think call_site is an exception. In all cases found, 
call_site is output first.
For me personally it simplified output parsing.

So I would like to know Steven's position on this question.

Thank you,
	Vasily Averin

Re: [PATCH v2] percpu: improve percpu_alloc_percpu event trace

Posted by Roman Gushchin 3 years, 12 months ago

On Wed, May 11, 2022 at 08:11:54AM +0300, Vasily Averin wrote:
> On 5/11/22 05:33, Roman Gushchin wrote:
> > On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
> >>  TRACE_EVENT(percpu_alloc_percpu,
> >>  
> >> -	TP_PROTO(bool reserved, bool is_atomic, size_t size,
> >> -		 size_t align, void *base_addr, int off, void __percpu *ptr),
> >> +	TP_PROTO(unsigned long call_site,
> >> +		 bool reserved, bool is_atomic, size_t size,
> >> +		 size_t align, void *base_addr, int off,
> >> +		 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
> > 
> > Don't we want to preserve the order and add the call_site at the end?
> > Trace events are not ABI, but if we don't have a strong reason to break it,
> > I'd preserve the old order.
> 
> I checked recent trace patches and found that order changes is acceptable.
> 
> commit 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d
> Author: David Howells <dhowells@redhat.com>
> Date:   Fri Jan 14 11:44:54 2022 +0000
> 
>     cachefiles: Make some tracepoint adjustments
> 
> -           TP_printk("o=%08x i=%lx e=%d",
> -                     __entry->obj, __entry->ino, __entry->error)
> +           TP_printk("o=%08x dB=%lx B=%lx e=%d",
> +                     __entry->obj, __entry->dino, __entry->ino, __entry->error)
> 
> On the other hand I'm agree to keep old order by default.
> that's why I added bytes_alloc and gfp_flags to end of output.
> However I think call_site is an exception. In all cases found, 
> call_site is output first.
> For me personally it simplified output parsing.
> 
> So I would like to know Steven's position on this question.

Ok, not a strong opinion, I think both options are acceptable.

Thanks!