include/trace/events/percpu.h | 23 +++++++++++++++++------ mm/percpu-internal.h | 8 ++++---- mm/percpu.c | 5 +++-- 3 files changed, 24 insertions(+), 12 deletions(-)
Added call_site, bytes_alloc and gfp_flags fields to the output
of the percpu_alloc_percpu ftrace event:
mkdir-4393 [001] 169.334788: percpu_alloc_percpu:
call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8
base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448
gfp_flags=GFP_KERNEL_ACCOUNT
This is required to track memcg-accounted percpu allocations.
Signed-off-by: Vasily Averin <vvs@openvz.org>
---
v2: added call_site, improved patch description
---
include/trace/events/percpu.h | 23 +++++++++++++++++------
mm/percpu-internal.h | 8 ++++----
mm/percpu.c | 5 +++--
3 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
index df112a64f6c9..e989cefc0def 100644
--- a/include/trace/events/percpu.h
+++ b/include/trace/events/percpu.h
@@ -6,15 +6,20 @@
#define _TRACE_PERCPU_H
#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
TRACE_EVENT(percpu_alloc_percpu,
- TP_PROTO(bool reserved, bool is_atomic, size_t size,
- size_t align, void *base_addr, int off, void __percpu *ptr),
+ TP_PROTO(unsigned long call_site,
+ bool reserved, bool is_atomic, size_t size,
+ size_t align, void *base_addr, int off,
+ void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
- TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),
+ TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
+ ptr, bytes_alloc, gfp_flags),
TP_STRUCT__entry(
+ __field( unsigned long, call_site )
__field( bool, reserved )
__field( bool, is_atomic )
__field( size_t, size )
@@ -22,9 +27,11 @@ TRACE_EVENT(percpu_alloc_percpu,
__field( void *, base_addr )
__field( int, off )
__field( void __percpu *, ptr )
+ __field( size_t, bytes_alloc )
+ __field( gfp_t, gfp_flags )
),
-
TP_fast_assign(
+ __entry->call_site = call_site;
__entry->reserved = reserved;
__entry->is_atomic = is_atomic;
__entry->size = size;
@@ -32,12 +39,16 @@ TRACE_EVENT(percpu_alloc_percpu,
__entry->base_addr = base_addr;
__entry->off = off;
__entry->ptr = ptr;
+ __entry->bytes_alloc = bytes_alloc;
+ __entry->gfp_flags = gfp_flags;
),
- TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
+ TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
+ (void *)__entry->call_site,
__entry->reserved, __entry->is_atomic,
__entry->size, __entry->align,
- __entry->base_addr, __entry->off, __entry->ptr)
+ __entry->base_addr, __entry->off, __entry->ptr,
+ __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);
TRACE_EVENT(percpu_free_percpu,
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 411d1593ef23..70b1ea23f4d2 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -113,7 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
-#ifdef CONFIG_MEMCG_KMEM
/**
* pcpu_obj_full_size - helper to calculate size of each accounted object
* @size: size of area to allocate in bytes
@@ -123,13 +122,14 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
*/
static inline size_t pcpu_obj_full_size(size_t size)
{
- size_t extra_size;
+ size_t extra_size = 0;
- extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#ifdef CONFIG_MEMCG_KMEM
+ extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#endif
return size * num_possible_cpus() + extra_size;
}
-#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_PERCPU_STATS
diff --git a/mm/percpu.c b/mm/percpu.c
index ea28db283044..3633eeefaa0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1884,8 +1884,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
- trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
- chunk->base_addr, off, ptr);
+ trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr,
+ pcpu_obj_full_size(size), gfp);
pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
--
2.31.1
On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote: > Added call_site, bytes_alloc and gfp_flags fields to the output > of the percpu_alloc_percpu ftrace event: > > mkdir-4393 [001] 169.334788: percpu_alloc_percpu: > call_site=mem_cgroup_css_alloc+0xa6 reserved=0 is_atomic=0 size=2408 align=8 > base_addr=0xffffc7117fc00000 off=402176 ptr=0x3dc867a62300 bytes_alloc=14448 > gfp_flags=GFP_KERNEL_ACCOUNT > > This is required to track memcg-accounted percpu allocations. > > Signed-off-by: Vasily Averin <vvs@openvz.org> Acked-by: Roman Gushchin <roman.gushchin@linux.dev> LGTM, thanks Vasily! One minor thing below. > --- > v2: added call_site, improved patch description > --- > include/trace/events/percpu.h | 23 +++++++++++++++++------ > mm/percpu-internal.h | 8 ++++---- > mm/percpu.c | 5 +++-- > 3 files changed, 24 insertions(+), 12 deletions(-) > > diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h > index df112a64f6c9..e989cefc0def 100644 > --- a/include/trace/events/percpu.h > +++ b/include/trace/events/percpu.h > @@ -6,15 +6,20 @@ > #define _TRACE_PERCPU_H > > #include <linux/tracepoint.h> > +#include <trace/events/mmflags.h> > > TRACE_EVENT(percpu_alloc_percpu, > > - TP_PROTO(bool reserved, bool is_atomic, size_t size, > - size_t align, void *base_addr, int off, void __percpu *ptr), > + TP_PROTO(unsigned long call_site, > + bool reserved, bool is_atomic, size_t size, > + size_t align, void *base_addr, int off, > + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), Don't we want to preserve the order and add the call_site at the end? Trace events are not ABI, but if we don't have a strong reason to break it, I'd preserve the old order. Thanks!
On Tue, 10 May 2022 19:33:17 -0700 Roman Gushchin <roman.gushchin@linux.dev> wrote: > --- a/include/trace/events/percpu.h > > +++ b/include/trace/events/percpu.h > > @@ -6,15 +6,20 @@ > > #define _TRACE_PERCPU_H > > > > #include <linux/tracepoint.h> > > +#include <trace/events/mmflags.h> > > > > TRACE_EVENT(percpu_alloc_percpu, > > > > - TP_PROTO(bool reserved, bool is_atomic, size_t size, > > - size_t align, void *base_addr, int off, void __percpu *ptr), > > + TP_PROTO(unsigned long call_site, > > + bool reserved, bool is_atomic, size_t size, > > + size_t align, void *base_addr, int off, > > + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), > > Don't we want to preserve the order and add the call_site at the end? > Trace events are not ABI, but if we don't have a strong reason to break it, > I'd preserve the old order. Ideally everyone should be using libtraceevent which will parse the format file for the needed entries. Nothing (important) should be parsing the raw ascii from the trace files. It's slow and unreliable. The raw format (trace_pipe_raw) files, along with libtraceevent will handle fining the fields you are looking for, even if the fields move around (internally or externally). Then there's trace-cruncher (a python script that uses libtracefs and libtraceevent) that will work too. https://github.com/vmware/trace-cruncher -- Steve
On 5/11/22 05:33, Roman Gushchin wrote:
> On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
>> TRACE_EVENT(percpu_alloc_percpu,
>>
>> - TP_PROTO(bool reserved, bool is_atomic, size_t size,
>> - size_t align, void *base_addr, int off, void __percpu *ptr),
>> + TP_PROTO(unsigned long call_site,
>> + bool reserved, bool is_atomic, size_t size,
>> + size_t align, void *base_addr, int off,
>> + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
>
> Don't we want to preserve the order and add the call_site at the end?
> Trace events are not ABI, but if we don't have a strong reason to break it,
> I'd preserve the old order.
I checked recent trace patches and found that order changes is acceptable.
commit 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d
Author: David Howells <dhowells@redhat.com>
Date: Fri Jan 14 11:44:54 2022 +0000
cachefiles: Make some tracepoint adjustments
- TP_printk("o=%08x i=%lx e=%d",
- __entry->obj, __entry->ino, __entry->error)
+ TP_printk("o=%08x dB=%lx B=%lx e=%d",
+ __entry->obj, __entry->dino, __entry->ino, __entry->error)
On the other hand I'm agree to keep old order by default.
that's why I added bytes_alloc and gfp_flags to end of output.
However I think call_site is an exception. In all cases found,
call_site is output first.
For me personally it simplified output parsing.
So I would like to know Steven's position on this question.
Thank you,
Vasily Averin
On Wed, May 11, 2022 at 08:11:54AM +0300, Vasily Averin wrote:
> On 5/11/22 05:33, Roman Gushchin wrote:
> > On Fri, May 06, 2022 at 10:29:25PM +0300, Vasily Averin wrote:
> >> TRACE_EVENT(percpu_alloc_percpu,
> >>
> >> - TP_PROTO(bool reserved, bool is_atomic, size_t size,
> >> - size_t align, void *base_addr, int off, void __percpu *ptr),
> >> + TP_PROTO(unsigned long call_site,
> >> + bool reserved, bool is_atomic, size_t size,
> >> + size_t align, void *base_addr, int off,
> >> + void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),
> >
> > Don't we want to preserve the order and add the call_site at the end?
> > Trace events are not ABI, but if we don't have a strong reason to break it,
> > I'd preserve the old order.
>
> I checked recent trace patches and found that order changes is acceptable.
>
> commit 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d
> Author: David Howells <dhowells@redhat.com>
> Date: Fri Jan 14 11:44:54 2022 +0000
>
> cachefiles: Make some tracepoint adjustments
>
> - TP_printk("o=%08x i=%lx e=%d",
> - __entry->obj, __entry->ino, __entry->error)
> + TP_printk("o=%08x dB=%lx B=%lx e=%d",
> + __entry->obj, __entry->dino, __entry->ino, __entry->error)
>
> On the other hand I'm agree to keep old order by default.
> that's why I added bytes_alloc and gfp_flags to end of output.
> However I think call_site is an exception. In all cases found,
> call_site is output first.
> For me personally it simplified output parsing.
>
> So I would like to know Steven's position on this question.
Ok, not a strong opinion, I think both options are acceptable.
Thanks!
© 2016 - 2026 Red Hat, Inc.