[PATCH v2 1/4] perf lock: Add bpf maps for owner stack tracing

Chun-Tse Shao posted 4 patches 1 year ago
There is a newer version of this series
[PATCH v2 1/4] perf lock: Add bpf maps for owner stack tracing
Posted by Chun-Tse Shao 1 year ago
Add few bpf maps in order to tracing owner stack.

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 tools/perf/util/bpf_lock_contention.c         | 17 ++++++--
 .../perf/util/bpf_skel/lock_contention.bpf.c  | 40 +++++++++++++++++--
 tools/perf/util/bpf_skel/lock_data.h          |  6 +++
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 41a1ad087895..c9c58f243ceb 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -41,9 +41,20 @@ int lock_contention_prepare(struct lock_contention *con)
 	else
 		bpf_map__set_max_entries(skel->maps.task_data, 1);
 
-	if (con->save_callstack)
-		bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
-	else
+	if (con->save_callstack) {
+		bpf_map__set_max_entries(skel->maps.stacks,
+					 con->map_nr_entries);
+		if (con->owner) {
+			bpf_map__set_value_size(skel->maps.owner_stacks_entries,
+						con->max_stack * sizeof(u64));
+			bpf_map__set_value_size(
+				skel->maps.contention_owner_stacks,
+				con->max_stack * sizeof(u64));
+			bpf_map__set_key_size(skel->maps.owner_lock_stat,
+						con->max_stack * sizeof(u64));
+			skel->rodata->max_stack = con->max_stack;
+		}
+	} else
 		bpf_map__set_max_entries(skel->maps.stacks, 1);
 
 	if (target__has_cpu(target)) {
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 1069bda5d733..05da19fdab23 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -19,13 +19,37 @@
 #define LCB_F_PERCPU	(1U << 4)
 #define LCB_F_MUTEX	(1U << 5)
 
-/* callstack storage  */
+ /* tmp buffer for owner callstack */
 struct {
-	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, 1);
+} owner_stacks_entries SEC(".maps");
+
+/* a map for tracing lock address to owner data */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64)); // lock address
+	__uint(value_size, sizeof(cotd));
 	__uint(max_entries, MAX_ENTRIES);
-} stacks SEC(".maps");
+} contention_owner_tracing SEC(".maps");
+
+/* a map for tracing lock address to owner stacktrace */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64)); // lock address
+	__uint(value_size, sizeof(__u64)); // straktrace
+	__uint(max_entries, MAX_ENTRIES);
+} contention_owner_stacks SEC(".maps");
+
+/* owner callstack to contention data storage */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u64));
+	__uint(value_size, sizeof(struct contention_data));
+	__uint(max_entries, MAX_ENTRIES);
+} owner_lock_stat SEC(".maps");
 
 /* maintain timestamp at the beginning of contention */
 struct {
@@ -43,6 +67,14 @@ struct {
 	__uint(max_entries, 1);
 } tstamp_cpu SEC(".maps");
 
+/* callstack storage  */
+struct {
+	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, MAX_ENTRIES);
+} stacks SEC(".maps");
+
 /* actual lock contention statistics */
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -126,6 +158,7 @@ const volatile int needs_callstack;
 const volatile int stack_skip;
 const volatile int lock_owner;
 const volatile int use_cgroup_v2;
+const volatile int max_stack;
 
 /* determine the key of lock stat */
 const volatile int aggr_mode;
@@ -436,7 +469,6 @@ int contention_end(u64 *ctx)
 			return 0;
 		need_delete = true;
 	}
-
 	duration = bpf_ktime_get_ns() - pelem->timestamp;
 	if ((__s64)duration < 0) {
 		__sync_fetch_and_add(&time_fail, 1);
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index de12892f992f..1ef0bca9860e 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -3,6 +3,12 @@
 #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
 #define UTIL_BPF_SKEL_LOCK_DATA_H
 
+typedef struct contention_owner_tracing_data {
+	u32 pid; // Who has the lock.
+	u64 timestamp; // The time while the owner acquires lock and contention is going on.
+	u32 count; // How many waiters for this lock.
+} cotd;
+
 struct tstamp_data {
 	u64 timestamp;
 	u64 lock;
-- 
2.47.1.688.g23fc6f90ad-goog
Re: [PATCH v2 1/4] perf lock: Add bpf maps for owner stack tracing
Posted by Namhyung Kim 1 year ago
Hello,

On Sun, Jan 12, 2025 at 09:20:14PM -0800, Chun-Tse Shao wrote:
> Add few bpf maps in order to tracing owner stack.

If you want to split this code as a separate commit, I think you'd
better explain what these maps do and why you need them.

> 
> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> ---
>  tools/perf/util/bpf_lock_contention.c         | 17 ++++++--
>  .../perf/util/bpf_skel/lock_contention.bpf.c  | 40 +++++++++++++++++--
>  tools/perf/util/bpf_skel/lock_data.h          |  6 +++
>  3 files changed, 56 insertions(+), 7 deletions(-)
> 
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 41a1ad087895..c9c58f243ceb 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -41,9 +41,20 @@ int lock_contention_prepare(struct lock_contention *con)
>  	else
>  		bpf_map__set_max_entries(skel->maps.task_data, 1);
>  
> -	if (con->save_callstack)
> -		bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
> -	else
> +	if (con->save_callstack) {
> +		bpf_map__set_max_entries(skel->maps.stacks,
> +					 con->map_nr_entries);
> +		if (con->owner) {
> +			bpf_map__set_value_size(skel->maps.owner_stacks_entries,
> +						con->max_stack * sizeof(u64));
> +			bpf_map__set_value_size(
> +				skel->maps.contention_owner_stacks,
> +				con->max_stack * sizeof(u64));
> +			bpf_map__set_key_size(skel->maps.owner_lock_stat,
> +						con->max_stack * sizeof(u64));
> +			skel->rodata->max_stack = con->max_stack;
> +		}
> +	} else
>  		bpf_map__set_max_entries(skel->maps.stacks, 1);
>  
>  	if (target__has_cpu(target)) {
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 1069bda5d733..05da19fdab23 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -19,13 +19,37 @@
>  #define LCB_F_PERCPU	(1U << 4)
>  #define LCB_F_MUTEX	(1U << 5)
>  

Can we rename these shorter and save some typings?

> -/* callstack storage  */
> + /* tmp buffer for owner callstack */
>  struct {
> -	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
> +	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>  	__uint(key_size, sizeof(__u32));
>  	__uint(value_size, sizeof(__u64));
> +	__uint(max_entries, 1);
> +} owner_stacks_entries SEC(".maps");

I think this can be 'stack_buf'.

> +
> +/* a map for tracing lock address to owner data */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__uint(key_size, sizeof(__u64)); // lock address
> +	__uint(value_size, sizeof(cotd));
>  	__uint(max_entries, MAX_ENTRIES);
> -} stacks SEC(".maps");
> +} contention_owner_tracing SEC(".maps");

owner_data.

> +
> +/* a map for tracing lock address to owner stacktrace */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__uint(key_size, sizeof(__u64)); // lock address
> +	__uint(value_size, sizeof(__u64)); // straktrace

Typo.

> +	__uint(max_entries, MAX_ENTRIES);
> +} contention_owner_stacks SEC(".maps");

owner_stack.

> +
> +/* owner callstack to contention data storage */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__uint(key_size, sizeof(__u64));
> +	__uint(value_size, sizeof(struct contention_data));
> +	__uint(max_entries, MAX_ENTRIES);
> +} owner_lock_stat SEC(".maps");

owner_stat.  What do you think?

By the way, I got an idea to implement stackid map in BPF using hash
map.  For owner stack, you can use the stacktrace as a key and make a
value an unique integer.  Then the return value can be used as a stack
id (like from bpf_get_stackid) for the owner_data and owner_stat.

Something like:

  s32 get_stack_id(struct owner_stack *owner_stack, u64 stacktrace[])
  {
	s32 *id, new_id;
	static s32 id_gen = 1;

	id = bpf_map_lookup_elem(owner_stack, stacktrace);
	if (id)
		return *id;
	
	new_id = __sync_fetch_and_add(&id_gen, 1);
	bpf_map_update_elem(owner_stack, stacktrace, &new_id, BPF_NOEXIST);

	id = bpf_map_lookup_elem(owner_stack, stacktrace);
	if (id)
		return *id;
	
	return -1;
  }

Later, in user space, you can traverse the owner_stack map to build
reverse mapping from id to stacktrace.

>  
>  /* maintain timestamp at the beginning of contention */
>  struct {
> @@ -43,6 +67,14 @@ struct {
>  	__uint(max_entries, 1);
>  } tstamp_cpu SEC(".maps");
>  
> +/* callstack storage  */
> +struct {
> +	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
> +	__uint(key_size, sizeof(__u32));
> +	__uint(value_size, sizeof(__u64));
> +	__uint(max_entries, MAX_ENTRIES);
> +} stacks SEC(".maps");
> +
>  /* actual lock contention statistics */
>  struct {
>  	__uint(type, BPF_MAP_TYPE_HASH);
> @@ -126,6 +158,7 @@ const volatile int needs_callstack;
>  const volatile int stack_skip;
>  const volatile int lock_owner;
>  const volatile int use_cgroup_v2;
> +const volatile int max_stack;
>  
>  /* determine the key of lock stat */
>  const volatile int aggr_mode;
> @@ -436,7 +469,6 @@ int contention_end(u64 *ctx)
>  			return 0;
>  		need_delete = true;
>  	}
> -
>  	duration = bpf_ktime_get_ns() - pelem->timestamp;
>  	if ((__s64)duration < 0) {
>  		__sync_fetch_and_add(&time_fail, 1);
> diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> index de12892f992f..1ef0bca9860e 100644
> --- a/tools/perf/util/bpf_skel/lock_data.h
> +++ b/tools/perf/util/bpf_skel/lock_data.h
> @@ -3,6 +3,12 @@
>  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
>  #define UTIL_BPF_SKEL_LOCK_DATA_H
>  
> +typedef struct contention_owner_tracing_data {
> +	u32 pid; // Who has the lock.
> +	u64 timestamp; // The time while the owner acquires lock and contention is going on.
> +	u32 count; // How many waiters for this lock.

Switching the order of timestamp and count would remove padding.

> +} cotd;

Usually we don't use typedef to remove the struct tag.

Thanks,
Namhyung

> +
>  struct tstamp_data {
>  	u64 timestamp;
>  	u64 lock;
> -- 
> 2.47.1.688.g23fc6f90ad-goog
>
Re: [PATCH v2 1/4] perf lock: Add bpf maps for owner stack tracing
Posted by Chun-Tse Shao 1 year ago
Hi Namhyung, thanks for your reply!

On Mon, Jan 13, 2025 at 7:05 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello,
>
> On Sun, Jan 12, 2025 at 09:20:14PM -0800, Chun-Tse Shao wrote:
> > Add few bpf maps in order to tracing owner stack.
>
> If you want to split this code as a separate commit, I think you'd
> better explain what these maps do and why you need them.
>
> >
> > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > ---
> >  tools/perf/util/bpf_lock_contention.c         | 17 ++++++--
> >  .../perf/util/bpf_skel/lock_contention.bpf.c  | 40 +++++++++++++++++--
> >  tools/perf/util/bpf_skel/lock_data.h          |  6 +++
> >  3 files changed, 56 insertions(+), 7 deletions(-)
> >
> > diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> > index 41a1ad087895..c9c58f243ceb 100644
> > --- a/tools/perf/util/bpf_lock_contention.c
> > +++ b/tools/perf/util/bpf_lock_contention.c
> > @@ -41,9 +41,20 @@ int lock_contention_prepare(struct lock_contention *con)
> >       else
> >               bpf_map__set_max_entries(skel->maps.task_data, 1);
> >
> > -     if (con->save_callstack)
> > -             bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
> > -     else
> > +     if (con->save_callstack) {
> > +             bpf_map__set_max_entries(skel->maps.stacks,
> > +                                      con->map_nr_entries);
> > +             if (con->owner) {
> > +                     bpf_map__set_value_size(skel->maps.owner_stacks_entries,
> > +                                             con->max_stack * sizeof(u64));
> > +                     bpf_map__set_value_size(
> > +                             skel->maps.contention_owner_stacks,
> > +                             con->max_stack * sizeof(u64));
> > +                     bpf_map__set_key_size(skel->maps.owner_lock_stat,
> > +                                             con->max_stack * sizeof(u64));
> > +                     skel->rodata->max_stack = con->max_stack;
> > +             }
> > +     } else
> >               bpf_map__set_max_entries(skel->maps.stacks, 1);
> >
> >       if (target__has_cpu(target)) {
> > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > index 1069bda5d733..05da19fdab23 100644
> > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > @@ -19,13 +19,37 @@
> >  #define LCB_F_PERCPU (1U << 4)
> >  #define LCB_F_MUTEX  (1U << 5)
> >
>
> Can we rename these shorter and save some typings?

I tend to use longer variable names with full descriptions with some
easy to understand abbreviations. Would a shorter name be preferable
in Linux kernel?

>
> > -/* callstack storage  */
> > + /* tmp buffer for owner callstack */
> >  struct {
> > -     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> > +     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> >       __uint(key_size, sizeof(__u32));
> >       __uint(value_size, sizeof(__u64));
> > +     __uint(max_entries, 1);
> > +} owner_stacks_entries SEC(".maps");
>
> I think this can be 'stack_buf'.
>
> > +
> > +/* a map for tracing lock address to owner data */
> > +struct {
> > +     __uint(type, BPF_MAP_TYPE_HASH);
> > +     __uint(key_size, sizeof(__u64)); // lock address
> > +     __uint(value_size, sizeof(cotd));
> >       __uint(max_entries, MAX_ENTRIES);
> > -} stacks SEC(".maps");
> > +} contention_owner_tracing SEC(".maps");
>
> owner_data.
>
> > +
> > +/* a map for tracing lock address to owner stacktrace */
> > +struct {
> > +     __uint(type, BPF_MAP_TYPE_HASH);
> > +     __uint(key_size, sizeof(__u64)); // lock address
> > +     __uint(value_size, sizeof(__u64)); // straktrace
>
> Typo.
>
> > +     __uint(max_entries, MAX_ENTRIES);
> > +} contention_owner_stacks SEC(".maps");
>
> owner_stack.
>
> > +
> > +/* owner callstack to contention data storage */
> > +struct {
> > +     __uint(type, BPF_MAP_TYPE_HASH);
> > +     __uint(key_size, sizeof(__u64));
> > +     __uint(value_size, sizeof(struct contention_data));
> > +     __uint(max_entries, MAX_ENTRIES);
> > +} owner_lock_stat SEC(".maps");
>
> owner_stat.  What do you think?
>
> By the way, I got an idea to implement stackid map in BPF using hash
> map.  For owner stack, you can use the stacktrace as a key and make a
> value an unique integer.  Then the return value can be used as a stack
> id (like from bpf_get_stackid) for the owner_data and owner_stat.
>
> Something like:
>
>   s32 get_stack_id(struct owner_stack *owner_stack, u64 stacktrace[])
>   {
>         s32 *id, new_id;
>         static s32 id_gen = 1;
>
>         id = bpf_map_lookup_elem(owner_stack, stacktrace);
>         if (id)
>                 return *id;
>
>         new_id = __sync_fetch_and_add(&id_gen, 1);
>         bpf_map_update_elem(owner_stack, stacktrace, &new_id, BPF_NOEXIST);
>
>         id = bpf_map_lookup_elem(owner_stack, stacktrace);
>         if (id)
>                 return *id;
>
>         return -1;
>   }
>
> Later, in user space, you can traverse the owner_stack map to build
> reverse mapping from id to stacktrace.

I wonder if stack_id is necessary here. So far I have three bpf maps.
2 bpf maps for tracing owner stack on given lock address in bpf program:
  key: lock_address, value: a struct for tracing owner pid, count of
waiters and contention begin timestamp.
  key: lock_address, value: owner stack, which is variable length so I
have to put it in a separate bpf map.

1 bpf map for reporting owner stack in user mode:
  key: owner stack, value: struct lock_stat.

With stackid I think there will still be 3 bpf maps, one for
lock_address to owner's info with stackid, one for stackid to stack,
and one for contention_key (has stackid inside) to lock_stat. I think
it is just another way to implement and does not simplify the
implementation. WDYT?

>
> >
> >  /* maintain timestamp at the beginning of contention */
> >  struct {
> > @@ -43,6 +67,14 @@ struct {
> >       __uint(max_entries, 1);
> >  } tstamp_cpu SEC(".maps");
> >
> > +/* callstack storage  */
> > +struct {
> > +     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> > +     __uint(key_size, sizeof(__u32));
> > +     __uint(value_size, sizeof(__u64));
> > +     __uint(max_entries, MAX_ENTRIES);
> > +} stacks SEC(".maps");
> > +
> >  /* actual lock contention statistics */
> >  struct {
> >       __uint(type, BPF_MAP_TYPE_HASH);
> > @@ -126,6 +158,7 @@ const volatile int needs_callstack;
> >  const volatile int stack_skip;
> >  const volatile int lock_owner;
> >  const volatile int use_cgroup_v2;
> > +const volatile int max_stack;
> >
> >  /* determine the key of lock stat */
> >  const volatile int aggr_mode;
> > @@ -436,7 +469,6 @@ int contention_end(u64 *ctx)
> >                       return 0;
> >               need_delete = true;
> >       }
> > -
> >       duration = bpf_ktime_get_ns() - pelem->timestamp;
> >       if ((__s64)duration < 0) {
> >               __sync_fetch_and_add(&time_fail, 1);
> > diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> > index de12892f992f..1ef0bca9860e 100644
> > --- a/tools/perf/util/bpf_skel/lock_data.h
> > +++ b/tools/perf/util/bpf_skel/lock_data.h
> > @@ -3,6 +3,12 @@
> >  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
> >  #define UTIL_BPF_SKEL_LOCK_DATA_H
> >
> > +typedef struct contention_owner_tracing_data {
> > +     u32 pid; // Who has the lock.
> > +     u64 timestamp; // The time while the owner acquires lock and contention is going on.
> > +     u32 count; // How many waiters for this lock.
>
> Switching the order of timestamp and count would remove padding.

Thanks for the nit!

>
> > +} cotd;
>
> Usually we don't use typedef to remove the struct tag.
>
> Thanks,
> Namhyung
>
> > +
> >  struct tstamp_data {
> >       u64 timestamp;
> >       u64 lock;
> > --
> > 2.47.1.688.g23fc6f90ad-goog
> >
Re: [PATCH v2 1/4] perf lock: Add bpf maps for owner stack tracing
Posted by Namhyung Kim 1 year ago
On Tue, Jan 21, 2025 at 02:02:57PM -0800, Chun-Tse Shao wrote:
> Hi Namhyung, thanks for your reply!
> 
> On Mon, Jan 13, 2025 at 7:05 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Hello,
> >
> > On Sun, Jan 12, 2025 at 09:20:14PM -0800, Chun-Tse Shao wrote:
> > > Add few bpf maps in order to tracing owner stack.
> >
> > If you want to split this code as a separate commit, I think you'd
> > better explain what these maps do and why you need them.
> >
> > >
> > > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > > ---
> > >  tools/perf/util/bpf_lock_contention.c         | 17 ++++++--
> > >  .../perf/util/bpf_skel/lock_contention.bpf.c  | 40 +++++++++++++++++--
> > >  tools/perf/util/bpf_skel/lock_data.h          |  6 +++
> > >  3 files changed, 56 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> > > index 41a1ad087895..c9c58f243ceb 100644
> > > --- a/tools/perf/util/bpf_lock_contention.c
> > > +++ b/tools/perf/util/bpf_lock_contention.c
> > > @@ -41,9 +41,20 @@ int lock_contention_prepare(struct lock_contention *con)
> > >       else
> > >               bpf_map__set_max_entries(skel->maps.task_data, 1);
> > >
> > > -     if (con->save_callstack)
> > > -             bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
> > > -     else
> > > +     if (con->save_callstack) {
> > > +             bpf_map__set_max_entries(skel->maps.stacks,
> > > +                                      con->map_nr_entries);
> > > +             if (con->owner) {
> > > +                     bpf_map__set_value_size(skel->maps.owner_stacks_entries,
> > > +                                             con->max_stack * sizeof(u64));
> > > +                     bpf_map__set_value_size(
> > > +                             skel->maps.contention_owner_stacks,
> > > +                             con->max_stack * sizeof(u64));
> > > +                     bpf_map__set_key_size(skel->maps.owner_lock_stat,
> > > +                                             con->max_stack * sizeof(u64));
> > > +                     skel->rodata->max_stack = con->max_stack;
> > > +             }
> > > +     } else
> > >               bpf_map__set_max_entries(skel->maps.stacks, 1);
> > >
> > >       if (target__has_cpu(target)) {
> > > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > index 1069bda5d733..05da19fdab23 100644
> > > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > @@ -19,13 +19,37 @@
> > >  #define LCB_F_PERCPU (1U << 4)
> > >  #define LCB_F_MUTEX  (1U << 5)
> > >
> >
> > Can we rename these shorter and save some typings?
> 
> I tend to use longer variable names with full descriptions with some
> easy to understand abbreviations. Would a shorter name be preferable
> in Linux kernel?

Well, I think it's a matter of preference.  I don't know how others
think but I prefer shorter names.

> 
> >
> > > -/* callstack storage  */
> > > + /* tmp buffer for owner callstack */
> > >  struct {
> > > -     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> > > +     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> > >       __uint(key_size, sizeof(__u32));
> > >       __uint(value_size, sizeof(__u64));
> > > +     __uint(max_entries, 1);
> > > +} owner_stacks_entries SEC(".maps");
> >
> > I think this can be 'stack_buf'.
> >
> > > +
> > > +/* a map for tracing lock address to owner data */
> > > +struct {
> > > +     __uint(type, BPF_MAP_TYPE_HASH);
> > > +     __uint(key_size, sizeof(__u64)); // lock address
> > > +     __uint(value_size, sizeof(cotd));
> > >       __uint(max_entries, MAX_ENTRIES);
> > > -} stacks SEC(".maps");
> > > +} contention_owner_tracing SEC(".maps");
> >
> > owner_data.
> >
> > > +
> > > +/* a map for tracing lock address to owner stacktrace */
> > > +struct {
> > > +     __uint(type, BPF_MAP_TYPE_HASH);
> > > +     __uint(key_size, sizeof(__u64)); // lock address
> > > +     __uint(value_size, sizeof(__u64)); // straktrace
> >
> > Typo.
> >
> > > +     __uint(max_entries, MAX_ENTRIES);
> > > +} contention_owner_stacks SEC(".maps");
> >
> > owner_stack.
> >
> > > +
> > > +/* owner callstack to contention data storage */
> > > +struct {
> > > +     __uint(type, BPF_MAP_TYPE_HASH);
> > > +     __uint(key_size, sizeof(__u64));
> > > +     __uint(value_size, sizeof(struct contention_data));
> > > +     __uint(max_entries, MAX_ENTRIES);
> > > +} owner_lock_stat SEC(".maps");
> >
> > owner_stat.  What do you think?
> >
> > By the way, I got an idea to implement stackid map in BPF using hash
> > map.  For owner stack, you can use the stacktrace as a key and make a
> > value an unique integer.  Then the return value can be used as a stack
> > id (like from bpf_get_stackid) for the owner_data and owner_stat.
> >
> > Something like:
> >
> >   s32 get_stack_id(struct owner_stack *owner_stack, u64 stacktrace[])
> >   {
> >         s32 *id, new_id;
> >         static s32 id_gen = 1;
> >
> >         id = bpf_map_lookup_elem(owner_stack, stacktrace);
> >         if (id)
> >                 return *id;
> >
> >         new_id = __sync_fetch_and_add(&id_gen, 1);
> >         bpf_map_update_elem(owner_stack, stacktrace, &new_id, BPF_NOEXIST);
> >
> >         id = bpf_map_lookup_elem(owner_stack, stacktrace);
> >         if (id)
> >                 return *id;
> >
> >         return -1;
> >   }
> >
> > Later, in user space, you can traverse the owner_stack map to build
> > reverse mapping from id to stacktrace.
> 
> I wonder if stack_id is necessary here. So far I have three bpf maps.
> 2 bpf maps for tracing owner stack on given lock address in bpf program:
>   key: lock_address, value: a struct for tracing owner pid, count of
> waiters and contention begin timestamp.
>   key: lock_address, value: owner stack, which is variable length so I
> have to put it in a separate bpf map.
> 
> 1 bpf map for reporting owner stack in user mode:
>   key: owner stack, value: struct lock_stat.
> 
> With stackid I think there will still be 3 bpf maps, one for
> lock_address to owner's info with stackid, one for stackid to stack,
> and one for contention_key (has stackid inside) to lock_stat. I think
> it is just another way to implement and does not simplify the
> implementation. WDYT?

With stackid, I think we can have these 3 maps:
* key: stack-trace, value: stack-id
* key: lock-addr, value: owner-pid, counter, timestamp, stack-id
* key: stack-id, value: struct lock_stat

Then at least you can save some space for the key in the 3rd map (and
simplify the comparison of the key).

> 
> >
> > >
> > >  /* maintain timestamp at the beginning of contention */
> > >  struct {
> > > @@ -43,6 +67,14 @@ struct {
> > >       __uint(max_entries, 1);
> > >  } tstamp_cpu SEC(".maps");
> > >
> > > +/* callstack storage  */
> > > +struct {
> > > +     __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> > > +     __uint(key_size, sizeof(__u32));
> > > +     __uint(value_size, sizeof(__u64));
> > > +     __uint(max_entries, MAX_ENTRIES);
> > > +} stacks SEC(".maps");
> > > +
> > >  /* actual lock contention statistics */
> > >  struct {
> > >       __uint(type, BPF_MAP_TYPE_HASH);
> > > @@ -126,6 +158,7 @@ const volatile int needs_callstack;
> > >  const volatile int stack_skip;
> > >  const volatile int lock_owner;
> > >  const volatile int use_cgroup_v2;
> > > +const volatile int max_stack;
> > >
> > >  /* determine the key of lock stat */
> > >  const volatile int aggr_mode;
> > > @@ -436,7 +469,6 @@ int contention_end(u64 *ctx)
> > >                       return 0;
> > >               need_delete = true;
> > >       }
> > > -
> > >       duration = bpf_ktime_get_ns() - pelem->timestamp;
> > >       if ((__s64)duration < 0) {
> > >               __sync_fetch_and_add(&time_fail, 1);
> > > diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
> > > index de12892f992f..1ef0bca9860e 100644
> > > --- a/tools/perf/util/bpf_skel/lock_data.h
> > > +++ b/tools/perf/util/bpf_skel/lock_data.h
> > > @@ -3,6 +3,12 @@
> > >  #ifndef UTIL_BPF_SKEL_LOCK_DATA_H
> > >  #define UTIL_BPF_SKEL_LOCK_DATA_H
> > >
> > > +typedef struct contention_owner_tracing_data {
> > > +     u32 pid; // Who has the lock.
> > > +     u64 timestamp; // The time while the owner acquires lock and contention is going on.
> > > +     u32 count; // How many waiters for this lock.
> >
> > Switching the order of timestamp and count would remove padding.
> 
> Thanks for the nit!

No problem. :)

Thanks,
Namhyung

> 
> >
> > > +} cotd;
> >
> > Usually we don't use typedef to remove the struct tag.
> >
> > Thanks,
> > Namhyung
> >
> > > +
> > >  struct tstamp_data {
> > >       u64 timestamp;
> > >       u64 lock;
> > > --
> > > 2.47.1.688.g23fc6f90ad-goog
> > >