[PATCH v2 2/4] perf lock: Retrieve owner callstack in bpf program

Chun-Tse Shao posted 4 patches 1 year ago
There is a newer version of this series
[PATCH v2 2/4] perf lock: Retrieve owner callstack in bpf program
Posted by Chun-Tse Shao 1 year ago
Tracing owner callstack in `contention_begin()` and `contention_end()`,
and storing in `owner_lock_stat` bpf map.

Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
 .../perf/util/bpf_skel/lock_contention.bpf.c  | 152 +++++++++++++++++-
 1 file changed, 151 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 05da19fdab23..3f47fbfa237c 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -7,6 +7,7 @@
 #include <asm-generic/errno-base.h>
 
 #include "lock_data.h"
+#include <time.h>
 
 /* for collect_lock_syms().  4096 was rejected by the verifier */
 #define MAX_CPUS  1024
@@ -178,6 +179,9 @@ int data_fail;
 int task_map_full;
 int data_map_full;
 
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
 static inline __u64 get_current_cgroup_id(void)
 {
 	struct task_struct *task;
@@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
 	pelem->flags = (__u32)ctx[1];
 
 	if (needs_callstack) {
+		u32 i = 0;
+		int owner_pid;
+		unsigned long *entries;
+		struct task_struct *task;
+		cotd *data;
+
+		if (!lock_owner)
+			goto contention_begin_skip_owner_callstack;
+
+		task = get_lock_owner(pelem->lock, pelem->flags);
+		if (!task)
+			goto contention_begin_skip_owner_callstack;
+
+		owner_pid = BPF_CORE_READ(task, pid);
+
+		entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
+		if (!entries)
+			goto contention_begin_skip_owner_callstack;
+		for (i = 0; i < max_stack; i++)
+			entries[i] = 0x0;
+
+		task = bpf_task_from_pid(owner_pid);
+		if (task) {
+			bpf_get_task_stack(task, entries,
+					   max_stack * sizeof(unsigned long),
+					   0);
+			bpf_task_release(task);
+		}
+
+		data = bpf_map_lookup_elem(&contention_owner_tracing,
+					   &(pelem->lock));
+
+		// Contention just happens, or corner case `lock` is owned by
+		// process not `owner_pid`.
+		if (!data || data->pid != owner_pid) {
+			cotd first = {
+				.pid = owner_pid,
+				.timestamp = pelem->timestamp,
+				.count = 1,
+			};
+			bpf_map_update_elem(&contention_owner_tracing,
+					    &(pelem->lock), &first, BPF_ANY);
+			bpf_map_update_elem(&contention_owner_stacks,
+					    &(pelem->lock), entries, BPF_ANY);
+		}
+		// Contention is going on and new waiter joins.
+		else {
+			__sync_fetch_and_add(&data->count, 1);
+			// TODO: Since for owner the callstack would change at
+			// different time, We should check and report if the
+			// callstack is different with the recorded one in
+			// `contention_owner_stacks`.
+		}
+contention_begin_skip_owner_callstack:
 		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
 						  BPF_F_FAST_STACK_CMP | stack_skip);
 		if (pelem->stack_id < 0)
@@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
 	struct tstamp_data *pelem;
 	struct contention_key key = {};
 	struct contention_data *data;
+	__u64 timestamp;
 	__u64 duration;
 	bool need_delete = false;
 
@@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
 			return 0;
 		need_delete = true;
 	}
-	duration = bpf_ktime_get_ns() - pelem->timestamp;
+	timestamp = bpf_ktime_get_ns();
+	duration = timestamp - pelem->timestamp;
 	if ((__s64)duration < 0) {
 		__sync_fetch_and_add(&time_fail, 1);
 		goto out;
 	}
 
+	if (needs_callstack && lock_owner) {
+		u64 owner_contention_time;
+		unsigned long *owner_stack;
+		struct contention_data *cdata;
+		cotd *otdata;
+
+		otdata = bpf_map_lookup_elem(&contention_owner_tracing,
+					     &(pelem->lock));
+		owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
+						  &(pelem->lock));
+		if (!otdata || !owner_stack)
+			goto contention_end_skip_owner_callstack;
+
+		owner_contention_time = timestamp - otdata->timestamp;
+
+		// Update `owner_lock_stat` if `owner_stack` is
+		// available.
+		if (owner_stack[0] != 0x0) {
+			cdata = bpf_map_lookup_elem(&owner_lock_stat,
+						    owner_stack);
+			if (!cdata) {
+				struct contention_data first = {
+					.total_time = owner_contention_time,
+					.max_time = owner_contention_time,
+					.min_time = owner_contention_time,
+					.count = 1,
+					.flags = pelem->flags,
+				};
+				bpf_map_update_elem(&owner_lock_stat,
+						    owner_stack, &first,
+						    BPF_ANY);
+			} else {
+				__sync_fetch_and_add(&cdata->total_time,
+						     owner_contention_time);
+				__sync_fetch_and_add(&cdata->count, 1);
+
+				/* FIXME: need atomic operations */
+				if (cdata->max_time < owner_contention_time)
+					cdata->max_time = owner_contention_time;
+				if (cdata->min_time > owner_contention_time)
+					cdata->min_time = owner_contention_time;
+			}
+		}
+
+		//  No contention is going on, delete `lock` in
+		//  `contention_owner_tracing` and
+		//  `contention_owner_stacks`
+		if (otdata->count <= 1) {
+			bpf_map_delete_elem(&contention_owner_tracing,
+					    &(pelem->lock));
+			bpf_map_delete_elem(&contention_owner_stacks,
+					    &(pelem->lock));
+		}
+		// Contention is still going on, with a new owner
+		// (current task). `otdata` should be updated accordingly.
+		else {
+			(otdata->count)--;
+
+			// If ctx[1] is not 0, the current task terminates lock
+			// waiting without acquiring it. Owner is not changed.
+			if (ctx[1] == 0) {
+				u32 i = 0;
+				unsigned long *entries = bpf_map_lookup_elem(
+					&owner_stacks_entries, &i);
+				if (entries) {
+					for (i = 0; i < (u32)max_stack; i++)
+						entries[i] = 0x0;
+
+					bpf_get_task_stack(
+						bpf_get_current_task_btf(),
+						entries,
+						max_stack *
+							sizeof(unsigned long),
+						0);
+					bpf_map_update_elem(
+						&contention_owner_stacks,
+						&(pelem->lock), entries,
+						BPF_ANY);
+				}
+
+				otdata->pid = pid;
+				otdata->timestamp = timestamp;
+			}
+
+			bpf_map_update_elem(&contention_owner_tracing,
+					    &(pelem->lock), otdata, BPF_ANY);
+		}
+	}
+contention_end_skip_owner_callstack:
+
 	switch (aggr_mode) {
 	case LOCK_AGGR_CALLER:
 		key.stack_id = pelem->stack_id;
-- 
2.47.1.688.g23fc6f90ad-goog
Re: [PATCH v2 2/4] perf lock: Retrieve owner callstack in bpf program
Posted by Namhyung Kim 1 year ago
On Sun, Jan 12, 2025 at 09:20:15PM -0800, Chun-Tse Shao wrote:
> Tracing owner callstack in `contention_begin()` and `contention_end()`,
> and storing in `owner_lock_stat` bpf map.
> 
> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> ---
>  .../perf/util/bpf_skel/lock_contention.bpf.c  | 152 +++++++++++++++++-
>  1 file changed, 151 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 05da19fdab23..3f47fbfa237c 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -7,6 +7,7 @@
>  #include <asm-generic/errno-base.h>
>  
>  #include "lock_data.h"
> +#include <time.h>
>  
>  /* for collect_lock_syms().  4096 was rejected by the verifier */
>  #define MAX_CPUS  1024
> @@ -178,6 +179,9 @@ int data_fail;
>  int task_map_full;
>  int data_map_full;
>  
> +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> +void bpf_task_release(struct task_struct *p) __ksym;

To support old (ancient?) kernels, you can declare them as __weak and
check if one of them is defined and ignore owner stacks on them.  Also
you can check them in user space and turn off the option before loading.

> +
>  static inline __u64 get_current_cgroup_id(void)
>  {
>  	struct task_struct *task;
> @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
>  	pelem->flags = (__u32)ctx[1];
>  
>  	if (needs_callstack) {
> +		u32 i = 0;
> +		int owner_pid;
> +		unsigned long *entries;
> +		struct task_struct *task;
> +		cotd *data;
> +
> +		if (!lock_owner)
> +			goto contention_begin_skip_owner_callstack;

Can be it 'skip_owner'?

> +
> +		task = get_lock_owner(pelem->lock, pelem->flags);
> +		if (!task)
> +			goto contention_begin_skip_owner_callstack;
> +
> +		owner_pid = BPF_CORE_READ(task, pid);
> +
> +		entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> +		if (!entries)
> +			goto contention_begin_skip_owner_callstack;
> +		for (i = 0; i < max_stack; i++)
> +			entries[i] = 0x0;
> +
> +		task = bpf_task_from_pid(owner_pid);
> +		if (task) {
> +			bpf_get_task_stack(task, entries,
> +					   max_stack * sizeof(unsigned long),
> +					   0);
> +			bpf_task_release(task);
> +		}
> +
> +		data = bpf_map_lookup_elem(&contention_owner_tracing,
> +					   &(pelem->lock));

No need for parenthesis.

> +
> +		// Contention just happens, or corner case `lock` is owned by
> +		// process not `owner_pid`.
> +		if (!data || data->pid != owner_pid) {
> +			cotd first = {
> +				.pid = owner_pid,
> +				.timestamp = pelem->timestamp,
> +				.count = 1,
> +			};
> +			bpf_map_update_elem(&contention_owner_tracing,
> +					    &(pelem->lock), &first, BPF_ANY);
> +			bpf_map_update_elem(&contention_owner_stacks,
> +					    &(pelem->lock), entries, BPF_ANY);

Hmm.. it just discard the old owner data if it comes from a new owner?
Why not save the data into the result for the old lock/callstack?


> +		}
> +		// Contention is going on and new waiter joins.
> +		else {
> +			__sync_fetch_and_add(&data->count, 1);
> +			// TODO: Since for owner the callstack would change at
> +			// different time, We should check and report if the
> +			// callstack is different with the recorded one in
> +			// `contention_owner_stacks`.
> +		}
> +contention_begin_skip_owner_callstack:
>  		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
>  						  BPF_F_FAST_STACK_CMP | stack_skip);
>  		if (pelem->stack_id < 0)
> @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
>  	struct tstamp_data *pelem;
>  	struct contention_key key = {};
>  	struct contention_data *data;
> +	__u64 timestamp;
>  	__u64 duration;
>  	bool need_delete = false;
>  
> @@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
>  			return 0;
>  		need_delete = true;
>  	}
> -	duration = bpf_ktime_get_ns() - pelem->timestamp;
> +	timestamp = bpf_ktime_get_ns();
> +	duration = timestamp - pelem->timestamp;
>  	if ((__s64)duration < 0) {
>  		__sync_fetch_and_add(&time_fail, 1);
>  		goto out;
>  	}
>  
> +	if (needs_callstack && lock_owner) {
> +		u64 owner_contention_time;
> +		unsigned long *owner_stack;
> +		struct contention_data *cdata;
> +		cotd *otdata;
> +
> +		otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> +					     &(pelem->lock));
> +		owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> +						  &(pelem->lock));
> +		if (!otdata || !owner_stack)
> +			goto contention_end_skip_owner_callstack;
> +
> +		owner_contention_time = timestamp - otdata->timestamp;
> +
> +		// Update `owner_lock_stat` if `owner_stack` is
> +		// available.
> +		if (owner_stack[0] != 0x0) {
> +			cdata = bpf_map_lookup_elem(&owner_lock_stat,
> +						    owner_stack);
> +			if (!cdata) {
> +				struct contention_data first = {
> +					.total_time = owner_contention_time,
> +					.max_time = owner_contention_time,
> +					.min_time = owner_contention_time,
> +					.count = 1,
> +					.flags = pelem->flags,
> +				};
> +				bpf_map_update_elem(&owner_lock_stat,
> +						    owner_stack, &first,
> +						    BPF_ANY);
> +			} else {
> +				__sync_fetch_and_add(&cdata->total_time,
> +						     owner_contention_time);
> +				__sync_fetch_and_add(&cdata->count, 1);
> +
> +				/* FIXME: need atomic operations */
> +				if (cdata->max_time < owner_contention_time)
> +					cdata->max_time = owner_contention_time;
> +				if (cdata->min_time > owner_contention_time)
> +					cdata->min_time = owner_contention_time;
> +			}
> +		}
> +
> +		//  No contention is going on, delete `lock` in
> +		//  `contention_owner_tracing` and
> +		//  `contention_owner_stacks`
> +		if (otdata->count <= 1) {
> +			bpf_map_delete_elem(&contention_owner_tracing,
> +					    &(pelem->lock));
> +			bpf_map_delete_elem(&contention_owner_stacks,
> +					    &(pelem->lock));
> +		}
> +		// Contention is still going on, with a new owner
> +		// (current task). `otdata` should be updated accordingly.
> +		else {
> +			(otdata->count)--;

No need for parenthesis, and it needs to be atomic dec.

> +
> +			// If ctx[1] is not 0, the current task terminates lock
> +			// waiting without acquiring it. Owner is not changed.

Please add a comment that ctx[1] has the return code of the lock
function.  Maybe it's better to use a local variable.

Also I think you need to say about the normal case too.  Returing 0
means the waiter now gets the lock and becomes a new owner.  So it needs
to update the owner information.


> +			if (ctx[1] == 0) {
> +				u32 i = 0;
> +				unsigned long *entries = bpf_map_lookup_elem(
> +					&owner_stacks_entries, &i);
> +				if (entries) {
> +					for (i = 0; i < (u32)max_stack; i++)
> +						entries[i] = 0x0;
> +
> +					bpf_get_task_stack(
> +						bpf_get_current_task_btf(),

Same as bpf_get_stack(), right?

> +						entries,
> +						max_stack *
> +							sizeof(unsigned long),
> +						0);
> +					bpf_map_update_elem(
> +						&contention_owner_stacks,
> +						&(pelem->lock), entries,
> +						BPF_ANY);

Please factor out the code if it indents too much.  Or you can use goto
or something to reduce the indentation level.

  if (ret != 0)
  	goto skip_update;

  ...

  if (entries == NULL)
  	goto skip_stack;

  ...

Thanks,
Namhyung

> +				}
> +
> +				otdata->pid = pid;
> +				otdata->timestamp = timestamp;
> +			}
> +
> +			bpf_map_update_elem(&contention_owner_tracing,
> +					    &(pelem->lock), otdata, BPF_ANY);
> +		}
> +	}
> +contention_end_skip_owner_callstack:
> +
>  	switch (aggr_mode) {
>  	case LOCK_AGGR_CALLER:
>  		key.stack_id = pelem->stack_id;
> -- 
> 2.47.1.688.g23fc6f90ad-goog
>
Re: [PATCH v2 2/4] perf lock: Retrieve owner callstack in bpf program
Posted by Chun-Tse Shao 1 year ago
On Mon, Jan 13, 2025 at 7:35 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Sun, Jan 12, 2025 at 09:20:15PM -0800, Chun-Tse Shao wrote:
> > Tracing owner callstack in `contention_begin()` and `contention_end()`,
> > and storing in `owner_lock_stat` bpf map.
> >
> > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > ---
> >  .../perf/util/bpf_skel/lock_contention.bpf.c  | 152 +++++++++++++++++-
> >  1 file changed, 151 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > index 05da19fdab23..3f47fbfa237c 100644
> > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > @@ -7,6 +7,7 @@
> >  #include <asm-generic/errno-base.h>
> >
> >  #include "lock_data.h"
> > +#include <time.h>
> >
> >  /* for collect_lock_syms().  4096 was rejected by the verifier */
> >  #define MAX_CPUS  1024
> > @@ -178,6 +179,9 @@ int data_fail;
> >  int task_map_full;
> >  int data_map_full;
> >
> > +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> > +void bpf_task_release(struct task_struct *p) __ksym;
>
> To support old (ancient?) kernels, you can declare them as __weak and
> check if one of them is defined and ignore owner stacks on them.  Also
> you can check them in user space and turn off the option before loading.
>
> > +
> >  static inline __u64 get_current_cgroup_id(void)
> >  {
> >       struct task_struct *task;
> > @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
> >       pelem->flags = (__u32)ctx[1];
> >
> >       if (needs_callstack) {
> > +             u32 i = 0;
> > +             int owner_pid;
> > +             unsigned long *entries;
> > +             struct task_struct *task;
> > +             cotd *data;
> > +
> > +             if (!lock_owner)
> > +                     goto contention_begin_skip_owner_callstack;
>
> Can be it 'skip_owner'?
>
> > +
> > +             task = get_lock_owner(pelem->lock, pelem->flags);
> > +             if (!task)
> > +                     goto contention_begin_skip_owner_callstack;
> > +
> > +             owner_pid = BPF_CORE_READ(task, pid);
> > +
> > +             entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> > +             if (!entries)
> > +                     goto contention_begin_skip_owner_callstack;
> > +             for (i = 0; i < max_stack; i++)
> > +                     entries[i] = 0x0;
> > +
> > +             task = bpf_task_from_pid(owner_pid);
> > +             if (task) {
> > +                     bpf_get_task_stack(task, entries,
> > +                                        max_stack * sizeof(unsigned long),
> > +                                        0);
> > +                     bpf_task_release(task);
> > +             }
> > +
> > +             data = bpf_map_lookup_elem(&contention_owner_tracing,
> > +                                        &(pelem->lock));
>
> No need for parenthesis.
>
> > +
> > +             // Contention just happens, or corner case `lock` is owned by
> > +             // process not `owner_pid`.
> > +             if (!data || data->pid != owner_pid) {
> > +                     cotd first = {
> > +                             .pid = owner_pid,
> > +                             .timestamp = pelem->timestamp,
> > +                             .count = 1,
> > +                     };
> > +                     bpf_map_update_elem(&contention_owner_tracing,
> > +                                         &(pelem->lock), &first, BPF_ANY);
> > +                     bpf_map_update_elem(&contention_owner_stacks,
> > +                                         &(pelem->lock), entries, BPF_ANY);
>
> Hmm.. it just discard the old owner data if it comes from a new owner?
> Why not save the data into the result for the old lock/callstack?

There are two conditions which enter this if statement:
1. (!data) contention just started, `&pelem->lock` entry in
`contetion_owner_tracing` is empty.
2. (data->pid != owner_pid) Some internal errors so `data->pid` is
misaligned with `owner_pid`. In this case the timestamp would be
incorrect so I prefer to drop it.
WDYT?

>
>
> > +             }
> > +             // Contention is going on and new waiter joins.
> > +             else {
> > +                     __sync_fetch_and_add(&data->count, 1);
> > +                     // TODO: Since for owner the callstack would change at
> > +                     // different time, We should check and report if the
> > +                     // callstack is different with the recorded one in
> > +                     // `contention_owner_stacks`.
> > +             }
> > +contention_begin_skip_owner_callstack:
> >               pelem->stack_id = bpf_get_stackid(ctx, &stacks,
> >                                                 BPF_F_FAST_STACK_CMP | stack_skip);
> >               if (pelem->stack_id < 0)
> > @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
> >       struct tstamp_data *pelem;
> >       struct contention_key key = {};
> >       struct contention_data *data;
> > +     __u64 timestamp;
> >       __u64 duration;
> >       bool need_delete = false;
> >
> > @@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
> >                       return 0;
> >               need_delete = true;
> >       }
> > -     duration = bpf_ktime_get_ns() - pelem->timestamp;
> > +     timestamp = bpf_ktime_get_ns();
> > +     duration = timestamp - pelem->timestamp;
> >       if ((__s64)duration < 0) {
> >               __sync_fetch_and_add(&time_fail, 1);
> >               goto out;
> >       }
> >
> > +     if (needs_callstack && lock_owner) {
> > +             u64 owner_contention_time;
> > +             unsigned long *owner_stack;
> > +             struct contention_data *cdata;
> > +             cotd *otdata;
> > +
> > +             otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> > +                                          &(pelem->lock));
> > +             owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> > +                                               &(pelem->lock));
> > +             if (!otdata || !owner_stack)
> > +                     goto contention_end_skip_owner_callstack;
> > +
> > +             owner_contention_time = timestamp - otdata->timestamp;
> > +
> > +             // Update `owner_lock_stat` if `owner_stack` is
> > +             // available.
> > +             if (owner_stack[0] != 0x0) {
> > +                     cdata = bpf_map_lookup_elem(&owner_lock_stat,
> > +                                                 owner_stack);
> > +                     if (!cdata) {
> > +                             struct contention_data first = {
> > +                                     .total_time = owner_contention_time,
> > +                                     .max_time = owner_contention_time,
> > +                                     .min_time = owner_contention_time,
> > +                                     .count = 1,
> > +                                     .flags = pelem->flags,
> > +                             };
> > +                             bpf_map_update_elem(&owner_lock_stat,
> > +                                                 owner_stack, &first,
> > +                                                 BPF_ANY);
> > +                     } else {
> > +                             __sync_fetch_and_add(&cdata->total_time,
> > +                                                  owner_contention_time);
> > +                             __sync_fetch_and_add(&cdata->count, 1);
> > +
> > +                             /* FIXME: need atomic operations */
> > +                             if (cdata->max_time < owner_contention_time)
> > +                                     cdata->max_time = owner_contention_time;
> > +                             if (cdata->min_time > owner_contention_time)
> > +                                     cdata->min_time = owner_contention_time;
> > +                     }
> > +             }
> > +
> > +             //  No contention is going on, delete `lock` in
> > +             //  `contention_owner_tracing` and
> > +             //  `contention_owner_stacks`
> > +             if (otdata->count <= 1) {
> > +                     bpf_map_delete_elem(&contention_owner_tracing,
> > +                                         &(pelem->lock));
> > +                     bpf_map_delete_elem(&contention_owner_stacks,
> > +                                         &(pelem->lock));
> > +             }
> > +             // Contention is still going on, with a new owner
> > +             // (current task). `otdata` should be updated accordingly.
> > +             else {
> > +                     (otdata->count)--;
>
> No need for parenthesis, and it needs to be atomic dec.
>
> > +
> > +                     // If ctx[1] is not 0, the current task terminates lock
> > +                     // waiting without acquiring it. Owner is not changed.
>
> Please add a comment that ctx[1] has the return code of the lock
> function.  Maybe it's better to use a local variable.
>
> Also I think you need to say about the normal case too.  Returing 0
> means the waiter now gets the lock and becomes a new owner.  So it needs
> to update the owner information.
>
>
> > +                     if (ctx[1] == 0) {
> > +                             u32 i = 0;
> > +                             unsigned long *entries = bpf_map_lookup_elem(
> > +                                     &owner_stacks_entries, &i);
> > +                             if (entries) {
> > +                                     for (i = 0; i < (u32)max_stack; i++)
> > +                                             entries[i] = 0x0;
> > +
> > +                                     bpf_get_task_stack(
> > +                                             bpf_get_current_task_btf(),
>
> Same as bpf_get_stack(), right?
>
> > +                                             entries,
> > +                                             max_stack *
> > +                                                     sizeof(unsigned long),
> > +                                             0);
> > +                                     bpf_map_update_elem(
> > +                                             &contention_owner_stacks,
> > +                                             &(pelem->lock), entries,
> > +                                             BPF_ANY);
>
> Please factor out the code if it indents too much.  Or you can use goto
> or something to reduce the indentation level.

I will reindent it with `ColumnLimit=100`. I was using 80 since it was
predefined in `.clang-format`, looks outdated but no one updated it..

>
>   if (ret != 0)
>         goto skip_update;
>
>   ...
>
>   if (entries == NULL)
>         goto skip_stack;
>
>   ...
>
> Thanks,
> Namhyung
>
> > +                             }
> > +
> > +                             otdata->pid = pid;
> > +                             otdata->timestamp = timestamp;
> > +                     }
> > +
> > +                     bpf_map_update_elem(&contention_owner_tracing,
> > +                                         &(pelem->lock), otdata, BPF_ANY);
> > +             }
> > +     }
> > +contention_end_skip_owner_callstack:
> > +
> >       switch (aggr_mode) {
> >       case LOCK_AGGR_CALLER:
> >               key.stack_id = pelem->stack_id;
> > --
> > 2.47.1.688.g23fc6f90ad-goog
> >
Re: [PATCH v2 2/4] perf lock: Retrieve owner callstack in bpf program
Posted by Namhyung Kim 1 year ago
On Tue, Jan 21, 2025 at 02:35:46PM -0800, Chun-Tse Shao wrote:
> On Mon, Jan 13, 2025 at 7:35 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Sun, Jan 12, 2025 at 09:20:15PM -0800, Chun-Tse Shao wrote:
> > > Tracing owner callstack in `contention_begin()` and `contention_end()`,
> > > and storing in `owner_lock_stat` bpf map.
> > >
> > > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > > ---
> > >  .../perf/util/bpf_skel/lock_contention.bpf.c  | 152 +++++++++++++++++-
> > >  1 file changed, 151 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > index 05da19fdab23..3f47fbfa237c 100644
> > > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > @@ -7,6 +7,7 @@
> > >  #include <asm-generic/errno-base.h>
> > >
> > >  #include "lock_data.h"
> > > +#include <time.h>
> > >
> > >  /* for collect_lock_syms().  4096 was rejected by the verifier */
> > >  #define MAX_CPUS  1024
> > > @@ -178,6 +179,9 @@ int data_fail;
> > >  int task_map_full;
> > >  int data_map_full;
> > >
> > > +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> > > +void bpf_task_release(struct task_struct *p) __ksym;
> >
> > To support old (ancient?) kernels, you can declare them as __weak and
> > check if one of them is defined and ignore owner stacks on them.  Also
> > you can check them in user space and turn off the option before loading.
> >
> > > +
> > >  static inline __u64 get_current_cgroup_id(void)
> > >  {
> > >       struct task_struct *task;
> > > @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
> > >       pelem->flags = (__u32)ctx[1];
> > >
> > >       if (needs_callstack) {
> > > +             u32 i = 0;
> > > +             int owner_pid;
> > > +             unsigned long *entries;
> > > +             struct task_struct *task;
> > > +             cotd *data;
> > > +
> > > +             if (!lock_owner)
> > > +                     goto contention_begin_skip_owner_callstack;
> >
> > Can be it 'skip_owner'?
> >
> > > +
> > > +             task = get_lock_owner(pelem->lock, pelem->flags);
> > > +             if (!task)
> > > +                     goto contention_begin_skip_owner_callstack;
> > > +
> > > +             owner_pid = BPF_CORE_READ(task, pid);
> > > +
> > > +             entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> > > +             if (!entries)
> > > +                     goto contention_begin_skip_owner_callstack;
> > > +             for (i = 0; i < max_stack; i++)
> > > +                     entries[i] = 0x0;
> > > +
> > > +             task = bpf_task_from_pid(owner_pid);
> > > +             if (task) {
> > > +                     bpf_get_task_stack(task, entries,
> > > +                                        max_stack * sizeof(unsigned long),
> > > +                                        0);
> > > +                     bpf_task_release(task);
> > > +             }
> > > +
> > > +             data = bpf_map_lookup_elem(&contention_owner_tracing,
> > > +                                        &(pelem->lock));
> >
> > No need for parenthesis.
> >
> > > +
> > > +             // Contention just happens, or corner case `lock` is owned by
> > > +             // process not `owner_pid`.
> > > +             if (!data || data->pid != owner_pid) {
> > > +                     cotd first = {
> > > +                             .pid = owner_pid,
> > > +                             .timestamp = pelem->timestamp,
> > > +                             .count = 1,
> > > +                     };
> > > +                     bpf_map_update_elem(&contention_owner_tracing,
> > > +                                         &(pelem->lock), &first, BPF_ANY);
> > > +                     bpf_map_update_elem(&contention_owner_stacks,
> > > +                                         &(pelem->lock), entries, BPF_ANY);
> >
> > Hmm.. it just discard the old owner data if it comes from a new owner?
> > Why not save the data into the result for the old lock/callstack?
> 
> There are two conditions which enter this if statement:
> 1. (!data) contention just started, `&pelem->lock` entry in
> `contetion_owner_tracing` is empty.
> 2. (data->pid != owner_pid) Some internal errors so `data->pid` is
> misaligned with `owner_pid`. In this case the timestamp would be
> incorrect so I prefer to drop it.
> WDYT?

Ok, now I think that it should see contention_end from the earlier
waiter for the second case so it should be rare.  Probably ok to drop
it for now.

> 
> >
> >
> > > +             }
> > > +             // Contention is going on and new waiter joins.
> > > +             else {
> > > +                     __sync_fetch_and_add(&data->count, 1);
> > > +                     // TODO: Since for owner the callstack would change at
> > > +                     // different time, We should check and report if the
> > > +                     // callstack is different with the recorded one in
> > > +                     // `contention_owner_stacks`.
> > > +             }
> > > +contention_begin_skip_owner_callstack:
> > >               pelem->stack_id = bpf_get_stackid(ctx, &stacks,
> > >                                                 BPF_F_FAST_STACK_CMP | stack_skip);
> > >               if (pelem->stack_id < 0)
> > > @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
> > >       struct tstamp_data *pelem;
> > >       struct contention_key key = {};
> > >       struct contention_data *data;
> > > +     __u64 timestamp;
> > >       __u64 duration;
> > >       bool need_delete = false;
> > >
> > > @@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
> > >                       return 0;
> > >               need_delete = true;
> > >       }
> > > -     duration = bpf_ktime_get_ns() - pelem->timestamp;
> > > +     timestamp = bpf_ktime_get_ns();
> > > +     duration = timestamp - pelem->timestamp;
> > >       if ((__s64)duration < 0) {
> > >               __sync_fetch_and_add(&time_fail, 1);
> > >               goto out;
> > >       }
> > >
> > > +     if (needs_callstack && lock_owner) {
> > > +             u64 owner_contention_time;
> > > +             unsigned long *owner_stack;
> > > +             struct contention_data *cdata;
> > > +             cotd *otdata;
> > > +
> > > +             otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> > > +                                          &(pelem->lock));
> > > +             owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> > > +                                               &(pelem->lock));
> > > +             if (!otdata || !owner_stack)
> > > +                     goto contention_end_skip_owner_callstack;
> > > +
> > > +             owner_contention_time = timestamp - otdata->timestamp;
> > > +
> > > +             // Update `owner_lock_stat` if `owner_stack` is
> > > +             // available.
> > > +             if (owner_stack[0] != 0x0) {
> > > +                     cdata = bpf_map_lookup_elem(&owner_lock_stat,
> > > +                                                 owner_stack);
> > > +                     if (!cdata) {
> > > +                             struct contention_data first = {
> > > +                                     .total_time = owner_contention_time,
> > > +                                     .max_time = owner_contention_time,
> > > +                                     .min_time = owner_contention_time,
> > > +                                     .count = 1,
> > > +                                     .flags = pelem->flags,
> > > +                             };
> > > +                             bpf_map_update_elem(&owner_lock_stat,
> > > +                                                 owner_stack, &first,
> > > +                                                 BPF_ANY);
> > > +                     } else {
> > > +                             __sync_fetch_and_add(&cdata->total_time,
> > > +                                                  owner_contention_time);
> > > +                             __sync_fetch_and_add(&cdata->count, 1);
> > > +
> > > +                             /* FIXME: need atomic operations */
> > > +                             if (cdata->max_time < owner_contention_time)
> > > +                                     cdata->max_time = owner_contention_time;
> > > +                             if (cdata->min_time > owner_contention_time)
> > > +                                     cdata->min_time = owner_contention_time;
> > > +                     }
> > > +             }
> > > +
> > > +             //  No contention is going on, delete `lock` in
> > > +             //  `contention_owner_tracing` and
> > > +             //  `contention_owner_stacks`
> > > +             if (otdata->count <= 1) {
> > > +                     bpf_map_delete_elem(&contention_owner_tracing,
> > > +                                         &(pelem->lock));
> > > +                     bpf_map_delete_elem(&contention_owner_stacks,
> > > +                                         &(pelem->lock));
> > > +             }
> > > +             // Contention is still going on, with a new owner
> > > +             // (current task). `otdata` should be updated accordingly.
> > > +             else {
> > > +                     (otdata->count)--;
> >
> > No need for parenthesis, and it needs to be atomic dec.
> >
> > > +
> > > +                     // If ctx[1] is not 0, the current task terminates lock
> > > +                     // waiting without acquiring it. Owner is not changed.
> >
> > Please add a comment that ctx[1] has the return code of the lock
> > function.  Maybe it's better to use a local variable.
> >
> > Also I think you need to say about the normal case too.  Returing 0
> > means the waiter now gets the lock and becomes a new owner.  So it needs
> > to update the owner information.
> >
> >
> > > +                     if (ctx[1] == 0) {
> > > +                             u32 i = 0;
> > > +                             unsigned long *entries = bpf_map_lookup_elem(
> > > +                                     &owner_stacks_entries, &i);
> > > +                             if (entries) {
> > > +                                     for (i = 0; i < (u32)max_stack; i++)
> > > +                                             entries[i] = 0x0;
> > > +
> > > +                                     bpf_get_task_stack(
> > > +                                             bpf_get_current_task_btf(),
> >
> > Same as bpf_get_stack(), right?
> >
> > > +                                             entries,
> > > +                                             max_stack *
> > > +                                                     sizeof(unsigned long),
> > > +                                             0);
> > > +                                     bpf_map_update_elem(
> > > +                                             &contention_owner_stacks,
> > > +                                             &(pelem->lock), entries,
> > > +                                             BPF_ANY);
> >
> > Please factor out the code if it indents too much.  Or you can use goto
> > or something to reduce the indentation level.
> 
> I will reindent it with `ColumnLimit=100`. I was using 80 since it was
> predefined in `.clang-format`, looks outdated but no one updated it..

Probably you can send a patch. :)

But it still holds the same, please try not to indent a lot (and user
shorter names).

Thanks,
Namhyung

> 
> >
> >   if (ret != 0)
> >         goto skip_update;
> >
> >   ...
> >
> >   if (entries == NULL)
> >         goto skip_stack;
> >
> >   ...
> >
> > Thanks,
> > Namhyung
> >
> > > +                             }
> > > +
> > > +                             otdata->pid = pid;
> > > +                             otdata->timestamp = timestamp;
> > > +                     }
> > > +
> > > +                     bpf_map_update_elem(&contention_owner_tracing,
> > > +                                         &(pelem->lock), otdata, BPF_ANY);
> > > +             }
> > > +     }
> > > +contention_end_skip_owner_callstack:
> > > +
> > >       switch (aggr_mode) {
> > >       case LOCK_AGGR_CALLER:
> > >               key.stack_id = pelem->stack_id;
> > > --
> > > 2.47.1.688.g23fc6f90ad-goog
> > >